view src/goodjava/html/Html.java @ 1771:c62324841dfb

improve NumberFieldParser and add key map lucene query
author Franklin Schmidt <fschmidt@gmail.com>
date Sun, 25 Jun 2023 15:05:09 -0600
parents 31a82b0d0a87
children a045f30fa67d
line wrap: on
line source

package goodjava.html;

import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.util.Map;
import java.util.HashMap;
import java.util.Collections;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import goodjava.parser.Parser;


public final class Html {

	private static final Pattern entityPtn = Pattern.compile(
		"&(#?[0-9a-zA-Z]+;)"
	);

	public static String encode(String s) {
		//s = s.replace("&","&amp;");
		s = entityPtn.matcher(s).replaceAll("&amp;$1");
		s = s.replace("<","&lt;");
		s = s.replace(">","&gt;");
		s = s.replace("\"","&quot;");
		return s;
	}

	private static final Pattern entityNumPtn = Pattern.compile(
		"&#(\\d+);"
	);

	public static String decode(String s) {
		Matcher m = entityNumPtn.matcher(s);
		if( m.find() ) {
			StringBuffer buf = new StringBuffer();
			do {
				String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
				m.appendReplacement(buf,entity);
			} while( m.find() );
			m.appendTail(buf);
			s = buf.toString();
		}
		s = s.replace("&nbsp;"," ");
		s = s.replace("&quot;","\"");
		s = s.replace("&gt;",">");
		s = s.replace("&lt;","<");
		s = s.replace("&amp;","&");
		return s;
	}


	public static final class Comment {
		public final String text;

		private Comment(String text) {
			this.text = text;
		}
	}

	public static final class CData {
		public final String text;

		private CData(String text) {
			this.text = text;
		}
	}

	public static final class Tag {
		public final String name;
		public final Map<String,Object> attributes;
		public final boolean isEmpty;
		public final String raw;
		public final Map<String,String> style;

		private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) {
			this(name,attributes,isEmpty,raw,null);
		}

		private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw,Map<String,String> style) {
			this.name = name;
			this.attributes = attributes;
			this.isEmpty = isEmpty;
			this.raw = raw;
			this.style = style;
		}
	}

	public static final class Container {
		public final Tag tag;
		public final String text;

		private Container(Tag tag,String text) {
			this.tag = tag;
			this.text = text;
		}
	}

	private static Set<String> defaultContainerTags = new HashSet<String>();
	static {
		Collections.addAll( defaultContainerTags, "script", "style", "textarea" );
	}

	public static List parse(String text) {
		return parse(text,defaultContainerTags);
	}

	public static List parse(String text,Set<String> containerTags) {
		return new Html(text,containerTags).parse();
	}

	private final Parser parser;
	private final Set<String> containerTags;

	private Html(String text,Set<String> containerTags) {
		this.parser = new Parser(text);
		this.containerTags = containerTags;
	}

	private List parse() {
		List list = new ArrayList();
		StringBuilder sb = new StringBuilder();
		while( !parser.endOfInput() ) {
			if( parser.test('<') ) {
				Tag tag = parseTag();
				if( tag != null ) {
					Object el = tag;
					String tagName = tag.name;
					if( containerTags.contains(tagName) ) {
						Container container = parseContainer(tag);
						if( container != null )
							el = container;
					}
					if( el != null 
						|| (el = parseComment()) != null
						|| (el = parseCdata()) != null
					) {
						add(list,sb);
						list.add(el);
						continue;
					}
				}
			}
			sb.append( parser.currentChar() );
			parser.anyChar();
		}
		add(list,sb);
		return list;
	}

	private static void add(List list,StringBuilder sb) {
		if( sb.length() > 0 ) {
			list.add(decode(sb.toString()));
			sb.setLength(0);
		}
	}

	private Comment parseComment() {
		parser.begin();
		if( !parser.match("<!--") )
			return parser.failure(null);
		int start = parser.currentIndex();
		while( !parser.test("-->") ) {
			if( !parser.anyChar() )
				return parser.failure(null);
		}
		String text = parser.textFrom(start);
		Comment comment = new Comment(text);
		return parser.success(comment);
	}

	private CData parseCdata() {
		parser.begin();
		if( !parser.match("<![CDATA[") )
			return parser.failure(null);
		int start = parser.currentIndex();
		while( !parser.test("]]>") ) {
			if( !parser.anyChar() )
				return parser.failure(null);
		}
		String text = parser.textFrom(start);
		CData cdata = new CData(text);
		return parser.success(cdata);
	}

	private Container parseContainer(Tag tag) {
		String endTagName = '/' + tag.name;
		int start = parser.begin();
		int end;
		while(true) {
			if( parser.test('<') ) {
				end = parser.currentIndex();
				Tag tag2 = parseTag();
				if( tag2.name.equals(endTagName) )
					break;
			}
			if( !parser.anyChar() )
				return parser.failure(null);
		}
		String text = parser.text.substring(start,end);
		Container container = new Container(tag,text);
		return parser.success(container);
	}

	private Tag parseTag() {
		int tagStart = parser.begin();
		if( !parser.match('<') )
			return parser.failure(null);
		int start = parser.currentIndex();
		parser.match('/');
		if( !matchNameChar() )
			return parser.failure(null);
		while( matchNameChar() );
		String name = parser.textFrom(start).toLowerCase();
		Map<String,Object> attributes = new HashMap<String,Object>();
		String attrName;
		Map<String,String> style = null;
		while( (attrName = parseAttrName()) != null ) {
			String attrValue = parseAttrValue();
			attributes.put( attrName, attrValue!=null ? attrValue : true );
			if( attrName.equals("style") && attrValue!=null && style==null ) {
				style = Css.style(attrValue);
			}
		}
		while( matchSpace() );
		boolean isEmpty = parser.match('/');
		if( !parser.match('>') )
			return parser.failure(null);
		String raw = parser.textFrom(tagStart);
		Tag tag = new Tag(name,attributes,isEmpty,raw,style);
		return parser.success(tag);
	}

	private String parseAttrName() {
		parser.begin();
		if( !matchSpace() )
			return parser.failure(null);
		while( matchSpace() );
		int start = parser.currentIndex();
		if( !matchNameChar() )
			return parser.failure(null);
		while( matchNameChar() );
		String name = parser.textFrom(start).toLowerCase();
		return parser.success(name);
	}

	private String parseAttrValue() {
		parser.begin();
		while( matchSpace() );
		if( !parser.match('=') )
			return parser.failure(null);
		while( matchSpace() );
		if( parser.anyOf("\"'") ) {
			char quote = parser.lastChar();
			int start = parser.currentIndex();
			while( !parser.test(quote) ) {
				if( !parser.anyChar() )
					return parser.failure(null);
			}
			String value = parser.textFrom(start);
			parser.match(quote);
			value = decode(value);
			return parser.success(value);
		}
		int start = parser.currentIndex();
		if( !matchValueChar() )
			return parser.failure(null);
		while( matchValueChar() );
		String value = parser.textFrom(start);
		value = decode(value);
		return parser.success(value);
	}

	private boolean matchNameChar() {
		return parser.inCharRange('a','z')
			|| parser.inCharRange('A','Z')
			|| parser.inCharRange('0','9')
			|| parser.anyOf("_.-:")
		;
	}

	private boolean matchValueChar() {
		return parser.noneOf(" \t\r\n\"'>/=");
	}

	private boolean matchSpace() {
		return parser.anyOf(" \t\r\n");
	}

}