diff src/goodjava/html/Html.java @ 1712:36c28be6d432

improve html and bbcode
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 14 Jul 2022 22:14:21 -0600
parents
children 31a82b0d0a87
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/goodjava/html/Html.java	Thu Jul 14 22:14:21 2022 -0600
@@ -0,0 +1,288 @@
+package goodjava.html;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import goodjava.parser.Parser;
+
+
+public final class Html {
+
+	private static final Pattern entityPtn = Pattern.compile(
+		"&(#?[0-9a-zA-Z]+;)"
+	);
+
+	public static String encode(String s) {
+		//s = s.replace("&","&amp;");
+		s = entityPtn.matcher(s).replaceAll("&amp;$1");
+		s = s.replace("<","&lt;");
+		s = s.replace(">","&gt;");
+		s = s.replace("\"","&quot;");
+		return s;
+	}
+
+	private static final Pattern entityNumPtn = Pattern.compile(
+		"&#(\\d+);"
+	);
+
+	public static String decode(String s) {
+		Matcher m = entityNumPtn.matcher(s);
+		if( m.find() ) {
+			StringBuffer buf = new StringBuffer();
+			do {
+				String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
+				m.appendReplacement(buf,entity);
+			} while( m.find() );
+			m.appendTail(buf);
+			s = buf.toString();
+		}
+		s = s.replace("&nbsp;"," ");
+		s = s.replace("&quot;","\"");
+		s = s.replace("&gt;",">");
+		s = s.replace("&lt;","<");
+		s = s.replace("&amp;","&");
+		return s;
+	}
+
+
+	public static final class Comment {
+		public final String text;
+
+		private Comment(String text) {
+			this.text = text;
+		}
+	}
+
+	public static final class CData {
+		public final String text;
+
+		private CData(String text) {
+			this.text = text;
+		}
+	}
+
+	public static final class Tag {
+		public final String name;
+		public final Map<String,Object> attributes;
+		public final boolean isEmpty;
+		public final String raw;
+
+		private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) {
+			this.name = name;
+			this.attributes = attributes;
+			this.isEmpty = isEmpty;
+			this.raw = raw;
+		}
+	}
+
+	public static final class Container {
+		public final Tag tag;
+		public final String text;
+
+		private Container(Tag tag,String text) {
+			this.tag = tag;
+			this.text = text;
+		}
+	}
+
+	private static Set<String> defaultContainerTags = new HashSet<String>();
+	static {
+		Collections.addAll( defaultContainerTags, "script", "style", "textarea" );
+	}
+
+	public static List parse(String text) {
+		return parse(text,defaultContainerTags);
+	}
+
+	public static List parse(String text,Set<String> containerTags) {
+		return new Html(text,containerTags).parse();
+	}
+
+	private final Parser parser;
+	private final Set<String> containerTags;
+
+	private Html(String text,Set<String> containerTags) {
+		this.parser = new Parser(text);
+		this.containerTags = containerTags;
+	}
+
+	private List parse() {
+		List list = new ArrayList();
+		StringBuilder sb = new StringBuilder();
+		while( !parser.endOfInput() ) {
+			if( parser.test('<') ) {
+				Tag tag = parseTag();
+				if( tag != null ) {
+					Object el = tag;
+					String tagName = tag.name;
+					if( containerTags.contains(tagName) ) {
+						Container container = parseContainer(tag);
+						if( container != null )
+							el = container;
+					}
+					if( el != null 
+						|| (el = parseComment()) != null
+						|| (el = parseCdata()) != null
+					) {
+						add(list,sb);
+						list.add(el);
+						continue;
+					}
+				}
+			}
+			sb.append( parser.currentChar() );
+			parser.anyChar();
+		}
+		add(list,sb);
+		return list;
+	}
+
+	private static void add(List list,StringBuilder sb) {
+		if( sb.length() > 0 ) {
+			list.add(decode(sb.toString()));
+			sb.setLength(0);
+		}
+	}
+
+	private Comment parseComment() {
+		parser.begin();
+		if( !parser.match("<!--") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("-->") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		Comment comment = new Comment(text);
+		return parser.success(comment);
+	}
+
+	private CData parseCdata() {
+		parser.begin();
+		if( !parser.match("<![CDATA[") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("]]>") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		CData cdata = new CData(text);
+		return parser.success(cdata);
+	}
+
+	private Container parseContainer(Tag tag) {
+		String endTagName = '/' + tag.name;
+		int start = parser.begin();
+		int end;
+		while(true) {
+			if( parser.test('<') ) {
+				end = parser.currentIndex();
+				Tag tag2 = parseTag();
+				if( tag2.name.equals(endTagName) )
+					break;
+			}
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.text.substring(start,end);
+		Container container = new Container(tag,text);
+		return parser.success(container);
+	}
+
+	private Tag parseTag() {
+		int tagStart = parser.begin();
+		if( !parser.match('<') )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		parser.match('/');
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		Map<String,Object> attributes = new HashMap<String,Object>();
+		String attrName;
+		while( (attrName = parseAttrName()) != null ) {
+			String attrValue = parseAttrValue();
+			attributes.put( attrName, attrValue!=null ? attrValue : true );
+/*
+			if( attrName.equals("style") && attrValue!=null ) {
+				LuanTable style = Css.style(attrValue);
+				if( style!=null )
+					tbl.rawPut("style",style);
+			}
+*/
+		}
+		while( matchSpace() );
+		boolean isEmpty = parser.match('/');
+		if( !parser.match('>') )
+			return parser.failure(null);
+		String raw = parser.textFrom(tagStart);
+		Tag tag = new Tag(name,attributes,isEmpty,raw);
+		return parser.success(tag);
+	}
+
+	private String parseAttrName() {
+		parser.begin();
+		if( !matchSpace() )
+			return parser.failure(null);
+		while( matchSpace() );
+		int start = parser.currentIndex();
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		return parser.success(name);
+	}
+
+	private String parseAttrValue() {
+		parser.begin();
+		while( matchSpace() );
+		if( !parser.match('=') )
+			return parser.failure(null);
+		while( matchSpace() );
+		if( parser.anyOf("\"'") ) {
+			char quote = parser.lastChar();
+			int start = parser.currentIndex();
+			while( !parser.test(quote) ) {
+				if( !parser.anyChar() )
+					return parser.failure(null);
+			}
+			String value = parser.textFrom(start);
+			parser.match(quote);
+			value = decode(value);
+			return parser.success(value);
+		}
+		int start = parser.currentIndex();
+		if( !matchValueChar() )
+			return parser.failure(null);
+		while( matchValueChar() );
+		String value = parser.textFrom(start);
+		value = decode(value);
+		return parser.success(value);
+	}
+
+	private boolean matchNameChar() {
+		return parser.inCharRange('a','z')
+			|| parser.inCharRange('A','Z')
+			|| parser.inCharRange('0','9')
+			|| parser.anyOf("_.-:")
+		;
+	}
+
+	private boolean matchValueChar() {
+		return parser.noneOf(" \t\r\n\"'>/=");
+	}
+
+	private boolean matchSpace() {
+		return parser.anyOf(" \t\r\n");
+	}
+
+}