diff core/src/luan/modules/parsers/Html.java @ 625:a3c1e11fb6aa

rewrite much of Html to be more understandable; add Lucene html_highlighter();
author Franklin Schmidt <fschmidt@gmail.com>
date Tue, 12 Jan 2016 23:52:56 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/src/luan/modules/parsers/Html.java	Tue Jan 12 23:52:56 2016 -0700
@@ -0,0 +1,197 @@
+package luan.modules.parsers;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.HashSet;
+import luan.LuanTable;
+
+
+public final class Html {
+
+	public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException {
+		return new Html(text,containerTagsTbl).parse();
+	}
+
+	private final Parser parser;
+	private final Set<String> containerTags = new HashSet<String>();
+
+	private Html(String text,LuanTable containerTagsTbl) {
+		this.parser = new Parser(text);
+		for( Object v : containerTagsTbl.asList() ) {
+			containerTags.add((String)v);
+		}
+	}
+
+	private LuanTable parse() throws ParseException {
+		List list = new ArrayList();
+		StringBuilder sb = new StringBuilder();
+		while( !parser.endOfInput() ) {
+			if( parser.test('<') ) {
+				LuanTable tbl = parseTag();
+				if( tbl != null ) {
+					String tagName = (String)tbl.rawGet("name");
+					if( containerTags.contains(tagName) ) {
+						LuanTable container = parseContainer(tbl);
+						if( container != null )
+							tbl = container;
+					}
+					if( tbl != null 
+						|| (tbl = parseComment()) != null
+						|| (tbl = parseCdata()) != null
+					) {
+						if( sb.length() > 0 ) {
+							list.add(sb.toString());
+							sb.setLength(0);
+						}
+						list.add(tbl);
+						continue;
+					}
+				}
+			}
+			sb.append( parser.currentChar() );
+			parser.anyChar();
+		}
+		if( sb.length() > 0 )
+			list.add(sb.toString());
+		return new LuanTable(list);
+	}
+
+	private LuanTable parseComment() {
+		parser.begin();
+		if( !parser.match("<!--") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("-->") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","comment");
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseCdata() {
+		parser.begin();
+		if( !parser.match("<![CDATA[") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("]]>") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","cdata");
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseContainer(LuanTable tag) {
+		String endTagName = '/' + (String)tag.rawGet("name");
+		int start = parser.begin();
+		int end;
+		while(true) {
+			if( parser.test('<') ) {
+				end = parser.currentIndex();
+				LuanTable tag2 = parseTag();
+				String s = (String)tag2.rawGet("name");
+				if( s.equals(endTagName) )
+					break;
+			}
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.text.substring(start,end);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","container");
+		tbl.rawPut("tag",tag);
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseTag() {
+		parser.begin();
+		if( !parser.match('<') )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		parser.match('/');
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		LuanTable attributes = new LuanTable();
+		String attrName;
+		while( (attrName = parseAttrName()) != null ) {
+			String attrValue = parseAttrValue();
+			attributes.rawPut( attrName, attrValue!=null ? attrValue : true );
+		}
+		while( matchSpace() );
+		boolean isEmpty = parser.match('/');
+		if( !parser.match('>') )
+			return parser.failure(null);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","tag");
+		tbl.rawPut("name",name);
+		tbl.rawPut("attributes",attributes);
+		tbl.rawPut("is_empty",isEmpty);
+		return parser.success(tbl);
+	}
+
+	private String parseAttrName() {
+		parser.begin();
+		if( !matchSpace() )
+			return parser.failure(null);
+		while( matchSpace() );
+		int start = parser.currentIndex();
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start);
+		return parser.success(name);
+	}
+
+	private String parseAttrValue() {
+		parser.begin();
+		while( matchSpace() );
+		if( !parser.match('=') )
+			return parser.failure(null);
+		while( matchSpace() );
+		if( parser.anyOf("\"'") ) {
+			char quote = parser.lastChar();
+			int start = parser.currentIndex();
+			while( !parser.test(quote) ) {
+				if( !parser.anyChar() )
+					return parser.failure(null);
+			}
+			String value = parser.textFrom(start);
+			parser.match(quote);
+			return parser.success(value);
+		}
+		int start = parser.currentIndex();
+		if( !matchValueChar() )
+			return parser.failure(null);
+		while( matchValueChar() );
+		String value = parser.textFrom(start);
+		return parser.success(value);
+	}
+
+	private boolean matchNameChar() {
+		return parser.inCharRange('a','z')
+			|| parser.inCharRange('A','Z')
+			|| parser.inCharRange('0','9')
+			|| parser.anyOf("_.-:")
+		;
+	}
+
+	private boolean matchValueChar() {
+		return parser.noneOf(" \t\r\n\"'>/=");
+	}
+
+	private boolean matchSpace() {
+		return parser.anyOf(" \t\r\n");
+	}
+
+}