changeset 625:a3c1e11fb6aa

rewrite much of Html to be more understandable; add Lucene html_highlighter();
author Franklin Schmidt <fschmidt@gmail.com>
date Tue, 12 Jan 2016 23:52:56 -0700
parents 8281a248c47e
children 53a50c70c5e2
files core/src/luan/modules/Html.luan core/src/luan/modules/HtmlLuan.java core/src/luan/modules/parsers/Html.java lucene/src/luan/modules/lucene/Lucene.luan
diffstat 4 files changed, 274 insertions(+), 30 deletions(-) [+]
line wrap: on
line diff
--- a/core/src/luan/modules/Html.luan	Thu Jan 07 18:46:07 2016 -0700
+++ b/core/src/luan/modules/Html.luan	Tue Jan 12 23:52:56 2016 -0700
@@ -1,48 +1,76 @@
 java()
 local HtmlLuan = require "java:luan.modules.HtmlLuan"
+local HtmlParser = require "java:luan.modules.parsers.Html"
+local URLEncoder = require "java:java.net.URLEncoder"
+local Luan = require "luan:Luan"
+local error = Luan.error
+local ipairs = Luan.ipairs or error()
+local pairs = Luan.pairs or error()
+local type = Luan.type or error()
+local Io = require "luan:Io"
+local output_of = Io.output_of or error()
+
 
 local M = {}
 
 M.encode = HtmlLuan.encode
-M.parse = HtmlLuan.parse
-M.to_string = HtmlLuan.to_string
 
-
+local quote = HtmlLuan.quote
+M.quote = quote
 
--- extras
-
-local Luan = require "luan:Luan"
-local ipairs = Luan.ipairs
-local type = Luan.type
-local Io = require "luan:Io"
-local URLEncoder = require "java:java.net.URLEncoder"
+function M.parse(text,container_tags)
+	text or error "text required"
+	container_tags = container_tags or {"script","style","textarea"}
+	return HtmlParser.toList(text,container_tags)
+end
 
 function M.url_encode(s)
 	return URLEncoder.encode(s,"UTF-8")
 end
 
-function M.process_url_tags(html)
-	for i, v in ipairs(html) do
-		if type(v) == "table" and v.type == "tag" then
-			if v.name == "url" then
-				local url = v.attributes.url or html[i+1]
-				v.name = "a"
-				v.attributes.url = nil
-				v.attributes.href = url
-			elseif v.name == "/url" then
-				v.name = "/a"
-			end
+local function output_tag(tag)
+	%><<%= tag.name %><%
+	local attributes = tag.attributes
+	for name, value in pairs(attributes) do
+		%> <%= name %><%
+		if value ~= true then
+			%>=<%= quote(value) %><%
 		end
 	end
+	if tag.is_empty then
+		%>/<%
+	end
+	%>><%
 end
 
-function M.add_nofollow(html)
-	for i, v in ipairs(html) do
-		if type(v) == "table" and v.type == "tag" and v.name == "a" then
-			v.attributes.rel = "nofollow"
+function M.to_string(list)
+	return output_of( function()
+		for _, obj in ipairs(list) do
+			local tp = type(obj)
+			if tp == "string" then
+				%><%= obj %><%
+			elseif tp == "table" then
+				tp = obj.type
+				if tp == nil then
+					error "no type in element of table for 'Html.to_string'"
+				elseif tp == "comment" then
+					%><!--<%= obj.text %>--><%
+				elseif tp == "cdata" then
+					%><![CDATA[<%= obj.text %>]]><%
+				elseif tp == "tag" then
+					output_tag(obj)
+				elseif tp == "container" then
+					local tag = obj.tag
+					output_tag(tag)
+					%><%= obj.text %></<%= tag.name %>><%
+				else
+					error "invalid element type for 'Html.to_string'"
+				end
+			else
+				error("invalid value ("..tp..") in list for 'Html.to_string'")
+			end
 		end
-	end
+	end )
 end
 
-
 return M
--- a/core/src/luan/modules/HtmlLuan.java	Thu Jan 07 18:46:07 2016 -0700
+++ b/core/src/luan/modules/HtmlLuan.java	Tue Jan 12 23:52:56 2016 -0700
@@ -47,6 +47,7 @@
 
 	public static Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE));
 */
+/*
 	public static LuanTable parse(LuanState luan,String text,LuanTable containerTagsTbl)
 		throws LuanException
 	{
@@ -270,10 +271,10 @@
 		return (c=='"' || c=='\'') && s.charAt(s.length()-1)==c
 			? s.substring(1,s.length()-1) : s;
 	}
+*/
 
 
-
-
+/*
 	public static String to_string(LuanState luan,LuanTable tbl) throws LuanException {
 		List<Object> html = tbl.asList();
 		StringBuilder buf = new StringBuilder();
@@ -324,7 +325,7 @@
 		buf.append('>');
 		return buf.toString();
 	}
-
+*/
 	public static String quote(String s) {
 		StringBuilder buf = new StringBuilder();
 		buf.append('"');
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/core/src/luan/modules/parsers/Html.java	Tue Jan 12 23:52:56 2016 -0700
@@ -0,0 +1,197 @@
+package luan.modules.parsers;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.HashSet;
+import luan.LuanTable;
+
+
+public final class Html {
+
+	public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException {
+		return new Html(text,containerTagsTbl).parse();
+	}
+
+	private final Parser parser;
+	private final Set<String> containerTags = new HashSet<String>();
+
+	private Html(String text,LuanTable containerTagsTbl) {
+		this.parser = new Parser(text);
+		for( Object v : containerTagsTbl.asList() ) {
+			containerTags.add((String)v);
+		}
+	}
+
+	private LuanTable parse() throws ParseException {
+		List list = new ArrayList();
+		StringBuilder sb = new StringBuilder();
+		while( !parser.endOfInput() ) {
+			if( parser.test('<') ) {
+				LuanTable tbl = parseTag();
+				if( tbl != null ) {
+					String tagName = (String)tbl.rawGet("name");
+					if( containerTags.contains(tagName) ) {
+						LuanTable container = parseContainer(tbl);
+						if( container != null )
+							tbl = container;
+					}
+					if( tbl != null 
+						|| (tbl = parseComment()) != null
+						|| (tbl = parseCdata()) != null
+					) {
+						if( sb.length() > 0 ) {
+							list.add(sb.toString());
+							sb.setLength(0);
+						}
+						list.add(tbl);
+						continue;
+					}
+				}
+			}
+			sb.append( parser.currentChar() );
+			parser.anyChar();
+		}
+		if( sb.length() > 0 )
+			list.add(sb.toString());
+		return new LuanTable(list);
+	}
+
+	private LuanTable parseComment() {
+		parser.begin();
+		if( !parser.match("<!--") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("-->") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","comment");
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseCdata() {
+		parser.begin();
+		if( !parser.match("<![CDATA[") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("]]>") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","cdata");
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseContainer(LuanTable tag) {
+		String endTagName = '/' + (String)tag.rawGet("name");
+		int start = parser.begin();
+		int end;
+		while(true) {
+			if( parser.test('<') ) {
+				end = parser.currentIndex();
+				LuanTable tag2 = parseTag();
+				String s = (String)tag2.rawGet("name");
+				if( s.equals(endTagName) )
+					break;
+			}
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.text.substring(start,end);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","container");
+		tbl.rawPut("tag",tag);
+		tbl.rawPut("text",text);
+		return parser.success(tbl);
+	}
+
+	private LuanTable parseTag() {
+		parser.begin();
+		if( !parser.match('<') )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		parser.match('/');
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		LuanTable attributes = new LuanTable();
+		String attrName;
+		while( (attrName = parseAttrName()) != null ) {
+			String attrValue = parseAttrValue();
+			attributes.rawPut( attrName, attrValue!=null ? attrValue : true );
+		}
+		while( matchSpace() );
+		boolean isEmpty = parser.match('/');
+		if( !parser.match('>') )
+			return parser.failure(null);
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","tag");
+		tbl.rawPut("name",name);
+		tbl.rawPut("attributes",attributes);
+		tbl.rawPut("is_empty",isEmpty);
+		return parser.success(tbl);
+	}
+
+	private String parseAttrName() {
+		parser.begin();
+		if( !matchSpace() )
+			return parser.failure(null);
+		while( matchSpace() );
+		int start = parser.currentIndex();
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start);
+		return parser.success(name);
+	}
+
+	private String parseAttrValue() {
+		parser.begin();
+		while( matchSpace() );
+		if( !parser.match('=') )
+			return parser.failure(null);
+		while( matchSpace() );
+		if( parser.anyOf("\"'") ) {
+			char quote = parser.lastChar();
+			int start = parser.currentIndex();
+			while( !parser.test(quote) ) {
+				if( !parser.anyChar() )
+					return parser.failure(null);
+			}
+			String value = parser.textFrom(start);
+			parser.match(quote);
+			return parser.success(value);
+		}
+		int start = parser.currentIndex();
+		if( !matchValueChar() )
+			return parser.failure(null);
+		while( matchValueChar() );
+		String value = parser.textFrom(start);
+		return parser.success(value);
+	}
+
+	private boolean matchNameChar() {
+		return parser.inCharRange('a','z')
+			|| parser.inCharRange('A','Z')
+			|| parser.inCharRange('0','9')
+			|| parser.anyOf("_.-:")
+		;
+	}
+
+	private boolean matchValueChar() {
+		return parser.noneOf(" \t\r\n\"'>/=");
+	}
+
+	private boolean matchSpace() {
+		return parser.anyOf(" \t\r\n");
+	}
+
+}
--- a/lucene/src/luan/modules/lucene/Lucene.luan	Thu Jan 07 18:46:07 2016 -0700
+++ b/lucene/src/luan/modules/lucene/Lucene.luan	Tue Jan 12 23:52:56 2016 -0700
@@ -1,6 +1,9 @@
 java()
 local Luan = require "luan:Luan"
 local error = Luan.error
+local ipairs = Luan.ipairs or error()
+local type = Luan.type or error()
+local Html = require "luan:Html"
 local LuceneIndex = require "java:luan.modules.lucene.LuceneIndex"
 local NumberFieldParser = require "java:sane.lucene.queryparser.NumberFieldParser"
 local StringFieldParser = require "java:sane.lucene.queryparser.StringFieldParser"
@@ -69,6 +72,21 @@
 		return index.advanced_search(query)
 	end
 
+	function index.html_highlighter(query,formatter,container_tags)
+		local highlighter = index.highlighter(query,formatter)
+		return function(html)
+			local list = Html.parse(html,container_tags)
+			local result = {}
+			for _, obj in ipairs(list) do
+				if type(obj) == "string" then
+					obj = highlighter(obj)
+				end
+				result[#result+1] = obj
+			end
+			return Html.to_string(result)
+		end
+	end
+
 	return index
 end