Mercurial Hosting > luan
changeset 625:a3c1e11fb6aa
rewrite much of Html to be more understandable;
add Lucene html_highlighter();
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Tue, 12 Jan 2016 23:52:56 -0700 |
parents | 8281a248c47e |
children | 53a50c70c5e2 |
files | core/src/luan/modules/Html.luan core/src/luan/modules/HtmlLuan.java core/src/luan/modules/parsers/Html.java lucene/src/luan/modules/lucene/Lucene.luan |
diffstat | 4 files changed, 274 insertions(+), 30 deletions(-) [+] |
line wrap: on
line diff
diff -r 8281a248c47e -r a3c1e11fb6aa core/src/luan/modules/Html.luan --- a/core/src/luan/modules/Html.luan Thu Jan 07 18:46:07 2016 -0700 +++ b/core/src/luan/modules/Html.luan Tue Jan 12 23:52:56 2016 -0700 @@ -1,48 +1,76 @@ java() local HtmlLuan = require "java:luan.modules.HtmlLuan" +local HtmlParser = require "java:luan.modules.parsers.Html" +local URLEncoder = require "java:java.net.URLEncoder" +local Luan = require "luan:Luan" +local error = Luan.error +local ipairs = Luan.ipairs or error() +local pairs = Luan.pairs or error() +local type = Luan.type or error() +local Io = require "luan:Io" +local output_of = Io.output_of or error() + local M = {} M.encode = HtmlLuan.encode -M.parse = HtmlLuan.parse -M.to_string = HtmlLuan.to_string - +local quote = HtmlLuan.quote +M.quote = quote --- extras - -local Luan = require "luan:Luan" -local ipairs = Luan.ipairs -local type = Luan.type -local Io = require "luan:Io" -local URLEncoder = require "java:java.net.URLEncoder" +function M.parse(text,container_tags) + text or error "text required" + container_tags = container_tags or {"script","style","textarea"} + return HtmlParser.toList(text,container_tags) +end function M.url_encode(s) return URLEncoder.encode(s,"UTF-8") end -function M.process_url_tags(html) - for i, v in ipairs(html) do - if type(v) == "table" and v.type == "tag" then - if v.name == "url" then - local url = v.attributes.url or html[i+1] - v.name = "a" - v.attributes.url = nil - v.attributes.href = url - elseif v.name == "/url" then - v.name = "/a" - end +local function output_tag(tag) + %><<%= tag.name %><% + local attributes = tag.attributes + for name, value in pairs(attributes) do + %> <%= name %><% + if value ~= true then + %>=<%= quote(value) %><% end end + if tag.is_empty then + %>/<% + end + %>><% end -function M.add_nofollow(html) - for i, v in ipairs(html) do - if type(v) == "table" and v.type == "tag" and v.name == "a" then - v.attributes.rel = "nofollow" +function M.to_string(list) + return output_of( function() + for _, obj in ipairs(list) do + local tp = type(obj) + if tp == "string" then + %><%= obj %><% + elseif tp == "table" then + tp = obj.type + if tp == nil then + error "no type in element of table for 'Html.to_string'" + elseif tp == "comment" then + %><!--<%= obj.text %>--><% + elseif tp == "cdata" then + %><![CDATA[<%= obj.text %>]]><% + elseif tp == "tag" then + output_tag(obj) + elseif tp == "container" then + local tag = obj.tag + output_tag(tag) + %><%= obj.text %></<%= tag.name %>><% + else + error "invalid element type for 'Html.to_string'" + end + else + error("invalid value ("..tp..") in list for 'Html.to_string'") + end end - end + end ) end - return M
diff -r 8281a248c47e -r a3c1e11fb6aa core/src/luan/modules/HtmlLuan.java --- a/core/src/luan/modules/HtmlLuan.java Thu Jan 07 18:46:07 2016 -0700 +++ b/core/src/luan/modules/HtmlLuan.java Tue Jan 12 23:52:56 2016 -0700 @@ -47,6 +47,7 @@ public static Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); */ +/* public static LuanTable parse(LuanState luan,String text,LuanTable containerTagsTbl) throws LuanException { @@ -270,10 +271,10 @@ return (c=='"' || c=='\'') && s.charAt(s.length()-1)==c ? s.substring(1,s.length()-1) : s; } +*/ - - +/* public static String to_string(LuanState luan,LuanTable tbl) throws LuanException { List<Object> html = tbl.asList(); StringBuilder buf = new StringBuilder(); @@ -324,7 +325,7 @@ buf.append('>'); return buf.toString(); } - +*/ public static String quote(String s) { StringBuilder buf = new StringBuilder(); buf.append('"');
diff -r 8281a248c47e -r a3c1e11fb6aa core/src/luan/modules/parsers/Html.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/core/src/luan/modules/parsers/Html.java Tue Jan 12 23:52:56 2016 -0700 @@ -0,0 +1,197 @@ +package luan.modules.parsers; + +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; +import luan.LuanTable; + + +public final class Html { + + public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException { + return new Html(text,containerTagsTbl).parse(); + } + + private final Parser parser; + private final Set<String> containerTags = new HashSet<String>(); + + private Html(String text,LuanTable containerTagsTbl) { + this.parser = new Parser(text); + for( Object v : containerTagsTbl.asList() ) { + containerTags.add((String)v); + } + } + + private LuanTable parse() throws ParseException { + List list = new ArrayList(); + StringBuilder sb = new StringBuilder(); + while( !parser.endOfInput() ) { + if( parser.test('<') ) { + LuanTable tbl = parseTag(); + if( tbl != null ) { + String tagName = (String)tbl.rawGet("name"); + if( containerTags.contains(tagName) ) { + LuanTable container = parseContainer(tbl); + if( container != null ) + tbl = container; + } + if( tbl != null + || (tbl = parseComment()) != null + || (tbl = parseCdata()) != null + ) { + if( sb.length() > 0 ) { + list.add(sb.toString()); + sb.setLength(0); + } + list.add(tbl); + continue; + } + } + } + sb.append( parser.currentChar() ); + parser.anyChar(); + } + if( sb.length() > 0 ) + list.add(sb.toString()); + return new LuanTable(list); + } + + private LuanTable parseComment() { + parser.begin(); + if( !parser.match("<!--") ) + return parser.failure(null); + int start = parser.currentIndex(); + while( !parser.test("-->") ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.textFrom(start); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","comment"); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseCdata() { + parser.begin(); + if( !parser.match("<![CDATA[") ) + return parser.failure(null); + int start = parser.currentIndex(); + while( !parser.test("]]>") ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.textFrom(start); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","cdata"); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseContainer(LuanTable tag) { + String endTagName = '/' + (String)tag.rawGet("name"); + int start = parser.begin(); + int end; + while(true) { + if( parser.test('<') ) { + end = parser.currentIndex(); + LuanTable tag2 = parseTag(); + String s = (String)tag2.rawGet("name"); + if( s.equals(endTagName) ) + break; + } + if( !parser.anyChar() ) + return parser.failure(null); + } + String text = parser.text.substring(start,end); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","container"); + tbl.rawPut("tag",tag); + tbl.rawPut("text",text); + return parser.success(tbl); + } + + private LuanTable parseTag() { + parser.begin(); + if( !parser.match('<') ) + return parser.failure(null); + int start = parser.currentIndex(); + parser.match('/'); + if( !matchNameChar() ) + return parser.failure(null); + while( matchNameChar() ); + String name = parser.textFrom(start).toLowerCase(); + LuanTable attributes = new LuanTable(); + String attrName; + while( (attrName = parseAttrName()) != null ) { + String attrValue = parseAttrValue(); + attributes.rawPut( attrName, attrValue!=null ? attrValue : true ); + } + while( matchSpace() ); + boolean isEmpty = parser.match('/'); + if( !parser.match('>') ) + return parser.failure(null); + LuanTable tbl = new LuanTable(); + tbl.rawPut("type","tag"); + tbl.rawPut("name",name); + tbl.rawPut("attributes",attributes); + tbl.rawPut("is_empty",isEmpty); + return parser.success(tbl); + } + + private String parseAttrName() { + parser.begin(); + if( !matchSpace() ) + return parser.failure(null); + while( matchSpace() ); + int start = parser.currentIndex(); + if( !matchNameChar() ) + return parser.failure(null); + while( matchNameChar() ); + String name = parser.textFrom(start); + return parser.success(name); + } + + private String parseAttrValue() { + parser.begin(); + while( matchSpace() ); + if( !parser.match('=') ) + return parser.failure(null); + while( matchSpace() ); + if( parser.anyOf("\"'") ) { + char quote = parser.lastChar(); + int start = parser.currentIndex(); + while( !parser.test(quote) ) { + if( !parser.anyChar() ) + return parser.failure(null); + } + String value = parser.textFrom(start); + parser.match(quote); + return parser.success(value); + } + int start = parser.currentIndex(); + if( !matchValueChar() ) + return parser.failure(null); + while( matchValueChar() ); + String value = parser.textFrom(start); + return parser.success(value); + } + + private boolean matchNameChar() { + return parser.inCharRange('a','z') + || parser.inCharRange('A','Z') + || parser.inCharRange('0','9') + || parser.anyOf("_.-:") + ; + } + + private boolean matchValueChar() { + return parser.noneOf(" \t\r\n\"'>/="); + } + + private boolean matchSpace() { + return parser.anyOf(" \t\r\n"); + } + +}
diff -r 8281a248c47e -r a3c1e11fb6aa lucene/src/luan/modules/lucene/Lucene.luan --- a/lucene/src/luan/modules/lucene/Lucene.luan Thu Jan 07 18:46:07 2016 -0700 +++ b/lucene/src/luan/modules/lucene/Lucene.luan Tue Jan 12 23:52:56 2016 -0700 @@ -1,6 +1,9 @@ java() local Luan = require "luan:Luan" local error = Luan.error +local ipairs = Luan.ipairs or error() +local type = Luan.type or error() +local Html = require "luan:Html" local LuceneIndex = require "java:luan.modules.lucene.LuceneIndex" local NumberFieldParser = require "java:sane.lucene.queryparser.NumberFieldParser" local StringFieldParser = require "java:sane.lucene.queryparser.StringFieldParser" @@ -69,6 +72,21 @@ return index.advanced_search(query) end + function index.html_highlighter(query,formatter,container_tags) + local highlighter = index.highlighter(query,formatter) + return function(html) + local list = Html.parse(html,container_tags) + local result = {} + for _, obj in ipairs(list) do + if type(obj) == "string" then + obj = highlighter(obj) + end + result[#result+1] = obj + end + return Html.to_string(result) + end + end + return index end