comparison core/src/luan/modules/parsers/Html.java @ 625:a3c1e11fb6aa

rewrite much of Html to be more understandable; add Lucene html_highlighter();
author Franklin Schmidt <fschmidt@gmail.com>
date Tue, 12 Jan 2016 23:52:56 -0700
parents
children
comparison
equal deleted inserted replaced
624:8281a248c47e 625:a3c1e11fb6aa
1 package luan.modules.parsers;
2
3 import java.util.List;
4 import java.util.ArrayList;
5 import java.util.Set;
6 import java.util.HashSet;
7 import luan.LuanTable;
8
9
10 public final class Html {
11
12 public static LuanTable toList(String text,LuanTable containerTagsTbl) throws ParseException {
13 return new Html(text,containerTagsTbl).parse();
14 }
15
16 private final Parser parser;
17 private final Set<String> containerTags = new HashSet<String>();
18
19 private Html(String text,LuanTable containerTagsTbl) {
20 this.parser = new Parser(text);
21 for( Object v : containerTagsTbl.asList() ) {
22 containerTags.add((String)v);
23 }
24 }
25
26 private LuanTable parse() throws ParseException {
27 List list = new ArrayList();
28 StringBuilder sb = new StringBuilder();
29 while( !parser.endOfInput() ) {
30 if( parser.test('<') ) {
31 LuanTable tbl = parseTag();
32 if( tbl != null ) {
33 String tagName = (String)tbl.rawGet("name");
34 if( containerTags.contains(tagName) ) {
35 LuanTable container = parseContainer(tbl);
36 if( container != null )
37 tbl = container;
38 }
39 if( tbl != null
40 || (tbl = parseComment()) != null
41 || (tbl = parseCdata()) != null
42 ) {
43 if( sb.length() > 0 ) {
44 list.add(sb.toString());
45 sb.setLength(0);
46 }
47 list.add(tbl);
48 continue;
49 }
50 }
51 }
52 sb.append( parser.currentChar() );
53 parser.anyChar();
54 }
55 if( sb.length() > 0 )
56 list.add(sb.toString());
57 return new LuanTable(list);
58 }
59
60 private LuanTable parseComment() {
61 parser.begin();
62 if( !parser.match("<!--") )
63 return parser.failure(null);
64 int start = parser.currentIndex();
65 while( !parser.test("-->") ) {
66 if( !parser.anyChar() )
67 return parser.failure(null);
68 }
69 String text = parser.textFrom(start);
70 LuanTable tbl = new LuanTable();
71 tbl.rawPut("type","comment");
72 tbl.rawPut("text",text);
73 return parser.success(tbl);
74 }
75
76 private LuanTable parseCdata() {
77 parser.begin();
78 if( !parser.match("<![CDATA[") )
79 return parser.failure(null);
80 int start = parser.currentIndex();
81 while( !parser.test("]]>") ) {
82 if( !parser.anyChar() )
83 return parser.failure(null);
84 }
85 String text = parser.textFrom(start);
86 LuanTable tbl = new LuanTable();
87 tbl.rawPut("type","cdata");
88 tbl.rawPut("text",text);
89 return parser.success(tbl);
90 }
91
92 private LuanTable parseContainer(LuanTable tag) {
93 String endTagName = '/' + (String)tag.rawGet("name");
94 int start = parser.begin();
95 int end;
96 while(true) {
97 if( parser.test('<') ) {
98 end = parser.currentIndex();
99 LuanTable tag2 = parseTag();
100 String s = (String)tag2.rawGet("name");
101 if( s.equals(endTagName) )
102 break;
103 }
104 if( !parser.anyChar() )
105 return parser.failure(null);
106 }
107 String text = parser.text.substring(start,end);
108 LuanTable tbl = new LuanTable();
109 tbl.rawPut("type","container");
110 tbl.rawPut("tag",tag);
111 tbl.rawPut("text",text);
112 return parser.success(tbl);
113 }
114
115 private LuanTable parseTag() {
116 parser.begin();
117 if( !parser.match('<') )
118 return parser.failure(null);
119 int start = parser.currentIndex();
120 parser.match('/');
121 if( !matchNameChar() )
122 return parser.failure(null);
123 while( matchNameChar() );
124 String name = parser.textFrom(start).toLowerCase();
125 LuanTable attributes = new LuanTable();
126 String attrName;
127 while( (attrName = parseAttrName()) != null ) {
128 String attrValue = parseAttrValue();
129 attributes.rawPut( attrName, attrValue!=null ? attrValue : true );
130 }
131 while( matchSpace() );
132 boolean isEmpty = parser.match('/');
133 if( !parser.match('>') )
134 return parser.failure(null);
135 LuanTable tbl = new LuanTable();
136 tbl.rawPut("type","tag");
137 tbl.rawPut("name",name);
138 tbl.rawPut("attributes",attributes);
139 tbl.rawPut("is_empty",isEmpty);
140 return parser.success(tbl);
141 }
142
143 private String parseAttrName() {
144 parser.begin();
145 if( !matchSpace() )
146 return parser.failure(null);
147 while( matchSpace() );
148 int start = parser.currentIndex();
149 if( !matchNameChar() )
150 return parser.failure(null);
151 while( matchNameChar() );
152 String name = parser.textFrom(start);
153 return parser.success(name);
154 }
155
156 private String parseAttrValue() {
157 parser.begin();
158 while( matchSpace() );
159 if( !parser.match('=') )
160 return parser.failure(null);
161 while( matchSpace() );
162 if( parser.anyOf("\"'") ) {
163 char quote = parser.lastChar();
164 int start = parser.currentIndex();
165 while( !parser.test(quote) ) {
166 if( !parser.anyChar() )
167 return parser.failure(null);
168 }
169 String value = parser.textFrom(start);
170 parser.match(quote);
171 return parser.success(value);
172 }
173 int start = parser.currentIndex();
174 if( !matchValueChar() )
175 return parser.failure(null);
176 while( matchValueChar() );
177 String value = parser.textFrom(start);
178 return parser.success(value);
179 }
180
181 private boolean matchNameChar() {
182 return parser.inCharRange('a','z')
183 || parser.inCharRange('A','Z')
184 || parser.inCharRange('0','9')
185 || parser.anyOf("_.-:")
186 ;
187 }
188
189 private boolean matchValueChar() {
190 return parser.noneOf(" \t\r\n\"'>/=");
191 }
192
193 private boolean matchSpace() {
194 return parser.anyOf(" \t\r\n");
195 }
196
197 }