Mercurial Hosting > luan
annotate src/goodjava/html/Html.java @ 1764:527c53b91a50
lucene error handling
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Mon, 22 May 2023 20:43:52 -0600 |
parents | 31a82b0d0a87 |
children | a045f30fa67d |
rev | line source |
---|---|
1712 | 1 package goodjava.html; |
2 | |
3 import java.util.List; | |
4 import java.util.ArrayList; | |
5 import java.util.Set; | |
6 import java.util.HashSet; | |
7 import java.util.Map; | |
8 import java.util.HashMap; | |
9 import java.util.Collections; | |
10 import java.util.regex.Pattern; | |
11 import java.util.regex.Matcher; | |
12 import goodjava.parser.Parser; | |
13 | |
14 | |
15 public final class Html { | |
16 | |
17 private static final Pattern entityPtn = Pattern.compile( | |
18 "&(#?[0-9a-zA-Z]+;)" | |
19 ); | |
20 | |
21 public static String encode(String s) { | |
22 //s = s.replace("&","&"); | |
23 s = entityPtn.matcher(s).replaceAll("&$1"); | |
24 s = s.replace("<","<"); | |
25 s = s.replace(">",">"); | |
26 s = s.replace("\"","""); | |
27 return s; | |
28 } | |
29 | |
30 private static final Pattern entityNumPtn = Pattern.compile( | |
31 "&#(\\d+);" | |
32 ); | |
33 | |
34 public static String decode(String s) { | |
35 Matcher m = entityNumPtn.matcher(s); | |
36 if( m.find() ) { | |
37 StringBuffer buf = new StringBuffer(); | |
38 do { | |
39 String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))}); | |
40 m.appendReplacement(buf,entity); | |
41 } while( m.find() ); | |
42 m.appendTail(buf); | |
43 s = buf.toString(); | |
44 } | |
45 s = s.replace(" "," "); | |
46 s = s.replace(""","\""); | |
47 s = s.replace(">",">"); | |
48 s = s.replace("<","<"); | |
49 s = s.replace("&","&"); | |
50 return s; | |
51 } | |
52 | |
53 | |
54 public static final class Comment { | |
55 public final String text; | |
56 | |
57 private Comment(String text) { | |
58 this.text = text; | |
59 } | |
60 } | |
61 | |
62 public static final class CData { | |
63 public final String text; | |
64 | |
65 private CData(String text) { | |
66 this.text = text; | |
67 } | |
68 } | |
69 | |
70 public static final class Tag { | |
71 public final String name; | |
72 public final Map<String,Object> attributes; | |
73 public final boolean isEmpty; | |
74 public final String raw; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
75 public final Map<String,String> style; |
1712 | 76 |
77 private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) { | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
78 this(name,attributes,isEmpty,raw,null); |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
79 } |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
80 |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
81 private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw,Map<String,String> style) { |
1712 | 82 this.name = name; |
83 this.attributes = attributes; | |
84 this.isEmpty = isEmpty; | |
85 this.raw = raw; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
86 this.style = style; |
1712 | 87 } |
88 } | |
89 | |
90 public static final class Container { | |
91 public final Tag tag; | |
92 public final String text; | |
93 | |
94 private Container(Tag tag,String text) { | |
95 this.tag = tag; | |
96 this.text = text; | |
97 } | |
98 } | |
99 | |
100 private static Set<String> defaultContainerTags = new HashSet<String>(); | |
101 static { | |
102 Collections.addAll( defaultContainerTags, "script", "style", "textarea" ); | |
103 } | |
104 | |
105 public static List parse(String text) { | |
106 return parse(text,defaultContainerTags); | |
107 } | |
108 | |
109 public static List parse(String text,Set<String> containerTags) { | |
110 return new Html(text,containerTags).parse(); | |
111 } | |
112 | |
113 private final Parser parser; | |
114 private final Set<String> containerTags; | |
115 | |
116 private Html(String text,Set<String> containerTags) { | |
117 this.parser = new Parser(text); | |
118 this.containerTags = containerTags; | |
119 } | |
120 | |
121 private List parse() { | |
122 List list = new ArrayList(); | |
123 StringBuilder sb = new StringBuilder(); | |
124 while( !parser.endOfInput() ) { | |
125 if( parser.test('<') ) { | |
126 Tag tag = parseTag(); | |
127 if( tag != null ) { | |
128 Object el = tag; | |
129 String tagName = tag.name; | |
130 if( containerTags.contains(tagName) ) { | |
131 Container container = parseContainer(tag); | |
132 if( container != null ) | |
133 el = container; | |
134 } | |
135 if( el != null | |
136 || (el = parseComment()) != null | |
137 || (el = parseCdata()) != null | |
138 ) { | |
139 add(list,sb); | |
140 list.add(el); | |
141 continue; | |
142 } | |
143 } | |
144 } | |
145 sb.append( parser.currentChar() ); | |
146 parser.anyChar(); | |
147 } | |
148 add(list,sb); | |
149 return list; | |
150 } | |
151 | |
152 private static void add(List list,StringBuilder sb) { | |
153 if( sb.length() > 0 ) { | |
154 list.add(decode(sb.toString())); | |
155 sb.setLength(0); | |
156 } | |
157 } | |
158 | |
159 private Comment parseComment() { | |
160 parser.begin(); | |
161 if( !parser.match("<!--") ) | |
162 return parser.failure(null); | |
163 int start = parser.currentIndex(); | |
164 while( !parser.test("-->") ) { | |
165 if( !parser.anyChar() ) | |
166 return parser.failure(null); | |
167 } | |
168 String text = parser.textFrom(start); | |
169 Comment comment = new Comment(text); | |
170 return parser.success(comment); | |
171 } | |
172 | |
173 private CData parseCdata() { | |
174 parser.begin(); | |
175 if( !parser.match("<![CDATA[") ) | |
176 return parser.failure(null); | |
177 int start = parser.currentIndex(); | |
178 while( !parser.test("]]>") ) { | |
179 if( !parser.anyChar() ) | |
180 return parser.failure(null); | |
181 } | |
182 String text = parser.textFrom(start); | |
183 CData cdata = new CData(text); | |
184 return parser.success(cdata); | |
185 } | |
186 | |
187 private Container parseContainer(Tag tag) { | |
188 String endTagName = '/' + tag.name; | |
189 int start = parser.begin(); | |
190 int end; | |
191 while(true) { | |
192 if( parser.test('<') ) { | |
193 end = parser.currentIndex(); | |
194 Tag tag2 = parseTag(); | |
195 if( tag2.name.equals(endTagName) ) | |
196 break; | |
197 } | |
198 if( !parser.anyChar() ) | |
199 return parser.failure(null); | |
200 } | |
201 String text = parser.text.substring(start,end); | |
202 Container container = new Container(tag,text); | |
203 return parser.success(container); | |
204 } | |
205 | |
206 private Tag parseTag() { | |
207 int tagStart = parser.begin(); | |
208 if( !parser.match('<') ) | |
209 return parser.failure(null); | |
210 int start = parser.currentIndex(); | |
211 parser.match('/'); | |
212 if( !matchNameChar() ) | |
213 return parser.failure(null); | |
214 while( matchNameChar() ); | |
215 String name = parser.textFrom(start).toLowerCase(); | |
216 Map<String,Object> attributes = new HashMap<String,Object>(); | |
217 String attrName; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
218 Map<String,String> style = null; |
1712 | 219 while( (attrName = parseAttrName()) != null ) { |
220 String attrValue = parseAttrValue(); | |
221 attributes.put( attrName, attrValue!=null ? attrValue : true ); | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
222 if( attrName.equals("style") && attrValue!=null && style==null ) { |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
223 style = Css.style(attrValue); |
1712 | 224 } |
225 } | |
226 while( matchSpace() ); | |
227 boolean isEmpty = parser.match('/'); | |
228 if( !parser.match('>') ) | |
229 return parser.failure(null); | |
230 String raw = parser.textFrom(tagStart); | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
231 Tag tag = new Tag(name,attributes,isEmpty,raw,style); |
1712 | 232 return parser.success(tag); |
233 } | |
234 | |
235 private String parseAttrName() { | |
236 parser.begin(); | |
237 if( !matchSpace() ) | |
238 return parser.failure(null); | |
239 while( matchSpace() ); | |
240 int start = parser.currentIndex(); | |
241 if( !matchNameChar() ) | |
242 return parser.failure(null); | |
243 while( matchNameChar() ); | |
244 String name = parser.textFrom(start).toLowerCase(); | |
245 return parser.success(name); | |
246 } | |
247 | |
248 private String parseAttrValue() { | |
249 parser.begin(); | |
250 while( matchSpace() ); | |
251 if( !parser.match('=') ) | |
252 return parser.failure(null); | |
253 while( matchSpace() ); | |
254 if( parser.anyOf("\"'") ) { | |
255 char quote = parser.lastChar(); | |
256 int start = parser.currentIndex(); | |
257 while( !parser.test(quote) ) { | |
258 if( !parser.anyChar() ) | |
259 return parser.failure(null); | |
260 } | |
261 String value = parser.textFrom(start); | |
262 parser.match(quote); | |
263 value = decode(value); | |
264 return parser.success(value); | |
265 } | |
266 int start = parser.currentIndex(); | |
267 if( !matchValueChar() ) | |
268 return parser.failure(null); | |
269 while( matchValueChar() ); | |
270 String value = parser.textFrom(start); | |
271 value = decode(value); | |
272 return parser.success(value); | |
273 } | |
274 | |
275 private boolean matchNameChar() { | |
276 return parser.inCharRange('a','z') | |
277 || parser.inCharRange('A','Z') | |
278 || parser.inCharRange('0','9') | |
279 || parser.anyOf("_.-:") | |
280 ; | |
281 } | |
282 | |
283 private boolean matchValueChar() { | |
284 return parser.noneOf(" \t\r\n\"'>/="); | |
285 } | |
286 | |
287 private boolean matchSpace() { | |
288 return parser.anyOf(" \t\r\n"); | |
289 } | |
290 | |
291 } |