Mercurial Hosting > luan
annotate src/goodjava/html/Html.java @ 1800:a045f30fa67d
html fix
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Fri, 08 Mar 2024 10:24:00 -0700 |
parents | 31a82b0d0a87 |
children |
rev | line source |
---|---|
1712 | 1 package goodjava.html; |
2 | |
3 import java.util.List; | |
4 import java.util.ArrayList; | |
5 import java.util.Set; | |
6 import java.util.HashSet; | |
7 import java.util.Map; | |
8 import java.util.HashMap; | |
9 import java.util.Collections; | |
10 import java.util.regex.Pattern; | |
11 import java.util.regex.Matcher; | |
12 import goodjava.parser.Parser; | |
13 | |
14 | |
15 public final class Html { | |
16 | |
17 private static final Pattern entityPtn = Pattern.compile( | |
18 "&(#?[0-9a-zA-Z]+;)" | |
19 ); | |
20 | |
21 public static String encode(String s) { | |
22 //s = s.replace("&","&"); | |
23 s = entityPtn.matcher(s).replaceAll("&$1"); | |
24 s = s.replace("<","<"); | |
25 s = s.replace(">",">"); | |
26 s = s.replace("\"","""); | |
27 return s; | |
28 } | |
29 | |
30 private static final Pattern entityNumPtn = Pattern.compile( | |
31 "&#(\\d+);" | |
32 ); | |
33 | |
34 public static String decode(String s) { | |
35 Matcher m = entityNumPtn.matcher(s); | |
36 if( m.find() ) { | |
37 StringBuffer buf = new StringBuffer(); | |
38 do { | |
39 String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))}); | |
40 m.appendReplacement(buf,entity); | |
41 } while( m.find() ); | |
42 m.appendTail(buf); | |
43 s = buf.toString(); | |
44 } | |
45 s = s.replace(" "," "); | |
46 s = s.replace(""","\""); | |
47 s = s.replace(">",">"); | |
48 s = s.replace("<","<"); | |
49 s = s.replace("&","&"); | |
50 return s; | |
51 } | |
52 | |
53 | |
54 public static final class Comment { | |
55 public final String text; | |
56 | |
57 private Comment(String text) { | |
58 this.text = text; | |
59 } | |
60 } | |
61 | |
62 public static final class CData { | |
63 public final String text; | |
64 | |
65 private CData(String text) { | |
66 this.text = text; | |
67 } | |
68 } | |
69 | |
70 public static final class Tag { | |
71 public final String name; | |
72 public final Map<String,Object> attributes; | |
73 public final boolean isEmpty; | |
74 public final String raw; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
75 public final Map<String,String> style; |
1712 | 76 |
77 private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) { | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
78 this(name,attributes,isEmpty,raw,null); |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
79 } |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
80 |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
81 private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw,Map<String,String> style) { |
1712 | 82 this.name = name; |
83 this.attributes = attributes; | |
84 this.isEmpty = isEmpty; | |
85 this.raw = raw; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
86 this.style = style; |
1712 | 87 } |
88 } | |
89 | |
90 public static final class Container { | |
91 public final Tag tag; | |
92 public final String text; | |
93 | |
94 private Container(Tag tag,String text) { | |
95 this.tag = tag; | |
96 this.text = text; | |
97 } | |
98 } | |
99 | |
100 private static Set<String> defaultContainerTags = new HashSet<String>(); | |
101 static { | |
102 Collections.addAll( defaultContainerTags, "script", "style", "textarea" ); | |
103 } | |
104 | |
105 public static List parse(String text) { | |
106 return parse(text,defaultContainerTags); | |
107 } | |
108 | |
109 public static List parse(String text,Set<String> containerTags) { | |
110 return new Html(text,containerTags).parse(); | |
111 } | |
112 | |
113 private final Parser parser; | |
114 private final Set<String> containerTags; | |
115 | |
116 private Html(String text,Set<String> containerTags) { | |
117 this.parser = new Parser(text); | |
118 this.containerTags = containerTags; | |
119 } | |
120 | |
121 private List parse() { | |
122 List list = new ArrayList(); | |
123 StringBuilder sb = new StringBuilder(); | |
124 while( !parser.endOfInput() ) { | |
125 if( parser.test('<') ) { | |
126 Tag tag = parseTag(); | |
127 if( tag != null ) { | |
128 Object el = tag; | |
129 String tagName = tag.name; | |
130 if( containerTags.contains(tagName) ) { | |
131 Container container = parseContainer(tag); | |
132 if( container != null ) | |
133 el = container; | |
134 } | |
1800 | 135 add(list,sb); |
136 list.add(el); | |
137 continue; | |
138 } else { | |
139 Object el = parseComment(); | |
140 if( el == null ) | |
141 el = parseCdata(); | |
142 if( el != null ) { | |
1712 | 143 add(list,sb); |
144 list.add(el); | |
145 continue; | |
146 } | |
147 } | |
148 } | |
149 sb.append( parser.currentChar() ); | |
150 parser.anyChar(); | |
151 } | |
152 add(list,sb); | |
153 return list; | |
154 } | |
155 | |
156 private static void add(List list,StringBuilder sb) { | |
157 if( sb.length() > 0 ) { | |
158 list.add(decode(sb.toString())); | |
159 sb.setLength(0); | |
160 } | |
161 } | |
162 | |
163 private Comment parseComment() { | |
164 parser.begin(); | |
165 if( !parser.match("<!--") ) | |
166 return parser.failure(null); | |
167 int start = parser.currentIndex(); | |
168 while( !parser.test("-->") ) { | |
169 if( !parser.anyChar() ) | |
170 return parser.failure(null); | |
171 } | |
172 String text = parser.textFrom(start); | |
1800 | 173 parser.match("-->"); |
1712 | 174 Comment comment = new Comment(text); |
175 return parser.success(comment); | |
176 } | |
177 | |
178 private CData parseCdata() { | |
179 parser.begin(); | |
180 if( !parser.match("<![CDATA[") ) | |
181 return parser.failure(null); | |
182 int start = parser.currentIndex(); | |
183 while( !parser.test("]]>") ) { | |
184 if( !parser.anyChar() ) | |
185 return parser.failure(null); | |
186 } | |
187 String text = parser.textFrom(start); | |
1800 | 188 parser.match("]]>"); |
1712 | 189 CData cdata = new CData(text); |
190 return parser.success(cdata); | |
191 } | |
192 | |
193 private Container parseContainer(Tag tag) { | |
194 String endTagName = '/' + tag.name; | |
195 int start = parser.begin(); | |
196 int end; | |
197 while(true) { | |
198 if( parser.test('<') ) { | |
199 end = parser.currentIndex(); | |
200 Tag tag2 = parseTag(); | |
201 if( tag2.name.equals(endTagName) ) | |
202 break; | |
203 } | |
204 if( !parser.anyChar() ) | |
205 return parser.failure(null); | |
206 } | |
207 String text = parser.text.substring(start,end); | |
208 Container container = new Container(tag,text); | |
209 return parser.success(container); | |
210 } | |
211 | |
212 private Tag parseTag() { | |
213 int tagStart = parser.begin(); | |
214 if( !parser.match('<') ) | |
215 return parser.failure(null); | |
216 int start = parser.currentIndex(); | |
217 parser.match('/'); | |
218 if( !matchNameChar() ) | |
219 return parser.failure(null); | |
220 while( matchNameChar() ); | |
221 String name = parser.textFrom(start).toLowerCase(); | |
222 Map<String,Object> attributes = new HashMap<String,Object>(); | |
223 String attrName; | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
224 Map<String,String> style = null; |
1712 | 225 while( (attrName = parseAttrName()) != null ) { |
226 String attrValue = parseAttrValue(); | |
227 attributes.put( attrName, attrValue!=null ? attrValue : true ); | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
228 if( attrName.equals("style") && attrValue!=null && style==null ) { |
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
229 style = Css.style(attrValue); |
1712 | 230 } |
231 } | |
232 while( matchSpace() ); | |
233 boolean isEmpty = parser.match('/'); | |
234 if( !parser.match('>') ) | |
235 return parser.failure(null); | |
236 String raw = parser.textFrom(tagStart); | |
1714
31a82b0d0a87
bbcode and html work
Franklin Schmidt <fschmidt@gmail.com>
parents:
1712
diff
changeset
|
237 Tag tag = new Tag(name,attributes,isEmpty,raw,style); |
1712 | 238 return parser.success(tag); |
239 } | |
240 | |
241 private String parseAttrName() { | |
242 parser.begin(); | |
243 if( !matchSpace() ) | |
244 return parser.failure(null); | |
245 while( matchSpace() ); | |
246 int start = parser.currentIndex(); | |
247 if( !matchNameChar() ) | |
248 return parser.failure(null); | |
249 while( matchNameChar() ); | |
250 String name = parser.textFrom(start).toLowerCase(); | |
251 return parser.success(name); | |
252 } | |
253 | |
254 private String parseAttrValue() { | |
255 parser.begin(); | |
256 while( matchSpace() ); | |
257 if( !parser.match('=') ) | |
258 return parser.failure(null); | |
259 while( matchSpace() ); | |
260 if( parser.anyOf("\"'") ) { | |
261 char quote = parser.lastChar(); | |
262 int start = parser.currentIndex(); | |
263 while( !parser.test(quote) ) { | |
264 if( !parser.anyChar() ) | |
265 return parser.failure(null); | |
266 } | |
267 String value = parser.textFrom(start); | |
268 parser.match(quote); | |
269 value = decode(value); | |
270 return parser.success(value); | |
271 } | |
272 int start = parser.currentIndex(); | |
273 if( !matchValueChar() ) | |
274 return parser.failure(null); | |
275 while( matchValueChar() ); | |
276 String value = parser.textFrom(start); | |
277 value = decode(value); | |
278 return parser.success(value); | |
279 } | |
280 | |
281 private boolean matchNameChar() { | |
282 return parser.inCharRange('a','z') | |
283 || parser.inCharRange('A','Z') | |
284 || parser.inCharRange('0','9') | |
285 || parser.anyOf("_.-:") | |
286 ; | |
287 } | |
288 | |
289 private boolean matchValueChar() { | |
290 return parser.noneOf(" \t\r\n\"'>/="); | |
291 } | |
292 | |
293 private boolean matchSpace() { | |
294 return parser.anyOf(" \t\r\n"); | |
295 } | |
296 | |
297 } |