1712
|
1 package goodjava.html;
|
|
2
|
|
3 import java.util.List;
|
|
4 import java.util.ArrayList;
|
|
5 import java.util.Set;
|
|
6 import java.util.HashSet;
|
|
7 import java.util.Map;
|
|
8 import java.util.HashMap;
|
|
9 import java.util.Collections;
|
|
10 import java.util.regex.Pattern;
|
|
11 import java.util.regex.Matcher;
|
|
12 import goodjava.parser.Parser;
|
|
13
|
|
14
|
|
15 public final class Html {
|
|
16
|
|
17 private static final Pattern entityPtn = Pattern.compile(
|
|
18 "&(#?[0-9a-zA-Z]+;)"
|
|
19 );
|
|
20
|
|
21 public static String encode(String s) {
|
|
22 //s = s.replace("&","&");
|
|
23 s = entityPtn.matcher(s).replaceAll("&$1");
|
|
24 s = s.replace("<","<");
|
|
25 s = s.replace(">",">");
|
|
26 s = s.replace("\"",""");
|
|
27 return s;
|
|
28 }
|
|
29
|
|
30 private static final Pattern entityNumPtn = Pattern.compile(
|
|
31 "&#(\\d+);"
|
|
32 );
|
|
33
|
|
34 public static String decode(String s) {
|
|
35 Matcher m = entityNumPtn.matcher(s);
|
|
36 if( m.find() ) {
|
|
37 StringBuffer buf = new StringBuffer();
|
|
38 do {
|
|
39 String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
|
|
40 m.appendReplacement(buf,entity);
|
|
41 } while( m.find() );
|
|
42 m.appendTail(buf);
|
|
43 s = buf.toString();
|
|
44 }
|
|
45 s = s.replace(" "," ");
|
|
46 s = s.replace(""","\"");
|
|
47 s = s.replace(">",">");
|
|
48 s = s.replace("<","<");
|
|
49 s = s.replace("&","&");
|
|
50 return s;
|
|
51 }
|
|
52
|
|
53
|
|
54 public static final class Comment {
|
|
55 public final String text;
|
|
56
|
|
57 private Comment(String text) {
|
|
58 this.text = text;
|
|
59 }
|
|
60 }
|
|
61
|
|
62 public static final class CData {
|
|
63 public final String text;
|
|
64
|
|
65 private CData(String text) {
|
|
66 this.text = text;
|
|
67 }
|
|
68 }
|
|
69
|
|
70 public static final class Tag {
|
|
71 public final String name;
|
|
72 public final Map<String,Object> attributes;
|
|
73 public final boolean isEmpty;
|
|
74 public final String raw;
|
|
75
|
|
76 private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) {
|
|
77 this.name = name;
|
|
78 this.attributes = attributes;
|
|
79 this.isEmpty = isEmpty;
|
|
80 this.raw = raw;
|
|
81 }
|
|
82 }
|
|
83
|
|
84 public static final class Container {
|
|
85 public final Tag tag;
|
|
86 public final String text;
|
|
87
|
|
88 private Container(Tag tag,String text) {
|
|
89 this.tag = tag;
|
|
90 this.text = text;
|
|
91 }
|
|
92 }
|
|
93
|
|
94 private static Set<String> defaultContainerTags = new HashSet<String>();
|
|
95 static {
|
|
96 Collections.addAll( defaultContainerTags, "script", "style", "textarea" );
|
|
97 }
|
|
98
|
|
99 public static List parse(String text) {
|
|
100 return parse(text,defaultContainerTags);
|
|
101 }
|
|
102
|
|
103 public static List parse(String text,Set<String> containerTags) {
|
|
104 return new Html(text,containerTags).parse();
|
|
105 }
|
|
106
|
|
107 private final Parser parser;
|
|
108 private final Set<String> containerTags;
|
|
109
|
|
110 private Html(String text,Set<String> containerTags) {
|
|
111 this.parser = new Parser(text);
|
|
112 this.containerTags = containerTags;
|
|
113 }
|
|
114
|
|
115 private List parse() {
|
|
116 List list = new ArrayList();
|
|
117 StringBuilder sb = new StringBuilder();
|
|
118 while( !parser.endOfInput() ) {
|
|
119 if( parser.test('<') ) {
|
|
120 Tag tag = parseTag();
|
|
121 if( tag != null ) {
|
|
122 Object el = tag;
|
|
123 String tagName = tag.name;
|
|
124 if( containerTags.contains(tagName) ) {
|
|
125 Container container = parseContainer(tag);
|
|
126 if( container != null )
|
|
127 el = container;
|
|
128 }
|
|
129 if( el != null
|
|
130 || (el = parseComment()) != null
|
|
131 || (el = parseCdata()) != null
|
|
132 ) {
|
|
133 add(list,sb);
|
|
134 list.add(el);
|
|
135 continue;
|
|
136 }
|
|
137 }
|
|
138 }
|
|
139 sb.append( parser.currentChar() );
|
|
140 parser.anyChar();
|
|
141 }
|
|
142 add(list,sb);
|
|
143 return list;
|
|
144 }
|
|
145
|
|
146 private static void add(List list,StringBuilder sb) {
|
|
147 if( sb.length() > 0 ) {
|
|
148 list.add(decode(sb.toString()));
|
|
149 sb.setLength(0);
|
|
150 }
|
|
151 }
|
|
152
|
|
153 private Comment parseComment() {
|
|
154 parser.begin();
|
|
155 if( !parser.match("<!--") )
|
|
156 return parser.failure(null);
|
|
157 int start = parser.currentIndex();
|
|
158 while( !parser.test("-->") ) {
|
|
159 if( !parser.anyChar() )
|
|
160 return parser.failure(null);
|
|
161 }
|
|
162 String text = parser.textFrom(start);
|
|
163 Comment comment = new Comment(text);
|
|
164 return parser.success(comment);
|
|
165 }
|
|
166
|
|
167 private CData parseCdata() {
|
|
168 parser.begin();
|
|
169 if( !parser.match("<![CDATA[") )
|
|
170 return parser.failure(null);
|
|
171 int start = parser.currentIndex();
|
|
172 while( !parser.test("]]>") ) {
|
|
173 if( !parser.anyChar() )
|
|
174 return parser.failure(null);
|
|
175 }
|
|
176 String text = parser.textFrom(start);
|
|
177 CData cdata = new CData(text);
|
|
178 return parser.success(cdata);
|
|
179 }
|
|
180
|
|
181 private Container parseContainer(Tag tag) {
|
|
182 String endTagName = '/' + tag.name;
|
|
183 int start = parser.begin();
|
|
184 int end;
|
|
185 while(true) {
|
|
186 if( parser.test('<') ) {
|
|
187 end = parser.currentIndex();
|
|
188 Tag tag2 = parseTag();
|
|
189 if( tag2.name.equals(endTagName) )
|
|
190 break;
|
|
191 }
|
|
192 if( !parser.anyChar() )
|
|
193 return parser.failure(null);
|
|
194 }
|
|
195 String text = parser.text.substring(start,end);
|
|
196 Container container = new Container(tag,text);
|
|
197 return parser.success(container);
|
|
198 }
|
|
199
|
|
200 private Tag parseTag() {
|
|
201 int tagStart = parser.begin();
|
|
202 if( !parser.match('<') )
|
|
203 return parser.failure(null);
|
|
204 int start = parser.currentIndex();
|
|
205 parser.match('/');
|
|
206 if( !matchNameChar() )
|
|
207 return parser.failure(null);
|
|
208 while( matchNameChar() );
|
|
209 String name = parser.textFrom(start).toLowerCase();
|
|
210 Map<String,Object> attributes = new HashMap<String,Object>();
|
|
211 String attrName;
|
|
212 while( (attrName = parseAttrName()) != null ) {
|
|
213 String attrValue = parseAttrValue();
|
|
214 attributes.put( attrName, attrValue!=null ? attrValue : true );
|
|
215 /*
|
|
216 if( attrName.equals("style") && attrValue!=null ) {
|
|
217 LuanTable style = Css.style(attrValue);
|
|
218 if( style!=null )
|
|
219 tbl.rawPut("style",style);
|
|
220 }
|
|
221 */
|
|
222 }
|
|
223 while( matchSpace() );
|
|
224 boolean isEmpty = parser.match('/');
|
|
225 if( !parser.match('>') )
|
|
226 return parser.failure(null);
|
|
227 String raw = parser.textFrom(tagStart);
|
|
228 Tag tag = new Tag(name,attributes,isEmpty,raw);
|
|
229 return parser.success(tag);
|
|
230 }
|
|
231
|
|
232 private String parseAttrName() {
|
|
233 parser.begin();
|
|
234 if( !matchSpace() )
|
|
235 return parser.failure(null);
|
|
236 while( matchSpace() );
|
|
237 int start = parser.currentIndex();
|
|
238 if( !matchNameChar() )
|
|
239 return parser.failure(null);
|
|
240 while( matchNameChar() );
|
|
241 String name = parser.textFrom(start).toLowerCase();
|
|
242 return parser.success(name);
|
|
243 }
|
|
244
|
|
245 private String parseAttrValue() {
|
|
246 parser.begin();
|
|
247 while( matchSpace() );
|
|
248 if( !parser.match('=') )
|
|
249 return parser.failure(null);
|
|
250 while( matchSpace() );
|
|
251 if( parser.anyOf("\"'") ) {
|
|
252 char quote = parser.lastChar();
|
|
253 int start = parser.currentIndex();
|
|
254 while( !parser.test(quote) ) {
|
|
255 if( !parser.anyChar() )
|
|
256 return parser.failure(null);
|
|
257 }
|
|
258 String value = parser.textFrom(start);
|
|
259 parser.match(quote);
|
|
260 value = decode(value);
|
|
261 return parser.success(value);
|
|
262 }
|
|
263 int start = parser.currentIndex();
|
|
264 if( !matchValueChar() )
|
|
265 return parser.failure(null);
|
|
266 while( matchValueChar() );
|
|
267 String value = parser.textFrom(start);
|
|
268 value = decode(value);
|
|
269 return parser.success(value);
|
|
270 }
|
|
271
|
|
272 private boolean matchNameChar() {
|
|
273 return parser.inCharRange('a','z')
|
|
274 || parser.inCharRange('A','Z')
|
|
275 || parser.inCharRange('0','9')
|
|
276 || parser.anyOf("_.-:")
|
|
277 ;
|
|
278 }
|
|
279
|
|
280 private boolean matchValueChar() {
|
|
281 return parser.noneOf(" \t\r\n\"'>/=");
|
|
282 }
|
|
283
|
|
284 private boolean matchSpace() {
|
|
285 return parser.anyOf(" \t\r\n");
|
|
286 }
|
|
287
|
|
288 }
|