68
|
1 /*
|
|
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com>
|
|
3
|
|
4 Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
5 of this software and associated documentation files (the "Software"), to deal
|
|
6 in the Software without restriction, including without limitation the rights
|
|
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8 copies of the Software, and to permit persons to whom the Software is
|
|
9 furnished to do so, subject to the following conditions:
|
|
10
|
|
11 The above copyright notice and this permission notice shall be included in
|
|
12 all copies or substantial portions of the Software.
|
|
13
|
|
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
20 THE SOFTWARE.
|
|
21 */
|
|
22
|
|
23 package fschmidt.html;
|
|
24
|
|
25 import java.io.InputStreamReader;
|
|
26 import java.util.ArrayList;
|
|
27 import java.util.Iterator;
|
|
28 import java.util.Arrays;
|
|
29 import java.util.Set;
|
|
30 import java.util.HashSet;
|
|
31 import org.slf4j.Logger;
|
|
32 import org.slf4j.LoggerFactory;
|
|
33 import fschmidt.util.java.HtmlUtils;
|
|
34
|
|
35
|
|
36 public final class Html extends ArrayList<Object> {
|
|
37 private static final Logger logger = LoggerFactory.getLogger(Html.class);
|
|
38
|
|
39 public static final String TEXTAREA = "textarea";
|
|
40 public static final String SCRIPT = "script";
|
|
41 public static final String STYLE = "style";
|
|
42
|
|
43 private int startingLine = 0;
|
|
44 private boolean removeBadTags = false;
|
|
45 private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE));
|
|
46
|
|
47 public Html() {}
|
|
48
|
|
49 public Html(String text) {
|
|
50 parse(text);
|
|
51 }
|
|
52
|
|
53 public Set<String> containerTags() {
|
|
54 return containerTags;
|
|
55 }
|
|
56
|
|
57 public void setStartingLine(int startingLine) {
|
|
58 this.startingLine = startingLine;
|
|
59 }
|
|
60
|
|
61 public void removeBadTags(boolean removeBadTags) {
|
|
62 this.removeBadTags = removeBadTags;
|
|
63 }
|
|
64
|
|
65 public void parse(String text) {
|
|
66 int len = text.length();
|
|
67 int i = 0;
|
|
68 int i2Prev = 0;
|
|
69 int line = startingLine;
|
|
70 outer:
|
|
71 while( i < len ) {
|
|
72 int i2 = text.indexOf('<',i);
|
|
73 while( i2 != -1 && i2+1 < len ) {
|
|
74 char c = text.charAt(i2+1);
|
|
75 if( Character.isLetter(c) || c=='/' || c=='!' )
|
|
76 break;
|
|
77 i2 = text.indexOf('<',i2+1);
|
|
78 }
|
|
79 if( i2 == -1 ) {
|
|
80 add( text.substring(i) );
|
|
81 break;
|
|
82 }
|
|
83 if( i < i2 )
|
|
84 add( text.substring(i,i2) );
|
|
85 if( text.startsWith("<!--",i2) ) {
|
|
86 i = text.indexOf("-->",i2+4);
|
|
87 if( i == -1 ) {
|
|
88 add( text.substring(i2) );
|
|
89 break;
|
|
90 }
|
|
91 add( new HtmlComment( text.substring(i2+4,i) ) );
|
|
92 i += 3;
|
|
93 } else if( text.startsWith("<![CDATA[",i2) ) {
|
|
94 i = text.indexOf("]]>",i2+9);
|
|
95 if( i == -1 ) {
|
|
96 add( text.substring(i2) );
|
|
97 break;
|
|
98 }
|
|
99 add( new HtmlCdata( text.substring(i2+9,i) ) );
|
|
100 i += 3;
|
|
101 } else {
|
|
102 i = text.indexOf('>',i2);
|
|
103 if( i == -1 ) {
|
|
104 add( text.substring(i2) );
|
|
105 break;
|
|
106 }
|
|
107 line += lines(text,i2Prev,i2);
|
|
108 i2Prev = i2;
|
|
109 String tagText = text.substring(i2+1,i);
|
|
110 try {
|
|
111 HtmlTag tag = new HtmlTag(tagText);
|
|
112 tag.lineNumber = line;
|
|
113 String tagName = tag.getName().toLowerCase();
|
|
114 if( containerTags.contains(tagName) ) {
|
|
115 i2 = i;
|
|
116 String endTagName = '/' + tagName;
|
|
117 while(true) {
|
|
118 i2 = text.indexOf('<',i2+1);
|
|
119 if( i2 == -1 )
|
|
120 break;
|
|
121 int i3 = text.indexOf('>',i2);
|
|
122 if( i3 == -1 )
|
|
123 break;
|
|
124 int j = i2+1;
|
|
125 while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++;
|
|
126 String s = text.substring(i2+1,j);
|
|
127 if( s.equalsIgnoreCase(endTagName) ) {
|
|
128 HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) );
|
|
129 line += lines(text,i2Prev,i2);
|
|
130 tag2.lineNumber = line;
|
|
131 i2Prev = i2;
|
|
132 String text2 = text.substring(i+1,i2);
|
|
133 HtmlTextContainer textContainer =
|
|
134 tagName.equals(TEXTAREA) ?
|
|
135 new HtmlTextarea(tag,text2,tag2)
|
|
136 : tagName.equals(SCRIPT) ?
|
|
137 new HtmlScript(tag,text2,tag2)
|
|
138 : tagName.equals(STYLE) ?
|
|
139 new HtmlStyle(tag,text2,tag2)
|
|
140 :
|
|
141 new HtmlTextContainer(tag,text2,tag2)
|
|
142 ;
|
|
143 add( textContainer );
|
|
144 i = i3 + 1;
|
|
145 continue outer;
|
|
146 }
|
|
147 }
|
|
148 logger.warn("unclosed "+tagName);
|
|
149 }
|
|
150 i += 1;
|
|
151 add( tag );
|
|
152 } catch(HtmlTag.BadTag e) {
|
|
153 // logger.debug("bad tag",e);
|
|
154 i += 1;
|
|
155 if( !removeBadTags ) {
|
|
156 add( "<" );
|
|
157 add( HtmlUtils.htmlEncode(tagText) );
|
|
158 add( ">" );
|
|
159 }
|
|
160 }
|
|
161 }
|
|
162 }
|
|
163 }
|
|
164
|
|
165 @Override public String toString() {
|
|
166 StringBuilder buf = new StringBuilder();
|
|
167 for( Object o : this ) {
|
|
168 buf.append( o );
|
|
169 }
|
|
170 return buf.toString();
|
|
171 }
|
|
172
|
|
173 private static int lines(String text,int start,int end) {
|
|
174 int n = 0;
|
|
175 int i = start - 1;
|
|
176 while(true) {
|
|
177 i = text.indexOf('\n',i+1);
|
|
178 if( i == -1 || i >= end )
|
|
179 return n;
|
|
180 n++;
|
|
181 }
|
|
182 }
|
|
183
|
|
184 public Html flatten() {
|
|
185 Html html = new Html();
|
|
186 flattenTo(html);
|
|
187 return html;
|
|
188 }
|
|
189
|
|
190 void flattenTo(Html html) {
|
|
191 for( Object obj : this ) {
|
|
192 if( obj instanceof HtmlNode ) {
|
|
193 ((HtmlNode)obj).flattenTo(html);
|
|
194 } else {
|
|
195 html.add(obj);
|
|
196 }
|
|
197 }
|
|
198 }
|
|
199
|
|
200 public Html deepen() {
|
|
201 Iterator iter = iterator();
|
|
202 Html html = deepen(iter);
|
|
203 if( iter.hasNext() )
|
|
204 throw new RuntimeException("unmatched end tag:\n"+html);
|
|
205 return html;
|
|
206 }
|
|
207
|
|
208 private static Html deepen(Iterator iter) {
|
|
209 Html html = new Html();
|
|
210 while( iter.hasNext() ) {
|
|
211 Object obj = iter.next();
|
|
212 if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) {
|
|
213 HtmlTag tag = (HtmlTag)obj;
|
|
214 if( !tag.isEmpty() ) {
|
|
215 String name = tag.getName();
|
|
216 if( name.startsWith("/") ) {
|
|
217 html.add(tag);
|
|
218 return html;
|
|
219 }
|
|
220 Html children = deepen(iter);
|
|
221 HtmlTag endTag = (HtmlTag)children.get(children.size()-1);
|
|
222 if( endTag.getName().equals("/"+name) ) {
|
|
223 children.remove(children.size()-1);
|
|
224 html.add( new HtmlNode(tag,children) );
|
|
225 continue;
|
|
226 } else {
|
|
227 html.add(tag);
|
|
228 html.addAll(children);
|
|
229 return html;
|
|
230 }
|
|
231 }
|
|
232 }
|
|
233 html.add(obj);
|
|
234 }
|
|
235 return html;
|
|
236 }
|
|
237
|
|
238 public static void main(String[] args) throws Exception {
|
|
239 /*
|
|
240 String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/");
|
|
241 Html html = new Html(page);
|
|
242 String s = html.toString();
|
|
243 System.out.print(s);
|
|
244 // System.out.println(html.size());
|
|
245 */
|
|
246 String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in));
|
|
247 Html html = new Html(page);
|
|
248 for( Iterator i=html.iterator(); i.hasNext(); ) {
|
|
249 Object o = i.next();
|
|
250 System.out.println(o.getClass().getName()+" - "+o);
|
|
251 }
|
|
252 }
|
|
253 }
|