Mercurial Hosting > nabble
comparison src/fschmidt/html/Html.java @ 68:00520880ad02
add fschmidt source
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sun, 05 Oct 2025 17:24:15 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:9d0fefce6985 | 68:00520880ad02 |
---|---|
1 /* | |
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> | |
3 | |
4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 of this software and associated documentation files (the "Software"), to deal | |
6 in the Software without restriction, including without limitation the rights | |
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 copies of the Software, and to permit persons to whom the Software is | |
9 furnished to do so, subject to the following conditions: | |
10 | |
11 The above copyright notice and this permission notice shall be included in | |
12 all copies or substantial portions of the Software. | |
13 | |
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 THE SOFTWARE. | |
21 */ | |
22 | |
23 package fschmidt.html; | |
24 | |
25 import java.io.InputStreamReader; | |
26 import java.util.ArrayList; | |
27 import java.util.Iterator; | |
28 import java.util.Arrays; | |
29 import java.util.Set; | |
30 import java.util.HashSet; | |
31 import org.slf4j.Logger; | |
32 import org.slf4j.LoggerFactory; | |
33 import fschmidt.util.java.HtmlUtils; | |
34 | |
35 | |
36 public final class Html extends ArrayList<Object> { | |
37 private static final Logger logger = LoggerFactory.getLogger(Html.class); | |
38 | |
39 public static final String TEXTAREA = "textarea"; | |
40 public static final String SCRIPT = "script"; | |
41 public static final String STYLE = "style"; | |
42 | |
43 private int startingLine = 0; | |
44 private boolean removeBadTags = false; | |
45 private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); | |
46 | |
47 public Html() {} | |
48 | |
49 public Html(String text) { | |
50 parse(text); | |
51 } | |
52 | |
53 public Set<String> containerTags() { | |
54 return containerTags; | |
55 } | |
56 | |
57 public void setStartingLine(int startingLine) { | |
58 this.startingLine = startingLine; | |
59 } | |
60 | |
61 public void removeBadTags(boolean removeBadTags) { | |
62 this.removeBadTags = removeBadTags; | |
63 } | |
64 | |
65 public void parse(String text) { | |
66 int len = text.length(); | |
67 int i = 0; | |
68 int i2Prev = 0; | |
69 int line = startingLine; | |
70 outer: | |
71 while( i < len ) { | |
72 int i2 = text.indexOf('<',i); | |
73 while( i2 != -1 && i2+1 < len ) { | |
74 char c = text.charAt(i2+1); | |
75 if( Character.isLetter(c) || c=='/' || c=='!' ) | |
76 break; | |
77 i2 = text.indexOf('<',i2+1); | |
78 } | |
79 if( i2 == -1 ) { | |
80 add( text.substring(i) ); | |
81 break; | |
82 } | |
83 if( i < i2 ) | |
84 add( text.substring(i,i2) ); | |
85 if( text.startsWith("<!--",i2) ) { | |
86 i = text.indexOf("-->",i2+4); | |
87 if( i == -1 ) { | |
88 add( text.substring(i2) ); | |
89 break; | |
90 } | |
91 add( new HtmlComment( text.substring(i2+4,i) ) ); | |
92 i += 3; | |
93 } else if( text.startsWith("<![CDATA[",i2) ) { | |
94 i = text.indexOf("]]>",i2+9); | |
95 if( i == -1 ) { | |
96 add( text.substring(i2) ); | |
97 break; | |
98 } | |
99 add( new HtmlCdata( text.substring(i2+9,i) ) ); | |
100 i += 3; | |
101 } else { | |
102 i = text.indexOf('>',i2); | |
103 if( i == -1 ) { | |
104 add( text.substring(i2) ); | |
105 break; | |
106 } | |
107 line += lines(text,i2Prev,i2); | |
108 i2Prev = i2; | |
109 String tagText = text.substring(i2+1,i); | |
110 try { | |
111 HtmlTag tag = new HtmlTag(tagText); | |
112 tag.lineNumber = line; | |
113 String tagName = tag.getName().toLowerCase(); | |
114 if( containerTags.contains(tagName) ) { | |
115 i2 = i; | |
116 String endTagName = '/' + tagName; | |
117 while(true) { | |
118 i2 = text.indexOf('<',i2+1); | |
119 if( i2 == -1 ) | |
120 break; | |
121 int i3 = text.indexOf('>',i2); | |
122 if( i3 == -1 ) | |
123 break; | |
124 int j = i2+1; | |
125 while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++; | |
126 String s = text.substring(i2+1,j); | |
127 if( s.equalsIgnoreCase(endTagName) ) { | |
128 HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) ); | |
129 line += lines(text,i2Prev,i2); | |
130 tag2.lineNumber = line; | |
131 i2Prev = i2; | |
132 String text2 = text.substring(i+1,i2); | |
133 HtmlTextContainer textContainer = | |
134 tagName.equals(TEXTAREA) ? | |
135 new HtmlTextarea(tag,text2,tag2) | |
136 : tagName.equals(SCRIPT) ? | |
137 new HtmlScript(tag,text2,tag2) | |
138 : tagName.equals(STYLE) ? | |
139 new HtmlStyle(tag,text2,tag2) | |
140 : | |
141 new HtmlTextContainer(tag,text2,tag2) | |
142 ; | |
143 add( textContainer ); | |
144 i = i3 + 1; | |
145 continue outer; | |
146 } | |
147 } | |
148 logger.warn("unclosed "+tagName); | |
149 } | |
150 i += 1; | |
151 add( tag ); | |
152 } catch(HtmlTag.BadTag e) { | |
153 // logger.debug("bad tag",e); | |
154 i += 1; | |
155 if( !removeBadTags ) { | |
156 add( "<" ); | |
157 add( HtmlUtils.htmlEncode(tagText) ); | |
158 add( ">" ); | |
159 } | |
160 } | |
161 } | |
162 } | |
163 } | |
164 | |
165 @Override public String toString() { | |
166 StringBuilder buf = new StringBuilder(); | |
167 for( Object o : this ) { | |
168 buf.append( o ); | |
169 } | |
170 return buf.toString(); | |
171 } | |
172 | |
173 private static int lines(String text,int start,int end) { | |
174 int n = 0; | |
175 int i = start - 1; | |
176 while(true) { | |
177 i = text.indexOf('\n',i+1); | |
178 if( i == -1 || i >= end ) | |
179 return n; | |
180 n++; | |
181 } | |
182 } | |
183 | |
184 public Html flatten() { | |
185 Html html = new Html(); | |
186 flattenTo(html); | |
187 return html; | |
188 } | |
189 | |
190 void flattenTo(Html html) { | |
191 for( Object obj : this ) { | |
192 if( obj instanceof HtmlNode ) { | |
193 ((HtmlNode)obj).flattenTo(html); | |
194 } else { | |
195 html.add(obj); | |
196 } | |
197 } | |
198 } | |
199 | |
200 public Html deepen() { | |
201 Iterator iter = iterator(); | |
202 Html html = deepen(iter); | |
203 if( iter.hasNext() ) | |
204 throw new RuntimeException("unmatched end tag:\n"+html); | |
205 return html; | |
206 } | |
207 | |
208 private static Html deepen(Iterator iter) { | |
209 Html html = new Html(); | |
210 while( iter.hasNext() ) { | |
211 Object obj = iter.next(); | |
212 if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) { | |
213 HtmlTag tag = (HtmlTag)obj; | |
214 if( !tag.isEmpty() ) { | |
215 String name = tag.getName(); | |
216 if( name.startsWith("/") ) { | |
217 html.add(tag); | |
218 return html; | |
219 } | |
220 Html children = deepen(iter); | |
221 HtmlTag endTag = (HtmlTag)children.get(children.size()-1); | |
222 if( endTag.getName().equals("/"+name) ) { | |
223 children.remove(children.size()-1); | |
224 html.add( new HtmlNode(tag,children) ); | |
225 continue; | |
226 } else { | |
227 html.add(tag); | |
228 html.addAll(children); | |
229 return html; | |
230 } | |
231 } | |
232 } | |
233 html.add(obj); | |
234 } | |
235 return html; | |
236 } | |
237 | |
238 public static void main(String[] args) throws Exception { | |
239 /* | |
240 String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/"); | |
241 Html html = new Html(page); | |
242 String s = html.toString(); | |
243 System.out.print(s); | |
244 // System.out.println(html.size()); | |
245 */ | |
246 String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in)); | |
247 Html html = new Html(page); | |
248 for( Iterator i=html.iterator(); i.hasNext(); ) { | |
249 Object o = i.next(); | |
250 System.out.println(o.getClass().getName()+" - "+o); | |
251 } | |
252 } | |
253 } |