comparison src/fschmidt/html/Html.java @ 68:00520880ad02

add fschmidt source
author Franklin Schmidt <fschmidt@gmail.com>
date Sun, 05 Oct 2025 17:24:15 -0600
parents
children
comparison
equal deleted inserted replaced
67:9d0fefce6985 68:00520880ad02
1 /*
2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com>
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 THE SOFTWARE.
21 */
22
23 package fschmidt.html;
24
25 import java.io.InputStreamReader;
26 import java.util.ArrayList;
27 import java.util.Iterator;
28 import java.util.Arrays;
29 import java.util.Set;
30 import java.util.HashSet;
31 import org.slf4j.Logger;
32 import org.slf4j.LoggerFactory;
33 import fschmidt.util.java.HtmlUtils;
34
35
36 public final class Html extends ArrayList<Object> {
37 private static final Logger logger = LoggerFactory.getLogger(Html.class);
38
39 public static final String TEXTAREA = "textarea";
40 public static final String SCRIPT = "script";
41 public static final String STYLE = "style";
42
43 private int startingLine = 0;
44 private boolean removeBadTags = false;
45 private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE));
46
47 public Html() {}
48
49 public Html(String text) {
50 parse(text);
51 }
52
53 public Set<String> containerTags() {
54 return containerTags;
55 }
56
57 public void setStartingLine(int startingLine) {
58 this.startingLine = startingLine;
59 }
60
61 public void removeBadTags(boolean removeBadTags) {
62 this.removeBadTags = removeBadTags;
63 }
64
65 public void parse(String text) {
66 int len = text.length();
67 int i = 0;
68 int i2Prev = 0;
69 int line = startingLine;
70 outer:
71 while( i < len ) {
72 int i2 = text.indexOf('<',i);
73 while( i2 != -1 && i2+1 < len ) {
74 char c = text.charAt(i2+1);
75 if( Character.isLetter(c) || c=='/' || c=='!' )
76 break;
77 i2 = text.indexOf('<',i2+1);
78 }
79 if( i2 == -1 ) {
80 add( text.substring(i) );
81 break;
82 }
83 if( i < i2 )
84 add( text.substring(i,i2) );
85 if( text.startsWith("<!--",i2) ) {
86 i = text.indexOf("-->",i2+4);
87 if( i == -1 ) {
88 add( text.substring(i2) );
89 break;
90 }
91 add( new HtmlComment( text.substring(i2+4,i) ) );
92 i += 3;
93 } else if( text.startsWith("<![CDATA[",i2) ) {
94 i = text.indexOf("]]>",i2+9);
95 if( i == -1 ) {
96 add( text.substring(i2) );
97 break;
98 }
99 add( new HtmlCdata( text.substring(i2+9,i) ) );
100 i += 3;
101 } else {
102 i = text.indexOf('>',i2);
103 if( i == -1 ) {
104 add( text.substring(i2) );
105 break;
106 }
107 line += lines(text,i2Prev,i2);
108 i2Prev = i2;
109 String tagText = text.substring(i2+1,i);
110 try {
111 HtmlTag tag = new HtmlTag(tagText);
112 tag.lineNumber = line;
113 String tagName = tag.getName().toLowerCase();
114 if( containerTags.contains(tagName) ) {
115 i2 = i;
116 String endTagName = '/' + tagName;
117 while(true) {
118 i2 = text.indexOf('<',i2+1);
119 if( i2 == -1 )
120 break;
121 int i3 = text.indexOf('>',i2);
122 if( i3 == -1 )
123 break;
124 int j = i2+1;
125 while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++;
126 String s = text.substring(i2+1,j);
127 if( s.equalsIgnoreCase(endTagName) ) {
128 HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) );
129 line += lines(text,i2Prev,i2);
130 tag2.lineNumber = line;
131 i2Prev = i2;
132 String text2 = text.substring(i+1,i2);
133 HtmlTextContainer textContainer =
134 tagName.equals(TEXTAREA) ?
135 new HtmlTextarea(tag,text2,tag2)
136 : tagName.equals(SCRIPT) ?
137 new HtmlScript(tag,text2,tag2)
138 : tagName.equals(STYLE) ?
139 new HtmlStyle(tag,text2,tag2)
140 :
141 new HtmlTextContainer(tag,text2,tag2)
142 ;
143 add( textContainer );
144 i = i3 + 1;
145 continue outer;
146 }
147 }
148 logger.warn("unclosed "+tagName);
149 }
150 i += 1;
151 add( tag );
152 } catch(HtmlTag.BadTag e) {
153 // logger.debug("bad tag",e);
154 i += 1;
155 if( !removeBadTags ) {
156 add( "&lt;" );
157 add( HtmlUtils.htmlEncode(tagText) );
158 add( "&gt;" );
159 }
160 }
161 }
162 }
163 }
164
165 @Override public String toString() {
166 StringBuilder buf = new StringBuilder();
167 for( Object o : this ) {
168 buf.append( o );
169 }
170 return buf.toString();
171 }
172
173 private static int lines(String text,int start,int end) {
174 int n = 0;
175 int i = start - 1;
176 while(true) {
177 i = text.indexOf('\n',i+1);
178 if( i == -1 || i >= end )
179 return n;
180 n++;
181 }
182 }
183
184 public Html flatten() {
185 Html html = new Html();
186 flattenTo(html);
187 return html;
188 }
189
190 void flattenTo(Html html) {
191 for( Object obj : this ) {
192 if( obj instanceof HtmlNode ) {
193 ((HtmlNode)obj).flattenTo(html);
194 } else {
195 html.add(obj);
196 }
197 }
198 }
199
200 public Html deepen() {
201 Iterator iter = iterator();
202 Html html = deepen(iter);
203 if( iter.hasNext() )
204 throw new RuntimeException("unmatched end tag:\n"+html);
205 return html;
206 }
207
208 private static Html deepen(Iterator iter) {
209 Html html = new Html();
210 while( iter.hasNext() ) {
211 Object obj = iter.next();
212 if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) {
213 HtmlTag tag = (HtmlTag)obj;
214 if( !tag.isEmpty() ) {
215 String name = tag.getName();
216 if( name.startsWith("/") ) {
217 html.add(tag);
218 return html;
219 }
220 Html children = deepen(iter);
221 HtmlTag endTag = (HtmlTag)children.get(children.size()-1);
222 if( endTag.getName().equals("/"+name) ) {
223 children.remove(children.size()-1);
224 html.add( new HtmlNode(tag,children) );
225 continue;
226 } else {
227 html.add(tag);
228 html.addAll(children);
229 return html;
230 }
231 }
232 }
233 html.add(obj);
234 }
235 return html;
236 }
237
238 public static void main(String[] args) throws Exception {
239 /*
240 String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/");
241 Html html = new Html(page);
242 String s = html.toString();
243 System.out.print(s);
244 // System.out.println(html.size());
245 */
246 String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in));
247 Html html = new Html(page);
248 for( Iterator i=html.iterator(); i.hasNext(); ) {
249 Object o = i.next();
250 System.out.println(o.getClass().getName()+" - "+o);
251 }
252 }
253 }