Mercurial Hosting > nabble
comparison src/fschmidt/html/Html.java @ 68:00520880ad02
add fschmidt source
| author | Franklin Schmidt <fschmidt@gmail.com> | 
|---|---|
| date | Sun, 05 Oct 2025 17:24:15 -0600 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 67:9d0fefce6985 | 68:00520880ad02 | 
|---|---|
| 1 /* | |
| 2 Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> | |
| 3 | |
| 4 Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 5 of this software and associated documentation files (the "Software"), to deal | |
| 6 in the Software without restriction, including without limitation the rights | |
| 7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 8 copies of the Software, and to permit persons to whom the Software is | |
| 9 furnished to do so, subject to the following conditions: | |
| 10 | |
| 11 The above copyright notice and this permission notice shall be included in | |
| 12 all copies or substantial portions of the Software. | |
| 13 | |
| 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| 20 THE SOFTWARE. | |
| 21 */ | |
| 22 | |
| 23 package fschmidt.html; | |
| 24 | |
| 25 import java.io.InputStreamReader; | |
| 26 import java.util.ArrayList; | |
| 27 import java.util.Iterator; | |
| 28 import java.util.Arrays; | |
| 29 import java.util.Set; | |
| 30 import java.util.HashSet; | |
| 31 import org.slf4j.Logger; | |
| 32 import org.slf4j.LoggerFactory; | |
| 33 import fschmidt.util.java.HtmlUtils; | |
| 34 | |
| 35 | |
| 36 public final class Html extends ArrayList<Object> { | |
| 37 private static final Logger logger = LoggerFactory.getLogger(Html.class); | |
| 38 | |
| 39 public static final String TEXTAREA = "textarea"; | |
| 40 public static final String SCRIPT = "script"; | |
| 41 public static final String STYLE = "style"; | |
| 42 | |
| 43 private int startingLine = 0; | |
| 44 private boolean removeBadTags = false; | |
| 45 private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); | |
| 46 | |
| 47 public Html() {} | |
| 48 | |
| 49 public Html(String text) { | |
| 50 parse(text); | |
| 51 } | |
| 52 | |
| 53 public Set<String> containerTags() { | |
| 54 return containerTags; | |
| 55 } | |
| 56 | |
| 57 public void setStartingLine(int startingLine) { | |
| 58 this.startingLine = startingLine; | |
| 59 } | |
| 60 | |
| 61 public void removeBadTags(boolean removeBadTags) { | |
| 62 this.removeBadTags = removeBadTags; | |
| 63 } | |
| 64 | |
| 65 public void parse(String text) { | |
| 66 int len = text.length(); | |
| 67 int i = 0; | |
| 68 int i2Prev = 0; | |
| 69 int line = startingLine; | |
| 70 outer: | |
| 71 while( i < len ) { | |
| 72 int i2 = text.indexOf('<',i); | |
| 73 while( i2 != -1 && i2+1 < len ) { | |
| 74 char c = text.charAt(i2+1); | |
| 75 if( Character.isLetter(c) || c=='/' || c=='!' ) | |
| 76 break; | |
| 77 i2 = text.indexOf('<',i2+1); | |
| 78 } | |
| 79 if( i2 == -1 ) { | |
| 80 add( text.substring(i) ); | |
| 81 break; | |
| 82 } | |
| 83 if( i < i2 ) | |
| 84 add( text.substring(i,i2) ); | |
| 85 if( text.startsWith("<!--",i2) ) { | |
| 86 i = text.indexOf("-->",i2+4); | |
| 87 if( i == -1 ) { | |
| 88 add( text.substring(i2) ); | |
| 89 break; | |
| 90 } | |
| 91 add( new HtmlComment( text.substring(i2+4,i) ) ); | |
| 92 i += 3; | |
| 93 } else if( text.startsWith("<![CDATA[",i2) ) { | |
| 94 i = text.indexOf("]]>",i2+9); | |
| 95 if( i == -1 ) { | |
| 96 add( text.substring(i2) ); | |
| 97 break; | |
| 98 } | |
| 99 add( new HtmlCdata( text.substring(i2+9,i) ) ); | |
| 100 i += 3; | |
| 101 } else { | |
| 102 i = text.indexOf('>',i2); | |
| 103 if( i == -1 ) { | |
| 104 add( text.substring(i2) ); | |
| 105 break; | |
| 106 } | |
| 107 line += lines(text,i2Prev,i2); | |
| 108 i2Prev = i2; | |
| 109 String tagText = text.substring(i2+1,i); | |
| 110 try { | |
| 111 HtmlTag tag = new HtmlTag(tagText); | |
| 112 tag.lineNumber = line; | |
| 113 String tagName = tag.getName().toLowerCase(); | |
| 114 if( containerTags.contains(tagName) ) { | |
| 115 i2 = i; | |
| 116 String endTagName = '/' + tagName; | |
| 117 while(true) { | |
| 118 i2 = text.indexOf('<',i2+1); | |
| 119 if( i2 == -1 ) | |
| 120 break; | |
| 121 int i3 = text.indexOf('>',i2); | |
| 122 if( i3 == -1 ) | |
| 123 break; | |
| 124 int j = i2+1; | |
| 125 while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++; | |
| 126 String s = text.substring(i2+1,j); | |
| 127 if( s.equalsIgnoreCase(endTagName) ) { | |
| 128 HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) ); | |
| 129 line += lines(text,i2Prev,i2); | |
| 130 tag2.lineNumber = line; | |
| 131 i2Prev = i2; | |
| 132 String text2 = text.substring(i+1,i2); | |
| 133 HtmlTextContainer textContainer = | |
| 134 tagName.equals(TEXTAREA) ? | |
| 135 new HtmlTextarea(tag,text2,tag2) | |
| 136 : tagName.equals(SCRIPT) ? | |
| 137 new HtmlScript(tag,text2,tag2) | |
| 138 : tagName.equals(STYLE) ? | |
| 139 new HtmlStyle(tag,text2,tag2) | |
| 140 : | |
| 141 new HtmlTextContainer(tag,text2,tag2) | |
| 142 ; | |
| 143 add( textContainer ); | |
| 144 i = i3 + 1; | |
| 145 continue outer; | |
| 146 } | |
| 147 } | |
| 148 logger.warn("unclosed "+tagName); | |
| 149 } | |
| 150 i += 1; | |
| 151 add( tag ); | |
| 152 } catch(HtmlTag.BadTag e) { | |
| 153 // logger.debug("bad tag",e); | |
| 154 i += 1; | |
| 155 if( !removeBadTags ) { | |
| 156 add( "<" ); | |
| 157 add( HtmlUtils.htmlEncode(tagText) ); | |
| 158 add( ">" ); | |
| 159 } | |
| 160 } | |
| 161 } | |
| 162 } | |
| 163 } | |
| 164 | |
| 165 @Override public String toString() { | |
| 166 StringBuilder buf = new StringBuilder(); | |
| 167 for( Object o : this ) { | |
| 168 buf.append( o ); | |
| 169 } | |
| 170 return buf.toString(); | |
| 171 } | |
| 172 | |
| 173 private static int lines(String text,int start,int end) { | |
| 174 int n = 0; | |
| 175 int i = start - 1; | |
| 176 while(true) { | |
| 177 i = text.indexOf('\n',i+1); | |
| 178 if( i == -1 || i >= end ) | |
| 179 return n; | |
| 180 n++; | |
| 181 } | |
| 182 } | |
| 183 | |
| 184 public Html flatten() { | |
| 185 Html html = new Html(); | |
| 186 flattenTo(html); | |
| 187 return html; | |
| 188 } | |
| 189 | |
| 190 void flattenTo(Html html) { | |
| 191 for( Object obj : this ) { | |
| 192 if( obj instanceof HtmlNode ) { | |
| 193 ((HtmlNode)obj).flattenTo(html); | |
| 194 } else { | |
| 195 html.add(obj); | |
| 196 } | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 public Html deepen() { | |
| 201 Iterator iter = iterator(); | |
| 202 Html html = deepen(iter); | |
| 203 if( iter.hasNext() ) | |
| 204 throw new RuntimeException("unmatched end tag:\n"+html); | |
| 205 return html; | |
| 206 } | |
| 207 | |
| 208 private static Html deepen(Iterator iter) { | |
| 209 Html html = new Html(); | |
| 210 while( iter.hasNext() ) { | |
| 211 Object obj = iter.next(); | |
| 212 if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) { | |
| 213 HtmlTag tag = (HtmlTag)obj; | |
| 214 if( !tag.isEmpty() ) { | |
| 215 String name = tag.getName(); | |
| 216 if( name.startsWith("/") ) { | |
| 217 html.add(tag); | |
| 218 return html; | |
| 219 } | |
| 220 Html children = deepen(iter); | |
| 221 HtmlTag endTag = (HtmlTag)children.get(children.size()-1); | |
| 222 if( endTag.getName().equals("/"+name) ) { | |
| 223 children.remove(children.size()-1); | |
| 224 html.add( new HtmlNode(tag,children) ); | |
| 225 continue; | |
| 226 } else { | |
| 227 html.add(tag); | |
| 228 html.addAll(children); | |
| 229 return html; | |
| 230 } | |
| 231 } | |
| 232 } | |
| 233 html.add(obj); | |
| 234 } | |
| 235 return html; | |
| 236 } | |
| 237 | |
| 238 public static void main(String[] args) throws Exception { | |
| 239 /* | |
| 240 String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/"); | |
| 241 Html html = new Html(page); | |
| 242 String s = html.toString(); | |
| 243 System.out.print(s); | |
| 244 // System.out.println(html.size()); | |
| 245 */ | |
| 246 String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in)); | |
| 247 Html html = new Html(page); | |
| 248 for( Iterator i=html.iterator(); i.hasNext(); ) { | |
| 249 Object o = i.next(); | |
| 250 System.out.println(o.getClass().getName()+" - "+o); | |
| 251 } | |
| 252 } | |
| 253 } | 
