| 68 | 1 /* | 
|  | 2 Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com> | 
|  | 3 | 
|  | 4 Permission is hereby granted, free of charge, to any person obtaining a copy | 
|  | 5 of this software and associated documentation files (the "Software"), to deal | 
|  | 6 in the Software without restriction, including without limitation the rights | 
|  | 7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | 
|  | 8 copies of the Software, and to permit persons to whom the Software is | 
|  | 9 furnished to do so, subject to the following conditions: | 
|  | 10 | 
|  | 11 The above copyright notice and this permission notice shall be included in | 
|  | 12 all copies or substantial portions of the Software. | 
|  | 13 | 
|  | 14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | 
|  | 15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | 
|  | 16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | 
|  | 17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | 
|  | 18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 
|  | 19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | 
|  | 20 THE SOFTWARE. | 
|  | 21 */ | 
|  | 22 | 
|  | 23 package fschmidt.html; | 
|  | 24 | 
|  | 25 import java.io.InputStreamReader; | 
|  | 26 import java.util.ArrayList; | 
|  | 27 import java.util.Iterator; | 
|  | 28 import java.util.Arrays; | 
|  | 29 import java.util.Set; | 
|  | 30 import java.util.HashSet; | 
|  | 31 import org.slf4j.Logger; | 
|  | 32 import org.slf4j.LoggerFactory; | 
|  | 33 import fschmidt.util.java.HtmlUtils; | 
|  | 34 | 
|  | 35 | 
|  | 36 public final class Html extends ArrayList<Object> { | 
|  | 37 	private static final Logger logger = LoggerFactory.getLogger(Html.class); | 
|  | 38 | 
|  | 39 	public static final String TEXTAREA = "textarea"; | 
|  | 40 	public static final String SCRIPT = "script"; | 
|  | 41 	public static final String STYLE = "style"; | 
|  | 42 | 
|  | 43 	private int startingLine = 0; | 
|  | 44 	private boolean removeBadTags = false; | 
|  | 45 	private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); | 
|  | 46 | 
|  | 47 	public Html() {} | 
|  | 48 | 
|  | 49 	public Html(String text) { | 
|  | 50 		parse(text); | 
|  | 51 	} | 
|  | 52 | 
|  | 53 	public Set<String> containerTags() { | 
|  | 54 		return containerTags; | 
|  | 55 	} | 
|  | 56 | 
|  | 57 	public void setStartingLine(int startingLine) { | 
|  | 58 		this.startingLine = startingLine; | 
|  | 59 	} | 
|  | 60 | 
|  | 61 	public void removeBadTags(boolean removeBadTags) { | 
|  | 62 		this.removeBadTags = removeBadTags; | 
|  | 63 	} | 
|  | 64 | 
|  | 65 	public void parse(String text) { | 
|  | 66 		int len = text.length(); | 
|  | 67 		int i = 0; | 
|  | 68 		int i2Prev = 0; | 
|  | 69 		int line = startingLine; | 
|  | 70 outer: | 
|  | 71 		while( i < len ) { | 
|  | 72 			int i2 = text.indexOf('<',i); | 
|  | 73 			while( i2 != -1 && i2+1 < len ) { | 
|  | 74 				char c = text.charAt(i2+1); | 
|  | 75 				if( Character.isLetter(c) || c=='/' || c=='!' ) | 
|  | 76 					break; | 
|  | 77 				i2 = text.indexOf('<',i2+1); | 
|  | 78 			} | 
|  | 79 			if( i2 == -1 ) { | 
|  | 80 				add( text.substring(i) ); | 
|  | 81 				break; | 
|  | 82 			} | 
|  | 83 			if( i < i2 ) | 
|  | 84 				add( text.substring(i,i2) ); | 
|  | 85 			if( text.startsWith("<!--",i2) ) { | 
|  | 86 				i = text.indexOf("-->",i2+4); | 
|  | 87 				if( i == -1 ) { | 
|  | 88 					add( text.substring(i2) ); | 
|  | 89 					break; | 
|  | 90 				} | 
|  | 91 				add( new HtmlComment( text.substring(i2+4,i) ) ); | 
|  | 92 				i += 3; | 
|  | 93 			} else if( text.startsWith("<![CDATA[",i2) ) { | 
|  | 94 				i = text.indexOf("]]>",i2+9); | 
|  | 95 				if( i == -1 ) { | 
|  | 96 					add( text.substring(i2) ); | 
|  | 97 					break; | 
|  | 98 				} | 
|  | 99 				add( new HtmlCdata( text.substring(i2+9,i) ) ); | 
|  | 100 				i += 3; | 
|  | 101 			} else { | 
|  | 102 				i = text.indexOf('>',i2); | 
|  | 103 				if( i == -1 ) { | 
|  | 104 					add( text.substring(i2) ); | 
|  | 105 					break; | 
|  | 106 				} | 
|  | 107 				line += lines(text,i2Prev,i2); | 
|  | 108 				i2Prev = i2; | 
|  | 109 				String tagText = text.substring(i2+1,i); | 
|  | 110 				try { | 
|  | 111 					HtmlTag tag = new HtmlTag(tagText); | 
|  | 112 					tag.lineNumber = line; | 
|  | 113 					String tagName = tag.getName().toLowerCase(); | 
|  | 114 					if( containerTags.contains(tagName) ) { | 
|  | 115 						i2 = i; | 
|  | 116 						String endTagName = '/' + tagName; | 
|  | 117 						while(true) { | 
|  | 118 							i2 = text.indexOf('<',i2+1); | 
|  | 119 							if( i2 == -1 ) | 
|  | 120 								break; | 
|  | 121 							int i3 = text.indexOf('>',i2); | 
|  | 122 							if( i3 == -1 ) | 
|  | 123 								break; | 
|  | 124 							int j = i2+1; | 
|  | 125 							while( j<i3 && !Character.isWhitespace(text.charAt(j)) )  j++; | 
|  | 126 							String s = text.substring(i2+1,j); | 
|  | 127 							if( s.equalsIgnoreCase(endTagName) ) { | 
|  | 128 								HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) ); | 
|  | 129 								line += lines(text,i2Prev,i2); | 
|  | 130 								tag2.lineNumber = line; | 
|  | 131 								i2Prev = i2; | 
|  | 132 								String text2 = text.substring(i+1,i2); | 
|  | 133 								HtmlTextContainer textContainer = | 
|  | 134 									tagName.equals(TEXTAREA) ? | 
|  | 135 										new HtmlTextarea(tag,text2,tag2) | 
|  | 136 									: tagName.equals(SCRIPT) ? | 
|  | 137 										new HtmlScript(tag,text2,tag2) | 
|  | 138 									: tagName.equals(STYLE) ? | 
|  | 139 										new HtmlStyle(tag,text2,tag2) | 
|  | 140 									: | 
|  | 141 										new HtmlTextContainer(tag,text2,tag2) | 
|  | 142 								; | 
|  | 143 								add( textContainer ); | 
|  | 144 								i = i3 + 1; | 
|  | 145 								continue outer; | 
|  | 146 							} | 
|  | 147 						} | 
|  | 148 						logger.warn("unclosed "+tagName); | 
|  | 149 					} | 
|  | 150 					i += 1; | 
|  | 151 					add( tag ); | 
|  | 152 				} catch(HtmlTag.BadTag e) { | 
|  | 153 //					logger.debug("bad tag",e); | 
|  | 154 					i += 1; | 
|  | 155 					if( !removeBadTags ) { | 
|  | 156 						add( "<" ); | 
|  | 157 						add( HtmlUtils.htmlEncode(tagText) ); | 
|  | 158 						add( ">" ); | 
|  | 159 					} | 
|  | 160 				} | 
|  | 161 			} | 
|  | 162 		} | 
|  | 163 	} | 
|  | 164 | 
|  | 165 	@Override public String toString() { | 
|  | 166 		StringBuilder buf = new StringBuilder(); | 
|  | 167 		for( Object o : this ) { | 
|  | 168 			buf.append( o ); | 
|  | 169 		} | 
|  | 170 		return buf.toString(); | 
|  | 171 	} | 
|  | 172 | 
|  | 173 	private static int lines(String text,int start,int end) { | 
|  | 174 		int n = 0; | 
|  | 175 		int i = start - 1; | 
|  | 176 		while(true) { | 
|  | 177 			i = text.indexOf('\n',i+1); | 
|  | 178 			if( i == -1 || i >= end ) | 
|  | 179 				return n; | 
|  | 180 			n++; | 
|  | 181 		} | 
|  | 182 	} | 
|  | 183 | 
|  | 184 	public Html flatten() { | 
|  | 185 		Html html = new Html(); | 
|  | 186 		flattenTo(html); | 
|  | 187 		return html; | 
|  | 188 	} | 
|  | 189 | 
|  | 190 	void flattenTo(Html html) { | 
|  | 191 		for( Object obj : this ) { | 
|  | 192 			if( obj instanceof HtmlNode ) { | 
|  | 193 				((HtmlNode)obj).flattenTo(html); | 
|  | 194 			} else { | 
|  | 195 				html.add(obj); | 
|  | 196 			} | 
|  | 197 		} | 
|  | 198 	} | 
|  | 199 | 
|  | 200 	public Html deepen() { | 
|  | 201 		Iterator iter = iterator(); | 
|  | 202 		Html html = deepen(iter); | 
|  | 203 		if( iter.hasNext() ) | 
|  | 204 			throw new RuntimeException("unmatched end tag:\n"+html); | 
|  | 205 		return html; | 
|  | 206 	} | 
|  | 207 | 
|  | 208 	private static Html deepen(Iterator iter) { | 
|  | 209 		Html html = new Html(); | 
|  | 210 		while( iter.hasNext() ) { | 
|  | 211 			Object obj = iter.next(); | 
|  | 212 			if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) { | 
|  | 213 				HtmlTag tag = (HtmlTag)obj; | 
|  | 214 				if( !tag.isEmpty() ) { | 
|  | 215 					String name = tag.getName(); | 
|  | 216 					if( name.startsWith("/") ) { | 
|  | 217 						html.add(tag); | 
|  | 218 						return html; | 
|  | 219 					} | 
|  | 220 					Html children = deepen(iter); | 
|  | 221 					HtmlTag endTag = (HtmlTag)children.get(children.size()-1); | 
|  | 222 					if( endTag.getName().equals("/"+name) ) { | 
|  | 223 						children.remove(children.size()-1); | 
|  | 224 						html.add( new HtmlNode(tag,children) ); | 
|  | 225 						continue; | 
|  | 226 					} else { | 
|  | 227 						html.add(tag); | 
|  | 228 						html.addAll(children); | 
|  | 229 						return html; | 
|  | 230 					} | 
|  | 231 				} | 
|  | 232 			} | 
|  | 233 			html.add(obj); | 
|  | 234 		} | 
|  | 235 		return html; | 
|  | 236 	} | 
|  | 237 | 
|  | 238 	public static void main(String[] args) throws Exception { | 
|  | 239 /* | 
|  | 240 		String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/"); | 
|  | 241 		Html html = new Html(page); | 
|  | 242 		String s = html.toString(); | 
|  | 243 		System.out.print(s); | 
|  | 244 //		System.out.println(html.size()); | 
|  | 245 */ | 
|  | 246 		String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in)); | 
|  | 247 		Html html = new Html(page); | 
|  | 248 		for( Iterator i=html.iterator(); i.hasNext(); ) { | 
|  | 249 			Object o = i.next(); | 
|  | 250 			System.out.println(o.getClass().getName()+" - "+o); | 
|  | 251 		} | 
|  | 252 	} | 
|  | 253 } |