| 
68
 | 
     1 /*
 | 
| 
 | 
     2 Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com>
 | 
| 
 | 
     3 
 | 
| 
 | 
     4 Permission is hereby granted, free of charge, to any person obtaining a copy
 | 
| 
 | 
     5 of this software and associated documentation files (the "Software"), to deal
 | 
| 
 | 
     6 in the Software without restriction, including without limitation the rights
 | 
| 
 | 
     7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 | 
| 
 | 
     8 copies of the Software, and to permit persons to whom the Software is
 | 
| 
 | 
     9 furnished to do so, subject to the following conditions:
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 The above copyright notice and this permission notice shall be included in
 | 
| 
 | 
    12 all copies or substantial portions of the Software.
 | 
| 
 | 
    13 
 | 
| 
 | 
    14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
| 
 | 
    15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
| 
 | 
    16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 | 
| 
 | 
    17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 | 
| 
 | 
    18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 | 
| 
 | 
    19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 | 
| 
 | 
    20 THE SOFTWARE.
 | 
| 
 | 
    21 */
 | 
| 
 | 
    22 
 | 
| 
 | 
    23 package fschmidt.html;
 | 
| 
 | 
    24 
 | 
| 
 | 
    25 import java.io.InputStreamReader;
 | 
| 
 | 
    26 import java.util.ArrayList;
 | 
| 
 | 
    27 import java.util.Iterator;
 | 
| 
 | 
    28 import java.util.Arrays;
 | 
| 
 | 
    29 import java.util.Set;
 | 
| 
 | 
    30 import java.util.HashSet;
 | 
| 
 | 
    31 import org.slf4j.Logger;
 | 
| 
 | 
    32 import org.slf4j.LoggerFactory;
 | 
| 
 | 
    33 import fschmidt.util.java.HtmlUtils;
 | 
| 
 | 
    34 
 | 
| 
 | 
    35 
 | 
| 
 | 
    36 public final class Html extends ArrayList<Object> {
 | 
| 
 | 
    37 	private static final Logger logger = LoggerFactory.getLogger(Html.class);
 | 
| 
 | 
    38 
 | 
| 
 | 
    39 	public static final String TEXTAREA = "textarea";
 | 
| 
 | 
    40 	public static final String SCRIPT = "script";
 | 
| 
 | 
    41 	public static final String STYLE = "style";
 | 
| 
 | 
    42 
 | 
| 
 | 
    43 	private int startingLine = 0;
 | 
| 
 | 
    44 	private boolean removeBadTags = false;
 | 
| 
 | 
    45 	private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE));
 | 
| 
 | 
    46 
 | 
| 
 | 
    47 	public Html() {}
 | 
| 
 | 
    48 
 | 
| 
 | 
    49 	public Html(String text) {
 | 
| 
 | 
    50 		parse(text);
 | 
| 
 | 
    51 	}
 | 
| 
 | 
    52 
 | 
| 
 | 
    53 	public Set<String> containerTags() {
 | 
| 
 | 
    54 		return containerTags;
 | 
| 
 | 
    55 	}
 | 
| 
 | 
    56 
 | 
| 
 | 
    57 	public void setStartingLine(int startingLine) {
 | 
| 
 | 
    58 		this.startingLine = startingLine;
 | 
| 
 | 
    59 	}
 | 
| 
 | 
    60 
 | 
| 
 | 
    61 	public void removeBadTags(boolean removeBadTags) {
 | 
| 
 | 
    62 		this.removeBadTags = removeBadTags;
 | 
| 
 | 
    63 	}
 | 
| 
 | 
    64 
 | 
| 
 | 
    65 	public void parse(String text) {
 | 
| 
 | 
    66 		int len = text.length();
 | 
| 
 | 
    67 		int i = 0;
 | 
| 
 | 
    68 		int i2Prev = 0;
 | 
| 
 | 
    69 		int line = startingLine;
 | 
| 
 | 
    70 outer:
 | 
| 
 | 
    71 		while( i < len ) {
 | 
| 
 | 
    72 			int i2 = text.indexOf('<',i);
 | 
| 
 | 
    73 			while( i2 != -1 && i2+1 < len ) {
 | 
| 
 | 
    74 				char c = text.charAt(i2+1);
 | 
| 
 | 
    75 				if( Character.isLetter(c) || c=='/' || c=='!' )
 | 
| 
 | 
    76 					break;
 | 
| 
 | 
    77 				i2 = text.indexOf('<',i2+1);
 | 
| 
 | 
    78 			}
 | 
| 
 | 
    79 			if( i2 == -1 ) {
 | 
| 
 | 
    80 				add( text.substring(i) );
 | 
| 
 | 
    81 				break;
 | 
| 
 | 
    82 			}
 | 
| 
 | 
    83 			if( i < i2 )
 | 
| 
 | 
    84 				add( text.substring(i,i2) );
 | 
| 
 | 
    85 			if( text.startsWith("<!--",i2) ) {
 | 
| 
 | 
    86 				i = text.indexOf("-->",i2+4);
 | 
| 
 | 
    87 				if( i == -1 ) {
 | 
| 
 | 
    88 					add( text.substring(i2) );
 | 
| 
 | 
    89 					break;
 | 
| 
 | 
    90 				}
 | 
| 
 | 
    91 				add( new HtmlComment( text.substring(i2+4,i) ) );
 | 
| 
 | 
    92 				i += 3;
 | 
| 
 | 
    93 			} else if( text.startsWith("<![CDATA[",i2) ) {
 | 
| 
 | 
    94 				i = text.indexOf("]]>",i2+9);
 | 
| 
 | 
    95 				if( i == -1 ) {
 | 
| 
 | 
    96 					add( text.substring(i2) );
 | 
| 
 | 
    97 					break;
 | 
| 
 | 
    98 				}
 | 
| 
 | 
    99 				add( new HtmlCdata( text.substring(i2+9,i) ) );
 | 
| 
 | 
   100 				i += 3;
 | 
| 
 | 
   101 			} else {
 | 
| 
 | 
   102 				i = text.indexOf('>',i2);
 | 
| 
 | 
   103 				if( i == -1 ) {
 | 
| 
 | 
   104 					add( text.substring(i2) );
 | 
| 
 | 
   105 					break;
 | 
| 
 | 
   106 				}
 | 
| 
 | 
   107 				line += lines(text,i2Prev,i2);
 | 
| 
 | 
   108 				i2Prev = i2;
 | 
| 
 | 
   109 				String tagText = text.substring(i2+1,i);
 | 
| 
 | 
   110 				try {
 | 
| 
 | 
   111 					HtmlTag tag = new HtmlTag(tagText);
 | 
| 
 | 
   112 					tag.lineNumber = line;
 | 
| 
 | 
   113 					String tagName = tag.getName().toLowerCase();
 | 
| 
 | 
   114 					if( containerTags.contains(tagName) ) {
 | 
| 
 | 
   115 						i2 = i;
 | 
| 
 | 
   116 						String endTagName = '/' + tagName;
 | 
| 
 | 
   117 						while(true) {
 | 
| 
 | 
   118 							i2 = text.indexOf('<',i2+1);
 | 
| 
 | 
   119 							if( i2 == -1 )
 | 
| 
 | 
   120 								break;
 | 
| 
 | 
   121 							int i3 = text.indexOf('>',i2);
 | 
| 
 | 
   122 							if( i3 == -1 )
 | 
| 
 | 
   123 								break;
 | 
| 
 | 
   124 							int j = i2+1;
 | 
| 
 | 
   125 							while( j<i3 && !Character.isWhitespace(text.charAt(j)) )  j++;
 | 
| 
 | 
   126 							String s = text.substring(i2+1,j);
 | 
| 
 | 
   127 							if( s.equalsIgnoreCase(endTagName) ) {
 | 
| 
 | 
   128 								HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) );
 | 
| 
 | 
   129 								line += lines(text,i2Prev,i2);
 | 
| 
 | 
   130 								tag2.lineNumber = line;
 | 
| 
 | 
   131 								i2Prev = i2;
 | 
| 
 | 
   132 								String text2 = text.substring(i+1,i2);
 | 
| 
 | 
   133 								HtmlTextContainer textContainer =
 | 
| 
 | 
   134 									tagName.equals(TEXTAREA) ?
 | 
| 
 | 
   135 										new HtmlTextarea(tag,text2,tag2)
 | 
| 
 | 
   136 									: tagName.equals(SCRIPT) ?
 | 
| 
 | 
   137 										new HtmlScript(tag,text2,tag2)
 | 
| 
 | 
   138 									: tagName.equals(STYLE) ?
 | 
| 
 | 
   139 										new HtmlStyle(tag,text2,tag2)
 | 
| 
 | 
   140 									:
 | 
| 
 | 
   141 										new HtmlTextContainer(tag,text2,tag2)
 | 
| 
 | 
   142 								;
 | 
| 
 | 
   143 								add( textContainer );
 | 
| 
 | 
   144 								i = i3 + 1;
 | 
| 
 | 
   145 								continue outer;
 | 
| 
 | 
   146 							}
 | 
| 
 | 
   147 						}
 | 
| 
 | 
   148 						logger.warn("unclosed "+tagName);
 | 
| 
 | 
   149 					}
 | 
| 
 | 
   150 					i += 1;
 | 
| 
 | 
   151 					add( tag );
 | 
| 
 | 
   152 				} catch(HtmlTag.BadTag e) {
 | 
| 
 | 
   153 //					logger.debug("bad tag",e);
 | 
| 
 | 
   154 					i += 1;
 | 
| 
 | 
   155 					if( !removeBadTags ) {
 | 
| 
 | 
   156 						add( "<" );
 | 
| 
 | 
   157 						add( HtmlUtils.htmlEncode(tagText) );
 | 
| 
 | 
   158 						add( ">" );
 | 
| 
 | 
   159 					}
 | 
| 
 | 
   160 				}
 | 
| 
 | 
   161 			}
 | 
| 
 | 
   162 		}
 | 
| 
 | 
   163 	}
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 	@Override public String toString() {
 | 
| 
 | 
   166 		StringBuilder buf = new StringBuilder();
 | 
| 
 | 
   167 		for( Object o : this ) {
 | 
| 
 | 
   168 			buf.append( o );
 | 
| 
 | 
   169 		}
 | 
| 
 | 
   170 		return buf.toString();
 | 
| 
 | 
   171 	}
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 	private static int lines(String text,int start,int end) {
 | 
| 
 | 
   174 		int n = 0;
 | 
| 
 | 
   175 		int i = start - 1;
 | 
| 
 | 
   176 		while(true) {
 | 
| 
 | 
   177 			i = text.indexOf('\n',i+1);
 | 
| 
 | 
   178 			if( i == -1 || i >= end )
 | 
| 
 | 
   179 				return n;
 | 
| 
 | 
   180 			n++;
 | 
| 
 | 
   181 		}
 | 
| 
 | 
   182 	}
 | 
| 
 | 
   183 
 | 
| 
 | 
   184 	public Html flatten() {
 | 
| 
 | 
   185 		Html html = new Html();
 | 
| 
 | 
   186 		flattenTo(html);
 | 
| 
 | 
   187 		return html;
 | 
| 
 | 
   188 	}
 | 
| 
 | 
   189 
 | 
| 
 | 
   190 	void flattenTo(Html html) {
 | 
| 
 | 
   191 		for( Object obj : this ) {
 | 
| 
 | 
   192 			if( obj instanceof HtmlNode ) {
 | 
| 
 | 
   193 				((HtmlNode)obj).flattenTo(html);
 | 
| 
 | 
   194 			} else {
 | 
| 
 | 
   195 				html.add(obj);
 | 
| 
 | 
   196 			}
 | 
| 
 | 
   197 		}
 | 
| 
 | 
   198 	}
 | 
| 
 | 
   199 
 | 
| 
 | 
   200 	public Html deepen() {
 | 
| 
 | 
   201 		Iterator iter = iterator();
 | 
| 
 | 
   202 		Html html = deepen(iter);
 | 
| 
 | 
   203 		if( iter.hasNext() )
 | 
| 
 | 
   204 			throw new RuntimeException("unmatched end tag:\n"+html);
 | 
| 
 | 
   205 		return html;
 | 
| 
 | 
   206 	}
 | 
| 
 | 
   207 
 | 
| 
 | 
   208 	private static Html deepen(Iterator iter) {
 | 
| 
 | 
   209 		Html html = new Html();
 | 
| 
 | 
   210 		while( iter.hasNext() ) {
 | 
| 
 | 
   211 			Object obj = iter.next();
 | 
| 
 | 
   212 			if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) {
 | 
| 
 | 
   213 				HtmlTag tag = (HtmlTag)obj;
 | 
| 
 | 
   214 				if( !tag.isEmpty() ) {
 | 
| 
 | 
   215 					String name = tag.getName();
 | 
| 
 | 
   216 					if( name.startsWith("/") ) {
 | 
| 
 | 
   217 						html.add(tag);
 | 
| 
 | 
   218 						return html;
 | 
| 
 | 
   219 					}
 | 
| 
 | 
   220 					Html children = deepen(iter);
 | 
| 
 | 
   221 					HtmlTag endTag = (HtmlTag)children.get(children.size()-1);
 | 
| 
 | 
   222 					if( endTag.getName().equals("/"+name) ) {
 | 
| 
 | 
   223 						children.remove(children.size()-1);
 | 
| 
 | 
   224 						html.add( new HtmlNode(tag,children) );
 | 
| 
 | 
   225 						continue;
 | 
| 
 | 
   226 					} else {
 | 
| 
 | 
   227 						html.add(tag);
 | 
| 
 | 
   228 						html.addAll(children);
 | 
| 
 | 
   229 						return html;
 | 
| 
 | 
   230 					}
 | 
| 
 | 
   231 				}
 | 
| 
 | 
   232 			}
 | 
| 
 | 
   233 			html.add(obj);
 | 
| 
 | 
   234 		}
 | 
| 
 | 
   235 		return html;
 | 
| 
 | 
   236 	}
 | 
| 
 | 
   237 
 | 
| 
 | 
   238 	public static void main(String[] args) throws Exception {
 | 
| 
 | 
   239 /*
 | 
| 
 | 
   240 		String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/");
 | 
| 
 | 
   241 		Html html = new Html(page);
 | 
| 
 | 
   242 		String s = html.toString();
 | 
| 
 | 
   243 		System.out.print(s);
 | 
| 
 | 
   244 //		System.out.println(html.size());
 | 
| 
 | 
   245 */
 | 
| 
 | 
   246 		String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in));
 | 
| 
 | 
   247 		Html html = new Html(page);
 | 
| 
 | 
   248 		for( Iterator i=html.iterator(); i.hasNext(); ) {
 | 
| 
 | 
   249 			Object o = i.next();
 | 
| 
 | 
   250 			System.out.println(o.getClass().getName()+" - "+o);
 | 
| 
 | 
   251 		}
 | 
| 
 | 
   252 	}
 | 
| 
 | 
   253 }
 |