diff src/fschmidt/html/Html.java @ 68:00520880ad02

add fschmidt source
author Franklin Schmidt <fschmidt@gmail.com>
date Sun, 05 Oct 2025 17:24:15 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/fschmidt/html/Html.java	Sun Oct 05 17:24:15 2025 -0600
@@ -0,0 +1,253 @@
+/*
+Copyright (c) 2008  Franklin Schmidt <fschmidt@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+package fschmidt.html;
+
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.HashSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import fschmidt.util.java.HtmlUtils;
+
+
+public final class Html extends ArrayList<Object> {
+	private static final Logger logger = LoggerFactory.getLogger(Html.class);
+
+	public static final String TEXTAREA = "textarea";
+	public static final String SCRIPT = "script";
+	public static final String STYLE = "style";
+
+	private int startingLine = 0;
+	private boolean removeBadTags = false;
+	private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE));
+
+	public Html() {}
+
+	public Html(String text) {
+		parse(text);
+	}
+
+	public Set<String> containerTags() {
+		return containerTags;
+	}
+
+	public void setStartingLine(int startingLine) {
+		this.startingLine = startingLine;
+	}
+
+	public void removeBadTags(boolean removeBadTags) {
+		this.removeBadTags = removeBadTags;
+	}
+
+	public void parse(String text) {
+		int len = text.length();
+		int i = 0;
+		int i2Prev = 0;
+		int line = startingLine;
+outer:
+		while( i < len ) {
+			int i2 = text.indexOf('<',i);
+			while( i2 != -1 && i2+1 < len ) {
+				char c = text.charAt(i2+1);
+				if( Character.isLetter(c) || c=='/' || c=='!' )
+					break;
+				i2 = text.indexOf('<',i2+1);
+			}
+			if( i2 == -1 ) {
+				add( text.substring(i) );
+				break;
+			}
+			if( i < i2 )
+				add( text.substring(i,i2) );
+			if( text.startsWith("<!--",i2) ) {
+				i = text.indexOf("-->",i2+4);
+				if( i == -1 ) {
+					add( text.substring(i2) );
+					break;
+				}
+				add( new HtmlComment( text.substring(i2+4,i) ) );
+				i += 3;
+			} else if( text.startsWith("<![CDATA[",i2) ) {
+				i = text.indexOf("]]>",i2+9);
+				if( i == -1 ) {
+					add( text.substring(i2) );
+					break;
+				}
+				add( new HtmlCdata( text.substring(i2+9,i) ) );
+				i += 3;
+			} else {
+				i = text.indexOf('>',i2);
+				if( i == -1 ) {
+					add( text.substring(i2) );
+					break;
+				}
+				line += lines(text,i2Prev,i2);
+				i2Prev = i2;
+				String tagText = text.substring(i2+1,i);
+				try {
+					HtmlTag tag = new HtmlTag(tagText);
+					tag.lineNumber = line;
+					String tagName = tag.getName().toLowerCase();
+					if( containerTags.contains(tagName) ) {
+						i2 = i;
+						String endTagName = '/' + tagName;
+						while(true) {
+							i2 = text.indexOf('<',i2+1);
+							if( i2 == -1 )
+								break;
+							int i3 = text.indexOf('>',i2);
+							if( i3 == -1 )
+								break;
+							int j = i2+1;
+							while( j<i3 && !Character.isWhitespace(text.charAt(j)) )  j++;
+							String s = text.substring(i2+1,j);
+							if( s.equalsIgnoreCase(endTagName) ) {
+								HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) );
+								line += lines(text,i2Prev,i2);
+								tag2.lineNumber = line;
+								i2Prev = i2;
+								String text2 = text.substring(i+1,i2);
+								HtmlTextContainer textContainer =
+									tagName.equals(TEXTAREA) ?
+										new HtmlTextarea(tag,text2,tag2)
+									: tagName.equals(SCRIPT) ?
+										new HtmlScript(tag,text2,tag2)
+									: tagName.equals(STYLE) ?
+										new HtmlStyle(tag,text2,tag2)
+									:
+										new HtmlTextContainer(tag,text2,tag2)
+								;
+								add( textContainer );
+								i = i3 + 1;
+								continue outer;
+							}
+						}
+						logger.warn("unclosed "+tagName);
+					}
+					i += 1;
+					add( tag );
+				} catch(HtmlTag.BadTag e) {
+//					logger.debug("bad tag",e);
+					i += 1;
+					if( !removeBadTags ) {
+						add( "&lt;" );
+						add( HtmlUtils.htmlEncode(tagText) );
+						add( "&gt;" );
+					}
+				}
+			}
+		}
+	}
+
+	@Override public String toString() {
+		StringBuilder buf = new StringBuilder();
+		for( Object o : this ) {
+			buf.append( o );
+		}
+		return buf.toString();
+	}
+
+	private static int lines(String text,int start,int end) {
+		int n = 0;
+		int i = start - 1;
+		while(true) {
+			i = text.indexOf('\n',i+1);
+			if( i == -1 || i >= end )
+				return n;
+			n++;
+		}
+	}
+
+	public Html flatten() {
+		Html html = new Html();
+		flattenTo(html);
+		return html;
+	}
+
+	void flattenTo(Html html) {
+		for( Object obj : this ) {
+			if( obj instanceof HtmlNode ) {
+				((HtmlNode)obj).flattenTo(html);
+			} else {
+				html.add(obj);
+			}
+		}
+	}
+
+	public Html deepen() {
+		Iterator iter = iterator();
+		Html html = deepen(iter);
+		if( iter.hasNext() )
+			throw new RuntimeException("unmatched end tag:\n"+html);
+		return html;
+	}
+
+	private static Html deepen(Iterator iter) {
+		Html html = new Html();
+		while( iter.hasNext() ) {
+			Object obj = iter.next();
+			if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) {
+				HtmlTag tag = (HtmlTag)obj;
+				if( !tag.isEmpty() ) {
+					String name = tag.getName();
+					if( name.startsWith("/") ) {
+						html.add(tag);
+						return html;
+					}
+					Html children = deepen(iter);
+					HtmlTag endTag = (HtmlTag)children.get(children.size()-1);
+					if( endTag.getName().equals("/"+name) ) {
+						children.remove(children.size()-1);
+						html.add( new HtmlNode(tag,children) );
+						continue;
+					} else {
+						html.add(tag);
+						html.addAll(children);
+						return html;
+					}
+				}
+			}
+			html.add(obj);
+		}
+		return html;
+	}
+
+	public static void main(String[] args) throws Exception {
+/*
+		String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/");
+		Html html = new Html(page);
+		String s = html.toString();
+		System.out.print(s);
+//		System.out.println(html.size());
+*/
+		String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in));
+		Html html = new Html(page);
+		for( Iterator i=html.iterator(); i.hasNext(); ) {
+			Object o = i.next();
+			System.out.println(o.getClass().getName()+" - "+o);
+		}
+	}
+}