Mercurial Hosting > nabble
diff src/fschmidt/html/Html.java @ 68:00520880ad02
add fschmidt source
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sun, 05 Oct 2025 17:24:15 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/fschmidt/html/Html.java Sun Oct 05 17:24:15 2025 -0600 @@ -0,0 +1,253 @@ +/* +Copyright (c) 2008 Franklin Schmidt <fschmidt@gmail.com> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +package fschmidt.html; + +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Arrays; +import java.util.Set; +import java.util.HashSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import fschmidt.util.java.HtmlUtils; + + +public final class Html extends ArrayList<Object> { + private static final Logger logger = LoggerFactory.getLogger(Html.class); + + public static final String TEXTAREA = "textarea"; + public static final String SCRIPT = "script"; + public static final String STYLE = "style"; + + private int startingLine = 0; + private boolean removeBadTags = false; + private Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); + + public Html() {} + + public Html(String text) { + parse(text); + } + + public Set<String> containerTags() { + return containerTags; + } + + public void setStartingLine(int startingLine) { + this.startingLine = startingLine; + } + + public void removeBadTags(boolean removeBadTags) { + this.removeBadTags = removeBadTags; + } + + public void parse(String text) { + int len = text.length(); + int i = 0; + int i2Prev = 0; + int line = startingLine; +outer: + while( i < len ) { + int i2 = text.indexOf('<',i); + while( i2 != -1 && i2+1 < len ) { + char c = text.charAt(i2+1); + if( Character.isLetter(c) || c=='/' || c=='!' ) + break; + i2 = text.indexOf('<',i2+1); + } + if( i2 == -1 ) { + add( text.substring(i) ); + break; + } + if( i < i2 ) + add( text.substring(i,i2) ); + if( text.startsWith("<!--",i2) ) { + i = text.indexOf("-->",i2+4); + if( i == -1 ) { + add( text.substring(i2) ); + break; + } + add( new HtmlComment( text.substring(i2+4,i) ) ); + i += 3; + } else if( text.startsWith("<![CDATA[",i2) ) { + i = text.indexOf("]]>",i2+9); + if( i == -1 ) { + add( text.substring(i2) ); + break; + } + add( new HtmlCdata( text.substring(i2+9,i) ) ); + i += 3; + } else { + i = text.indexOf('>',i2); + if( i == -1 ) { + add( text.substring(i2) ); + break; + } + line += lines(text,i2Prev,i2); + i2Prev = i2; + String tagText = text.substring(i2+1,i); + try { + HtmlTag tag = new HtmlTag(tagText); + tag.lineNumber = line; + String tagName = tag.getName().toLowerCase(); + if( containerTags.contains(tagName) ) { + i2 = i; + String endTagName = '/' + tagName; + while(true) { + i2 = text.indexOf('<',i2+1); + if( i2 == -1 ) + break; + int i3 = text.indexOf('>',i2); + if( i3 == -1 ) + break; + int j = i2+1; + while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++; + String s = text.substring(i2+1,j); + if( s.equalsIgnoreCase(endTagName) ) { + HtmlTag tag2 = new HtmlTag( text.substring(i2+1,i3) ); + line += lines(text,i2Prev,i2); + tag2.lineNumber = line; + i2Prev = i2; + String text2 = text.substring(i+1,i2); + HtmlTextContainer textContainer = + tagName.equals(TEXTAREA) ? + new HtmlTextarea(tag,text2,tag2) + : tagName.equals(SCRIPT) ? + new HtmlScript(tag,text2,tag2) + : tagName.equals(STYLE) ? + new HtmlStyle(tag,text2,tag2) + : + new HtmlTextContainer(tag,text2,tag2) + ; + add( textContainer ); + i = i3 + 1; + continue outer; + } + } + logger.warn("unclosed "+tagName); + } + i += 1; + add( tag ); + } catch(HtmlTag.BadTag e) { +// logger.debug("bad tag",e); + i += 1; + if( !removeBadTags ) { + add( "<" ); + add( HtmlUtils.htmlEncode(tagText) ); + add( ">" ); + } + } + } + } + } + + @Override public String toString() { + StringBuilder buf = new StringBuilder(); + for( Object o : this ) { + buf.append( o ); + } + return buf.toString(); + } + + private static int lines(String text,int start,int end) { + int n = 0; + int i = start - 1; + while(true) { + i = text.indexOf('\n',i+1); + if( i == -1 || i >= end ) + return n; + n++; + } + } + + public Html flatten() { + Html html = new Html(); + flattenTo(html); + return html; + } + + void flattenTo(Html html) { + for( Object obj : this ) { + if( obj instanceof HtmlNode ) { + ((HtmlNode)obj).flattenTo(html); + } else { + html.add(obj); + } + } + } + + public Html deepen() { + Iterator iter = iterator(); + Html html = deepen(iter); + if( iter.hasNext() ) + throw new RuntimeException("unmatched end tag:\n"+html); + return html; + } + + private static Html deepen(Iterator iter) { + Html html = new Html(); + while( iter.hasNext() ) { + Object obj = iter.next(); + if( obj instanceof HtmlTag && !(obj instanceof HtmlNode) ) { + HtmlTag tag = (HtmlTag)obj; + if( !tag.isEmpty() ) { + String name = tag.getName(); + if( name.startsWith("/") ) { + html.add(tag); + return html; + } + Html children = deepen(iter); + HtmlTag endTag = (HtmlTag)children.get(children.size()-1); + if( endTag.getName().equals("/"+name) ) { + children.remove(children.size()-1); + html.add( new HtmlNode(tag,children) ); + continue; + } else { + html.add(tag); + html.addAll(children); + return html; + } + } + } + html.add(obj); + } + return html; + } + + public static void main(String[] args) throws Exception { +/* + String page = fschmidt.util.java.IoUtils.readPage("http://www.yahoo.com/"); + Html html = new Html(page); + String s = html.toString(); + System.out.print(s); +// System.out.println(html.size()); +*/ + String page = fschmidt.util.java.IoUtils.readAll(new InputStreamReader(System.in)); + Html html = new Html(page); + for( Iterator i=html.iterator(); i.hasNext(); ) { + Object o = i.next(); + System.out.println(o.getClass().getName()+" - "+o); + } + } +}