Mercurial Hosting > luan
view src/goodjava/html/Html.java @ 1719:2f3a8f16f583
add regex.split
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Mon, 25 Jul 2022 14:31:48 -0600 |
parents | 31a82b0d0a87 |
children | a045f30fa67d |
line wrap: on
line source
package goodjava.html; import java.util.List; import java.util.ArrayList; import java.util.Set; import java.util.HashSet; import java.util.Map; import java.util.HashMap; import java.util.Collections; import java.util.regex.Pattern; import java.util.regex.Matcher; import goodjava.parser.Parser; public final class Html { private static final Pattern entityPtn = Pattern.compile( "&(#?[0-9a-zA-Z]+;)" ); public static String encode(String s) { //s = s.replace("&","&"); s = entityPtn.matcher(s).replaceAll("&$1"); s = s.replace("<","<"); s = s.replace(">",">"); s = s.replace("\"","""); return s; } private static final Pattern entityNumPtn = Pattern.compile( "&#(\\d+);" ); public static String decode(String s) { Matcher m = entityNumPtn.matcher(s); if( m.find() ) { StringBuffer buf = new StringBuffer(); do { String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))}); m.appendReplacement(buf,entity); } while( m.find() ); m.appendTail(buf); s = buf.toString(); } s = s.replace(" "," "); s = s.replace(""","\""); s = s.replace(">",">"); s = s.replace("<","<"); s = s.replace("&","&"); return s; } public static final class Comment { public final String text; private Comment(String text) { this.text = text; } } public static final class CData { public final String text; private CData(String text) { this.text = text; } } public static final class Tag { public final String name; public final Map<String,Object> attributes; public final boolean isEmpty; public final String raw; public final Map<String,String> style; private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) { this(name,attributes,isEmpty,raw,null); } private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw,Map<String,String> style) { this.name = name; this.attributes = attributes; this.isEmpty = isEmpty; this.raw = raw; this.style = style; } } public static final class Container { public final Tag tag; public final String text; private Container(Tag tag,String text) { this.tag = tag; this.text = text; } } private static Set<String> defaultContainerTags = new HashSet<String>(); static { Collections.addAll( defaultContainerTags, "script", "style", "textarea" ); } public static List parse(String text) { return parse(text,defaultContainerTags); } public static List parse(String text,Set<String> containerTags) { return new Html(text,containerTags).parse(); } private final Parser parser; private final Set<String> containerTags; private Html(String text,Set<String> containerTags) { this.parser = new Parser(text); this.containerTags = containerTags; } private List parse() { List list = new ArrayList(); StringBuilder sb = new StringBuilder(); while( !parser.endOfInput() ) { if( parser.test('<') ) { Tag tag = parseTag(); if( tag != null ) { Object el = tag; String tagName = tag.name; if( containerTags.contains(tagName) ) { Container container = parseContainer(tag); if( container != null ) el = container; } if( el != null || (el = parseComment()) != null || (el = parseCdata()) != null ) { add(list,sb); list.add(el); continue; } } } sb.append( parser.currentChar() ); parser.anyChar(); } add(list,sb); return list; } private static void add(List list,StringBuilder sb) { if( sb.length() > 0 ) { list.add(decode(sb.toString())); sb.setLength(0); } } private Comment parseComment() { parser.begin(); if( !parser.match("<!--") ) return parser.failure(null); int start = parser.currentIndex(); while( !parser.test("-->") ) { if( !parser.anyChar() ) return parser.failure(null); } String text = parser.textFrom(start); Comment comment = new Comment(text); return parser.success(comment); } private CData parseCdata() { parser.begin(); if( !parser.match("<![CDATA[") ) return parser.failure(null); int start = parser.currentIndex(); while( !parser.test("]]>") ) { if( !parser.anyChar() ) return parser.failure(null); } String text = parser.textFrom(start); CData cdata = new CData(text); return parser.success(cdata); } private Container parseContainer(Tag tag) { String endTagName = '/' + tag.name; int start = parser.begin(); int end; while(true) { if( parser.test('<') ) { end = parser.currentIndex(); Tag tag2 = parseTag(); if( tag2.name.equals(endTagName) ) break; } if( !parser.anyChar() ) return parser.failure(null); } String text = parser.text.substring(start,end); Container container = new Container(tag,text); return parser.success(container); } private Tag parseTag() { int tagStart = parser.begin(); if( !parser.match('<') ) return parser.failure(null); int start = parser.currentIndex(); parser.match('/'); if( !matchNameChar() ) return parser.failure(null); while( matchNameChar() ); String name = parser.textFrom(start).toLowerCase(); Map<String,Object> attributes = new HashMap<String,Object>(); String attrName; Map<String,String> style = null; while( (attrName = parseAttrName()) != null ) { String attrValue = parseAttrValue(); attributes.put( attrName, attrValue!=null ? attrValue : true ); if( attrName.equals("style") && attrValue!=null && style==null ) { style = Css.style(attrValue); } } while( matchSpace() ); boolean isEmpty = parser.match('/'); if( !parser.match('>') ) return parser.failure(null); String raw = parser.textFrom(tagStart); Tag tag = new Tag(name,attributes,isEmpty,raw,style); return parser.success(tag); } private String parseAttrName() { parser.begin(); if( !matchSpace() ) return parser.failure(null); while( matchSpace() ); int start = parser.currentIndex(); if( !matchNameChar() ) return parser.failure(null); while( matchNameChar() ); String name = parser.textFrom(start).toLowerCase(); return parser.success(name); } private String parseAttrValue() { parser.begin(); while( matchSpace() ); if( !parser.match('=') ) return parser.failure(null); while( matchSpace() ); if( parser.anyOf("\"'") ) { char quote = parser.lastChar(); int start = parser.currentIndex(); while( !parser.test(quote) ) { if( !parser.anyChar() ) return parser.failure(null); } String value = parser.textFrom(start); parser.match(quote); value = decode(value); return parser.success(value); } int start = parser.currentIndex(); if( !matchValueChar() ) return parser.failure(null); while( matchValueChar() ); String value = parser.textFrom(start); value = decode(value); return parser.success(value); } private boolean matchNameChar() { return parser.inCharRange('a','z') || parser.inCharRange('A','Z') || parser.inCharRange('0','9') || parser.anyOf("_.-:") ; } private boolean matchValueChar() { return parser.noneOf(" \t\r\n\"'>/="); } private boolean matchSpace() { return parser.anyOf(" \t\r\n"); } }