changeset 1712:36c28be6d432

improve html and bbcode
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 14 Jul 2022 22:14:21 -0600
parents 05d14db623b6
children 4d70e9543ef2
files src/goodjava/bbcode/BBCode.java src/goodjava/html/Html.java src/luan/modules/Html.luan src/luan/modules/HtmlLuan.java src/luan/modules/Parsers.luan
diffstat 5 files changed, 359 insertions(+), 71 deletions(-) [+]
line wrap: on
line diff
--- a/src/goodjava/bbcode/BBCode.java	Wed Jul 13 21:50:41 2022 -0600
+++ b/src/goodjava/bbcode/BBCode.java	Thu Jul 14 22:14:21 2022 -0600
@@ -11,6 +11,15 @@
 
 public final class BBCode {
 
+	private static final Pattern tagPtn = Pattern.compile(
+		"\\[(/?[a-zA-Z]+(=[^ \\n\\t\\[\\]]*)?)\\]"
+	);
+
+	public static String encode(String s) {
+		return tagPtn.matcher(s).replaceAll("[brackets]$1[/brackets]");
+	}
+
+
 	public final class Element {
 		public final String name;
 		public final String param;
@@ -216,7 +225,7 @@
 		parser.matchIgnoreCase("s");
 		if( !parser.matchIgnoreCase("://") )
 			return parser.failure(null);
-		while( parser.noneOf(" []'") );
+		while( parser.noneOf(" \n\t[]") );
 		String url = parser.textFrom(start);
 		while( parser.match(' ') );
 		return parser.success(url);
@@ -233,7 +242,7 @@
 			end = "[/code]";
 		} else if( parser.match('=') ) {
 			int start = parser.currentIndex();
-			while( parser.noneOf("[]") );
+			while( parser.noneOf("[]\n") );
 			param = parser.textFrom(start);
 			if( !parser.match(']') )
 				return parser.failure(null);
@@ -347,8 +356,8 @@
 			name = null;
 		} else if( parser.match('=') ) {
 			int start = parser.currentIndex();
-			while( parser.noneOf("[]") );
-			name = parser.textFrom(start).trim();
+			while( parser.noneOf("[]\n") );
+			name = parser.textFrom(start);
 			if( !parser.match(']') )
 				return parser.failure(null);
 		} else
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/goodjava/html/Html.java	Thu Jul 14 22:14:21 2022 -0600
@@ -0,0 +1,288 @@
+package goodjava.html;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Collections;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+import goodjava.parser.Parser;
+
+
+public final class Html {
+
+	private static final Pattern entityPtn = Pattern.compile(
+		"&(#?[0-9a-zA-Z]+;)"
+	);
+
+	public static String encode(String s) {
+		//s = s.replace("&","&amp;");
+		s = entityPtn.matcher(s).replaceAll("&amp;$1");
+		s = s.replace("<","&lt;");
+		s = s.replace(">","&gt;");
+		s = s.replace("\"","&quot;");
+		return s;
+	}
+
+	private static final Pattern entityNumPtn = Pattern.compile(
+		"&#(\\d+);"
+	);
+
+	public static String decode(String s) {
+		Matcher m = entityNumPtn.matcher(s);
+		if( m.find() ) {
+			StringBuffer buf = new StringBuffer();
+			do {
+				String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
+				m.appendReplacement(buf,entity);
+			} while( m.find() );
+			m.appendTail(buf);
+			s = buf.toString();
+		}
+		s = s.replace("&nbsp;"," ");
+		s = s.replace("&quot;","\"");
+		s = s.replace("&gt;",">");
+		s = s.replace("&lt;","<");
+		s = s.replace("&amp;","&");
+		return s;
+	}
+
+
+	public static final class Comment {
+		public final String text;
+
+		private Comment(String text) {
+			this.text = text;
+		}
+	}
+
+	public static final class CData {
+		public final String text;
+
+		private CData(String text) {
+			this.text = text;
+		}
+	}
+
+	public static final class Tag {
+		public final String name;
+		public final Map<String,Object> attributes;
+		public final boolean isEmpty;
+		public final String raw;
+
+		private Tag(String name,Map<String,Object> attributes,boolean isEmpty,String raw) {
+			this.name = name;
+			this.attributes = attributes;
+			this.isEmpty = isEmpty;
+			this.raw = raw;
+		}
+	}
+
+	public static final class Container {
+		public final Tag tag;
+		public final String text;
+
+		private Container(Tag tag,String text) {
+			this.tag = tag;
+			this.text = text;
+		}
+	}
+
+	private static Set<String> defaultContainerTags = new HashSet<String>();
+	static {
+		Collections.addAll( defaultContainerTags, "script", "style", "textarea" );
+	}
+
+	public static List parse(String text) {
+		return parse(text,defaultContainerTags);
+	}
+
+	public static List parse(String text,Set<String> containerTags) {
+		return new Html(text,containerTags).parse();
+	}
+
+	private final Parser parser;
+	private final Set<String> containerTags;
+
+	private Html(String text,Set<String> containerTags) {
+		this.parser = new Parser(text);
+		this.containerTags = containerTags;
+	}
+
+	private List parse() {
+		List list = new ArrayList();
+		StringBuilder sb = new StringBuilder();
+		while( !parser.endOfInput() ) {
+			if( parser.test('<') ) {
+				Tag tag = parseTag();
+				if( tag != null ) {
+					Object el = tag;
+					String tagName = tag.name;
+					if( containerTags.contains(tagName) ) {
+						Container container = parseContainer(tag);
+						if( container != null )
+							el = container;
+					}
+					if( el != null 
+						|| (el = parseComment()) != null
+						|| (el = parseCdata()) != null
+					) {
+						add(list,sb);
+						list.add(el);
+						continue;
+					}
+				}
+			}
+			sb.append( parser.currentChar() );
+			parser.anyChar();
+		}
+		add(list,sb);
+		return list;
+	}
+
+	private static void add(List list,StringBuilder sb) {
+		if( sb.length() > 0 ) {
+			list.add(decode(sb.toString()));
+			sb.setLength(0);
+		}
+	}
+
+	private Comment parseComment() {
+		parser.begin();
+		if( !parser.match("<!--") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("-->") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		Comment comment = new Comment(text);
+		return parser.success(comment);
+	}
+
+	private CData parseCdata() {
+		parser.begin();
+		if( !parser.match("<![CDATA[") )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		while( !parser.test("]]>") ) {
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.textFrom(start);
+		CData cdata = new CData(text);
+		return parser.success(cdata);
+	}
+
+	private Container parseContainer(Tag tag) {
+		String endTagName = '/' + tag.name;
+		int start = parser.begin();
+		int end;
+		while(true) {
+			if( parser.test('<') ) {
+				end = parser.currentIndex();
+				Tag tag2 = parseTag();
+				if( tag2.name.equals(endTagName) )
+					break;
+			}
+			if( !parser.anyChar() )
+				return parser.failure(null);
+		}
+		String text = parser.text.substring(start,end);
+		Container container = new Container(tag,text);
+		return parser.success(container);
+	}
+
+	private Tag parseTag() {
+		int tagStart = parser.begin();
+		if( !parser.match('<') )
+			return parser.failure(null);
+		int start = parser.currentIndex();
+		parser.match('/');
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		Map<String,Object> attributes = new HashMap<String,Object>();
+		String attrName;
+		while( (attrName = parseAttrName()) != null ) {
+			String attrValue = parseAttrValue();
+			attributes.put( attrName, attrValue!=null ? attrValue : true );
+/*
+			if( attrName.equals("style") && attrValue!=null ) {
+				LuanTable style = Css.style(attrValue);
+				if( style!=null )
+					tbl.rawPut("style",style);
+			}
+*/
+		}
+		while( matchSpace() );
+		boolean isEmpty = parser.match('/');
+		if( !parser.match('>') )
+			return parser.failure(null);
+		String raw = parser.textFrom(tagStart);
+		Tag tag = new Tag(name,attributes,isEmpty,raw);
+		return parser.success(tag);
+	}
+
+	private String parseAttrName() {
+		parser.begin();
+		if( !matchSpace() )
+			return parser.failure(null);
+		while( matchSpace() );
+		int start = parser.currentIndex();
+		if( !matchNameChar() )
+			return parser.failure(null);
+		while( matchNameChar() );
+		String name = parser.textFrom(start).toLowerCase();
+		return parser.success(name);
+	}
+
+	private String parseAttrValue() {
+		parser.begin();
+		while( matchSpace() );
+		if( !parser.match('=') )
+			return parser.failure(null);
+		while( matchSpace() );
+		if( parser.anyOf("\"'") ) {
+			char quote = parser.lastChar();
+			int start = parser.currentIndex();
+			while( !parser.test(quote) ) {
+				if( !parser.anyChar() )
+					return parser.failure(null);
+			}
+			String value = parser.textFrom(start);
+			parser.match(quote);
+			value = decode(value);
+			return parser.success(value);
+		}
+		int start = parser.currentIndex();
+		if( !matchValueChar() )
+			return parser.failure(null);
+		while( matchValueChar() );
+		String value = parser.textFrom(start);
+		value = decode(value);
+		return parser.success(value);
+	}
+
+	private boolean matchNameChar() {
+		return parser.inCharRange('a','z')
+			|| parser.inCharRange('A','Z')
+			|| parser.inCharRange('0','9')
+			|| parser.anyOf("_.-:")
+		;
+	}
+
+	private boolean matchValueChar() {
+		return parser.noneOf(" \t\r\n\"'>/=");
+	}
+
+	private boolean matchSpace() {
+		return parser.anyOf(" \t\r\n");
+	}
+
+}
--- a/src/luan/modules/Html.luan	Wed Jul 13 21:50:41 2022 -0600
+++ b/src/luan/modules/Html.luan	Thu Jul 14 22:14:21 2022 -0600
@@ -1,6 +1,5 @@
 require "java"
 local HtmlLuan = require "java:luan.modules.HtmlLuan"
-local HtmlParser = require "java:luan.modules.parsers.Html"
 local URLEncoder = require "java:java.net.URLEncoder"
 local URLDecoder = require "java:java.net.URLDecoder"
 local JsonToString = require "java:goodjava.json.JsonToString"
@@ -16,16 +15,14 @@
 local Html = {}
 
 Html.decode = HtmlLuan.decode
-Html.encode = HtmlLuan.encode
+local encode = HtmlLuan.encode
+Html.encode = encode
 Html.javascript_encode = JsonToString.javascriptEncode
 
-local quote = HtmlLuan.quote
-Html.quote = quote
-
 function Html.parse(text,container_tags)
 	text or error "text required"
 	container_tags = container_tags or {"script","style","textarea"}
-	return HtmlParser.toList(text,container_tags)
+	return HtmlLuan.parse(text,container_tags)
 end
 
 function Html.url_encode(s)
@@ -41,7 +38,7 @@
 	for name, value in pairs(tag.attributes) do
 		%> <%= name %><%
 		if value ~= true then
-			%>=<%= quote(value) %><%
+			%>="<%= encode(value) %>"<%
 		end
 	end
 	if tag.is_empty then
@@ -55,7 +52,7 @@
 		for _, obj in ipairs(list) do
 			local tp = type(obj)
 			if tp == "string" then
-				%><%= obj %><%
+				%><%= encode(obj) %><%
 			elseif tp == "table" then
 				tp = obj.type
 				if tp == nil then
--- a/src/luan/modules/HtmlLuan.java	Wed Jul 13 21:50:41 2022 -0600
+++ b/src/luan/modules/HtmlLuan.java	Thu Jul 14 22:14:21 2022 -0600
@@ -2,13 +2,9 @@
 
 import java.util.List;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Set;
 import java.util.HashSet;
-import java.util.Map;
-import java.util.regex.Pattern;
-import java.util.regex.Matcher;
-import luan.Luan;
+import goodjava.html.Html;
 import luan.LuanTable;
 import luan.LuanException;
 
@@ -17,67 +13,63 @@
 
 	public static String encode(String s) throws LuanException {
 		Utils.checkNotNull(s);
-		final char[] a = s.toCharArray();
-		StringBuilder buf = new StringBuilder();
-		for( char c : a ) {
-			switch(c) {
-			case '&':
-				buf.append("&amp;");
-				break;
-			case '<':
-				buf.append("&lt;");
-				break;
-			case '>':
-				buf.append("&gt;");
-				break;
-			case '"':
-				buf.append("&quot;");
-				break;
-			default:
-				buf.append(c);
-			}
-		}
-		return buf.toString();
+		return Html.encode(s);
 	}
 
-	private static final Pattern entityPtn = Pattern.compile(
-		"&#(\\d+);"
-	);
-
 	public static String decode(String s) {
-		StringBuffer buf = new StringBuffer();
-		Matcher m = entityPtn.matcher(s);
-		while( m.find() ) {
-			String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
-			m.appendReplacement(buf,entity);
-		}
-		m.appendTail(buf);
-		s = buf.toString();
-		s = s.replace("&nbsp;"," ");
-		s = s.replace("&quot;","\"");
-		s = s.replace("&gt;",">");
-		s = s.replace("&lt;","<");
-		s = s.replace("&amp;","&");
-		return s;
+		return Html.decode(s);
 	}
 
-	public static String quote(String s) {
-		StringBuilder buf = new StringBuilder();
-		buf.append('"');
-		int i = 0;
-		while(true) {
-			int i2 = s.indexOf('"',i);
-			if( i2 == -1 ) {
-				buf.append(s.substring(i));
-				break;
-			} else {
-				buf.append(s.substring(i,i2));
-				buf.append("&quot;");
-				i = i2 + 1;
+	private static LuanTable tag(Html.Tag tag) throws LuanException {
+		LuanTable tbl = new LuanTable();
+		tbl.rawPut("type","tag");
+		tbl.rawPut("name",tag.name);
+		tbl.rawPut("attributes",new LuanTable(tag.attributes));
+		tbl.rawPut("is_empty",tag.isEmpty);
+		tbl.rawPut("raw",tag.raw);
+		return tbl;
+	}
+	public static LuanTable parse(String text,LuanTable containerTagsTbl) {
+		try {
+			Set<String> containerTags = new HashSet();
+			for( Object s : containerTagsTbl.asList() ) {
+				containerTags.add((String)s);
 			}
+			List list = Html.parse(text,containerTags);
+			List rtn = new ArrayList();
+			for( Object el : list ) {
+				if( el instanceof String ) {
+					rtn.add(el);
+				} else if( el instanceof Html.Tag ) {
+					Html.Tag tag = (Html.Tag)el;
+					rtn.add(tag(tag));
+				} else if( el instanceof Html.Comment ) {
+					Html.Comment comment = (Html.Comment)el;
+					LuanTable tbl = new LuanTable();
+					tbl.rawPut("type","comment");
+					tbl.rawPut("text",comment.text);
+					rtn.add(tbl);
+				} else if( el instanceof Html.CData ) {
+					Html.CData cdata = (Html.CData)el;
+					LuanTable tbl = new LuanTable();
+					tbl.rawPut("type","cdata");
+					tbl.rawPut("text",cdata.text);
+					rtn.add(tbl);
+				} else if( el instanceof Html.Container ) {
+					Html.Container container = (Html.Container)el;
+					LuanTable tbl = new LuanTable();
+					tbl.rawPut("type","container");
+					tbl.rawPut("tag",tag(container.tag));
+					tbl.rawPut("text",container.text);
+					rtn.add(tbl);
+				} else {
+					throw new RuntimeException("invalid el "+el);
+				}
+			}
+			return new LuanTable(rtn);
+		} catch(LuanException e) {
+			throw new RuntimeException(e);
 		}
-		buf.append('"');
-		return buf.toString();
 	}
 
 }
--- a/src/luan/modules/Parsers.luan	Wed Jul 13 21:50:41 2022 -0600
+++ b/src/luan/modules/Parsers.luan	Thu Jul 14 22:14:21 2022 -0600
@@ -1,4 +1,5 @@
 require "java"
+local BBCode = require "java:goodjava.bbcode.BBCode"
 local BBCodeLuan = require "java:luan.modules.parsers.BBCodeLuan"
 local Csv = require "java:luan.modules.parsers.Csv"
 local Theme = require "java:luan.modules.parsers.Theme"
@@ -9,6 +10,7 @@
 
 local Parsers = {}
 
+Parsers.bbcode_encode = BBCode.encode
 Parsers.bbcode_parse = BBCodeLuan.parse
 Parsers.csv_to_list = Csv.toList
 Parsers.json_string = BasicLuan.json_string