comparison src/luan/modules/HtmlLuan.java @ 1341:a015a0b5c388

add Html.decode(), Lucene.count_tokens(), lucene boosts, Sql.database.set()
author Franklin Schmidt <fschmidt@gmail.com>
date Tue, 19 Feb 2019 08:14:40 -0700
parents 25746915a241
children 21f5edab1fbf
comparison
equal deleted inserted replaced
1340:b3c4fcf29a53 1341:a015a0b5c388
4 import java.util.ArrayList; 4 import java.util.ArrayList;
5 import java.util.Arrays; 5 import java.util.Arrays;
6 import java.util.Set; 6 import java.util.Set;
7 import java.util.HashSet; 7 import java.util.HashSet;
8 import java.util.Map; 8 import java.util.Map;
9 import java.util.regex.Pattern;
10 import java.util.regex.Matcher;
9 import luan.Luan; 11 import luan.Luan;
10 import luan.LuanTable; 12 import luan.LuanTable;
11 import luan.LuanException; 13 import luan.LuanException;
12 14
13 15
37 } 39 }
38 } 40 }
39 return buf.toString(); 41 return buf.toString();
40 } 42 }
41 43
42 /* 44 private static final Pattern entityPtn = Pattern.compile(
43 // public static final String TEXTAREA = "textarea"; 45 "&#(\\d+);"
44 public static final String SCRIPT = "script"; 46 );
45 public static final String STYLE = "style";
46 47
47 public static Set<String> containerTags = new HashSet<String>(Arrays.asList(SCRIPT,STYLE)); 48 public static String decode(String s) {
48 */ 49 StringBuffer buf = new StringBuffer();
49 /* 50 Matcher m = entityPtn.matcher(s);
50 public static LuanTable parse(Luan luan,String text,LuanTable containerTagsTbl) 51 while( m.find() ) {
51 throws LuanException 52 String entity = new String(new char[]{(char)Integer.parseInt(m.group(1))});
52 { 53 m.appendReplacement(buf,entity);
53 Utils.checkNotNull(luan,text);
54 Utils.checkNotNull(luan,containerTagsTbl);
55 Set<String> containerTags = new HashSet<String>();
56 for( Object v : containerTagsTbl.asList() ) {
57 containerTags.add((String)v);
58 } 54 }
59 List<Object> html = new ArrayList<Object>(); 55 m.appendTail(buf);
60 int len = text.length(); 56 s = buf.toString();
61 int i = 0; 57 s = s.replace("&nbsp;"," ");
62 outer: 58 s = s.replace("&quot;","\"");
63 while( i < len ) { 59 s = s.replace("&gt;",">");
64 int i2 = text.indexOf('<',i); 60 s = s.replace("&lt;","<");
65 while( i2 != -1 && i2+1 < len ) { 61 s = s.replace("&amp;","&");
66 char c = text.charAt(i2+1); 62 return s;
67 if( Character.isLetter(c) || c=='/' || c=='!' )
68 break;
69 i2 = text.indexOf('<',i2+1);
70 }
71 if( i2 == -1 ) {
72 html.add( text.substring(i) );
73 break;
74 }
75 if( i < i2 )
76 html.add( text.substring(i,i2) );
77 if( text.startsWith("<!--",i2) ) {
78 i = text.indexOf("-->",i2+4);
79 if( i == -1 ) {
80 html.add( text.substring(i2) );
81 break;
82 }
83 html.add( comment( text.substring(i2+4,i) ) );
84 i += 3;
85 } else if( text.startsWith("<![CDATA[",i2) ) {
86 i = text.indexOf("]]>",i2+9);
87 if( i == -1 ) {
88 html.add( text.substring(i2) );
89 break;
90 }
91 html.add( cdata( text.substring(i2+9,i) ) );
92 i += 3;
93 } else {
94 i = text.indexOf('>',i2);
95 if( i == -1 ) {
96 html.add( text.substring(i2) );
97 break;
98 }
99 String tagText = text.substring(i2+1,i);
100 try {
101 LuanTable tag = parseTag(tagText);
102 String tagName = (String)tag.rawGet("name");
103 if( containerTags.contains(tagName) ) {
104 i2 = i;
105 String endTagName = '/' + tagName;
106 while(true) {
107 i2 = text.indexOf('<',i2+1);
108 if( i2 == -1 )
109 break;
110 int i3 = text.indexOf('>',i2);
111 if( i3 == -1 )
112 break;
113 int j = i2+1;
114 while( j<i3 && !Character.isWhitespace(text.charAt(j)) ) j++;
115 String s = text.substring(i2+1,j);
116 if( s.equalsIgnoreCase(endTagName) ) {
117 String text2 = text.substring(i+1,i2);
118 LuanTable textContainer = textContainer(tag,text2);
119 html.add( textContainer );
120 i = i3 + 1;
121 continue outer;
122 }
123 }
124 // logger.warn("unclosed "+tagName);
125 }
126 i += 1;
127 html.add( tag );
128 } catch(BadTag e) {
129 // logger.debug("bad tag",e);
130 i += 1;
131 // if( !removeBadTags ) {
132 html.add( "&lt;" );
133 html.add( encode(luan,tagText) );
134 html.add( "&gt;" );
135 // }
136 }
137 }
138 }
139 return new LuanTable(html);
140 } 63 }
141 64
142 static LuanTable comment(String text) {
143 LuanTable tbl = new LuanTable();
144 tbl.rawPut("type","comment");
145 tbl.rawPut("text",text);
146 return tbl;
147 }
148
149 static LuanTable cdata(String text) {
150 LuanTable tbl = new LuanTable();
151 tbl.rawPut("type","cdata");
152 tbl.rawPut("text",text);
153 return tbl;
154 }
155
156 static LuanTable textContainer(LuanTable tag,String text) {
157 LuanTable tbl = new LuanTable();
158 tbl.rawPut("type","container");
159 tbl.rawPut("tag",tag);
160 tbl.rawPut("text",text);
161 return tbl;
162 }
163
164
165
166 static final class BadTag extends RuntimeException {
167 private BadTag(String msg) {
168 super(msg);
169 }
170 }
171
172 static LuanTable parseTag(String text) {
173 LuanTable tbl = new LuanTable();
174 tbl.rawPut("type","tag");
175 if( text.endsWith("/") ) {
176 text = text.substring(0,text.length()-1);
177 tbl.rawPut("is_empty",true);
178 } else {
179 tbl.rawPut("is_empty",false);
180 }
181 int len = text.length();
182 int i = 0;
183 int i2 = i;
184 if( i2<len && text.charAt(i2)=='/' )
185 i2++;
186 while( i2<len ) {
187 char c = text.charAt(i2);
188 if( Character.isWhitespace(c) )
189 break;
190 if( !( Character.isLetterOrDigit(c) || c=='_' || c=='.' || c=='-' || c==':' ) )
191 throw new BadTag("invalid tag name for <"+text+">");
192 i2++;
193 }
194 String name = text.substring(i,i2).toLowerCase();
195 tbl.rawPut("name",name);
196 LuanTable attributes = new LuanTable();
197 tbl.rawPut("attributes",attributes);
198 i = i2;
199 while( i<len && Character.isWhitespace(text.charAt(i)) ) i++;
200 while( i<len ) {
201 i2 = toEndName(text,i,len);
202 String attrName = unquote(text.substring(i,i2).toLowerCase());
203 if( attributes.rawGet(attrName) != null )
204 throw new BadTag("duplicate attribute: "+attrName);
205 i = i2;
206 while( i<len && Character.isWhitespace(text.charAt(i)) ) i++;
207 if( i<len && text.charAt(i) == '=' ) {
208 i++;
209 i2 = i;
210 while( i<len && Character.isWhitespace(text.charAt(i)) ) i++;
211 i2 = toEndValue(text,i,len);
212 String attrValue = text.substring(i,i2);
213 if( attrValue.indexOf('<') != -1 || attrValue.indexOf('>') != -1 )
214 throw new BadTag("invalid attribute value: "+attrValue);
215 attrValue = unquote(attrValue);
216 attributes.rawPut(attrName,attrValue);
217 i = i2;
218 while( i<len && Character.isWhitespace(text.charAt(i)) ) i++;
219 } else {
220 attributes.rawPut(attrName,true);
221 }
222 }
223 return tbl;
224 }
225
226 private static int toEndName(String text,int i,int len) {
227 if( i==len )
228 return i;
229 char c = text.charAt(i);
230 switch(c) {
231 case '"':
232 case '\'':
233 i = text.indexOf(c,i+1);
234 return i==-1 ? len : i+1;
235 default:
236 if( Character.isWhitespace(c) ) {
237 throw new RuntimeException("text="+text+" i="+i);
238 }
239 do {
240 i++;
241 } while( i<len && (c=text.charAt(i))!='=' && !Character.isWhitespace(c) );
242 return i;
243 }
244 }
245
246 private static int toEndValue(String text,int i,int len) {
247 if( i==len )
248 return i;
249 char c = text.charAt(i);
250 switch(c) {
251 case '"':
252 case '\'':
253 i = text.indexOf(c,i+1);
254 return i==-1 ? len : i+1;
255 default:
256 if( Character.isWhitespace(c) ) {
257 throw new RuntimeException("text="+text+" i="+i);
258 }
259 do {
260 i++;
261 } while( i<len && !Character.isWhitespace(text.charAt(i)) );
262 return i;
263 }
264 }
265
266 public static String unquote(String s) {
267 if( s==null || s.length()<=1 )
268 return s;
269 char c = s.charAt(0);
270 return (c=='"' || c=='\'') && s.charAt(s.length()-1)==c
271 ? s.substring(1,s.length()-1) : s;
272 }
273 */
274
275
276 /*
277 public static String to_string(Luan luan,LuanTable tbl) throws LuanException {
278 List<Object> html = tbl.asList();
279 StringBuilder buf = new StringBuilder();
280 for( Object o : html ) {
281 if( o instanceof String ) {
282 buf.append( o );
283 } else if( o instanceof LuanTable ) {
284 LuanTable t = (LuanTable)o;
285 String type = (String)t.get(luan,"type");
286 if( type==null )
287 throw new LuanException(luan, "no type in element of table for 'Html.to_string'" );
288 if( type.equals("comment") ) {
289 buf.append( "<!--" ).append( t.get(luan,"text") ).append( "-->" );
290 } else if( type.equals("cdata") ) {
291 buf.append( "<![CDATA[" ).append( t.get(luan,"text") ).append( "]]" );
292 } else if( type.equals("tag") ) {
293 buf.append( tagToString(luan,t) );
294 } else if( type.equals("container") ) {
295 LuanTable tag = (LuanTable)t.get(luan,"tag");
296 buf.append( tagToString(luan,tag) );
297 buf.append( t.get(luan,"text") );
298 buf.append( "</" ).append( tag.get(luan,"name") ).append( ">" );
299 } else {
300 throw new LuanException(luan, "invalid element type for 'Html.to_string'" );
301 }
302 } else
303 throw new LuanException(luan, "invalid value ("+Luan.type(o)+") in table for 'Html.to_string'" );
304 }
305 return buf.toString();
306 }
307
308 private static String tagToString(Luan luan,LuanTable tbl) throws LuanException {
309 StringBuilder buf = new StringBuilder();
310 buf.append('<');
311 buf.append(tbl.get(luan,"name"));
312 LuanTable attributes = (LuanTable)tbl.get(luan,"attributes");
313 for( Map.Entry<Object,Object> attr : attributes.iterable(luan) ) {
314 buf.append( ' ' );
315 buf.append( attr.getKey() );
316 Object val = attr.getValue();
317 if( !val.equals(Boolean.TRUE) ) {
318 buf.append( '=' );
319 buf.append( quote((String)val) );
320 }
321 }
322 if( tbl.get(luan,"is_empty").equals(Boolean.TRUE) )
323 buf.append('/');
324 buf.append('>');
325 return buf.toString();
326 }
327 */
328 public static String quote(String s) { 65 public static String quote(String s) {
329 StringBuilder buf = new StringBuilder(); 66 StringBuilder buf = new StringBuilder();
330 buf.append('"'); 67 buf.append('"');
331 int i = 0; 68 int i = 0;
332 while(true) { 69 while(true) {