comparison src/nabble/model/Lucene.java @ 0:7ecd1a4ef557

add content
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 21 Mar 2019 19:15:52 -0600
parents
children abe0694e9849
comparison
equal deleted inserted replaced
-1:000000000000 0:7ecd1a4ef557
1 /*
2
3 Copyright (C) 2004 Franklin Schmidt <frank@gustos.com>
4
5 */
6
7 package nabble.model;
8
9 import fschmidt.db.Listener;
10 import fschmidt.util.java.CollectionUtils;
11 import fschmidt.util.mail.MailEncodingException;
12 import nabble.model.lucene.HitCollector;
13 import nabble.model.lucene.IndexCache;
14 import nabble.model.lucene.LuceneSearcher;
15 import nabble.view.lib.Permissions;
16 import nabble.view.lib.help.Help;
17 import org.apache.lucene.analysis.Analyzer;
18 import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
19 import org.apache.lucene.document.Document;
20 import org.apache.lucene.document.Field;
21 import org.apache.lucene.document.NumericField;
22 import org.apache.lucene.index.IndexReader;
23 import org.apache.lucene.index.IndexWriter;
24 import org.apache.lucene.index.Term;
25 import org.apache.lucene.queryParser.ParseException;
26 import org.apache.lucene.search.BooleanClause;
27 import org.apache.lucene.search.BooleanFilter;
28 import org.apache.lucene.search.BooleanQuery;
29 import org.apache.lucene.search.CachingWrapperFilter;
30 import org.apache.lucene.search.Filter;
31 import org.apache.lucene.search.FilterClause;
32 import org.apache.lucene.search.IndexSearcher;
33 import org.apache.lucene.search.NumericRangeFilter;
34 import org.apache.lucene.search.Query;
35 import org.apache.lucene.search.Searcher;
36 import org.apache.lucene.search.TermQuery;
37 import org.apache.lucene.search.TopDocs;
38 import org.apache.lucene.store.Directory;
39 import org.apache.lucene.store.RAMDirectory;
40 import org.apache.lucene.util.Version;
41 import org.slf4j.Logger;
42 import org.slf4j.LoggerFactory;
43
44 import java.io.File;
45 import java.io.IOException;
46 import java.sql.Connection;
47 import java.sql.PreparedStatement;
48 import java.sql.ResultSet;
49 import java.sql.SQLException;
50 import java.sql.Statement;
51 import java.text.DateFormat;
52 import java.text.SimpleDateFormat;
53 import java.util.ArrayList;
54 import java.util.Collection;
55 import java.util.Date;
56 import java.util.LinkedHashMap;
57 import java.util.List;
58 import java.util.Map;
59 import java.util.Set;
60
61
62 public final class Lucene {
63 private static final Logger logger = LoggerFactory.getLogger(Lucene.class);
64
65 public static interface DocumentListener {
66 public void event(Node node,Document doc);
67 }
68
69 private static final int nodeIndexVersion = 3;
70
71 private static final String NODE_ID_FLD = "nodeId";
72 static final String KIND_FLD = "kind";
73 static final String SUBJECT_FLD = "subject";
74 static final String MESSAGE_FLD = "message";
75 static final String ANCESTORS_FLD = "ancestors";
76 static final String PARENT_ID_FLD = "parentId";
77 static final String DATE_FLD = "date";
78 private static final String RANGE_SEARCH_DATE_FLD = "rangeSearchDate";
79 private static final String DAY_FLD = "day";
80 static final String USER_ID_FLD = "userId";
81 static final String AUTHOR_FLD = "author";
82 static final String PRIVATE_NODE_FLD = "privateNode";
83 static final String MAILING_LIST_FLD = "mailingList";
84
85 static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
86
87 private static final List<DocumentListener> documentListeners = new ArrayList<DocumentListener>();
88
89 private Lucene() {} // never
90
91 static LuceneSearcher newSearcher(Site site) throws IOException {
92 return nodeIndex.openSearcher(site.getId());
93 }
94
95 static long getNodeId(Document doc) {
96 return Long.parseLong(doc.get(NODE_ID_FLD));
97 }
98
99 static NodeImpl getNode(SiteImpl site, LuceneSearcher searcher, int docId) throws IOException {
100 return getNode( site, searcher.doc(docId) );
101 }
102
103 static NodeImpl getNode(SiteImpl site,Document doc) {
104 long nodeId = getNodeId(doc);
105 NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId);
106 if( node==null ) {
107 logger.error("missing node "+nodeId+", removing from lucene");
108 removeNode(site,nodeId);
109 }
110 return node;
111 }
112
113 private static void add(final Node node) {
114 Document doc = document(node);
115 try {
116 IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId());
117 try {
118 indexWriter.addDocument(doc);
119 } finally {
120 indexWriter.close();
121 }
122 } catch(IOException e) {
123 throw new RuntimeException(e);
124 }
125 }
126 /*
127 private static void removeSite(long siteId) {
128 try {
129 nodeIndex.delete(siteId);
130 } catch(IOException e) {
131 throw new RuntimeException(e);
132 }
133 }
134 */
135 private static void removeNode(Site site,long nodeId) {
136 Term term = new Term(NODE_ID_FLD,Long.toString(nodeId));
137 try {
138 IndexWriter indexWriter = nodeIndex.openIndexWriter(site.getId());
139 try {
140 indexWriter.deleteDocuments(term);
141 } finally {
142 indexWriter.close();
143 }
144 } catch(IOException e) {
145 throw new RuntimeException(e);
146 }
147 }
148
149 public static void update(final Node node) {
150 try {
151 Document doc = document(node);
152 if( doc==null ) {
153 removeNode(node.getSite(),node.getId());
154 } else {
155 IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId());
156 try {
157 indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc );
158 } finally {
159 indexWriter.close();
160 }
161 }
162 } catch (IOException e) {
163 throw new RuntimeException(e);
164 }
165 }
166
167 static void updateNode(SiteImpl site,long nodeId) {
168 Node node = NodeImpl.getNode(site.siteKey,nodeId);
169 if( node == null ) {
170 removeNode(site,nodeId);
171 } else {
172 update(node);
173 }
174 }
175
176 static {
177 /*
178 SiteImpl.table.getPostDeleteListeners().add(new Listener<SiteImpl>(){
179 public void event(SiteImpl site) {
180 removeSite(site.getId());
181 }
182 });
183 */
184 NodeImpl.postDeleteListeners.add(new Listener<NodeImpl>(){
185 public void event(NodeImpl node) {
186 // remove descendants
187 Term term = new Term(ANCESTORS_FLD,Long.toString(node.getId()));
188 try {
189 IndexWriter indexWriter = nodeIndex.openIndexWriter(node.siteKey.getId());
190 try {
191 indexWriter.deleteDocuments(term);
192 } finally {
193 indexWriter.close();
194 }
195 } catch(IOException e) {
196 throw new RuntimeException(e);
197 }
198 }
199 });
200 NodeImpl.postInsertListeners.add(new Listener<NodeImpl>(){
201 public void event(final NodeImpl node) {
202 node.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){
203 try {
204 add(node);
205 } catch(MailEncodingException e) {
206 logger.warn(node.toString(),e);
207 }
208 }});
209 }
210 });
211 NodeImpl.preUpdateListeners.add(new Listener<NodeImpl>(){
212 public void event(NodeImpl node) {
213 Set fields = node.getDbRecord().fields().keySet();
214 if( CollectionUtils.intersects(fields,nodeDbFields) ) {
215 final long nodeId = node.getId();
216 final SiteKey siteKey = node.siteKey;
217 siteKey.getDb().runAfterCommit(new Runnable() {
218 public void run() {
219 NodeImpl node = NodeImpl.getNode(siteKey,nodeId);
220 if (node != null) update(node);
221 }
222 });
223 }
224 }
225 });
226 MailingListImpl.postDeleteListeners.add(new Listener<MailingListImpl>(){
227 public void event(MailingListImpl mailingList) {
228 update(mailingList.getForum());
229 }
230 });
231 MailingListImpl.postInsertListeners.add(new Listener<MailingListImpl>(){
232 public void event(final MailingListImpl mailingList) {
233 mailingList.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){
234 update(mailingList.getForum());
235 }});
236 }
237 });
238 MailingListImpl.preUpdateListeners.add(new Listener<MailingListImpl>(){
239 public void event(MailingListImpl mailingList) {
240 Set fields = mailingList.getDbRecord().fields().keySet();
241 if( CollectionUtils.intersects(fields,mailingListDbFields) ) {
242 final long nodeId = mailingList.getForum().getId();
243 final SiteKey siteKey = mailingList.siteKey;
244 siteKey.getDb().runAfterCommit(new Runnable() {
245 public void run() {
246 NodeImpl node = NodeImpl.getNode(siteKey,nodeId);
247 update(node);
248 }
249 });
250 }
251 }
252 });
253 }
254
255 static void staleNode(NodeImpl node) throws IOException {
256 if( node==null )
257 return;
258 logger.debug("staleNode update");
259 updateNodes( node.getSiteImpl(), descendants(node) );
260 logger.debug("staleNode done");
261 }
262
263 static void nop() {}
264
265 public static void addDocumentListener(DocumentListener documentListener) {
266 documentListeners.add(documentListener);
267 }
268
269 static Document document(Node node) {
270 Document doc = new Document();
271 doc.add( new Field(NODE_ID_FLD, Long.toString(node.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
272 doc.add( new Field(KIND_FLD, node.getKind().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
273 String subject = node.getSubject();
274 Field subjectFld = new Field(SUBJECT_FLD, subject, Field.Store.NO, Field.Index.ANALYZED);
275 subjectFld.setBoost(2.0f);
276 doc.add(subjectFld);
277 try {
278 String message = MessageUtils.htmlToSearchText(node.getMessage().parse());
279 doc.add( new Field(MESSAGE_FLD, message, Field.Store.NO, Field.Index.ANALYZED) );
280 } catch(RuntimeException e) {
281 logger.error("nodeId="+node.getId(),e);
282 }
283
284 for( Node f : node.getAncestors() ) {
285 doc.add( new Field(ANCESTORS_FLD, Long.toString(f.getId()), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
286 }
287 Node parent = node.getParent();
288 if (parent != null)
289 doc.add(new Field(PARENT_ID_FLD, Long.toString(parent.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
290
291 int date = (int)(-node.getWhenCreated().getTime()/1000);
292 doc.add( new NumericField(DATE_FLD).setIntValue(date) );
293 int rangeSearchDate = formatRangeSearchDate(node.getWhenCreated());
294 doc.add( new NumericField(RANGE_SEARCH_DATE_FLD).setIntValue(rangeSearchDate) );
295 String day = formatDay(node.getWhenCreated());
296 doc.add( new Field(DAY_FLD, day, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
297
298 Person owner = node.getOwner();
299 String userId = owner.getSearchId();
300 doc.add( new Field(USER_ID_FLD, userId, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
301 String author = owner.getName();
302 doc.add( new Field(AUTHOR_FLD, author, Field.Store.NO, Field.Index.ANALYZED) );
303 doc.add( new Field(PRIVATE_NODE_FLD, formatPrivateNode(node), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS));
304 MailingList mailingList = node.getMailingList();
305 if (mailingList != null) { // only for forums
306 Field listAddrFld = new Field(MAILING_LIST_FLD, mailingList.getListAddress().toLowerCase(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
307 doc.add( listAddrFld );
308 }
309 for( DocumentListener documentListener : documentListeners ) {
310 documentListener.event(node,doc);
311 }
312 return doc;
313 }
314
315 private static final String[] nodeDbFields =
316 {"subject", "when_created", "msg_fmt", "parent_id", "is_app", "owner_id", "cookie", "anonymous_name"};
317
318 private static final String[] mailingListDbFields = {"mailing_list"};
319
320
321 public static void updateRecursively(Node node) {
322 update(node);
323 for (Node n : node.getChildren()) {
324 updateRecursively(n);
325 }
326 }
327
328
329
330
331
332
333
334 // from SearchServer
335
336 static NodeImpl node(SiteImpl site,Document doc) {
337 long nodeId = getNodeId(doc);
338 NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId);
339 if (node==null)
340 logger.error("invalid node_id in lucene index: "+nodeId);
341 return node;
342 }
343
344 private static final IndexCache.Builder<Long> builder = new IndexCache.Builder<Long>() {
345
346 public void build(Long siteId) throws SQLException, IOException {
347 SiteKey siteKey = SiteKey.getInstance(siteId);
348 Connection con = siteKey.getDb().getConnection();
349 long[] nodeIds;
350 {
351 Statement stmt = con.createStatement();
352 ResultSet rs = stmt.executeQuery(
353 "select count(*) as n from node"
354 );
355 rs.next();
356 nodeIds = new long[rs.getInt("n")];
357 rs.close();
358 stmt.close();
359 }
360 {
361 PreparedStatement stmt = con.prepareStatement(
362 "select node_id from node order by node_id limit ?"
363 );
364 stmt.setInt(1,nodeIds.length);
365 ResultSet rs = stmt.executeQuery();
366 for( int i=0; rs.next(); i++ ) {
367 nodeIds[i] = rs.getLong("node_id");
368 }
369 rs.close();
370 stmt.close();
371 }
372 logger.error("Lucene started - site_id = " + siteId + " / " + nodeIds.length + " nodes");
373 IndexWriter indexWriter = nodeIndex.openIndexWriter(siteId);
374 int count = 0;
375 int lastPercent = 0;
376 try {
377 for( long nodeId : nodeIds ) {
378 Node node = NodeImpl.getNode(siteKey,nodeId);
379 if( node != null ) {
380 Document doc = document(node);
381 indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc );
382 }
383 count++;
384 int percent = Math.round(100f * count / (float) nodeIds.length);
385 if (percent > lastPercent) {
386 logger.error("Lucene build " + percent + "% completed");
387 lastPercent = percent;
388 }
389 }
390 } finally {
391 indexWriter.close();
392 }
393 con.close();
394 }
395
396 public boolean exists(String keyString) {
397 long id;
398 try {
399 id = Long.parseLong(keyString);
400 } catch(NumberFormatException e) {
401 return false;
402 }
403 return SiteKey.getInstance(id).siteGlobal() != null;
404 }
405 };
406
407 private static final IndexCache<Long> nodeIndex;
408 static {
409 logger.info("Starting search server");
410 Init.luceneStarted = true;
411 String localDir = (String)Init.get("local_dir");
412 String luceneDir = localDir + "lucene/";
413 File dirFile = new File(luceneDir);
414 nodeIndex = new IndexCache<Long>(dirFile,analyzer,nodeIndexVersion,builder);
415 }
416
417 private static void updateNodes(final SiteImpl site,Query query) {
418 try {
419 final LuceneSearcher searcher = newSearcher(site);
420 try {
421 searcher.search(query,new HitCollector() {
422 protected void process(Document doc) {
423 Node node = getNode(site,doc);
424 if( node != null )
425 update(node);
426 }
427 });
428 } finally {
429 searcher.close();
430 }
431 } catch(IOException e) {
432 throw new RuntimeException(e);
433 }
434 }
435
436
437 public static boolean isReady(Site site) {
438 return nodeIndex.isReady(site.getId());
439 }
440
441 public static void rebuild(Site site) throws IOException {
442 nodeIndex.rebuild(site.getId());
443 }
444
445 static synchronized void shutdown() {
446 nodeIndex.shutdown();
447 }
448
449
450
451
452
453 private static final long tenMinutes = 1000L*60*10;
454
455 static int formatRangeSearchDate(Date date) {
456 return (int)(date.getTime()/tenMinutes);
457 }
458
459
460 private static final DateFormat dayFormat = new SimpleDateFormat("yyyyMMdd");
461
462 static String formatDay(Date date) {
463 synchronized(dayFormat) {
464 return dayFormat.format(date);
465 }
466 }
467
468 static String formatPrivateNode(Node node) {
469 Node privateNode = Permissions.getPrivateNodeForSearch(node);
470 return privateNode==null ? "none" : Long.toString(privateNode.getId());
471 }
472
473
474 public static Filter and(Filter f1,Filter f2) {
475 BooleanFilter f = new BooleanFilter();
476 f.add(new FilterClause(f1,BooleanClause.Occur.MUST));
477 f.add(new FilterClause(f2,BooleanClause.Occur.MUST));
478 return f;
479 }
480
481 public static Filter getRangeFilter(Date from, Date to) {
482 Integer lowerDateTerm = (from==null)?null:formatRangeSearchDate(from);
483 Integer upperDateTerm = (to==null)?null:formatRangeSearchDate(to);
484 return NumericRangeFilter.newIntRange(RANGE_SEARCH_DATE_FLD, lowerDateTerm, upperDateTerm, true,true);
485 }
486
487
488 private static final int maxCachedFilters = Init.get("maxCachedFilters", 20);
489
490 private static Map<Filter,CachingWrapperFilter> filterCache = new LinkedHashMap<Filter,CachingWrapperFilter>() {
491 protected boolean removeEldestEntry(Map.Entry eldest) {
492 return size() > maxCachedFilters;
493 }
494 };
495
496 public static synchronized CachingWrapperFilter getCachedFilter(Filter filter) {
497 CachingWrapperFilter f = filterCache.get(filter);
498 if( f == null ) {
499 f = new CachingWrapperFilter(filter);
500 filterCache.put(filter,f);
501 }
502 return f;
503 }
504
505
506 static Query descendants(Node node) {
507 return descendants(node.getId());
508 }
509
510 private static Query descendants(long nodeId) {
511 return new TermQuery(new Term(ANCESTORS_FLD,Long.toString(nodeId)));
512 }
513
514 static Query children(Node node) {
515 return new TermQuery(new Term(PARENT_ID_FLD,Long.toString(node.getId())));
516 }
517
518 static Query node(Node node) {
519 return node(node.getId());
520 }
521
522 static Query node(long nodeId) {
523 return new TermQuery(new Term(NODE_ID_FLD,Long.toString(nodeId)));
524 }
525
526 static Query day(Date date) {
527 return new TermQuery(new Term(DAY_FLD,formatDay(date)));
528 }
529
530
531 private static final Directory helpDir = new RAMDirectory();
532 private static IndexReader helpIndexReader;
533
534 private static final String[] helpSearchFields = new String[] {
535 "answer", "question"
536 };
537
538 public static Help[] searchHelp(String line) throws ParseException {
539 try {
540 Query query = NodeSearcher.parse(line,helpSearchFields);
541 Searcher searcher = new IndexSearcher(helpIndexReader);
542 try {
543 TopDocs hits = searcher.search(query,helpIndexReader.numDocs());
544 Help[] helps = new Help[hits.scoreDocs.length];
545 for( int i=0; i<helps.length; i++ ) {
546 helps[i] = Help.getHelp(Integer.parseInt(searcher.doc(hits.scoreDocs[i].doc).get("id")));
547 }
548 return helps;
549 } catch (BooleanQuery.TooManyClauses e) {
550 throw new RuntimeException("Your search will give too many matches.");
551 } finally {
552 searcher.close();
553 }
554 } catch (IOException e) {
555 throw new RuntimeException(e);
556 }
557 }
558
559 public static void addHelp(final Collection<Help> helps) {
560 try {
561 IndexWriter writer = new IndexWriter(helpDir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
562 for( Help help : helps ) {
563 writer.addDocument(document(help));
564 }
565 writer.close();
566 helpIndexReader = IndexReader.open(helpDir,true);
567 } catch (IOException e) {
568 throw new RuntimeException(e);
569 }
570 }
571
572 private static Document document(Help help) {
573 Document doc = new Document();
574 String id = Integer.toString(help.id);
575 doc.add( new Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
576 Field answer = new Field("answer", help.answer(), Field.Store.NO, Field.Index.ANALYZED);
577 doc.add(answer);
578 Field question = new Field("question", help.question, Field.Store.NO, Field.Index.ANALYZED);
579 doc.add(question);
580 return doc;
581 }
582
583 }