Mercurial Hosting > nabble
diff src/nabble/model/Lucene.java @ 0:7ecd1a4ef557
add content
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Thu, 21 Mar 2019 19:15:52 -0600 |
parents | |
children | abe0694e9849 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/nabble/model/Lucene.java Thu Mar 21 19:15:52 2019 -0600 @@ -0,0 +1,583 @@ +/* + +Copyright (C) 2004 Franklin Schmidt <frank@gustos.com> + +*/ + +package nabble.model; + +import fschmidt.db.Listener; +import fschmidt.util.java.CollectionUtils; +import fschmidt.util.mail.MailEncodingException; +import nabble.model.lucene.HitCollector; +import nabble.model.lucene.IndexCache; +import nabble.model.lucene.LuceneSearcher; +import nabble.view.lib.Permissions; +import nabble.view.lib.help.Help; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.snowball.SnowballAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanFilter; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.CachingWrapperFilter; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.FilterClause; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.NumericRangeFilter; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.DateFormat; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + + +public final class Lucene { + private static final Logger logger = LoggerFactory.getLogger(Lucene.class); + + public static interface DocumentListener { + public void event(Node node,Document doc); + } + + private static final int nodeIndexVersion = 3; + + private static final String NODE_ID_FLD = "nodeId"; + static final String KIND_FLD = "kind"; + static final String SUBJECT_FLD = "subject"; + static final String MESSAGE_FLD = "message"; + static final String ANCESTORS_FLD = "ancestors"; + static final String PARENT_ID_FLD = "parentId"; + static final String DATE_FLD = "date"; + private static final String RANGE_SEARCH_DATE_FLD = "rangeSearchDate"; + private static final String DAY_FLD = "day"; + static final String USER_ID_FLD = "userId"; + static final String AUTHOR_FLD = "author"; + static final String PRIVATE_NODE_FLD = "privateNode"; + static final String MAILING_LIST_FLD = "mailingList"; + + static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"); + + private static final List<DocumentListener> documentListeners = new ArrayList<DocumentListener>(); + + private Lucene() {} // never + + static LuceneSearcher newSearcher(Site site) throws IOException { + return nodeIndex.openSearcher(site.getId()); + } + + static long getNodeId(Document doc) { + return Long.parseLong(doc.get(NODE_ID_FLD)); + } + + static NodeImpl getNode(SiteImpl site, LuceneSearcher searcher, int docId) throws IOException { + return getNode( site, searcher.doc(docId) ); + } + + static NodeImpl getNode(SiteImpl site,Document doc) { + long nodeId = getNodeId(doc); + NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId); + if( node==null ) { + logger.error("missing node "+nodeId+", removing from lucene"); + removeNode(site,nodeId); + } + return node; + } + + private static void add(final Node node) { + Document doc = document(node); + try { + IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId()); + try { + indexWriter.addDocument(doc); + } finally { + indexWriter.close(); + } + } catch(IOException e) { + throw new RuntimeException(e); + } + } +/* + private static void removeSite(long siteId) { + try { + nodeIndex.delete(siteId); + } catch(IOException e) { + throw new RuntimeException(e); + } + } +*/ + private static void removeNode(Site site,long nodeId) { + Term term = new Term(NODE_ID_FLD,Long.toString(nodeId)); + try { + IndexWriter indexWriter = nodeIndex.openIndexWriter(site.getId()); + try { + indexWriter.deleteDocuments(term); + } finally { + indexWriter.close(); + } + } catch(IOException e) { + throw new RuntimeException(e); + } + } + + public static void update(final Node node) { + try { + Document doc = document(node); + if( doc==null ) { + removeNode(node.getSite(),node.getId()); + } else { + IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId()); + try { + indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc ); + } finally { + indexWriter.close(); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + static void updateNode(SiteImpl site,long nodeId) { + Node node = NodeImpl.getNode(site.siteKey,nodeId); + if( node == null ) { + removeNode(site,nodeId); + } else { + update(node); + } + } + + static { +/* + SiteImpl.table.getPostDeleteListeners().add(new Listener<SiteImpl>(){ + public void event(SiteImpl site) { + removeSite(site.getId()); + } + }); +*/ + NodeImpl.postDeleteListeners.add(new Listener<NodeImpl>(){ + public void event(NodeImpl node) { + // remove descendants + Term term = new Term(ANCESTORS_FLD,Long.toString(node.getId())); + try { + IndexWriter indexWriter = nodeIndex.openIndexWriter(node.siteKey.getId()); + try { + indexWriter.deleteDocuments(term); + } finally { + indexWriter.close(); + } + } catch(IOException e) { + throw new RuntimeException(e); + } + } + }); + NodeImpl.postInsertListeners.add(new Listener<NodeImpl>(){ + public void event(final NodeImpl node) { + node.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){ + try { + add(node); + } catch(MailEncodingException e) { + logger.warn(node.toString(),e); + } + }}); + } + }); + NodeImpl.preUpdateListeners.add(new Listener<NodeImpl>(){ + public void event(NodeImpl node) { + Set fields = node.getDbRecord().fields().keySet(); + if( CollectionUtils.intersects(fields,nodeDbFields) ) { + final long nodeId = node.getId(); + final SiteKey siteKey = node.siteKey; + siteKey.getDb().runAfterCommit(new Runnable() { + public void run() { + NodeImpl node = NodeImpl.getNode(siteKey,nodeId); + if (node != null) update(node); + } + }); + } + } + }); + MailingListImpl.postDeleteListeners.add(new Listener<MailingListImpl>(){ + public void event(MailingListImpl mailingList) { + update(mailingList.getForum()); + } + }); + MailingListImpl.postInsertListeners.add(new Listener<MailingListImpl>(){ + public void event(final MailingListImpl mailingList) { + mailingList.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){ + update(mailingList.getForum()); + }}); + } + }); + MailingListImpl.preUpdateListeners.add(new Listener<MailingListImpl>(){ + public void event(MailingListImpl mailingList) { + Set fields = mailingList.getDbRecord().fields().keySet(); + if( CollectionUtils.intersects(fields,mailingListDbFields) ) { + final long nodeId = mailingList.getForum().getId(); + final SiteKey siteKey = mailingList.siteKey; + siteKey.getDb().runAfterCommit(new Runnable() { + public void run() { + NodeImpl node = NodeImpl.getNode(siteKey,nodeId); + update(node); + } + }); + } + } + }); + } + + static void staleNode(NodeImpl node) throws IOException { + if( node==null ) + return; + logger.debug("staleNode update"); + updateNodes( node.getSiteImpl(), descendants(node) ); + logger.debug("staleNode done"); + } + + static void nop() {} + + public static void addDocumentListener(DocumentListener documentListener) { + documentListeners.add(documentListener); + } + + static Document document(Node node) { + Document doc = new Document(); + doc.add( new Field(NODE_ID_FLD, Long.toString(node.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + doc.add( new Field(KIND_FLD, node.getKind().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + String subject = node.getSubject(); + Field subjectFld = new Field(SUBJECT_FLD, subject, Field.Store.NO, Field.Index.ANALYZED); + subjectFld.setBoost(2.0f); + doc.add(subjectFld); + try { + String message = MessageUtils.htmlToSearchText(node.getMessage().parse()); + doc.add( new Field(MESSAGE_FLD, message, Field.Store.NO, Field.Index.ANALYZED) ); + } catch(RuntimeException e) { + logger.error("nodeId="+node.getId(),e); + } + + for( Node f : node.getAncestors() ) { + doc.add( new Field(ANCESTORS_FLD, Long.toString(f.getId()), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + } + Node parent = node.getParent(); + if (parent != null) + doc.add(new Field(PARENT_ID_FLD, Long.toString(parent.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + + int date = (int)(-node.getWhenCreated().getTime()/1000); + doc.add( new NumericField(DATE_FLD).setIntValue(date) ); + int rangeSearchDate = formatRangeSearchDate(node.getWhenCreated()); + doc.add( new NumericField(RANGE_SEARCH_DATE_FLD).setIntValue(rangeSearchDate) ); + String day = formatDay(node.getWhenCreated()); + doc.add( new Field(DAY_FLD, day, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + + Person owner = node.getOwner(); + String userId = owner.getSearchId(); + doc.add( new Field(USER_ID_FLD, userId, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + String author = owner.getName(); + doc.add( new Field(AUTHOR_FLD, author, Field.Store.NO, Field.Index.ANALYZED) ); + doc.add( new Field(PRIVATE_NODE_FLD, formatPrivateNode(node), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS)); + MailingList mailingList = node.getMailingList(); + if (mailingList != null) { // only for forums + Field listAddrFld = new Field(MAILING_LIST_FLD, mailingList.getListAddress().toLowerCase(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS); + doc.add( listAddrFld ); + } + for( DocumentListener documentListener : documentListeners ) { + documentListener.event(node,doc); + } + return doc; + } + + private static final String[] nodeDbFields = + {"subject", "when_created", "msg_fmt", "parent_id", "is_app", "owner_id", "cookie", "anonymous_name"}; + + private static final String[] mailingListDbFields = {"mailing_list"}; + + + public static void updateRecursively(Node node) { + update(node); + for (Node n : node.getChildren()) { + updateRecursively(n); + } + } + + + + + + + + // from SearchServer + + static NodeImpl node(SiteImpl site,Document doc) { + long nodeId = getNodeId(doc); + NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId); + if (node==null) + logger.error("invalid node_id in lucene index: "+nodeId); + return node; + } + + private static final IndexCache.Builder<Long> builder = new IndexCache.Builder<Long>() { + + public void build(Long siteId) throws SQLException, IOException { + SiteKey siteKey = SiteKey.getInstance(siteId); + Connection con = siteKey.getDb().getConnection(); + long[] nodeIds; + { + Statement stmt = con.createStatement(); + ResultSet rs = stmt.executeQuery( + "select count(*) as n from node" + ); + rs.next(); + nodeIds = new long[rs.getInt("n")]; + rs.close(); + stmt.close(); + } + { + PreparedStatement stmt = con.prepareStatement( + "select node_id from node order by node_id limit ?" + ); + stmt.setInt(1,nodeIds.length); + ResultSet rs = stmt.executeQuery(); + for( int i=0; rs.next(); i++ ) { + nodeIds[i] = rs.getLong("node_id"); + } + rs.close(); + stmt.close(); + } + logger.error("Lucene started - site_id = " + siteId + " / " + nodeIds.length + " nodes"); + IndexWriter indexWriter = nodeIndex.openIndexWriter(siteId); + int count = 0; + int lastPercent = 0; + try { + for( long nodeId : nodeIds ) { + Node node = NodeImpl.getNode(siteKey,nodeId); + if( node != null ) { + Document doc = document(node); + indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc ); + } + count++; + int percent = Math.round(100f * count / (float) nodeIds.length); + if (percent > lastPercent) { + logger.error("Lucene build " + percent + "% completed"); + lastPercent = percent; + } + } + } finally { + indexWriter.close(); + } + con.close(); + } + + public boolean exists(String keyString) { + long id; + try { + id = Long.parseLong(keyString); + } catch(NumberFormatException e) { + return false; + } + return SiteKey.getInstance(id).siteGlobal() != null; + } + }; + + private static final IndexCache<Long> nodeIndex; + static { + logger.info("Starting search server"); + Init.luceneStarted = true; + String localDir = (String)Init.get("local_dir"); + String luceneDir = localDir + "lucene/"; + File dirFile = new File(luceneDir); + nodeIndex = new IndexCache<Long>(dirFile,analyzer,nodeIndexVersion,builder); + } + + private static void updateNodes(final SiteImpl site,Query query) { + try { + final LuceneSearcher searcher = newSearcher(site); + try { + searcher.search(query,new HitCollector() { + protected void process(Document doc) { + Node node = getNode(site,doc); + if( node != null ) + update(node); + } + }); + } finally { + searcher.close(); + } + } catch(IOException e) { + throw new RuntimeException(e); + } + } + + + public static boolean isReady(Site site) { + return nodeIndex.isReady(site.getId()); + } + + public static void rebuild(Site site) throws IOException { + nodeIndex.rebuild(site.getId()); + } + + static synchronized void shutdown() { + nodeIndex.shutdown(); + } + + + + + + private static final long tenMinutes = 1000L*60*10; + + static int formatRangeSearchDate(Date date) { + return (int)(date.getTime()/tenMinutes); + } + + + private static final DateFormat dayFormat = new SimpleDateFormat("yyyyMMdd"); + + static String formatDay(Date date) { + synchronized(dayFormat) { + return dayFormat.format(date); + } + } + + static String formatPrivateNode(Node node) { + Node privateNode = Permissions.getPrivateNodeForSearch(node); + return privateNode==null ? "none" : Long.toString(privateNode.getId()); + } + + + public static Filter and(Filter f1,Filter f2) { + BooleanFilter f = new BooleanFilter(); + f.add(new FilterClause(f1,BooleanClause.Occur.MUST)); + f.add(new FilterClause(f2,BooleanClause.Occur.MUST)); + return f; + } + + public static Filter getRangeFilter(Date from, Date to) { + Integer lowerDateTerm = (from==null)?null:formatRangeSearchDate(from); + Integer upperDateTerm = (to==null)?null:formatRangeSearchDate(to); + return NumericRangeFilter.newIntRange(RANGE_SEARCH_DATE_FLD, lowerDateTerm, upperDateTerm, true,true); + } + + + private static final int maxCachedFilters = Init.get("maxCachedFilters", 20); + + private static Map<Filter,CachingWrapperFilter> filterCache = new LinkedHashMap<Filter,CachingWrapperFilter>() { + protected boolean removeEldestEntry(Map.Entry eldest) { + return size() > maxCachedFilters; + } + }; + + public static synchronized CachingWrapperFilter getCachedFilter(Filter filter) { + CachingWrapperFilter f = filterCache.get(filter); + if( f == null ) { + f = new CachingWrapperFilter(filter); + filterCache.put(filter,f); + } + return f; + } + + + static Query descendants(Node node) { + return descendants(node.getId()); + } + + private static Query descendants(long nodeId) { + return new TermQuery(new Term(ANCESTORS_FLD,Long.toString(nodeId))); + } + + static Query children(Node node) { + return new TermQuery(new Term(PARENT_ID_FLD,Long.toString(node.getId()))); + } + + static Query node(Node node) { + return node(node.getId()); + } + + static Query node(long nodeId) { + return new TermQuery(new Term(NODE_ID_FLD,Long.toString(nodeId))); + } + + static Query day(Date date) { + return new TermQuery(new Term(DAY_FLD,formatDay(date))); + } + + + private static final Directory helpDir = new RAMDirectory(); + private static IndexReader helpIndexReader; + + private static final String[] helpSearchFields = new String[] { + "answer", "question" + }; + + public static Help[] searchHelp(String line) throws ParseException { + try { + Query query = NodeSearcher.parse(line,helpSearchFields); + Searcher searcher = new IndexSearcher(helpIndexReader); + try { + TopDocs hits = searcher.search(query,helpIndexReader.numDocs()); + Help[] helps = new Help[hits.scoreDocs.length]; + for( int i=0; i<helps.length; i++ ) { + helps[i] = Help.getHelp(Integer.parseInt(searcher.doc(hits.scoreDocs[i].doc).get("id"))); + } + return helps; + } catch (BooleanQuery.TooManyClauses e) { + throw new RuntimeException("Your search will give too many matches."); + } finally { + searcher.close(); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static void addHelp(final Collection<Help> helps) { + try { + IndexWriter writer = new IndexWriter(helpDir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); + for( Help help : helps ) { + writer.addDocument(document(help)); + } + writer.close(); + helpIndexReader = IndexReader.open(helpDir,true); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static Document document(Help help) { + Document doc = new Document(); + String id = Integer.toString(help.id); + doc.add( new Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); + Field answer = new Field("answer", help.answer(), Field.Store.NO, Field.Index.ANALYZED); + doc.add(answer); + Field question = new Field("question", help.question, Field.Store.NO, Field.Index.ANALYZED); + doc.add(question); + return doc; + } + +}