diff src/nabble/model/Lucene.java @ 0:7ecd1a4ef557

add content
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 21 Mar 2019 19:15:52 -0600
parents
children abe0694e9849
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/nabble/model/Lucene.java	Thu Mar 21 19:15:52 2019 -0600
@@ -0,0 +1,583 @@
+/*
+
+Copyright (C) 2004  Franklin Schmidt <frank@gustos.com>
+
+*/
+
+package nabble.model;
+
+import fschmidt.db.Listener;
+import fschmidt.util.java.CollectionUtils;
+import fschmidt.util.mail.MailEncodingException;
+import nabble.model.lucene.HitCollector;
+import nabble.model.lucene.IndexCache;
+import nabble.model.lucene.LuceneSearcher;
+import nabble.view.lib.Permissions;
+import nabble.view.lib.help.Help;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanFilter;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.CachingWrapperFilter;
+import org.apache.lucene.search.Filter;
+import org.apache.lucene.search.FilterClause;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.NumericRangeFilter;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.Searcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+
+public final class Lucene {
+	private static final Logger logger = LoggerFactory.getLogger(Lucene.class);
+
+	public static interface DocumentListener {
+		public void event(Node node,Document doc);
+	}
+
+	private static final int nodeIndexVersion = 3;
+
+	private static final String NODE_ID_FLD = "nodeId";
+	static final String KIND_FLD = "kind";
+	static final String SUBJECT_FLD = "subject";
+	static final String MESSAGE_FLD = "message";
+	static final String ANCESTORS_FLD = "ancestors";
+	static final String PARENT_ID_FLD = "parentId";
+	static final String DATE_FLD = "date";
+	private static final String RANGE_SEARCH_DATE_FLD = "rangeSearchDate";
+	private static final String DAY_FLD = "day";
+	static final String USER_ID_FLD = "userId";
+	static final String AUTHOR_FLD = "author";
+	static final String PRIVATE_NODE_FLD = "privateNode";
+	static final String MAILING_LIST_FLD = "mailingList";
+
+	static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
+
+	private static final List<DocumentListener> documentListeners = new ArrayList<DocumentListener>();
+
+	private Lucene() {}  // never
+
+	static LuceneSearcher newSearcher(Site site) throws IOException {
+		return nodeIndex.openSearcher(site.getId());
+	}
+
+	static long getNodeId(Document doc) {
+		return Long.parseLong(doc.get(NODE_ID_FLD));
+	}
+
+	static NodeImpl getNode(SiteImpl site, LuceneSearcher searcher, int docId) throws IOException {
+		return getNode( site, searcher.doc(docId) );
+	}
+
+	static NodeImpl getNode(SiteImpl site,Document doc) {
+		long nodeId = getNodeId(doc);
+		NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId);
+		if( node==null ) {
+			logger.error("missing node "+nodeId+", removing from lucene");
+			removeNode(site,nodeId);
+		}
+		return node;
+	}
+
+	private static void add(final Node node) {
+		Document doc = document(node);
+		try {
+			IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId());
+			try {
+				indexWriter.addDocument(doc);
+			} finally {
+				indexWriter.close();
+			}
+		} catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+/*
+	private static void removeSite(long siteId) {
+		try {
+			nodeIndex.delete(siteId);
+		} catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+*/
+	private static void removeNode(Site site,long nodeId) {
+		Term term = new Term(NODE_ID_FLD,Long.toString(nodeId));
+		try {
+			IndexWriter indexWriter = nodeIndex.openIndexWriter(site.getId());
+			try {
+				indexWriter.deleteDocuments(term);
+			} finally {
+				indexWriter.close();
+			}
+		} catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	public static void update(final Node node) {
+		try {
+			Document doc = document(node);
+			if( doc==null ) {
+				removeNode(node.getSite(),node.getId());
+			} else {
+				IndexWriter indexWriter = nodeIndex.openIndexWriter(node.getSite().getId());
+				try {
+					indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc );
+				} finally {
+					indexWriter.close();
+				}
+			}
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	static void updateNode(SiteImpl site,long nodeId) {
+		Node node = NodeImpl.getNode(site.siteKey,nodeId);
+		if( node == null ) {
+			removeNode(site,nodeId);
+		} else {
+			update(node);
+		}
+	}
+
+	static {
+/*
+		SiteImpl.table.getPostDeleteListeners().add(new Listener<SiteImpl>(){
+			public void event(SiteImpl site) {
+				removeSite(site.getId());
+			}
+		});
+*/
+		NodeImpl.postDeleteListeners.add(new Listener<NodeImpl>(){
+			public void event(NodeImpl node) {
+				// remove descendants
+				Term term = new Term(ANCESTORS_FLD,Long.toString(node.getId()));
+				try {
+					IndexWriter indexWriter = nodeIndex.openIndexWriter(node.siteKey.getId());
+					try {
+						indexWriter.deleteDocuments(term);
+					} finally {
+						indexWriter.close();
+					}
+				} catch(IOException e) {
+					throw new RuntimeException(e);
+				}
+			}
+		});
+		NodeImpl.postInsertListeners.add(new Listener<NodeImpl>(){
+			public void event(final NodeImpl node) {
+				node.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){
+					try {
+						add(node);
+					} catch(MailEncodingException e) {
+						logger.warn(node.toString(),e);
+					}
+				}});
+			}
+		});
+		NodeImpl.preUpdateListeners.add(new Listener<NodeImpl>(){
+			public void event(NodeImpl node) {
+				Set fields = node.getDbRecord().fields().keySet();
+				if( CollectionUtils.intersects(fields,nodeDbFields) ) {
+					final long nodeId = node.getId();
+					final SiteKey siteKey = node.siteKey;
+					siteKey.getDb().runAfterCommit(new Runnable() {
+						public void run() {
+							NodeImpl node = NodeImpl.getNode(siteKey,nodeId);
+							if (node != null) update(node);
+						}
+					});
+				}
+			}
+		});
+		MailingListImpl.postDeleteListeners.add(new Listener<MailingListImpl>(){
+			public void event(MailingListImpl mailingList) {
+				update(mailingList.getForum());
+			}
+		});
+		MailingListImpl.postInsertListeners.add(new Listener<MailingListImpl>(){
+			public void event(final MailingListImpl mailingList) {
+				mailingList.siteKey.getDb().runAfterCommit(new Runnable(){public void run(){
+					update(mailingList.getForum());
+				}});
+			}
+		});
+		MailingListImpl.preUpdateListeners.add(new Listener<MailingListImpl>(){
+			public void event(MailingListImpl mailingList) {
+				Set fields = mailingList.getDbRecord().fields().keySet();
+				if( CollectionUtils.intersects(fields,mailingListDbFields) ) {
+					final long nodeId = mailingList.getForum().getId();
+					final SiteKey siteKey = mailingList.siteKey;
+					siteKey.getDb().runAfterCommit(new Runnable() {
+						public void run() {
+							NodeImpl node = NodeImpl.getNode(siteKey,nodeId);
+							update(node);
+						}
+					});
+				}
+			}
+		});
+	}
+
+	static void staleNode(NodeImpl node) throws IOException {
+		if( node==null )
+			return;
+		logger.debug("staleNode update");
+		updateNodes( node.getSiteImpl(), descendants(node) );
+		logger.debug("staleNode done");
+	}
+
+	static void nop() {}
+
+	public static void addDocumentListener(DocumentListener documentListener) {
+		documentListeners.add(documentListener);
+	}
+
+	static Document document(Node node) {
+		Document doc = new Document();
+		doc.add( new Field(NODE_ID_FLD, Long.toString(node.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+		doc.add( new Field(KIND_FLD, node.getKind().toString(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+		String subject = node.getSubject();
+		Field subjectFld = new Field(SUBJECT_FLD, subject, Field.Store.NO, Field.Index.ANALYZED);
+		subjectFld.setBoost(2.0f);
+		doc.add(subjectFld);
+		try {
+			String message = MessageUtils.htmlToSearchText(node.getMessage().parse());
+			doc.add( new Field(MESSAGE_FLD, message, Field.Store.NO, Field.Index.ANALYZED) );
+		} catch(RuntimeException e) {
+			logger.error("nodeId="+node.getId(),e);
+		}
+
+		for( Node f : node.getAncestors() ) {
+			doc.add( new Field(ANCESTORS_FLD, Long.toString(f.getId()), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+		}
+		Node parent = node.getParent();
+		if (parent != null)
+			doc.add(new Field(PARENT_ID_FLD, Long.toString(parent.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
+
+		int date = (int)(-node.getWhenCreated().getTime()/1000);
+		doc.add( new NumericField(DATE_FLD).setIntValue(date) );
+		int rangeSearchDate = formatRangeSearchDate(node.getWhenCreated());
+		doc.add( new NumericField(RANGE_SEARCH_DATE_FLD).setIntValue(rangeSearchDate) );
+		String day = formatDay(node.getWhenCreated());
+		doc.add( new Field(DAY_FLD, day, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+
+		Person owner = node.getOwner();
+		String userId = owner.getSearchId();
+		doc.add( new Field(USER_ID_FLD, userId, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+		String author = owner.getName();
+		doc.add( new Field(AUTHOR_FLD, author, Field.Store.NO, Field.Index.ANALYZED) );
+		doc.add( new Field(PRIVATE_NODE_FLD, formatPrivateNode(node), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS));
+		MailingList mailingList = node.getMailingList();
+		if (mailingList != null) { // only for forums
+			Field listAddrFld = new Field(MAILING_LIST_FLD, mailingList.getListAddress().toLowerCase(), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS);
+			doc.add( listAddrFld );
+		}
+		for( DocumentListener documentListener : documentListeners ) {
+			documentListener.event(node,doc);
+		}
+		return doc;
+	}
+
+	private static final String[] nodeDbFields =
+		{"subject", "when_created", "msg_fmt", "parent_id", "is_app", "owner_id", "cookie", "anonymous_name"};
+
+	private static final String[] mailingListDbFields = {"mailing_list"};
+
+
+	public static void updateRecursively(Node node) {
+		update(node);
+		for (Node n : node.getChildren()) {
+			updateRecursively(n);
+		}
+	}
+
+
+
+
+
+
+
+	// from SearchServer
+
+	static NodeImpl node(SiteImpl site,Document doc) {
+		long nodeId = getNodeId(doc);
+		NodeImpl node = NodeImpl.getNode(site.siteKey,nodeId);
+		if (node==null)
+			logger.error("invalid node_id in lucene index: "+nodeId);
+		return node;
+	}
+
+	private static final IndexCache.Builder<Long> builder = new IndexCache.Builder<Long>() {
+
+		public void build(Long siteId) throws SQLException, IOException {
+			SiteKey siteKey = SiteKey.getInstance(siteId);
+			Connection con = siteKey.getDb().getConnection();
+			long[] nodeIds;
+			{
+				Statement stmt = con.createStatement();
+				ResultSet rs = stmt.executeQuery(
+					"select count(*) as n from node"
+				);
+				rs.next();
+				nodeIds = new long[rs.getInt("n")];
+				rs.close();
+				stmt.close();
+			}
+			{
+				PreparedStatement stmt = con.prepareStatement(
+					"select node_id from node order by node_id limit ?"
+				);
+				stmt.setInt(1,nodeIds.length);
+				ResultSet rs = stmt.executeQuery();
+				for( int i=0; rs.next(); i++ ) {
+					nodeIds[i] = rs.getLong("node_id");
+				}
+				rs.close();
+				stmt.close();
+			}
+			logger.error("Lucene started - site_id = " + siteId + " / " + nodeIds.length + " nodes");
+			IndexWriter indexWriter = nodeIndex.openIndexWriter(siteId);
+			int count = 0;
+			int lastPercent = 0;
+			try {
+				for( long nodeId : nodeIds ) {
+					Node node = NodeImpl.getNode(siteKey,nodeId);
+					if( node != null ) {
+						Document doc = document(node);
+						indexWriter.updateDocument( new Term(NODE_ID_FLD,doc.get(NODE_ID_FLD)), doc );
+					}
+					count++;
+					int percent = Math.round(100f * count / (float) nodeIds.length);
+					if (percent > lastPercent) {
+						logger.error("Lucene build " + percent + "% completed");
+						lastPercent = percent;
+					}
+				}
+			} finally {
+				indexWriter.close();
+			}
+			con.close();
+		}
+
+		public boolean exists(String keyString) {
+			long id;
+			try {
+				id = Long.parseLong(keyString);
+			} catch(NumberFormatException e) {
+				return false;
+			}
+			return SiteKey.getInstance(id).siteGlobal() != null;
+		}
+	};
+
+	private static final IndexCache<Long> nodeIndex;
+	static {
+		logger.info("Starting search server");
+		Init.luceneStarted = true;
+		String localDir = (String)Init.get("local_dir");
+		String luceneDir = localDir + "lucene/";
+		File dirFile = new File(luceneDir);
+		nodeIndex = new IndexCache<Long>(dirFile,analyzer,nodeIndexVersion,builder);
+	}
+
+	private static void updateNodes(final SiteImpl site,Query query) {
+		try {
+			final LuceneSearcher searcher = newSearcher(site);
+			try {
+				searcher.search(query,new HitCollector() {
+					protected void process(Document doc) {
+						Node node = getNode(site,doc);
+						if( node != null )
+							update(node);
+					}
+				});
+			} finally {
+				searcher.close();
+			}
+		} catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+
+	public static boolean isReady(Site site) {
+		return nodeIndex.isReady(site.getId());
+	}
+
+	public static void rebuild(Site site) throws IOException {
+		nodeIndex.rebuild(site.getId());
+	}
+
+	static synchronized void shutdown() {
+		nodeIndex.shutdown();
+	}
+
+
+
+
+
+	private static final long tenMinutes = 1000L*60*10;
+
+	static int formatRangeSearchDate(Date date) {
+		return (int)(date.getTime()/tenMinutes);
+	}
+
+
+	private static final DateFormat dayFormat = new SimpleDateFormat("yyyyMMdd");
+
+	static String formatDay(Date date) {
+		synchronized(dayFormat) {
+			return dayFormat.format(date);
+		}
+	}
+
+	static String formatPrivateNode(Node node) {
+		Node privateNode = Permissions.getPrivateNodeForSearch(node);
+		return privateNode==null ? "none" : Long.toString(privateNode.getId());
+	}
+
+
+	public static Filter and(Filter f1,Filter f2) {
+		BooleanFilter f = new BooleanFilter();
+		f.add(new FilterClause(f1,BooleanClause.Occur.MUST));
+		f.add(new FilterClause(f2,BooleanClause.Occur.MUST));
+		return f;
+	}
+
+	public static Filter getRangeFilter(Date from, Date to) {
+		Integer lowerDateTerm = (from==null)?null:formatRangeSearchDate(from);
+		Integer upperDateTerm = (to==null)?null:formatRangeSearchDate(to);
+		return NumericRangeFilter.newIntRange(RANGE_SEARCH_DATE_FLD, lowerDateTerm, upperDateTerm, true,true);
+	}
+
+
+	private static final int maxCachedFilters = Init.get("maxCachedFilters", 20);
+
+	private static Map<Filter,CachingWrapperFilter> filterCache = new LinkedHashMap<Filter,CachingWrapperFilter>() {
+	     protected boolean removeEldestEntry(Map.Entry eldest) {
+	        return size() > maxCachedFilters;
+	     }
+	};
+
+	public static synchronized CachingWrapperFilter getCachedFilter(Filter filter) {
+		CachingWrapperFilter f = filterCache.get(filter);
+		if( f == null ) {
+			f = new CachingWrapperFilter(filter);
+			filterCache.put(filter,f);
+		}
+		return f;
+	}
+
+
+	static Query descendants(Node node) {
+		return descendants(node.getId());
+	}
+
+	private static Query descendants(long nodeId) {
+		return new TermQuery(new Term(ANCESTORS_FLD,Long.toString(nodeId)));
+	}
+
+	static Query children(Node node) {
+		return new TermQuery(new Term(PARENT_ID_FLD,Long.toString(node.getId())));
+	}
+
+	static Query node(Node node) {
+		return node(node.getId());
+	}
+
+	static Query node(long nodeId) {
+		return new TermQuery(new Term(NODE_ID_FLD,Long.toString(nodeId)));
+	}
+
+	static Query day(Date date) {
+		return new TermQuery(new Term(DAY_FLD,formatDay(date)));
+	}
+
+
+	private static final Directory helpDir = new RAMDirectory();
+	private static IndexReader helpIndexReader;
+
+	private static final String[] helpSearchFields = new String[] {
+		"answer", "question"
+	};
+
+	public static Help[] searchHelp(String line) throws ParseException {
+		try {
+			Query query = NodeSearcher.parse(line,helpSearchFields);
+			Searcher searcher = new IndexSearcher(helpIndexReader);
+			try {
+				TopDocs hits = searcher.search(query,helpIndexReader.numDocs());
+				Help[] helps = new Help[hits.scoreDocs.length];
+				for( int i=0; i<helps.length; i++ ) {
+					helps[i] = Help.getHelp(Integer.parseInt(searcher.doc(hits.scoreDocs[i].doc).get("id")));
+				}
+				return helps;
+			} catch (BooleanQuery.TooManyClauses e) {
+				throw new RuntimeException("Your search will give too many matches.");
+			} finally {
+				searcher.close();
+			}
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	public static void addHelp(final Collection<Help> helps) {
+		try {
+			IndexWriter writer = new IndexWriter(helpDir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
+			for( Help help : helps ) {
+				writer.addDocument(document(help));
+			}
+			writer.close();
+			helpIndexReader = IndexReader.open(helpDir,true);
+		} catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	private static Document document(Help help) {
+		Document doc = new Document();
+		String id = Integer.toString(help.id);
+		doc.add( new Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
+		Field answer = new Field("answer", help.answer(), Field.Store.NO, Field.Index.ANALYZED);
+		doc.add(answer);
+		Field question = new Field("question", help.question, Field.Store.NO, Field.Index.ANALYZED);
+		doc.add(question);
+		return doc;
+	}
+
+}