Mercurial Hosting > luan
view src/luan/modules/lucene/LuceneIndex.java @ 1818:d3e6c7f64559 default tip
docs - Math
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sun, 16 Jun 2024 20:26:12 -0600 |
parents | c62324841dfb |
children |
line wrap: on
line source
package luan.modules.lucene; import java.io.File; import java.io.IOException; import java.lang.ref.Reference; import java.lang.ref.WeakReference; import java.util.Map; import java.util.HashMap; import java.util.ArrayList; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.ReentrantLock; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.SnapshotDeletionPolicy; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.CheckIndex; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.search.Query; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Explanation; import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.NullFragmenter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.TokenGroup; import goodjava.lucene.analysis.LowercaseAnalyzer; import goodjava.lucene.queryparser.GoodQueryParser; import goodjava.lucene.queryparser.FieldParser; import goodjava.lucene.queryparser.MultiFieldParser; import goodjava.lucene.queryparser.StringFieldParser; import goodjava.lucene.queryparser.NumberFieldParser; import goodjava.lucene.api.GoodIndexWriter; import goodjava.lucene.api.LuceneIndexWriter; import goodjava.lucene.api.GoodIndexWriterConfig; import goodjava.lucene.api.LuceneUtils; import goodjava.lucene.logging.LoggingIndexWriter; import goodjava.lucene.logging.OpDoer; import goodjava.lucene.backup.BackupIndexWriter; import goodjava.parser.ParseException; import luan.modules.Utils; import luan.Luan; import luan.LuanTable; import luan.LuanFunction; import luan.LuanException; import luan.LuanRuntimeException; import goodjava.logging.Logger; import goodjava.logging.LoggerFactory; public final class LuceneIndex { private static final Logger logger = LoggerFactory.getLogger(LuceneIndex.class); private static Map<String,Reference<LuceneIndex>> indexes = new HashMap<String,Reference<LuceneIndex>>(); public static LuceneIndex getLuceneIndex(Luan luan,File indexDir,LuanTable options) throws LuanException, IOException, ClassNotFoundException { String key = indexDir.getCanonicalPath(); synchronized(indexes) { Reference<LuceneIndex> ref = indexes.get(key); if( ref != null ) { LuceneIndex li = ref.get(); if( li != null ) { Object version = options.get(luan,"version"); if( (version==null || version.equals(li.version)) && li.indexDir.exists() ) return li; li.closeWriter(); } } LuceneIndex li = new LuceneIndex(luan,indexDir,options); indexes.put(key, new WeakReference<LuceneIndex>(li)); return li; } } private static final Version luceneVersion = Version.LUCENE_4_9; public static final StringFieldParser STRING_FIELD_PARSER = new StringFieldParser(new KeywordAnalyzer()); public static final StringFieldParser LOWERCASE_FIELD_PARSER = new StringFieldParser(new LowercaseAnalyzer(luceneVersion)); public static final StringFieldParser ENGLISH_FIELD_PARSER = new StringFieldParser(new EnglishAnalyzer(luceneVersion)); public static final StringFieldParser SIMPLE_FIELD_PARSER = new StringFieldParser(new SimpleAnalyzer(luceneVersion)); private static final SortField ID_SORT = new SortField("id",SortField.Type.LONG); private static final SortField ID_DESC_SORT = new SortField("id",SortField.Type.LONG,true); private final Object version; private final ReentrantLock writeLock = new ReentrantLock(); private final File indexDir; private GoodIndexWriter writer; private DirectoryReader reader; private IndexSearcher searcher; private final ThreadLocal<IndexSearcher> threadLocalSearcher = new ThreadLocal<IndexSearcher>(); private final MultiFieldParser mfp; private final Analyzer analyzer; // ??? private FSDirectory fsDir; private int writeCount; private AtomicInteger writeCounter = new AtomicInteger(); private final GoodIndexWriterConfig config; private boolean wasCreated; private final File logDir; private final long logTime; private final String name; private final String domain; private LuceneIndex(Luan luan,File indexDir,LuanTable options) throws LuanException, IOException, ClassNotFoundException { options = new LuanTable(options); this.version = options.remove("version"); FieldParser defaultFieldParser = (FieldParser)options.remove("default_type"); LuanTable defaultFieldsTbl = Utils.removeTable(options,"default_fields"); String[] defaultFields = defaultFieldsTbl==null ? null : (String[])defaultFieldsTbl.asList().toArray(new String[0]); LuanFunction supplementer = Utils.removeFunction(options,"supplementer"); logDir = (File)options.remove("log_dir"); logTime = (Long)options.remove("log_time"); name = (String)options.remove("name"); Utils.checkEmpty(options); { LuanTable module = (LuanTable)luan.require("luan:http/Http.luan"); String domain = (String)module.get(luan,"domain"); if( domain == null ) domain = "localhost"; this.domain = domain; } mfp = defaultFieldParser==null ? new MultiFieldParser() : new MultiFieldParser(defaultFieldParser,defaultFields); mfp.fields.put( "type", STRING_FIELD_PARSER ); mfp.fields.put( "id", NumberFieldParser.LONG ); this.indexDir = indexDir; Analyzer analyzer = STRING_FIELD_PARSER.analyzer; if( defaultFieldParser instanceof StringFieldParser ) { StringFieldParser sfp = (StringFieldParser)defaultFieldParser; analyzer = sfp.analyzer; } this.analyzer = analyzer; this.config = new SupplementingConfig(luceneVersion,mfp,supplementer); wasCreated = reopen(); } public boolean reopen() throws IOException { fsDir = FSDirectory.open(indexDir); boolean wasCreated = !fsDir.getDirectory().exists(); writer = new LuceneIndexWriter(fsDir,config); try { if( logDir != null ) { if( BackupIndexWriter.backupDomains == null ) { writer = new LoggingIndexWriter((LuceneIndexWriter)writer,logDir,logTime); } else { writer = BackupIndexWriter.newWithRestore((LuceneIndexWriter)writer,logDir,logTime,domain,name); } } reader = DirectoryReader.open(fsDir); searcher = new IndexSearcher(reader); initId(); return wasCreated; } catch(IOException e) { writer.close(); throw e; } } private void wrote() { writeCounter.incrementAndGet(); } public void delete_all() throws IOException { boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); try { writer.deleteAll(); id = 0; if(commit) writer.commit(); } finally { wrote(); writeLock.unlock(); } } public void delete(String queryStr) throws IOException, ParseException, LuanException { Query query = GoodQueryParser.parseQuery(mfp,queryStr); boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); try { writer.deleteDocuments(query); if(commit) writer.commit(); } finally { wrote(); writeLock.unlock(); } } public void reindex(String queryStr) throws IOException, ParseException { Query query = GoodQueryParser.parseQuery(mfp,queryStr); boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); try { writer.reindexDocuments("id",query); if(commit) writer.commit(); } finally { wrote(); writeLock.unlock(); } } public void save( Luan luan, LuanTable doc, LuanTable unstored, Map<String,Float> boosts ) throws LuanException, IOException { Object obj = doc.get(luan,"id"); Long id; try { id = (Long)obj; } catch(ClassCastException e) { throw new LuanException("id should be Long but is "+obj.getClass().getSimpleName()); } boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); try { if( unstored!=null || boosts!=null ) { if( unstored == null ) throw new LuanException("unstored required with boosts"); if( boosts == null ) throw new LuanException("boosts required with unstored"); if( id != null ) throw new LuanException("update not supported"); if( !(writer instanceof LuceneIndexWriter) ) throw new LuanException("not supported with index logging"); id = ++this.id; doc.put(luan,"id",id); LuceneIndexWriter liw = (LuceneIndexWriter)writer; liw.addDocument( toLucene(doc), toLucene(unstored), boosts ); } else if( id == null ) { id = ++this.id; doc.put(luan,"id",id); writer.addDocument(toLucene(doc)); } else { writer.updateDocument( "id", toLucene(doc) ); } if(commit) writer.commit(); } finally { wrote(); writeLock.unlock(); } } public boolean is_in_transaction() { return writeLock.isHeldByCurrentThread(); } public Object run_in_transaction(Luan luan,LuanFunction fn) throws IOException, LuanException { boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); boolean ok = false; try { Object rtn = fn.call(luan); ok = true; if(commit) { writer.commit(); } return rtn; } finally { if( !ok && commit ) { writer.rollback(); reopen(); } wrote(); writeLock.unlock(); } } // ??? public Object run_in_lock(Luan luan,LuanFunction fn) throws IOException, LuanException { if( writeLock.isHeldByCurrentThread() ) throw new RuntimeException(); writeLock.lock(); try { synchronized(this) { return fn.call(luan); } } finally { wrote(); writeLock.unlock(); } } private long id; private void initId() throws IOException { TopDocs td = searcher.search(new MatchAllDocsQuery(),1,new Sort(ID_DESC_SORT)); switch(td.scoreDocs.length) { case 0: id = 0; break; case 1: id = (Long)searcher.doc(td.scoreDocs[0].doc).getField("id").numericValue(); break; default: throw new RuntimeException(); } } public SnapshotDeletionPolicy snapshotDeletionPolicy() { return (SnapshotDeletionPolicy)writer.getLuceneIndexWriter().getConfig().getIndexDeletionPolicy(); } public Object snapshot(Luan luan,LuanFunction fn) throws LuanException, IOException { SnapshotDeletionPolicy snapshotDeletionPolicy = snapshotDeletionPolicy(); IndexCommit ic = snapshotDeletionPolicy.snapshot(); try { String dir = fsDir.getDirectory().toString(); LuanTable fileNames = new LuanTable(new ArrayList(ic.getFileNames())); return fn.call(luan,dir,fileNames); } finally { snapshotDeletionPolicy.release(ic); } } public void tag(String tag) throws IOException { boolean commit = !writeLock.isHeldByCurrentThread(); writeLock.lock(); try { writer.tag(tag); if(commit) writer.commit(); } finally { writeLock.unlock(); } } public String to_string() { return writer.getLuceneIndexWriter().getDirectory().toString(); } protected void finalize() throws Throwable { close(); super.finalize(); } public void close_down() throws IOException { String key = indexDir.getCanonicalPath(); synchronized(indexes) { indexes.remove(key); } close(); } public void close() throws IOException { closeWriter(); reader.close(); } private void closeWriter() throws IOException { writeLock.lock(); try { writer.close(); } finally { writeLock.unlock(); } } private static class DocFn extends LuanFunction { final IndexSearcher searcher; final Query query; int docID; DocFn(IndexSearcher searcher,Query query) { this.searcher = searcher; this.query = query; } @Override public Object call(Luan luan,Object[] args) throws LuanException { try { LuanTable doc = toTable(searcher.doc(docID)); if( args.length > 0 && "explain".equals(args[0]) ) { Explanation explanation = searcher.explain(query,docID); return new Object[]{doc,explanation}; } else { return doc; } } catch(IOException e) { throw new LuanException(e); } } } private static abstract class MyCollector extends Collector { int docBase; int i = 0; @Override public void setScorer(Scorer scorer) {} @Override public void setNextReader(AtomicReaderContext context) { this.docBase = context.docBase; } @Override public boolean acceptsDocsOutOfOrder() { return true; } } private synchronized IndexSearcher openSearcher() throws IOException { int gwc = writeCounter.get(); if( writeCount != gwc ) { writeCount = gwc; DirectoryReader newReader = DirectoryReader.openIfChanged(reader); // DirectoryReader newReader = DirectoryReader.openIfChanged(reader,writer.getLuceneIndexWriter(),true); if( newReader != null ) { reader.decRef(); reader = newReader; searcher = new IndexSearcher(reader); } } reader.incRef(); return searcher; } // call in finally block private static void close(IndexSearcher searcher) throws IOException { searcher.getIndexReader().decRef(); } public void ensure_open() throws IOException { close(openSearcher()); } public int advanced_search( final Luan luan, Object queryStr, LuanFunction fn, Integer n, String sortStr ) throws LuanException, IOException, ParseException { Query query; if( queryStr instanceof String ) { query = GoodQueryParser.parseQuery(mfp,(String)queryStr); } else if( queryStr instanceof LuanTable ) { LuanTable t = (LuanTable)queryStr; BooleanQuery bq = new BooleanQuery(); for( Map.Entry<Object,Object> entry : t.rawIterable() ) { String name = (String)entry.getKey(); Object value = entry.getValue(); Query q = new TermQuery( LuceneUtils.term(name,value) ); bq.add( q, BooleanClause.Occur.MUST ); } query = bq; } else throw new LuanException("query must be string or table"); IndexSearcher searcher = threadLocalSearcher.get(); boolean inTransaction = searcher != null; if( !inTransaction ) searcher = openSearcher(); try { if( fn!=null && n==null ) { if( sortStr != null ) throw new LuanException("sort must be nil when n is nil"); final DocFn docFn = new DocFn(searcher,query); MyCollector col = new MyCollector() { @Override public void collect(int doc) { try { docFn.docID = docBase + doc; fn.call(luan,++i,docFn); } catch(LuanException e) { throw new LuanRuntimeException(e); } } }; try { searcher.search(query,col); } catch(LuanRuntimeException e) { throw (LuanException)e.getCause(); } return col.i; } if( fn==null || n==0 ) { TotalHitCountCollector thcc = new TotalHitCountCollector(); searcher.search(query,thcc); return thcc.getTotalHits(); } Sort sort = sortStr==null ? null : GoodQueryParser.parseSort(mfp,sortStr); TopDocs td = sort==null ? searcher.search(query,n) : searcher.search(query,n,sort); final ScoreDoc[] scoreDocs = td.scoreDocs; DocFn docFn = new DocFn(searcher,query); for( int i=0; i<scoreDocs.length; i++ ) { ScoreDoc scoreDoc = scoreDocs[i]; docFn.docID = scoreDoc.doc; fn.call(luan,i+1,docFn,scoreDoc.score); } return td.totalHits; } finally { if( !inTransaction ) close(searcher); } } public Object search_in_transaction(Luan luan,LuanFunction fn) throws LuanException, IOException { if( threadLocalSearcher.get() != null ) throw new LuanException("can't nest search_in_transaction calls"); IndexSearcher searcher = openSearcher(); threadLocalSearcher.set(searcher); try { return fn.call(luan); } finally { threadLocalSearcher.set(null); close(searcher); } } public FieldParser getIndexedFieldParser(String field) { return mfp.fields.get(field); } public void setIndexedFieldParser(String field,FieldParser fp) { if( fp==null ) { // delete mfp.fields.remove(field); return; } mfp.fields.put( field, fp ); } static Map<String,Object> toLucene(LuanTable table) throws LuanException { return SupplementingConfig.toLucene(table); } private static LuanTable toTable(Document doc) throws LuanException { return doc==null ? null : SupplementingConfig.toTable(LuceneUtils.toMap(doc)); } private static final Formatter nullFormatter = new Formatter() { public String highlightTerm(String originalText,TokenGroup tokenGroup) { return originalText; } }; public LuanFunction highlighter(final Luan luan,String queryStr,final LuanFunction formatter,final Integer fragmentSize,String dotdotdot) throws ParseException { Query query = GoodQueryParser.parseQuery(mfp,queryStr); Formatter fmt = new Formatter() { public String highlightTerm(String originalText,TokenGroup tokenGroup) { if( tokenGroup.getTotalScore() <= 0 ) return originalText; try { return (String)Luan.first(formatter.call(luan,originalText)); } catch(LuanException e) { throw new LuanRuntimeException(e); } } }; QueryScorer queryScorer = new QueryScorer(query); final Highlighter chooser = fragmentSize==null ? null : new Highlighter(nullFormatter,queryScorer); if( chooser != null ) chooser.setTextFragmenter( new SimpleSpanFragmenter(queryScorer,fragmentSize) ); final Highlighter hl = new Highlighter(fmt,queryScorer); hl.setTextFragmenter( new NullFragmenter() ); return new LuanFunction() { @Override public String call(Luan luan,Object[] args) throws LuanException { String text = (String)args[0]; try { if( chooser != null ) { String s = chooser.getBestFragment(analyzer,null,text); if( s != null ) { if( dotdotdot != null ) { boolean atStart = text.startsWith(s); boolean atEnd = text.endsWith(s); if( !atStart ) s = dotdotdot + s; if( !atEnd ) s = s + dotdotdot; } text = s; } else if( text.length() > fragmentSize ) { text = text.substring(0,fragmentSize); if( dotdotdot != null ) text += "..."; } } String s = hl.getBestFragment(analyzer,null,text); return s!=null ? s : text; } catch(LuanRuntimeException e) { throw (LuanException)e.getCause(); } catch(IOException e) { throw new RuntimeException(e); } catch(InvalidTokenOffsetsException e) { throw new RuntimeException(e); } } }; } public int count_tokens(String text) throws IOException { int n = 0; TokenStream ts = analyzer.tokenStream(null,text); ts.reset(); while( ts.incrementToken() ) { n++; } ts.close(); return n; } void restore(LuanTable doc) throws LuanException, IOException { writer.addDocument(toLucene(doc)); } public void relog() throws IOException, LuanException { logger.info("start relog"); writeLock.lock(); try { LoggingIndexWriter loggingWriter = (LoggingIndexWriter)writer; loggingWriter.logLucene(); } finally { writeLock.unlock(); } logger.info("end relog"); } public void restore_from_log(Luan luan,LuanFunction handler) throws IOException, LuanException, ParseException { LoggingIndexWriter loggingWriter = (LoggingIndexWriter)writer; if( wasCreated && !loggingWriter.wasCreated ) { logger.error("restoring from log"); force_restore_from_log(luan,handler); } } public void force_restore_from_log(Luan luan,LuanFunction handler) throws IOException { logger.warn("start force_restore_from_log"); if( writeLock.isHeldByCurrentThread() ) throw new RuntimeException(); OpDoer opDoer = handler==null ? null : new LuanOpDoer(writer,luan,handler); writeLock.lock(); boolean ok = false; try { LoggingIndexWriter loggingWriter = (LoggingIndexWriter)writer; loggingWriter.playLogs(opDoer); ok = true; wrote(); ensure_open(); // refresh searcher initId(); wasCreated = false; } finally { if( !ok ) { writer.rollback(); reopen(); } wrote(); writeLock.unlock(); } logger.warn("end force_restore_from_log"); } public void check() throws IOException, LuanException, ParseException { String msg = "start check"; logger.info(msg); CheckIndex.Status status = new CheckIndex(fsDir).checkIndex(); if( !status.clean ) logger.error("index not clean"); if( writer instanceof LoggingIndexWriter ) { LoggingIndexWriter loggingWriter = (LoggingIndexWriter)writer; logger.info("log check"); boolean ok = loggingWriter.check(ID_SORT); } logger.info("end check"); } public String explain_query(String queryStr) throws ParseException { return GoodQueryParser.parseQuery(mfp,queryStr).toString(); } }