Mercurial Hosting > nabble
view src/global/Site.java @ 38:04d4c291484b
handle missing schema
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Sat, 11 Jul 2020 21:30:58 -0600 |
parents | abe0694e9849 |
children | 56accc959f8c |
line wrap: on
line source
package global; import fschmidt.util.java.HtmlUtils; import nabble.view.lib.ViewUtils; import nabble.model.Init; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.snowball.SnowballAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.io.BufferedReader; import java.io.FileReader; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.HashSet; import java.util.Arrays; import java.util.regex.Pattern; import java.util.regex.Matcher; public final class Site { private static final Logger logger = LoggerFactory.getLogger(Site.class); public static volatile String status = ""; public static final String SERVER_FLD = "server"; public static final String SITE_FLD = "site"; public static final String DOMAIN_FLD = "domain"; public static final String SUBJECT_FLD = "subject"; public static final String MESSAGE_FLD = "message"; public static final String TYPE_FLD = "type"; public static final String ACTIVITY_FLD = "activity"; public static final String EMBARRASSING_FLD = "embarrassing"; public static final String PRIVATE_FLD = "private"; public static final String OWNER_EMAIL_FLD = "owner_email"; public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain"; public static final String NODE_COUNT_FLD = "node_count"; public static final String WHEN_CREATED_FLD = "when_created"; public static final String TWEAKS_FLD = "tweaks"; public static final String HAS_TWEAKS_FLD = "has_tweaks"; public static final String FILE_COUNT_FLD = "file_count"; public static final String FILE_NODE_RATIO_FLD = "file_node_ratio"; public static final String MONTHLY_VIEWS_FLD = "monthly_views"; public static final String VALUE_FLD = "value"; public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true)); public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true)); public static final Sort SORT_BY_VALUE = new Sort( new SortField(VALUE_FLD, SortField.INT, true), new SortField(NODE_COUNT_FLD, SortField.INT, true) ); public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"); private static final FSDirectory dir1; private static final FSDirectory dir2; private static volatile Thread thread = null; private static final boolean skipReindex1 = Init.get("skipReindex1",false); static { try { String localDir = (String)Init.get("home_dir")+"local/"; dir1 = FSDirectory.open(new File(localDir+"lucene_raw")); dir2 = FSDirectory.open(new File(localDir+"lucene_global")); } catch(IOException e) { throw new RuntimeException(e); } } public static FSDirectory dir() { return dir2; } /* static synchronized void clear() throws IOException { new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close(); } */ public static boolean isReindexing() { return thread != null && thread.isAlive(); } public static void startReindexing(){ status = "starting reindex"; thread = new Thread(new Runnable(){public void run(){ try { reindex(); } catch(SQLException e) { logger.error("",e); status = "Error: " + e.getMessage(); } catch(IOException e) { logger.error("",e); status = "Error: " + e.getMessage(); } catch(RuntimeException e) { logger.error("",e); status = "Error: " + e.getMessage(); } }},"reindex"); thread.start(); } static synchronized void reindex() throws SQLException, IOException { if( !skipReindex1 ) reindex1(); status = "done indexing servers"; logger.info("reindex2"); reindex2(); logger.info("done reindexing"); status = "done reindexing"; } private static void reindex1() throws SQLException, IOException { IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); try { for( Server server : Server.getServers() ) { reindex1(server,indexWriter); } } finally { indexWriter.close(); } } /* public static synchronized void removeBySubject(String subject) throws IOException { Term term = new Term( SUBJECT_FLD, subject ); IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED); indexWriter.deleteDocuments(term); indexWriter.close(); } */ private static synchronized void reindex1(Server server,IndexWriter indexWriter) throws SQLException, IOException { logger.info("reindex "+server.name); status = "reindexing "+server.name; List<Long> siteIds = new ArrayList<Long>(); { Connection con = server.getConnection(); Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery( "select site_id from global.site_global" ); while( rs.next() ) { long siteId = rs.getLong("site_id"); siteIds.add(siteId); } rs.close(); stmt.close(); con.close(); } final int n = siteIds.size(); int count = 0; for( long siteId : siteIds ) { update(indexWriter,server,siteId); count++; logger.info("reindexed "+count+" of "+n+" sites on "+server.name); status = "reindexed "+count+" of "+n+" sites on "+server.name; } } private static void update(IndexWriter indexWriter,Server server,long siteId) throws IOException, SQLException { Connection con = server.getConnection(); try { String schema = "s" + siteId; Statement stmt = con.createStatement(); ResultSet rs = stmt.executeQuery( "select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks" +", (select count(*) from " + schema + ".file_node) as file_node_count" +" from global.site_global" +", " + schema + ".site" +" join " + schema + ".node on site.root_node_id = node.node_id" +" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id" +" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'" +" left join " + schema + ".user_ on node.owner_id = user_.user_id" +" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak" +" where site_global.site_id = " + siteId ); if( !rs.next() ) { logger.error("site not found: "+siteId); return; } Document doc = new Document(); doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) ); doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) ); String subject = rs.getString("subject"); String domain = rs.getString("custom_domain"); if( domain == null ) domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host); doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) ); doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) ); String message = rs.getString("message"); doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) ); String type = "" + rs.getString("type"); doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) ); int activity = rs.getInt("activity"); doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) ); String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") ); doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) ); String privS = rs.getString("priv"); String priv = Boolean.toString( privS != null ); doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) ); String email = rs.getString("email"); if( email != null ) { doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) ); } int nodeCount = rs.getInt("node_count"); doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) ); Date whenCreated = rs.getTimestamp("when_created"); doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) ); String tweaks = rs.getString("tweaks"); if( tweaks != null ) { doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); } int fileCount = rs.getInt("file_node_count"); doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); int monthlyViews = rs.getInt("monthly_views"); doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) ); rs.close(); stmt.close(); indexWriter.addDocument(doc); } catch(SQLException e) { logger.error("failed to index site "+siteId,e); String msg = e.getMessage(); if( !(msg.contains("schema") && msg.contains("does not exist")) ) throw e; } finally { con.close(); } } private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")"); private static Map<String,Integer> domainMap() throws IOException { Map<String,Integer> map = new HashMap<String,Integer>(); File siteFile = new File("data/sites.csv"); if( !siteFile.exists() ) return map; BufferedReader in = new BufferedReader(new FileReader(siteFile)); String line = in.readLine(); try { while (!ptn.matcher(line).matches()) line = in.readLine(); while (true) { if (line == null) break; if (line.length() > 0) { if( line.startsWith(",") ) break; Matcher m = ptn.matcher(line); if (!m.matches()) throw new RuntimeException(line); String domain = m.group(1); String amt = m.group(2); amt = amt.replaceAll("[,\"]", ""); int sessions = Integer.parseInt(amt); map.put(domain, sessions); } line = in.readLine(); } in.close(); } catch (RuntimeException e) { logger.error("Error in line: " + line, e); throw e; } return map; } private static void reindex2() throws IOException { Map<String,Integer> domainMap = domainMap(); IndexReader reader = IndexReader.open(dir1); IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); try { int n = reader.numDocs(); if( n != reader.maxDoc() ) throw new RuntimeException(); for( int i=0; i<n; i++ ) { Document data = reader.document(i); Document doc = new Document(); doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); String domain = data.get(DOMAIN_FLD); doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) ); doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) ); doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue(); doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) ); doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); String email = data.get(OWNER_EMAIL_FLD); if( email != null ) { doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); String emailDomain = email.substring( email.indexOf('@') + 1 ); doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); } int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue(); doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) ); long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue(); doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) ); String tweaks = data.get(TWEAKS_FLD); if( tweaks != null ) { doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); } int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue(); doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); if( nodeCount > 0 ) { int fileNodeRatio = fileCount*1000/nodeCount; doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) ); } int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue(); doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) ); Integer sessions = domainMap.get(domain); if( sessions == null ) sessions = 0; doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) ); indexWriter.addDocument(doc); } } finally { indexWriter.close(); reader.close(); } } // class here private final Document doc; public Site(Document doc) { this.doc = doc; } public String serverName() { return doc.get(SERVER_FLD); } public Server server() { return Server.getServer(serverName()); } public String id() { return doc.get(SITE_FLD); } private static final Set https = new HashSet( Arrays.asList( "www.postgresql-archive.org", "ffq.38.me.nabble.com" ) ); public String url() { String domain = doc.get(DOMAIN_FLD); String scheme = https.contains(domain) ? "https" : "http"; return scheme + "://" + domain + "/"; } public String subject() { return doc.get(SUBJECT_FLD); } public String subjectHtml() { return HtmlUtils.htmlEncode(subject()); } public String link() { return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>"; } public String message() { return doc.get(MESSAGE_FLD); } public String type() { return doc.get(TYPE_FLD); } public int activity() { NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD); return fld.getNumericValue().intValue(); } public int nodeCount() { NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD); return fld.getNumericValue().intValue(); } public Date whenCreated() { NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD); return new Date(fld.getNumericValue().longValue()); } public boolean isEmbarrassing() { return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) ); } public String tweaks() { return doc.get(TWEAKS_FLD); } public int fileCount() { NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD); return fld.getNumericValue().intValue(); } public String ownerEmail() { return doc.get(OWNER_EMAIL_FLD); } public int monthlyViews() { NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD); return fld.getNumericValue().intValue(); } public int value() { NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD); return fld.getNumericValue().intValue(); } }