Mercurial Hosting > nabble
diff src/global/Site.java @ 0:7ecd1a4ef557
add content
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Thu, 21 Mar 2019 19:15:52 -0600 |
parents | |
children | abe0694e9849 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/global/Site.java Thu Mar 21 19:15:52 2019 -0600 @@ -0,0 +1,450 @@ +package global; + +import fschmidt.util.java.HtmlUtils; +import nabble.view.lib.ViewUtils; +import nabble.model.Init; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.snowball.SnowballAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.BufferedReader; +import java.io.FileReader; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + + +public final class Site { + private static final Logger logger = LoggerFactory.getLogger(Site.class); + + public static volatile String status = ""; + + public static final String SERVER_FLD = "server"; + public static final String SITE_FLD = "site"; + public static final String DOMAIN_FLD = "domain"; + public static final String SUBJECT_FLD = "subject"; + public static final String MESSAGE_FLD = "message"; + public static final String TYPE_FLD = "type"; + public static final String ACTIVITY_FLD = "activity"; + public static final String EMBARRASSING_FLD = "embarrassing"; + public static final String PRIVATE_FLD = "private"; + public static final String OWNER_EMAIL_FLD = "owner_email"; + public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain"; + public static final String NODE_COUNT_FLD = "node_count"; + public static final String WHEN_CREATED_FLD = "when_created"; + public static final String TWEAKS_FLD = "tweaks"; + public static final String HAS_TWEAKS_FLD = "has_tweaks"; + public static final String FILE_COUNT_FLD = "file_count"; + public static final String FILE_NODE_RATIO_FLD = "file_node_ratio"; + public static final String MONTHLY_VIEWS_FLD = "monthly_views"; + public static final String VALUE_FLD = "value"; + + public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true)); + public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true)); + public static final Sort SORT_BY_VALUE = new Sort( + new SortField(VALUE_FLD, SortField.INT, true), + new SortField(NODE_COUNT_FLD, SortField.INT, true) + ); + + public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"); + private static final FSDirectory dir1; + private static final FSDirectory dir2; + private static volatile Thread thread = null; + private static final boolean skipReindex1 = Init.get("skipReindex1",false); + + static { + try { + String localDir = (String)Init.get("local_dir"); + dir1 = FSDirectory.open(new File(localDir+"lucene_raw")); + dir2 = FSDirectory.open(new File(localDir+"lucene_global")); + } catch(IOException e) { + throw new RuntimeException(e); + } + } + + public static FSDirectory dir() { + return dir2; + } +/* + static synchronized void clear() + throws IOException + { + new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close(); + } +*/ + public static boolean isReindexing() { + return thread != null && thread.isAlive(); + } + + public static void startReindexing(){ + status = "starting reindex"; + thread = new Thread(new Runnable(){public void run(){ + try { + reindex(); + } catch(SQLException e) { + logger.error("",e); + status = "Error: " + e.getMessage(); + } catch(IOException e) { + logger.error("",e); + status = "Error: " + e.getMessage(); + } catch(RuntimeException e) { + logger.error("",e); + status = "Error: " + e.getMessage(); + } + }},"reindex"); + thread.start(); + } + + static synchronized void reindex() + throws SQLException, IOException + { + if( !skipReindex1 ) + reindex1(); + status = "done indexing servers"; + logger.info("reindex2"); + reindex2(); + logger.info("done reindexing"); + status = "done reindexing"; + } + + private static void reindex1() + throws SQLException, IOException + { + IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); + try { + for( Server server : Server.getServers() ) { + reindex1(server,indexWriter); + } + } finally { + indexWriter.close(); + } + } +/* + public static synchronized void removeBySubject(String subject) + throws IOException + { + Term term = new Term( SUBJECT_FLD, subject ); + IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED); + indexWriter.deleteDocuments(term); + indexWriter.close(); + } +*/ + private static synchronized void reindex1(Server server,IndexWriter indexWriter) + throws SQLException, IOException + { + logger.info("reindex "+server.name); + status = "reindexing "+server.name; + + List<Long> siteIds = new ArrayList<Long>(); + { + Connection con = server.getConnection(); + Statement stmt = con.createStatement(); + ResultSet rs = stmt.executeQuery( + "select site_id from global.site_global" + ); + while( rs.next() ) { + long siteId = rs.getLong("site_id"); + siteIds.add(siteId); + } + rs.close(); + stmt.close(); + con.close(); + } + final int n = siteIds.size(); + int count = 0; + for( long siteId : siteIds ) { + update(indexWriter,server,siteId); + count++; + logger.info("reindexed "+count+" of "+n+" sites on "+server.name); + status = "reindexed "+count+" of "+n+" sites on "+server.name; + } + } + + private static void update(IndexWriter indexWriter,Server server,long siteId) + throws IOException, SQLException + { + Connection con = server.getConnection(); + try { + String schema = "s" + siteId; + Statement stmt = con.createStatement(); + ResultSet rs = stmt.executeQuery( + "select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks" + +", (select count(*) from " + schema + ".file_node) as file_node_count" + +" from global.site_global" + +", " + schema + ".site" + +" join " + schema + ".node on site.root_node_id = node.node_id" + +" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id" + +" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'" + +" left join " + schema + ".user_ on node.owner_id = user_.user_id" + +" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak" + +" where site_global.site_id = " + siteId + ); + if( !rs.next() ) { + logger.error("site not found: "+siteId); + return; + } + Document doc = new Document(); + doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) ); + doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) ); + String subject = rs.getString("subject"); + String domain = rs.getString("custom_domain"); + if( domain == null ) + domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host); + doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) ); + doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) ); + String message = rs.getString("message"); + doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) ); + String type = "" + rs.getString("type"); + doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) ); + int activity = rs.getInt("activity"); + doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) ); + String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") ); + doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) ); + String privS = rs.getString("priv"); + String priv = Boolean.toString( privS != null ); + doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) ); + String email = rs.getString("email"); + if( email != null ) { + doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) ); + } + int nodeCount = rs.getInt("node_count"); + doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) ); + Date whenCreated = rs.getTimestamp("when_created"); + doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) ); + String tweaks = rs.getString("tweaks"); + if( tweaks != null ) { + doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); + } + int fileCount = rs.getInt("file_node_count"); + doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); + int monthlyViews = rs.getInt("monthly_views"); + doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) ); + rs.close(); + stmt.close(); + indexWriter.addDocument(doc); + } catch(SQLException e) { + logger.error("failed to index site "+siteId,e); + String msg = e.getMessage(); + if( !(msg.contains("schema") && msg.contains("does not exist")) ) + throw e; + } finally { + con.close(); + } + } + + private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")"); + + private static Map<String,Integer> domainMap() + throws IOException + { + Map<String,Integer> map = new HashMap<String,Integer>(); + File siteFile = new File("data/sites.csv"); + if( !siteFile.exists() ) + return map; + BufferedReader in = new BufferedReader(new FileReader(siteFile)); + String line = in.readLine(); + try { + while (!ptn.matcher(line).matches()) + line = in.readLine(); + while (true) { + if (line == null) + break; + if (line.length() > 0) { + if( line.startsWith(",") ) + break; + Matcher m = ptn.matcher(line); + if (!m.matches()) + throw new RuntimeException(line); + String domain = m.group(1); + String amt = m.group(2); + amt = amt.replaceAll("[,\"]", ""); + int sessions = Integer.parseInt(amt); + map.put(domain, sessions); + } + line = in.readLine(); + } + in.close(); + } catch (RuntimeException e) { + logger.error("Error in line: " + line, e); + throw e; + } + return map; + } + + private static void reindex2() + throws IOException + { + Map<String,Integer> domainMap = domainMap(); + IndexReader reader = IndexReader.open(dir1); + IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); + try { + int n = reader.numDocs(); + if( n != reader.maxDoc() ) + throw new RuntimeException(); + for( int i=0; i<n; i++ ) { + Document data = reader.document(i); + Document doc = new Document(); + doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + String domain = data.get(DOMAIN_FLD); + doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) ); + doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) ); + doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue(); + doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) ); + doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + String email = data.get(OWNER_EMAIL_FLD); + if( email != null ) { + doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); + String emailDomain = email.substring( email.indexOf('@') + 1 ); + doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + } + int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue(); + doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) ); + long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue(); + doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) ); + String tweaks = data.get(TWEAKS_FLD); + if( tweaks != null ) { + doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); + doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); + } + int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue(); + doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); + if( nodeCount > 0 ) { + int fileNodeRatio = fileCount*1000/nodeCount; + doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) ); + } + int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue(); + doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) ); + Integer sessions = domainMap.get(domain); + if( sessions == null ) + sessions = 0; + doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) ); + indexWriter.addDocument(doc); + } + } finally { + indexWriter.close(); + reader.close(); + } + } + + // class here + + private final Document doc; + + public Site(Document doc) { + this.doc = doc; + } + + public String serverName() { + return doc.get(SERVER_FLD); + } + + public Server server() { + return Server.getServer(serverName()); + } + + public String id() { + return doc.get(SITE_FLD); + } + + private static final Set https = new HashSet( Arrays.asList( + "www.postgresql-archive.org", + "ffq.38.me.nabble.com" + ) ); + + public String url() { + String domain = doc.get(DOMAIN_FLD); + String scheme = https.contains(domain) ? "https" : "http"; + return scheme + "://" + domain + "/"; + } + + public String subject() { + return doc.get(SUBJECT_FLD); + } + + public String subjectHtml() { + return HtmlUtils.htmlEncode(subject()); + } + + public String link() { + return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>"; + } + + public String message() { + return doc.get(MESSAGE_FLD); + } + + public String type() { + return doc.get(TYPE_FLD); + } + + public int activity() { + NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD); + return fld.getNumericValue().intValue(); + } + + public int nodeCount() { + NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD); + return fld.getNumericValue().intValue(); + } + + public Date whenCreated() { + NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD); + return new Date(fld.getNumericValue().longValue()); + } + + public boolean isEmbarrassing() { + return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) ); + } + + public String tweaks() { + return doc.get(TWEAKS_FLD); + } + + public int fileCount() { + NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD); + return fld.getNumericValue().intValue(); + } + + public String ownerEmail() { + return doc.get(OWNER_EMAIL_FLD); + } + + public int monthlyViews() { + NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD); + return fld.getNumericValue().intValue(); + } + + public int value() { + NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD); + return fld.getNumericValue().intValue(); + } + +}