diff src/global/Site.java @ 0:7ecd1a4ef557

add content
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 21 Mar 2019 19:15:52 -0600
parents
children abe0694e9849
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/global/Site.java	Thu Mar 21 19:15:52 2019 -0600
@@ -0,0 +1,450 @@
+package global;
+
+import fschmidt.util.java.HtmlUtils;
+import nabble.view.lib.ViewUtils;
+import nabble.model.Init;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.Arrays;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+
+public final class Site {
+	private static final Logger logger = LoggerFactory.getLogger(Site.class);
+
+	public static volatile String status = "";
+
+	public static final String SERVER_FLD = "server";
+	public static final String SITE_FLD = "site";
+	public static final String DOMAIN_FLD = "domain";
+	public static final String SUBJECT_FLD = "subject";
+	public static final String MESSAGE_FLD = "message";
+	public static final String TYPE_FLD = "type";
+	public static final String ACTIVITY_FLD = "activity";
+	public static final String EMBARRASSING_FLD = "embarrassing";
+	public static final String PRIVATE_FLD = "private";
+	public static final String OWNER_EMAIL_FLD = "owner_email";
+	public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain";
+	public static final String NODE_COUNT_FLD = "node_count";
+	public static final String WHEN_CREATED_FLD = "when_created";
+	public static final String TWEAKS_FLD = "tweaks";
+	public static final String HAS_TWEAKS_FLD = "has_tweaks";
+	public static final String FILE_COUNT_FLD = "file_count";
+	public static final String FILE_NODE_RATIO_FLD = "file_node_ratio";
+	public static final String MONTHLY_VIEWS_FLD = "monthly_views";
+	public static final String VALUE_FLD = "value";
+
+	public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true));
+	public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true));
+	public static final Sort SORT_BY_VALUE = new Sort(
+		new SortField(VALUE_FLD, SortField.INT, true),
+		new SortField(NODE_COUNT_FLD, SortField.INT, true)
+	);
+
+	public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
+	private static final FSDirectory dir1;
+	private static final FSDirectory dir2;
+	private static volatile Thread thread = null;
+	private static final boolean skipReindex1 = Init.get("skipReindex1",false);
+
+	static {
+		try {
+			String localDir = (String)Init.get("local_dir");
+			dir1 = FSDirectory.open(new File(localDir+"lucene_raw"));
+			dir2 = FSDirectory.open(new File(localDir+"lucene_global"));
+		} catch(IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	public static FSDirectory dir() {
+		return dir2;
+	}
+/*
+	static synchronized void clear()
+		throws IOException
+	{
+		new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close();
+	}
+*/
+	public static boolean isReindexing() {
+		return thread != null && thread.isAlive();
+	}
+
+	public static void startReindexing(){
+		status = "starting reindex";
+		thread = new Thread(new Runnable(){public void run(){
+			try {
+				reindex();
+			} catch(SQLException e) {
+				logger.error("",e);
+				status = "Error: " + e.getMessage();
+			} catch(IOException e) {
+				logger.error("",e);
+				status = "Error: " + e.getMessage();
+			} catch(RuntimeException e) {
+				logger.error("",e);
+				status = "Error: " + e.getMessage();
+			}
+		}},"reindex");
+		thread.start();
+	}
+
+	static synchronized void reindex()
+		throws SQLException, IOException
+	{
+		if( !skipReindex1 )
+			reindex1();
+		status = "done indexing servers";
+		logger.info("reindex2");
+		reindex2();
+		logger.info("done reindexing");
+		status = "done reindexing";
+	}
+
+	private static void reindex1()
+		throws SQLException, IOException
+	{
+		IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
+		try {
+			for( Server server : Server.getServers() ) {
+				reindex1(server,indexWriter);
+			}
+		} finally {
+			indexWriter.close();
+		}
+	}
+/*
+	public static synchronized void removeBySubject(String subject)
+		throws IOException
+	{
+		Term term = new Term( SUBJECT_FLD, subject );
+		IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED);
+		indexWriter.deleteDocuments(term);
+		indexWriter.close();
+	}
+*/
+	private static synchronized void reindex1(Server server,IndexWriter indexWriter)
+		throws SQLException, IOException
+	{
+		logger.info("reindex "+server.name);
+		status = "reindexing "+server.name;
+
+		List<Long> siteIds = new ArrayList<Long>();
+		{
+			Connection con = server.getConnection();
+			Statement stmt = con.createStatement();
+			ResultSet rs = stmt.executeQuery(
+				"select site_id from global.site_global"
+			);
+			while( rs.next() ) {
+				long siteId = rs.getLong("site_id");
+				siteIds.add(siteId);
+			}
+			rs.close();
+			stmt.close();
+			con.close();
+		}
+		final int n = siteIds.size();
+		int count = 0;
+		for( long siteId : siteIds ) {
+			update(indexWriter,server,siteId);
+			count++;
+			logger.info("reindexed "+count+" of "+n+" sites on "+server.name);
+			status = "reindexed "+count+" of "+n+" sites on "+server.name;
+		}
+	}
+
+	private static void update(IndexWriter indexWriter,Server server,long siteId)
+		throws IOException, SQLException
+	{
+		Connection con = server.getConnection();
+		try {
+			String schema = "s" + siteId;
+			Statement stmt = con.createStatement();
+			ResultSet rs = stmt.executeQuery(
+				"select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks"
+				+", (select count(*) from " + schema + ".file_node) as file_node_count"
+				+" from global.site_global"
+					+", " + schema + ".site"
+					+" join " + schema + ".node on site.root_node_id = node.node_id"
+					+" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id"
+					+" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'"
+					+" left join " + schema + ".user_ on node.owner_id = user_.user_id"
+					+" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak"
+					+" where site_global.site_id = " + siteId
+			);
+			if( !rs.next() ) {
+				logger.error("site not found: "+siteId);
+				return;
+			}
+			Document doc = new Document();
+			doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) );
+			doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) );
+			String subject = rs.getString("subject");
+			String domain = rs.getString("custom_domain");
+			if( domain == null )
+				domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host);
+			doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) );
+			doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) );
+			String message = rs.getString("message");
+			doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) );
+			String type = "" + rs.getString("type");
+			doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) );
+			int activity = rs.getInt("activity");
+			doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) );
+			String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") );
+			doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) );
+			String privS = rs.getString("priv");
+			String priv = Boolean.toString( privS != null );
+			doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) );
+			String email = rs.getString("email");
+			if( email != null ) {
+				doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) );
+			}
+			int nodeCount = rs.getInt("node_count");
+			doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) );
+			Date whenCreated = rs.getTimestamp("when_created");
+			doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) );
+			String tweaks = rs.getString("tweaks");
+			if( tweaks != null ) {
+				doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
+			}
+			int fileCount = rs.getInt("file_node_count");
+			doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
+			int monthlyViews = rs.getInt("monthly_views");
+			doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) );
+			rs.close();
+			stmt.close();
+			indexWriter.addDocument(doc);
+		} catch(SQLException e) {
+			logger.error("failed to index site "+siteId,e);
+			String msg = e.getMessage();
+			if( !(msg.contains("schema") && msg.contains("does not exist")) )
+				throw e;
+		} finally {
+			con.close();
+		}
+	}
+
+	private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")");
+
+	private static Map<String,Integer> domainMap()
+		throws IOException
+	{
+		Map<String,Integer> map = new HashMap<String,Integer>();
+		File siteFile = new File("data/sites.csv");
+		if( !siteFile.exists() )
+			return map;
+		BufferedReader in = new BufferedReader(new FileReader(siteFile));
+		String line = in.readLine();
+		try {
+			while (!ptn.matcher(line).matches())
+				line = in.readLine();
+			while (true) {
+				if (line == null)
+					break;
+				if (line.length() > 0) {
+					if( line.startsWith(",") )
+						break;
+					Matcher m = ptn.matcher(line);
+					if (!m.matches())
+						throw new RuntimeException(line);
+					String domain = m.group(1);
+					String amt = m.group(2);
+					amt = amt.replaceAll("[,\"]", "");
+					int sessions = Integer.parseInt(amt);
+					map.put(domain, sessions);
+				}
+				line = in.readLine();
+			}
+			in.close();
+		} catch (RuntimeException e) {
+			logger.error("Error in line: " + line, e);
+			throw e;
+		}
+		return map;
+	}
+
+	private static void reindex2()
+		throws IOException
+	{
+		Map<String,Integer> domainMap = domainMap();
+		IndexReader reader = IndexReader.open(dir1);
+		IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
+		try {
+			int n = reader.numDocs();
+			if( n != reader.maxDoc() )
+				throw new RuntimeException();
+			for( int i=0; i<n; i++ ) {
+				Document data = reader.document(i);
+				Document doc = new Document();
+				doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				String domain = data.get(DOMAIN_FLD);
+				doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) );
+				doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) );
+				doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue();
+				doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) );
+				doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				String email = data.get(OWNER_EMAIL_FLD);
+				if( email != null ) {
+					doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
+					String emailDomain = email.substring( email.indexOf('@') + 1 );
+					doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				}
+				int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue();
+				doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) );
+				long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue();
+				doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) );
+				String tweaks = data.get(TWEAKS_FLD);
+				if( tweaks != null ) {
+					doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
+					doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
+				}
+				int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue();
+				doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
+				if( nodeCount > 0 ) {
+					int fileNodeRatio = fileCount*1000/nodeCount;
+					doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) );
+				}
+				int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue();
+				doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) );
+				Integer sessions = domainMap.get(domain);
+				if( sessions == null )
+					sessions = 0;
+				doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) );
+				indexWriter.addDocument(doc);
+			}
+		} finally {
+			indexWriter.close();
+			reader.close();
+		}
+	}
+
+	// class here
+
+	private final Document doc;
+
+	public Site(Document doc) {
+		this.doc = doc;
+	}
+
+	public String serverName() {
+		return doc.get(SERVER_FLD);
+	}
+
+	public Server server() {
+		return Server.getServer(serverName());
+	}
+
+	public String id() {
+		return doc.get(SITE_FLD);
+	}
+
+	private static final Set https = new HashSet( Arrays.asList(
+		"www.postgresql-archive.org",
+		"ffq.38.me.nabble.com"
+	) );
+
+	public String url() {
+		String domain = doc.get(DOMAIN_FLD);
+		String scheme = https.contains(domain) ? "https" : "http";
+		return scheme + "://" + domain + "/";
+	}
+
+	public String subject() {
+		return doc.get(SUBJECT_FLD);
+	}
+
+	public String subjectHtml() {
+		return HtmlUtils.htmlEncode(subject());
+	}
+
+	public String link() {
+		return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>";
+	}
+
+	public String message() {
+		return doc.get(MESSAGE_FLD);
+	}
+
+	public String type() {
+		return doc.get(TYPE_FLD);
+	}
+
+	public int activity() {
+		NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD);
+		return fld.getNumericValue().intValue();
+	}
+
+	public int nodeCount() {
+		NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD);
+		return fld.getNumericValue().intValue();
+	}
+
+	public Date whenCreated() {
+		NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD);
+		return new Date(fld.getNumericValue().longValue());
+	}
+
+	public boolean isEmbarrassing() {
+		return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) );
+	}
+
+	public String tweaks() {
+		return doc.get(TWEAKS_FLD);
+	}
+
+	public int fileCount() {
+		NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD);
+		return fld.getNumericValue().intValue();
+	}
+
+	public String ownerEmail() {
+		return doc.get(OWNER_EMAIL_FLD);
+	}
+
+	public int monthlyViews() {
+		NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD);
+		return fld.getNumericValue().intValue();
+	}
+
+	public int value() {
+		NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD);
+		return fld.getNumericValue().intValue();
+	}
+
+}