view src/global/Site.java @ 28:03e68185c2f5

block spam forums
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 02 Jul 2020 21:19:24 -0600
parents abe0694e9849
children 56accc959f8c
line wrap: on
line source

package global;

import fschmidt.util.java.HtmlUtils;
import nabble.view.lib.ViewUtils;
import nabble.model.Init;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileReader;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.regex.Matcher;


public final class Site {
	private static final Logger logger = LoggerFactory.getLogger(Site.class);

	public static volatile String status = "";

	public static final String SERVER_FLD = "server";
	public static final String SITE_FLD = "site";
	public static final String DOMAIN_FLD = "domain";
	public static final String SUBJECT_FLD = "subject";
	public static final String MESSAGE_FLD = "message";
	public static final String TYPE_FLD = "type";
	public static final String ACTIVITY_FLD = "activity";
	public static final String EMBARRASSING_FLD = "embarrassing";
	public static final String PRIVATE_FLD = "private";
	public static final String OWNER_EMAIL_FLD = "owner_email";
	public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain";
	public static final String NODE_COUNT_FLD = "node_count";
	public static final String WHEN_CREATED_FLD = "when_created";
	public static final String TWEAKS_FLD = "tweaks";
	public static final String HAS_TWEAKS_FLD = "has_tweaks";
	public static final String FILE_COUNT_FLD = "file_count";
	public static final String FILE_NODE_RATIO_FLD = "file_node_ratio";
	public static final String MONTHLY_VIEWS_FLD = "monthly_views";
	public static final String VALUE_FLD = "value";

	public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true));
	public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true));
	public static final Sort SORT_BY_VALUE = new Sort(
		new SortField(VALUE_FLD, SortField.INT, true),
		new SortField(NODE_COUNT_FLD, SortField.INT, true)
	);

	public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
	private static final FSDirectory dir1;
	private static final FSDirectory dir2;
	private static volatile Thread thread = null;
	private static final boolean skipReindex1 = Init.get("skipReindex1",false);

	static {
		try {
			String localDir = (String)Init.get("home_dir")+"local/";
			dir1 = FSDirectory.open(new File(localDir+"lucene_raw"));
			dir2 = FSDirectory.open(new File(localDir+"lucene_global"));
		} catch(IOException e) {
			throw new RuntimeException(e);
		}
	}

	public static FSDirectory dir() {
		return dir2;
	}
/*
	static synchronized void clear()
		throws IOException
	{
		new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close();
	}
*/
	public static boolean isReindexing() {
		return thread != null && thread.isAlive();
	}

	public static void startReindexing(){
		status = "starting reindex";
		thread = new Thread(new Runnable(){public void run(){
			try {
				reindex();
			} catch(SQLException e) {
				logger.error("",e);
				status = "Error: " + e.getMessage();
			} catch(IOException e) {
				logger.error("",e);
				status = "Error: " + e.getMessage();
			} catch(RuntimeException e) {
				logger.error("",e);
				status = "Error: " + e.getMessage();
			}
		}},"reindex");
		thread.start();
	}

	static synchronized void reindex()
		throws SQLException, IOException
	{
		if( !skipReindex1 )
			reindex1();
		status = "done indexing servers";
		logger.info("reindex2");
		reindex2();
		logger.info("done reindexing");
		status = "done reindexing";
	}

	private static void reindex1()
		throws SQLException, IOException
	{
		IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
		try {
			for( Server server : Server.getServers() ) {
				reindex1(server,indexWriter);
			}
		} finally {
			indexWriter.close();
		}
	}
/*
	public static synchronized void removeBySubject(String subject)
		throws IOException
	{
		Term term = new Term( SUBJECT_FLD, subject );
		IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED);
		indexWriter.deleteDocuments(term);
		indexWriter.close();
	}
*/
	private static synchronized void reindex1(Server server,IndexWriter indexWriter)
		throws SQLException, IOException
	{
		logger.info("reindex "+server.name);
		status = "reindexing "+server.name;

		List<Long> siteIds = new ArrayList<Long>();
		{
			Connection con = server.getConnection();
			Statement stmt = con.createStatement();
			ResultSet rs = stmt.executeQuery(
				"select site_id from global.site_global"
			);
			while( rs.next() ) {
				long siteId = rs.getLong("site_id");
				siteIds.add(siteId);
			}
			rs.close();
			stmt.close();
			con.close();
		}
		final int n = siteIds.size();
		int count = 0;
		for( long siteId : siteIds ) {
			update(indexWriter,server,siteId);
			count++;
			logger.info("reindexed "+count+" of "+n+" sites on "+server.name);
			status = "reindexed "+count+" of "+n+" sites on "+server.name;
		}
	}

	private static void update(IndexWriter indexWriter,Server server,long siteId)
		throws IOException, SQLException
	{
		Connection con = server.getConnection();
		try {
			String schema = "s" + siteId;
			Statement stmt = con.createStatement();
			ResultSet rs = stmt.executeQuery(
				"select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks"
				+", (select count(*) from " + schema + ".file_node) as file_node_count"
				+" from global.site_global"
					+", " + schema + ".site"
					+" join " + schema + ".node on site.root_node_id = node.node_id"
					+" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id"
					+" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'"
					+" left join " + schema + ".user_ on node.owner_id = user_.user_id"
					+" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak"
					+" where site_global.site_id = " + siteId
			);
			if( !rs.next() ) {
				logger.error("site not found: "+siteId);
				return;
			}
			Document doc = new Document();
			doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) );
			doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) );
			String subject = rs.getString("subject");
			String domain = rs.getString("custom_domain");
			if( domain == null )
				domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host);
			doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) );
			doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) );
			String message = rs.getString("message");
			doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) );
			String type = "" + rs.getString("type");
			doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) );
			int activity = rs.getInt("activity");
			doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) );
			String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") );
			doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) );
			String privS = rs.getString("priv");
			String priv = Boolean.toString( privS != null );
			doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) );
			String email = rs.getString("email");
			if( email != null ) {
				doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) );
			}
			int nodeCount = rs.getInt("node_count");
			doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) );
			Date whenCreated = rs.getTimestamp("when_created");
			doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) );
			String tweaks = rs.getString("tweaks");
			if( tweaks != null ) {
				doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
			}
			int fileCount = rs.getInt("file_node_count");
			doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
			int monthlyViews = rs.getInt("monthly_views");
			doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) );
			rs.close();
			stmt.close();
			indexWriter.addDocument(doc);
		} catch(SQLException e) {
			logger.error("failed to index site "+siteId,e);
			String msg = e.getMessage();
			if( !(msg.contains("schema") && msg.contains("does not exist")) )
				throw e;
		} finally {
			con.close();
		}
	}

	private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")");

	private static Map<String,Integer> domainMap()
		throws IOException
	{
		Map<String,Integer> map = new HashMap<String,Integer>();
		File siteFile = new File("data/sites.csv");
		if( !siteFile.exists() )
			return map;
		BufferedReader in = new BufferedReader(new FileReader(siteFile));
		String line = in.readLine();
		try {
			while (!ptn.matcher(line).matches())
				line = in.readLine();
			while (true) {
				if (line == null)
					break;
				if (line.length() > 0) {
					if( line.startsWith(",") )
						break;
					Matcher m = ptn.matcher(line);
					if (!m.matches())
						throw new RuntimeException(line);
					String domain = m.group(1);
					String amt = m.group(2);
					amt = amt.replaceAll("[,\"]", "");
					int sessions = Integer.parseInt(amt);
					map.put(domain, sessions);
				}
				line = in.readLine();
			}
			in.close();
		} catch (RuntimeException e) {
			logger.error("Error in line: " + line, e);
			throw e;
		}
		return map;
	}

	private static void reindex2()
		throws IOException
	{
		Map<String,Integer> domainMap = domainMap();
		IndexReader reader = IndexReader.open(dir1);
		IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
		try {
			int n = reader.numDocs();
			if( n != reader.maxDoc() )
				throw new RuntimeException();
			for( int i=0; i<n; i++ ) {
				Document data = reader.document(i);
				Document doc = new Document();
				doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
				doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
				String domain = data.get(DOMAIN_FLD);
				doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
				doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) );
				doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) );
				doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
				int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue();
				doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) );
				doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
				doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
				String email = data.get(OWNER_EMAIL_FLD);
				if( email != null ) {
					doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
					String emailDomain = email.substring( email.indexOf('@') + 1 );
					doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
				}
				int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue();
				doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) );
				long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue();
				doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) );
				String tweaks = data.get(TWEAKS_FLD);
				if( tweaks != null ) {
					doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
					doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
				}
				int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue();
				doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
				if( nodeCount > 0 ) {
					int fileNodeRatio = fileCount*1000/nodeCount;
					doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) );
				}
				int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue();
				doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) );
				Integer sessions = domainMap.get(domain);
				if( sessions == null )
					sessions = 0;
				doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) );
				indexWriter.addDocument(doc);
			}
		} finally {
			indexWriter.close();
			reader.close();
		}
	}

	// class here

	private final Document doc;

	public Site(Document doc) {
		this.doc = doc;
	}

	public String serverName() {
		return doc.get(SERVER_FLD);
	}

	public Server server() {
		return Server.getServer(serverName());
	}

	public String id() {
		return doc.get(SITE_FLD);
	}

	private static final Set https = new HashSet( Arrays.asList(
		"www.postgresql-archive.org",
		"ffq.38.me.nabble.com"
	) );

	public String url() {
		String domain = doc.get(DOMAIN_FLD);
		String scheme = https.contains(domain) ? "https" : "http";
		return scheme + "://" + domain + "/";
	}

	public String subject() {
		return doc.get(SUBJECT_FLD);
	}

	public String subjectHtml() {
		return HtmlUtils.htmlEncode(subject());
	}

	public String link() {
		return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>";
	}

	public String message() {
		return doc.get(MESSAGE_FLD);
	}

	public String type() {
		return doc.get(TYPE_FLD);
	}

	public int activity() {
		NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD);
		return fld.getNumericValue().intValue();
	}

	public int nodeCount() {
		NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD);
		return fld.getNumericValue().intValue();
	}

	public Date whenCreated() {
		NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD);
		return new Date(fld.getNumericValue().longValue());
	}

	public boolean isEmbarrassing() {
		return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) );
	}

	public String tweaks() {
		return doc.get(TWEAKS_FLD);
	}

	public int fileCount() {
		NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD);
		return fld.getNumericValue().intValue();
	}

	public String ownerEmail() {
		return doc.get(OWNER_EMAIL_FLD);
	}

	public int monthlyViews() {
		NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD);
		return fld.getNumericValue().intValue();
	}

	public int value() {
		NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD);
		return fld.getNumericValue().intValue();
	}

}