comparison src/global/Site.java @ 0:7ecd1a4ef557

add content
author Franklin Schmidt <fschmidt@gmail.com>
date Thu, 21 Mar 2019 19:15:52 -0600
parents
children abe0694e9849
comparison
equal deleted inserted replaced
-1:000000000000 0:7ecd1a4ef557
1 package global;
2
3 import fschmidt.util.java.HtmlUtils;
4 import nabble.view.lib.ViewUtils;
5 import nabble.model.Init;
6 import org.apache.lucene.analysis.Analyzer;
7 import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
8 import org.apache.lucene.document.Document;
9 import org.apache.lucene.document.Field;
10 import org.apache.lucene.document.NumericField;
11 import org.apache.lucene.index.IndexReader;
12 import org.apache.lucene.index.IndexWriter;
13 import org.apache.lucene.index.Term;
14 import org.apache.lucene.search.Sort;
15 import org.apache.lucene.search.SortField;
16 import org.apache.lucene.store.FSDirectory;
17 import org.apache.lucene.util.Version;
18 import org.slf4j.Logger;
19 import org.slf4j.LoggerFactory;
20
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.BufferedReader;
24 import java.io.FileReader;
25 import java.sql.Connection;
26 import java.sql.ResultSet;
27 import java.sql.SQLException;
28 import java.sql.Statement;
29 import java.util.ArrayList;
30 import java.util.Date;
31 import java.util.List;
32 import java.util.Map;
33 import java.util.HashMap;
34 import java.util.Set;
35 import java.util.HashSet;
36 import java.util.Arrays;
37 import java.util.regex.Pattern;
38 import java.util.regex.Matcher;
39
40
41 public final class Site {
42 private static final Logger logger = LoggerFactory.getLogger(Site.class);
43
44 public static volatile String status = "";
45
46 public static final String SERVER_FLD = "server";
47 public static final String SITE_FLD = "site";
48 public static final String DOMAIN_FLD = "domain";
49 public static final String SUBJECT_FLD = "subject";
50 public static final String MESSAGE_FLD = "message";
51 public static final String TYPE_FLD = "type";
52 public static final String ACTIVITY_FLD = "activity";
53 public static final String EMBARRASSING_FLD = "embarrassing";
54 public static final String PRIVATE_FLD = "private";
55 public static final String OWNER_EMAIL_FLD = "owner_email";
56 public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain";
57 public static final String NODE_COUNT_FLD = "node_count";
58 public static final String WHEN_CREATED_FLD = "when_created";
59 public static final String TWEAKS_FLD = "tweaks";
60 public static final String HAS_TWEAKS_FLD = "has_tweaks";
61 public static final String FILE_COUNT_FLD = "file_count";
62 public static final String FILE_NODE_RATIO_FLD = "file_node_ratio";
63 public static final String MONTHLY_VIEWS_FLD = "monthly_views";
64 public static final String VALUE_FLD = "value";
65
66 public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true));
67 public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true));
68 public static final Sort SORT_BY_VALUE = new Sort(
69 new SortField(VALUE_FLD, SortField.INT, true),
70 new SortField(NODE_COUNT_FLD, SortField.INT, true)
71 );
72
73 public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
74 private static final FSDirectory dir1;
75 private static final FSDirectory dir2;
76 private static volatile Thread thread = null;
77 private static final boolean skipReindex1 = Init.get("skipReindex1",false);
78
79 static {
80 try {
81 String localDir = (String)Init.get("local_dir");
82 dir1 = FSDirectory.open(new File(localDir+"lucene_raw"));
83 dir2 = FSDirectory.open(new File(localDir+"lucene_global"));
84 } catch(IOException e) {
85 throw new RuntimeException(e);
86 }
87 }
88
89 public static FSDirectory dir() {
90 return dir2;
91 }
92 /*
93 static synchronized void clear()
94 throws IOException
95 {
96 new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close();
97 }
98 */
99 public static boolean isReindexing() {
100 return thread != null && thread.isAlive();
101 }
102
103 public static void startReindexing(){
104 status = "starting reindex";
105 thread = new Thread(new Runnable(){public void run(){
106 try {
107 reindex();
108 } catch(SQLException e) {
109 logger.error("",e);
110 status = "Error: " + e.getMessage();
111 } catch(IOException e) {
112 logger.error("",e);
113 status = "Error: " + e.getMessage();
114 } catch(RuntimeException e) {
115 logger.error("",e);
116 status = "Error: " + e.getMessage();
117 }
118 }},"reindex");
119 thread.start();
120 }
121
122 static synchronized void reindex()
123 throws SQLException, IOException
124 {
125 if( !skipReindex1 )
126 reindex1();
127 status = "done indexing servers";
128 logger.info("reindex2");
129 reindex2();
130 logger.info("done reindexing");
131 status = "done reindexing";
132 }
133
134 private static void reindex1()
135 throws SQLException, IOException
136 {
137 IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
138 try {
139 for( Server server : Server.getServers() ) {
140 reindex1(server,indexWriter);
141 }
142 } finally {
143 indexWriter.close();
144 }
145 }
146 /*
147 public static synchronized void removeBySubject(String subject)
148 throws IOException
149 {
150 Term term = new Term( SUBJECT_FLD, subject );
151 IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED);
152 indexWriter.deleteDocuments(term);
153 indexWriter.close();
154 }
155 */
156 private static synchronized void reindex1(Server server,IndexWriter indexWriter)
157 throws SQLException, IOException
158 {
159 logger.info("reindex "+server.name);
160 status = "reindexing "+server.name;
161
162 List<Long> siteIds = new ArrayList<Long>();
163 {
164 Connection con = server.getConnection();
165 Statement stmt = con.createStatement();
166 ResultSet rs = stmt.executeQuery(
167 "select site_id from global.site_global"
168 );
169 while( rs.next() ) {
170 long siteId = rs.getLong("site_id");
171 siteIds.add(siteId);
172 }
173 rs.close();
174 stmt.close();
175 con.close();
176 }
177 final int n = siteIds.size();
178 int count = 0;
179 for( long siteId : siteIds ) {
180 update(indexWriter,server,siteId);
181 count++;
182 logger.info("reindexed "+count+" of "+n+" sites on "+server.name);
183 status = "reindexed "+count+" of "+n+" sites on "+server.name;
184 }
185 }
186
187 private static void update(IndexWriter indexWriter,Server server,long siteId)
188 throws IOException, SQLException
189 {
190 Connection con = server.getConnection();
191 try {
192 String schema = "s" + siteId;
193 Statement stmt = con.createStatement();
194 ResultSet rs = stmt.executeQuery(
195 "select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks"
196 +", (select count(*) from " + schema + ".file_node) as file_node_count"
197 +" from global.site_global"
198 +", " + schema + ".site"
199 +" join " + schema + ".node on site.root_node_id = node.node_id"
200 +" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id"
201 +" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'"
202 +" left join " + schema + ".user_ on node.owner_id = user_.user_id"
203 +" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak"
204 +" where site_global.site_id = " + siteId
205 );
206 if( !rs.next() ) {
207 logger.error("site not found: "+siteId);
208 return;
209 }
210 Document doc = new Document();
211 doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) );
212 doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) );
213 String subject = rs.getString("subject");
214 String domain = rs.getString("custom_domain");
215 if( domain == null )
216 domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host);
217 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) );
218 doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) );
219 String message = rs.getString("message");
220 doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) );
221 String type = "" + rs.getString("type");
222 doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) );
223 int activity = rs.getInt("activity");
224 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) );
225 String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") );
226 doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) );
227 String privS = rs.getString("priv");
228 String priv = Boolean.toString( privS != null );
229 doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) );
230 String email = rs.getString("email");
231 if( email != null ) {
232 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) );
233 }
234 int nodeCount = rs.getInt("node_count");
235 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) );
236 Date whenCreated = rs.getTimestamp("when_created");
237 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) );
238 String tweaks = rs.getString("tweaks");
239 if( tweaks != null ) {
240 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
241 }
242 int fileCount = rs.getInt("file_node_count");
243 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
244 int monthlyViews = rs.getInt("monthly_views");
245 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) );
246 rs.close();
247 stmt.close();
248 indexWriter.addDocument(doc);
249 } catch(SQLException e) {
250 logger.error("failed to index site "+siteId,e);
251 String msg = e.getMessage();
252 if( !(msg.contains("schema") && msg.contains("does not exist")) )
253 throw e;
254 } finally {
255 con.close();
256 }
257 }
258
259 private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")");
260
261 private static Map<String,Integer> domainMap()
262 throws IOException
263 {
264 Map<String,Integer> map = new HashMap<String,Integer>();
265 File siteFile = new File("data/sites.csv");
266 if( !siteFile.exists() )
267 return map;
268 BufferedReader in = new BufferedReader(new FileReader(siteFile));
269 String line = in.readLine();
270 try {
271 while (!ptn.matcher(line).matches())
272 line = in.readLine();
273 while (true) {
274 if (line == null)
275 break;
276 if (line.length() > 0) {
277 if( line.startsWith(",") )
278 break;
279 Matcher m = ptn.matcher(line);
280 if (!m.matches())
281 throw new RuntimeException(line);
282 String domain = m.group(1);
283 String amt = m.group(2);
284 amt = amt.replaceAll("[,\"]", "");
285 int sessions = Integer.parseInt(amt);
286 map.put(domain, sessions);
287 }
288 line = in.readLine();
289 }
290 in.close();
291 } catch (RuntimeException e) {
292 logger.error("Error in line: " + line, e);
293 throw e;
294 }
295 return map;
296 }
297
298 private static void reindex2()
299 throws IOException
300 {
301 Map<String,Integer> domainMap = domainMap();
302 IndexReader reader = IndexReader.open(dir1);
303 IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
304 try {
305 int n = reader.numDocs();
306 if( n != reader.maxDoc() )
307 throw new RuntimeException();
308 for( int i=0; i<n; i++ ) {
309 Document data = reader.document(i);
310 Document doc = new Document();
311 doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
312 doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
313 String domain = data.get(DOMAIN_FLD);
314 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
315 doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) );
316 doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) );
317 doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
318 int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue();
319 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) );
320 doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
321 doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
322 String email = data.get(OWNER_EMAIL_FLD);
323 if( email != null ) {
324 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
325 String emailDomain = email.substring( email.indexOf('@') + 1 );
326 doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
327 }
328 int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue();
329 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) );
330 long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue();
331 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) );
332 String tweaks = data.get(TWEAKS_FLD);
333 if( tweaks != null ) {
334 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
335 doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
336 }
337 int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue();
338 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
339 if( nodeCount > 0 ) {
340 int fileNodeRatio = fileCount*1000/nodeCount;
341 doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) );
342 }
343 int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue();
344 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) );
345 Integer sessions = domainMap.get(domain);
346 if( sessions == null )
347 sessions = 0;
348 doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) );
349 indexWriter.addDocument(doc);
350 }
351 } finally {
352 indexWriter.close();
353 reader.close();
354 }
355 }
356
357 // class here
358
359 private final Document doc;
360
361 public Site(Document doc) {
362 this.doc = doc;
363 }
364
365 public String serverName() {
366 return doc.get(SERVER_FLD);
367 }
368
369 public Server server() {
370 return Server.getServer(serverName());
371 }
372
373 public String id() {
374 return doc.get(SITE_FLD);
375 }
376
377 private static final Set https = new HashSet( Arrays.asList(
378 "www.postgresql-archive.org",
379 "ffq.38.me.nabble.com"
380 ) );
381
382 public String url() {
383 String domain = doc.get(DOMAIN_FLD);
384 String scheme = https.contains(domain) ? "https" : "http";
385 return scheme + "://" + domain + "/";
386 }
387
388 public String subject() {
389 return doc.get(SUBJECT_FLD);
390 }
391
392 public String subjectHtml() {
393 return HtmlUtils.htmlEncode(subject());
394 }
395
396 public String link() {
397 return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>";
398 }
399
400 public String message() {
401 return doc.get(MESSAGE_FLD);
402 }
403
404 public String type() {
405 return doc.get(TYPE_FLD);
406 }
407
408 public int activity() {
409 NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD);
410 return fld.getNumericValue().intValue();
411 }
412
413 public int nodeCount() {
414 NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD);
415 return fld.getNumericValue().intValue();
416 }
417
418 public Date whenCreated() {
419 NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD);
420 return new Date(fld.getNumericValue().longValue());
421 }
422
423 public boolean isEmbarrassing() {
424 return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) );
425 }
426
427 public String tweaks() {
428 return doc.get(TWEAKS_FLD);
429 }
430
431 public int fileCount() {
432 NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD);
433 return fld.getNumericValue().intValue();
434 }
435
436 public String ownerEmail() {
437 return doc.get(OWNER_EMAIL_FLD);
438 }
439
440 public int monthlyViews() {
441 NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD);
442 return fld.getNumericValue().intValue();
443 }
444
445 public int value() {
446 NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD);
447 return fld.getNumericValue().intValue();
448 }
449
450 }