0
|
1 package global;
|
|
2
|
|
3 import fschmidt.util.java.HtmlUtils;
|
|
4 import nabble.view.lib.ViewUtils;
|
|
5 import nabble.model.Init;
|
|
6 import org.apache.lucene.analysis.Analyzer;
|
|
7 import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
|
|
8 import org.apache.lucene.document.Document;
|
|
9 import org.apache.lucene.document.Field;
|
|
10 import org.apache.lucene.document.NumericField;
|
|
11 import org.apache.lucene.index.IndexReader;
|
|
12 import org.apache.lucene.index.IndexWriter;
|
|
13 import org.apache.lucene.index.Term;
|
|
14 import org.apache.lucene.search.Sort;
|
|
15 import org.apache.lucene.search.SortField;
|
|
16 import org.apache.lucene.store.FSDirectory;
|
|
17 import org.apache.lucene.util.Version;
|
|
18 import org.slf4j.Logger;
|
|
19 import org.slf4j.LoggerFactory;
|
|
20
|
|
21 import java.io.File;
|
|
22 import java.io.IOException;
|
|
23 import java.io.BufferedReader;
|
|
24 import java.io.FileReader;
|
|
25 import java.sql.Connection;
|
|
26 import java.sql.ResultSet;
|
|
27 import java.sql.SQLException;
|
|
28 import java.sql.Statement;
|
|
29 import java.util.ArrayList;
|
|
30 import java.util.Date;
|
|
31 import java.util.List;
|
|
32 import java.util.Map;
|
|
33 import java.util.HashMap;
|
|
34 import java.util.Set;
|
|
35 import java.util.HashSet;
|
|
36 import java.util.Arrays;
|
|
37 import java.util.regex.Pattern;
|
|
38 import java.util.regex.Matcher;
|
|
39
|
|
40
|
|
41 public final class Site {
|
|
42 private static final Logger logger = LoggerFactory.getLogger(Site.class);
|
|
43
|
|
44 public static volatile String status = "";
|
|
45
|
|
46 public static final String SERVER_FLD = "server";
|
|
47 public static final String SITE_FLD = "site";
|
|
48 public static final String DOMAIN_FLD = "domain";
|
|
49 public static final String SUBJECT_FLD = "subject";
|
|
50 public static final String MESSAGE_FLD = "message";
|
|
51 public static final String TYPE_FLD = "type";
|
|
52 public static final String ACTIVITY_FLD = "activity";
|
|
53 public static final String EMBARRASSING_FLD = "embarrassing";
|
|
54 public static final String PRIVATE_FLD = "private";
|
|
55 public static final String OWNER_EMAIL_FLD = "owner_email";
|
|
56 public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain";
|
|
57 public static final String NODE_COUNT_FLD = "node_count";
|
|
58 public static final String WHEN_CREATED_FLD = "when_created";
|
|
59 public static final String TWEAKS_FLD = "tweaks";
|
|
60 public static final String HAS_TWEAKS_FLD = "has_tweaks";
|
|
61 public static final String FILE_COUNT_FLD = "file_count";
|
|
62 public static final String FILE_NODE_RATIO_FLD = "file_node_ratio";
|
|
63 public static final String MONTHLY_VIEWS_FLD = "monthly_views";
|
|
64 public static final String VALUE_FLD = "value";
|
|
65
|
|
66 public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true));
|
|
67 public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true));
|
|
68 public static final Sort SORT_BY_VALUE = new Sort(
|
|
69 new SortField(VALUE_FLD, SortField.INT, true),
|
|
70 new SortField(NODE_COUNT_FLD, SortField.INT, true)
|
|
71 );
|
|
72
|
|
73 public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English");
|
|
74 private static final FSDirectory dir1;
|
|
75 private static final FSDirectory dir2;
|
|
76 private static volatile Thread thread = null;
|
|
77 private static final boolean skipReindex1 = Init.get("skipReindex1",false);
|
|
78
|
|
79 static {
|
|
80 try {
|
|
81 String localDir = (String)Init.get("local_dir");
|
|
82 dir1 = FSDirectory.open(new File(localDir+"lucene_raw"));
|
|
83 dir2 = FSDirectory.open(new File(localDir+"lucene_global"));
|
|
84 } catch(IOException e) {
|
|
85 throw new RuntimeException(e);
|
|
86 }
|
|
87 }
|
|
88
|
|
89 public static FSDirectory dir() {
|
|
90 return dir2;
|
|
91 }
|
|
92 /*
|
|
93 static synchronized void clear()
|
|
94 throws IOException
|
|
95 {
|
|
96 new IndexWriter(dir,analyzer,true,IndexWriter.MaxFieldLength.LIMITED).close();
|
|
97 }
|
|
98 */
|
|
99 public static boolean isReindexing() {
|
|
100 return thread != null && thread.isAlive();
|
|
101 }
|
|
102
|
|
103 public static void startReindexing(){
|
|
104 status = "starting reindex";
|
|
105 thread = new Thread(new Runnable(){public void run(){
|
|
106 try {
|
|
107 reindex();
|
|
108 } catch(SQLException e) {
|
|
109 logger.error("",e);
|
|
110 status = "Error: " + e.getMessage();
|
|
111 } catch(IOException e) {
|
|
112 logger.error("",e);
|
|
113 status = "Error: " + e.getMessage();
|
|
114 } catch(RuntimeException e) {
|
|
115 logger.error("",e);
|
|
116 status = "Error: " + e.getMessage();
|
|
117 }
|
|
118 }},"reindex");
|
|
119 thread.start();
|
|
120 }
|
|
121
|
|
122 static synchronized void reindex()
|
|
123 throws SQLException, IOException
|
|
124 {
|
|
125 if( !skipReindex1 )
|
|
126 reindex1();
|
|
127 status = "done indexing servers";
|
|
128 logger.info("reindex2");
|
|
129 reindex2();
|
|
130 logger.info("done reindexing");
|
|
131 status = "done reindexing";
|
|
132 }
|
|
133
|
|
134 private static void reindex1()
|
|
135 throws SQLException, IOException
|
|
136 {
|
|
137 IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
|
|
138 try {
|
|
139 for( Server server : Server.getServers() ) {
|
|
140 reindex1(server,indexWriter);
|
|
141 }
|
|
142 } finally {
|
|
143 indexWriter.close();
|
|
144 }
|
|
145 }
|
|
146 /*
|
|
147 public static synchronized void removeBySubject(String subject)
|
|
148 throws IOException
|
|
149 {
|
|
150 Term term = new Term( SUBJECT_FLD, subject );
|
|
151 IndexWriter indexWriter = new IndexWriter(dir2,analyzer,IndexWriter.MaxFieldLength.LIMITED);
|
|
152 indexWriter.deleteDocuments(term);
|
|
153 indexWriter.close();
|
|
154 }
|
|
155 */
|
|
156 private static synchronized void reindex1(Server server,IndexWriter indexWriter)
|
|
157 throws SQLException, IOException
|
|
158 {
|
|
159 logger.info("reindex "+server.name);
|
|
160 status = "reindexing "+server.name;
|
|
161
|
|
162 List<Long> siteIds = new ArrayList<Long>();
|
|
163 {
|
|
164 Connection con = server.getConnection();
|
|
165 Statement stmt = con.createStatement();
|
|
166 ResultSet rs = stmt.executeQuery(
|
|
167 "select site_id from global.site_global"
|
|
168 );
|
|
169 while( rs.next() ) {
|
|
170 long siteId = rs.getLong("site_id");
|
|
171 siteIds.add(siteId);
|
|
172 }
|
|
173 rs.close();
|
|
174 stmt.close();
|
|
175 con.close();
|
|
176 }
|
|
177 final int n = siteIds.size();
|
|
178 int count = 0;
|
|
179 for( long siteId : siteIds ) {
|
|
180 update(indexWriter,server,siteId);
|
|
181 count++;
|
|
182 logger.info("reindexed "+count+" of "+n+" sites on "+server.name);
|
|
183 status = "reindexed "+count+" of "+n+" sites on "+server.name;
|
|
184 }
|
|
185 }
|
|
186
|
|
187 private static void update(IndexWriter indexWriter,Server server,long siteId)
|
|
188 throws IOException, SQLException
|
|
189 {
|
|
190 Connection con = server.getConnection();
|
|
191 try {
|
|
192 String schema = "s" + siteId;
|
|
193 Statement stmt = con.createStatement();
|
|
194 ResultSet rs = stmt.executeQuery(
|
|
195 "select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks"
|
|
196 +", (select count(*) from " + schema + ".file_node) as file_node_count"
|
|
197 +" from global.site_global"
|
|
198 +", " + schema + ".site"
|
|
199 +" join " + schema + ".node on site.root_node_id = node.node_id"
|
|
200 +" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id"
|
|
201 +" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'"
|
|
202 +" left join " + schema + ".user_ on node.owner_id = user_.user_id"
|
|
203 +" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak"
|
|
204 +" where site_global.site_id = " + siteId
|
|
205 );
|
|
206 if( !rs.next() ) {
|
|
207 logger.error("site not found: "+siteId);
|
|
208 return;
|
|
209 }
|
|
210 Document doc = new Document();
|
|
211 doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) );
|
|
212 doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) );
|
|
213 String subject = rs.getString("subject");
|
|
214 String domain = rs.getString("custom_domain");
|
|
215 if( domain == null )
|
|
216 domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host);
|
|
217 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) );
|
|
218 doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) );
|
|
219 String message = rs.getString("message");
|
|
220 doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) );
|
|
221 String type = "" + rs.getString("type");
|
|
222 doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) );
|
|
223 int activity = rs.getInt("activity");
|
|
224 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) );
|
|
225 String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") );
|
|
226 doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) );
|
|
227 String privS = rs.getString("priv");
|
|
228 String priv = Boolean.toString( privS != null );
|
|
229 doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) );
|
|
230 String email = rs.getString("email");
|
|
231 if( email != null ) {
|
|
232 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) );
|
|
233 }
|
|
234 int nodeCount = rs.getInt("node_count");
|
|
235 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) );
|
|
236 Date whenCreated = rs.getTimestamp("when_created");
|
|
237 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) );
|
|
238 String tweaks = rs.getString("tweaks");
|
|
239 if( tweaks != null ) {
|
|
240 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
|
|
241 }
|
|
242 int fileCount = rs.getInt("file_node_count");
|
|
243 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
|
|
244 int monthlyViews = rs.getInt("monthly_views");
|
|
245 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) );
|
|
246 rs.close();
|
|
247 stmt.close();
|
|
248 indexWriter.addDocument(doc);
|
|
249 } catch(SQLException e) {
|
|
250 logger.error("failed to index site "+siteId,e);
|
|
251 String msg = e.getMessage();
|
|
252 if( !(msg.contains("schema") && msg.contains("does not exist")) )
|
|
253 throw e;
|
|
254 } finally {
|
|
255 con.close();
|
|
256 }
|
|
257 }
|
|
258
|
|
259 private static final Pattern ptn = Pattern.compile("([^,]+),(\\d+|\"[0-9,]+\")");
|
|
260
|
|
261 private static Map<String,Integer> domainMap()
|
|
262 throws IOException
|
|
263 {
|
|
264 Map<String,Integer> map = new HashMap<String,Integer>();
|
|
265 File siteFile = new File("data/sites.csv");
|
|
266 if( !siteFile.exists() )
|
|
267 return map;
|
|
268 BufferedReader in = new BufferedReader(new FileReader(siteFile));
|
|
269 String line = in.readLine();
|
|
270 try {
|
|
271 while (!ptn.matcher(line).matches())
|
|
272 line = in.readLine();
|
|
273 while (true) {
|
|
274 if (line == null)
|
|
275 break;
|
|
276 if (line.length() > 0) {
|
|
277 if( line.startsWith(",") )
|
|
278 break;
|
|
279 Matcher m = ptn.matcher(line);
|
|
280 if (!m.matches())
|
|
281 throw new RuntimeException(line);
|
|
282 String domain = m.group(1);
|
|
283 String amt = m.group(2);
|
|
284 amt = amt.replaceAll("[,\"]", "");
|
|
285 int sessions = Integer.parseInt(amt);
|
|
286 map.put(domain, sessions);
|
|
287 }
|
|
288 line = in.readLine();
|
|
289 }
|
|
290 in.close();
|
|
291 } catch (RuntimeException e) {
|
|
292 logger.error("Error in line: " + line, e);
|
|
293 throw e;
|
|
294 }
|
|
295 return map;
|
|
296 }
|
|
297
|
|
298 private static void reindex2()
|
|
299 throws IOException
|
|
300 {
|
|
301 Map<String,Integer> domainMap = domainMap();
|
|
302 IndexReader reader = IndexReader.open(dir1);
|
|
303 IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED);
|
|
304 try {
|
|
305 int n = reader.numDocs();
|
|
306 if( n != reader.maxDoc() )
|
|
307 throw new RuntimeException();
|
|
308 for( int i=0; i<n; i++ ) {
|
|
309 Document data = reader.document(i);
|
|
310 Document doc = new Document();
|
|
311 doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
312 doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
313 String domain = data.get(DOMAIN_FLD);
|
|
314 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
315 doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) );
|
|
316 doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) );
|
|
317 doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
318 int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue();
|
|
319 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) );
|
|
320 doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
321 doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
322 String email = data.get(OWNER_EMAIL_FLD);
|
|
323 if( email != null ) {
|
|
324 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
325 String emailDomain = email.substring( email.indexOf('@') + 1 );
|
|
326 doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
327 }
|
|
328 int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue();
|
|
329 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) );
|
|
330 long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue();
|
|
331 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) );
|
|
332 String tweaks = data.get(TWEAKS_FLD);
|
|
333 if( tweaks != null ) {
|
|
334 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) );
|
|
335 doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) );
|
|
336 }
|
|
337 int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue();
|
|
338 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) );
|
|
339 if( nodeCount > 0 ) {
|
|
340 int fileNodeRatio = fileCount*1000/nodeCount;
|
|
341 doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) );
|
|
342 }
|
|
343 int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue();
|
|
344 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) );
|
|
345 Integer sessions = domainMap.get(domain);
|
|
346 if( sessions == null )
|
|
347 sessions = 0;
|
|
348 doc.add( new NumericField(VALUE_FLD,Field.Store.YES,true).setIntValue(sessions) );
|
|
349 indexWriter.addDocument(doc);
|
|
350 }
|
|
351 } finally {
|
|
352 indexWriter.close();
|
|
353 reader.close();
|
|
354 }
|
|
355 }
|
|
356
|
|
357 // class here
|
|
358
|
|
359 private final Document doc;
|
|
360
|
|
361 public Site(Document doc) {
|
|
362 this.doc = doc;
|
|
363 }
|
|
364
|
|
365 public String serverName() {
|
|
366 return doc.get(SERVER_FLD);
|
|
367 }
|
|
368
|
|
369 public Server server() {
|
|
370 return Server.getServer(serverName());
|
|
371 }
|
|
372
|
|
373 public String id() {
|
|
374 return doc.get(SITE_FLD);
|
|
375 }
|
|
376
|
|
377 private static final Set https = new HashSet( Arrays.asList(
|
|
378 "www.postgresql-archive.org",
|
|
379 "ffq.38.me.nabble.com"
|
|
380 ) );
|
|
381
|
|
382 public String url() {
|
|
383 String domain = doc.get(DOMAIN_FLD);
|
|
384 String scheme = https.contains(domain) ? "https" : "http";
|
|
385 return scheme + "://" + domain + "/";
|
|
386 }
|
|
387
|
|
388 public String subject() {
|
|
389 return doc.get(SUBJECT_FLD);
|
|
390 }
|
|
391
|
|
392 public String subjectHtml() {
|
|
393 return HtmlUtils.htmlEncode(subject());
|
|
394 }
|
|
395
|
|
396 public String link() {
|
|
397 return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>";
|
|
398 }
|
|
399
|
|
400 public String message() {
|
|
401 return doc.get(MESSAGE_FLD);
|
|
402 }
|
|
403
|
|
404 public String type() {
|
|
405 return doc.get(TYPE_FLD);
|
|
406 }
|
|
407
|
|
408 public int activity() {
|
|
409 NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD);
|
|
410 return fld.getNumericValue().intValue();
|
|
411 }
|
|
412
|
|
413 public int nodeCount() {
|
|
414 NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD);
|
|
415 return fld.getNumericValue().intValue();
|
|
416 }
|
|
417
|
|
418 public Date whenCreated() {
|
|
419 NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD);
|
|
420 return new Date(fld.getNumericValue().longValue());
|
|
421 }
|
|
422
|
|
423 public boolean isEmbarrassing() {
|
|
424 return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) );
|
|
425 }
|
|
426
|
|
427 public String tweaks() {
|
|
428 return doc.get(TWEAKS_FLD);
|
|
429 }
|
|
430
|
|
431 public int fileCount() {
|
|
432 NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD);
|
|
433 return fld.getNumericValue().intValue();
|
|
434 }
|
|
435
|
|
436 public String ownerEmail() {
|
|
437 return doc.get(OWNER_EMAIL_FLD);
|
|
438 }
|
|
439
|
|
440 public int monthlyViews() {
|
|
441 NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD);
|
|
442 return fld.getNumericValue().intValue();
|
|
443 }
|
|
444
|
|
445 public int value() {
|
|
446 NumericField fld = (NumericField)doc.getFieldable(VALUE_FLD);
|
|
447 return fld.getNumericValue().intValue();
|
|
448 }
|
|
449
|
|
450 }
|