Mercurial Hosting > nabble
annotate src/global/Site.java @ 65:3d7067a23eff
fix
author | Franklin Schmidt <fschmidt@gmail.com> |
---|---|
date | Tue, 17 Sep 2024 05:01:59 -0600 |
parents | 56accc959f8c |
children |
rev | line source |
---|---|
0 | 1 package global; |
2 | |
3 import fschmidt.util.java.HtmlUtils; | |
4 import nabble.view.lib.ViewUtils; | |
5 import nabble.model.Init; | |
6 import org.apache.lucene.analysis.Analyzer; | |
7 import org.apache.lucene.analysis.snowball.SnowballAnalyzer; | |
8 import org.apache.lucene.document.Document; | |
9 import org.apache.lucene.document.Field; | |
10 import org.apache.lucene.document.NumericField; | |
11 import org.apache.lucene.index.IndexReader; | |
12 import org.apache.lucene.index.IndexWriter; | |
13 import org.apache.lucene.index.Term; | |
14 import org.apache.lucene.search.Sort; | |
15 import org.apache.lucene.search.SortField; | |
16 import org.apache.lucene.store.FSDirectory; | |
17 import org.apache.lucene.util.Version; | |
18 import org.slf4j.Logger; | |
19 import org.slf4j.LoggerFactory; | |
20 | |
21 import java.io.File; | |
22 import java.io.IOException; | |
23 import java.io.BufferedReader; | |
24 import java.io.FileReader; | |
25 import java.sql.Connection; | |
26 import java.sql.ResultSet; | |
27 import java.sql.SQLException; | |
28 import java.sql.Statement; | |
29 import java.util.ArrayList; | |
30 import java.util.Date; | |
31 import java.util.List; | |
32 import java.util.Map; | |
33 import java.util.HashMap; | |
34 import java.util.Set; | |
35 import java.util.HashSet; | |
36 import java.util.Arrays; | |
37 import java.util.regex.Pattern; | |
38 import java.util.regex.Matcher; | |
39 | |
40 | |
41 public final class Site { | |
42 private static final Logger logger = LoggerFactory.getLogger(Site.class); | |
43 | |
44 public static volatile String status = ""; | |
45 | |
46 public static final String SERVER_FLD = "server"; | |
47 public static final String SITE_FLD = "site"; | |
48 public static final String DOMAIN_FLD = "domain"; | |
49 public static final String SUBJECT_FLD = "subject"; | |
50 public static final String MESSAGE_FLD = "message"; | |
51 public static final String TYPE_FLD = "type"; | |
52 public static final String ACTIVITY_FLD = "activity"; | |
53 public static final String EMBARRASSING_FLD = "embarrassing"; | |
54 public static final String PRIVATE_FLD = "private"; | |
55 public static final String OWNER_EMAIL_FLD = "owner_email"; | |
56 public static final String OWNER_EMAIL_DOMAIN_FLD = "owner_email_domain"; | |
57 public static final String NODE_COUNT_FLD = "node_count"; | |
58 public static final String WHEN_CREATED_FLD = "when_created"; | |
59 public static final String TWEAKS_FLD = "tweaks"; | |
60 public static final String HAS_TWEAKS_FLD = "has_tweaks"; | |
61 public static final String FILE_COUNT_FLD = "file_count"; | |
62 public static final String FILE_NODE_RATIO_FLD = "file_node_ratio"; | |
63 public static final String MONTHLY_VIEWS_FLD = "monthly_views"; | |
64 | |
65 public static final Sort SORT_BY_ACTIVITY = new Sort(new SortField(ACTIVITY_FLD, SortField.INT, true)); | |
66 public static final Sort SORT_BY_FILE_NODE_RATIO = new Sort(new SortField(FILE_NODE_RATIO_FLD, SortField.INT, true)); | |
67 | |
68 public static final Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT,"English"); | |
69 private static final FSDirectory dir1; | |
70 private static final FSDirectory dir2; | |
71 private static volatile Thread thread = null; | |
72 private static final boolean skipReindex1 = Init.get("skipReindex1",false); | |
73 | |
74 static { | |
75 try { | |
2
abe0694e9849
replace local_dir with home_dir
Franklin Schmidt <fschmidt@gmail.com>
parents:
0
diff
changeset
|
76 String localDir = (String)Init.get("home_dir")+"local/"; |
0 | 77 dir1 = FSDirectory.open(new File(localDir+"lucene_raw")); |
78 dir2 = FSDirectory.open(new File(localDir+"lucene_global")); | |
79 } catch(IOException e) { | |
80 throw new RuntimeException(e); | |
81 } | |
82 } | |
83 | |
84 public static FSDirectory dir() { | |
85 return dir2; | |
86 } | |
53 | 87 |
0 | 88 public static boolean isReindexing() { |
89 return thread != null && thread.isAlive(); | |
90 } | |
91 | |
92 public static void startReindexing(){ | |
93 status = "starting reindex"; | |
94 thread = new Thread(new Runnable(){public void run(){ | |
95 try { | |
96 reindex(); | |
97 } catch(SQLException e) { | |
98 logger.error("",e); | |
99 status = "Error: " + e.getMessage(); | |
100 } catch(IOException e) { | |
101 logger.error("",e); | |
102 status = "Error: " + e.getMessage(); | |
103 } catch(RuntimeException e) { | |
104 logger.error("",e); | |
105 status = "Error: " + e.getMessage(); | |
106 } | |
107 }},"reindex"); | |
108 thread.start(); | |
109 } | |
110 | |
111 static synchronized void reindex() | |
112 throws SQLException, IOException | |
113 { | |
114 if( !skipReindex1 ) | |
115 reindex1(); | |
116 status = "done indexing servers"; | |
117 logger.info("reindex2"); | |
118 reindex2(); | |
119 logger.info("done reindexing"); | |
120 status = "done reindexing"; | |
121 } | |
122 | |
123 private static void reindex1() | |
124 throws SQLException, IOException | |
125 { | |
126 IndexWriter indexWriter = new IndexWriter(dir1,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); | |
127 try { | |
128 for( Server server : Server.getServers() ) { | |
129 reindex1(server,indexWriter); | |
130 } | |
131 } finally { | |
132 indexWriter.close(); | |
133 } | |
134 } | |
53 | 135 |
0 | 136 private static synchronized void reindex1(Server server,IndexWriter indexWriter) |
137 throws SQLException, IOException | |
138 { | |
139 logger.info("reindex "+server.name); | |
140 status = "reindexing "+server.name; | |
141 | |
142 List<Long> siteIds = new ArrayList<Long>(); | |
143 { | |
144 Connection con = server.getConnection(); | |
145 Statement stmt = con.createStatement(); | |
146 ResultSet rs = stmt.executeQuery( | |
147 "select site_id from global.site_global" | |
148 ); | |
149 while( rs.next() ) { | |
150 long siteId = rs.getLong("site_id"); | |
151 siteIds.add(siteId); | |
152 } | |
153 rs.close(); | |
154 stmt.close(); | |
155 con.close(); | |
156 } | |
157 final int n = siteIds.size(); | |
158 int count = 0; | |
159 for( long siteId : siteIds ) { | |
160 update(indexWriter,server,siteId); | |
161 count++; | |
162 logger.info("reindexed "+count+" of "+n+" sites on "+server.name); | |
163 status = "reindexed "+count+" of "+n+" sites on "+server.name; | |
164 } | |
165 } | |
166 | |
167 private static void update(IndexWriter indexWriter,Server server,long siteId) | |
168 throws IOException, SQLException | |
169 { | |
170 Connection con = server.getConnection(); | |
171 try { | |
172 String schema = "s" + siteId; | |
173 Statement stmt = con.createStatement(); | |
174 ResultSet rs = stmt.executeQuery( | |
175 "select site_global.*, site.*, node.*, node_msg.message, priv.label as priv, user_.email, tweak.tweaks" | |
176 +", (select count(*) from " + schema + ".file_node) as file_node_count" | |
177 +" from global.site_global" | |
178 +", " + schema + ".site" | |
179 +" join " + schema + ".node on site.root_node_id = node.node_id" | |
180 +" join " + schema + ".node_msg on site.root_node_id = node_msg.node_id" | |
181 +" left join " + schema + ".tag as priv on site.root_node_id = priv.node_id and priv.user_id is null and priv.label='permission:View'" | |
182 +" left join " + schema + ".user_ on node.owner_id = user_.user_id" | |
183 +" , (select string_agg(content,' ') as tweaks from " + schema + ".tweak) as tweak" | |
184 +" where site_global.site_id = " + siteId | |
185 ); | |
186 if( !rs.next() ) { | |
187 logger.error("site not found: "+siteId); | |
188 return; | |
189 } | |
190 Document doc = new Document(); | |
191 doc.add( new Field(SERVER_FLD, server.name, Field.Store.YES, Field.Index.NO) ); | |
192 doc.add( new Field(SITE_FLD, Long.toString(siteId), Field.Store.YES, Field.Index.NO) ); | |
193 String subject = rs.getString("subject"); | |
194 String domain = rs.getString("custom_domain"); | |
195 if( domain == null ) | |
196 domain = ViewUtils.getDefaultBaseUrl(siteId,subject,server.host); | |
197 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NO) ); | |
198 doc.add( new Field(SUBJECT_FLD, subject, Field.Store.YES, Field.Index.NO) ); | |
199 String message = rs.getString("message"); | |
200 doc.add( new Field(MESSAGE_FLD, message, Field.Store.YES, Field.Index.NO) ); | |
201 String type = "" + rs.getString("type"); | |
202 doc.add( new Field(TYPE_FLD, type, Field.Store.YES, Field.Index.NO) ); | |
203 int activity = rs.getInt("activity"); | |
204 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,false).setIntValue(activity) ); | |
205 String embarrassing = Boolean.toString( rs.getBoolean("is_embarrassing") ); | |
206 doc.add( new Field(EMBARRASSING_FLD, embarrassing, Field.Store.YES, Field.Index.NO) ); | |
207 String privS = rs.getString("priv"); | |
208 String priv = Boolean.toString( privS != null ); | |
209 doc.add( new Field(PRIVATE_FLD, priv, Field.Store.YES, Field.Index.NO) ); | |
210 String email = rs.getString("email"); | |
211 if( email != null ) { | |
212 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NO) ); | |
213 } | |
214 int nodeCount = rs.getInt("node_count"); | |
215 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,false).setIntValue(nodeCount) ); | |
216 Date whenCreated = rs.getTimestamp("when_created"); | |
217 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated.getTime()) ); | |
218 String tweaks = rs.getString("tweaks"); | |
219 if( tweaks != null ) { | |
220 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); | |
221 } | |
222 int fileCount = rs.getInt("file_node_count"); | |
223 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); | |
224 int monthlyViews = rs.getInt("monthly_views"); | |
225 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,false).setIntValue(monthlyViews) ); | |
226 rs.close(); | |
227 stmt.close(); | |
228 indexWriter.addDocument(doc); | |
229 } catch(SQLException e) { | |
230 logger.error("failed to index site "+siteId,e); | |
231 String msg = e.getMessage(); | |
232 if( !(msg.contains("schema") && msg.contains("does not exist")) ) | |
233 throw e; | |
234 } finally { | |
235 con.close(); | |
236 } | |
237 } | |
238 | |
239 private static void reindex2() | |
240 throws IOException | |
241 { | |
242 IndexReader reader = IndexReader.open(dir1); | |
243 IndexWriter indexWriter = new IndexWriter(dir2,analyzer,true,IndexWriter.MaxFieldLength.LIMITED); | |
244 try { | |
245 int n = reader.numDocs(); | |
246 if( n != reader.maxDoc() ) | |
247 throw new RuntimeException(); | |
248 for( int i=0; i<n; i++ ) { | |
249 Document data = reader.document(i); | |
250 Document doc = new Document(); | |
251 doc.add( new Field(SERVER_FLD, data.get(SERVER_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
252 doc.add( new Field(SITE_FLD, data.get(SITE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
253 String domain = data.get(DOMAIN_FLD); | |
254 doc.add( new Field(DOMAIN_FLD, domain, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
255 doc.add( new Field(SUBJECT_FLD, data.get(SUBJECT_FLD), Field.Store.YES, Field.Index.ANALYZED) ); | |
256 doc.add( new Field(MESSAGE_FLD, data.get(MESSAGE_FLD), Field.Store.YES, Field.Index.ANALYZED) ); | |
257 doc.add( new Field(TYPE_FLD, data.get(TYPE_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
258 int activity = ((NumericField)data.getFieldable(ACTIVITY_FLD)).getNumericValue().intValue(); | |
259 doc.add( new NumericField(ACTIVITY_FLD,Field.Store.YES,true).setIntValue(activity) ); | |
260 doc.add( new Field(EMBARRASSING_FLD, data.get(EMBARRASSING_FLD), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
261 doc.add( new Field(PRIVATE_FLD, data.get(PRIVATE_FLD), Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
262 String email = data.get(OWNER_EMAIL_FLD); | |
263 if( email != null ) { | |
264 doc.add( new Field(OWNER_EMAIL_FLD, email, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
265 String emailDomain = email.substring( email.indexOf('@') + 1 ); | |
266 doc.add( new Field(OWNER_EMAIL_DOMAIN_FLD, emailDomain, Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
267 } | |
268 int nodeCount = ((NumericField)data.getFieldable(NODE_COUNT_FLD)).getNumericValue().intValue(); | |
269 doc.add( new NumericField(NODE_COUNT_FLD,Field.Store.YES,true).setIntValue(nodeCount) ); | |
270 long whenCreated = ((NumericField)data.getFieldable(WHEN_CREATED_FLD)).getNumericValue().longValue(); | |
271 doc.add( new NumericField(WHEN_CREATED_FLD,Field.Store.YES,false).setLongValue(whenCreated) ); | |
272 String tweaks = data.get(TWEAKS_FLD); | |
273 if( tweaks != null ) { | |
274 doc.add( new Field(TWEAKS_FLD, tweaks, Field.Store.YES, Field.Index.NO) ); | |
275 doc.add( new Field(HAS_TWEAKS_FLD, "true", Field.Store.NO, Field.Index.NOT_ANALYZED_NO_NORMS) ); | |
276 } | |
277 int fileCount = ((NumericField)data.getFieldable(FILE_COUNT_FLD)).getNumericValue().intValue(); | |
278 doc.add( new NumericField(FILE_COUNT_FLD,Field.Store.YES,false).setIntValue(fileCount) ); | |
279 if( nodeCount > 0 ) { | |
280 int fileNodeRatio = fileCount*1000/nodeCount; | |
281 doc.add( new NumericField(FILE_NODE_RATIO_FLD,Field.Store.NO,true).setIntValue(fileNodeRatio) ); | |
282 } | |
283 int monthlyViews = ((NumericField)data.getFieldable(MONTHLY_VIEWS_FLD)).getNumericValue().intValue(); | |
284 doc.add( new NumericField(MONTHLY_VIEWS_FLD,Field.Store.YES,true).setIntValue(monthlyViews) ); | |
285 indexWriter.addDocument(doc); | |
286 } | |
287 } finally { | |
288 indexWriter.close(); | |
289 reader.close(); | |
290 } | |
291 } | |
292 | |
293 // class here | |
294 | |
295 private final Document doc; | |
296 | |
297 public Site(Document doc) { | |
298 this.doc = doc; | |
299 } | |
300 | |
301 public String serverName() { | |
302 return doc.get(SERVER_FLD); | |
303 } | |
304 | |
305 public Server server() { | |
306 return Server.getServer(serverName()); | |
307 } | |
308 | |
309 public String id() { | |
310 return doc.get(SITE_FLD); | |
311 } | |
312 | |
313 private static final Set https = new HashSet( Arrays.asList( | |
314 "www.postgresql-archive.org", | |
315 "ffq.38.me.nabble.com" | |
316 ) ); | |
317 | |
318 public String url() { | |
319 String domain = doc.get(DOMAIN_FLD); | |
320 String scheme = https.contains(domain) ? "https" : "http"; | |
321 return scheme + "://" + domain + "/"; | |
322 } | |
323 | |
324 public String subject() { | |
325 return doc.get(SUBJECT_FLD); | |
326 } | |
327 | |
328 public String subjectHtml() { | |
329 return HtmlUtils.htmlEncode(subject()); | |
330 } | |
331 | |
332 public String link() { | |
333 return "<a href=\"" + url() + "\">" + subjectHtml() + "</a>"; | |
334 } | |
335 | |
336 public String message() { | |
337 return doc.get(MESSAGE_FLD); | |
338 } | |
339 | |
340 public String type() { | |
341 return doc.get(TYPE_FLD); | |
342 } | |
343 | |
344 public int activity() { | |
345 NumericField fld = (NumericField)doc.getFieldable(ACTIVITY_FLD); | |
346 return fld.getNumericValue().intValue(); | |
347 } | |
348 | |
349 public int nodeCount() { | |
350 NumericField fld = (NumericField)doc.getFieldable(NODE_COUNT_FLD); | |
351 return fld.getNumericValue().intValue(); | |
352 } | |
353 | |
354 public Date whenCreated() { | |
355 NumericField fld = (NumericField)doc.getFieldable(WHEN_CREATED_FLD); | |
356 return new Date(fld.getNumericValue().longValue()); | |
357 } | |
358 | |
359 public boolean isEmbarrassing() { | |
360 return Boolean.parseBoolean( doc.get(EMBARRASSING_FLD) ); | |
361 } | |
362 | |
363 public String tweaks() { | |
364 return doc.get(TWEAKS_FLD); | |
365 } | |
366 | |
367 public int fileCount() { | |
368 NumericField fld = (NumericField)doc.getFieldable(FILE_COUNT_FLD); | |
369 return fld.getNumericValue().intValue(); | |
370 } | |
371 | |
372 public String ownerEmail() { | |
373 return doc.get(OWNER_EMAIL_FLD); | |
374 } | |
375 | |
376 public int monthlyViews() { | |
377 NumericField fld = (NumericField)doc.getFieldable(MONTHLY_VIEWS_FLD); | |
378 return fld.getNumericValue().intValue(); | |
379 } | |
380 | |
381 } |