@@ -209,11 +209,12 @@ def rebuild_feeds(events: pd.DataFrame) -> int:
209
209
# change is not too heavy
210
210
cnt = 0
211
211
sql = """SELECT test_name, probe_cc, probe_asn, input, time, status
212
- FROM blocking_events
213
- WHERE test_name = %(test_name)s AND input = %(inp)s
214
- AND probe_cc = %(cc)s AND probe_asn = %(asn)s
215
- ORDER BY time
212
+ FROM blocking_events
213
+ WHERE test_name = %(test_name)s AND input = %(inp)s
214
+ AND probe_cc = %(cc)s AND probe_asn = %(asn)s
215
+ ORDER BY time
216
216
"""
217
+ events = events .reset_index ()
217
218
unique_tcais = events [TCAI ].drop_duplicates ()
218
219
update_time = datetime .utcnow ()
219
220
for x in unique_tcais .itertuples ():
@@ -224,6 +225,7 @@ def rebuild_feeds(events: pd.DataFrame) -> int:
224
225
write_feed (feed_data , path )
225
226
cnt += len (history )
226
227
228
+ log .info (f"[re]created { cnt } feeds" )
227
229
return cnt
228
230
229
231
@@ -259,8 +261,8 @@ def load_country_name_map(devel: bool) -> dict:
259
261
def create_tables () -> None :
260
262
# Requires admin privileges
261
263
sql = """
262
- CREATE TABLE IF NOT EXISTS blocking_status
263
- (
264
+ CREATE TABLE IF NOT EXISTS blocking_status
265
+ (
264
266
`test_name` String,
265
267
`input` String,
266
268
`probe_cc` String,
@@ -274,27 +276,27 @@ def create_tables() -> None:
274
276
`change` Float32,
275
277
`stability` Float32,
276
278
`update_time` DateTime64(0) MATERIALIZED now64()
277
- )
278
- ENGINE = ReplacingMergeTree
279
- ORDER BY (test_name, input, probe_cc, probe_asn)
280
- SETTINGS index_granularity = 4
281
- """
279
+ )
280
+ ENGINE = ReplacingMergeTree
281
+ ORDER BY (test_name, input, probe_cc, probe_asn)
282
+ SETTINGS index_granularity = 4
283
+ """
282
284
query (sql )
283
285
sql = """
284
- CREATE TABLE IF NOT EXISTS blocking_events
285
- (
286
+ CREATE TABLE IF NOT EXISTS blocking_events
287
+ (
286
288
`test_name` String,
287
289
`input` String,
288
290
`probe_cc` String,
289
291
`probe_asn` Int32,
290
292
`status` String,
291
293
`time` DateTime64(3),
292
294
`detection_time` DateTime64(0) MATERIALIZED now64()
293
- )
294
- ENGINE = ReplacingMergeTree
295
- ORDER BY (test_name, input, probe_cc, probe_asn, time)
296
- SETTINGS index_granularity = 4
297
- """
295
+ )
296
+ ENGINE = ReplacingMergeTree
297
+ ORDER BY (test_name, input, probe_cc, probe_asn, time)
298
+ SETTINGS index_granularity = 4
299
+ """
298
300
query (sql )
299
301
sql = "CREATE USER IF NOT EXISTS detector IDENTIFIED WITH plaintext_password BY 'detector'"
300
302
query (sql )
@@ -342,15 +344,15 @@ def reprocess_inner(
342
344
status , events = process_data (status , new )
343
345
if events is not None and len (events ):
344
346
events_tmp .append (events )
345
- if collect_hist :
346
- status_history_tmp .append (status )
347
+ if collect_hist :
348
+ status_history_tmp .append (status )
347
349
348
350
if events_tmp :
349
351
events = pd .concat (events_tmp )
350
352
else :
351
353
events = None
352
- status_history = pd .concat (status_history_tmp ) if collect_hist else None
353
- return events , status , status_history
354
+ status_history = pd .concat (status_history_tmp ) if collect_hist else None
355
+ return events , status , status_history
354
356
355
357
356
358
@metrics .timer ("process_historical_data" )
@@ -402,6 +404,7 @@ def process_fresh_data(
402
404
urls = sorted (set (u for urls in services .values () for u in urls ))
403
405
404
406
status = load_blocking_status ()
407
+ metrics .gauge ("blocking_status_tblsize" , len (status ))
405
408
406
409
gen = gen_input (click , start_date , end_date , interval , urls )
407
410
new = None
@@ -443,8 +446,20 @@ def process_fresh_data(
443
446
444
447
if events is not None and len (events ):
445
448
log .debug (f"Appending { len (events )} events to blocking_events table" )
446
- sql = "INSERT INTO blocking_events VALUES"
447
- click .insert_dataframe (sql , events .reset_index (drop = True ))
449
+ ev = events .reset_index ()
450
+ ev = ev .drop (columns = ["old_status" ])
451
+ ev ["time" ] = end_date # event detection time
452
+ log .info (ev )
453
+ assert ev .columns .values .tolist () == [
454
+ "test_name" ,
455
+ "probe_cc" ,
456
+ "probe_asn" ,
457
+ "input" ,
458
+ "status" ,
459
+ "time" ,
460
+ ]
461
+ sql = "INSERT INTO blocking_events (test_name, probe_cc, probe_asn, input, status, time) VALUES"
462
+ click .insert_dataframe (sql , ev )
448
463
449
464
log .info ("Done" )
450
465
return events , status
@@ -673,6 +688,26 @@ def gen():
673
688
return events , status , status_history
674
689
675
690
691
+ def process (start , end , interval , services ) -> None :
692
+ events , status = process_fresh_data (start , end , interval , services )
693
+ log .info (f"Events: { len (events )} " )
694
+ if events is not None and len (events ):
695
+ log .info ("Rebuilding feeds" )
696
+ rebuild_feeds (events )
697
+ # TODO: create an index of available RSS feeds
698
+
699
+
700
+ def reprocess (conf , services ) -> None :
701
+ click .execute ("TRUNCATE TABLE blocking_status SYNC" )
702
+ click .execute ("TRUNCATE TABLE blocking_events SYNC" )
703
+
704
+ t = conf .start_date
705
+ while t < conf .end_date :
706
+ te = t + conf .interval
707
+ process (t , te , conf .interval , services )
708
+ t += conf .interval
709
+
710
+
676
711
def main ():
677
712
global click
678
713
setup ()
@@ -692,31 +727,24 @@ def main():
692
727
"Instagram" : ["https://www.instagram.com/" ],
693
728
}
694
729
if conf .reprocess :
730
+ # Destructing reprocess
695
731
assert conf .start_date and conf .end_date , "Dates not set"
696
- events , status , _ = process_historical_data (
697
- conf .start_date , conf .end_date , conf .interval , services
698
- )
699
- s = status .reset_index ()
700
- # log.info((s.accessible_perc, s.cnt, s.status))
732
+ reprocess (conf , services )
701
733
return
734
+ # assert conf.start_date and conf.end_date, "Dates not set"
735
+ # events, status, _ = process_historical_data(
736
+ # conf.start_date, conf.end_date, conf.interval, services
737
+ # )
738
+ # s = status.reset_index()
739
+ # log.info((s.accessible_perc, s.cnt, s.status))
702
740
703
741
else :
704
742
# Process fresh data
705
743
if conf .end_date is None :
706
744
# Beginning of current UTC hour
707
745
conf .end_date = datetime (* datetime .utcnow ().timetuple ()[:4 ])
708
746
conf .start_date = conf .end_date - conf .interval
709
- events , status = process_fresh_data (
710
- conf .start_date , conf .end_date , conf .interval , services
711
- )
712
- log .info (f"Events: { len (events )} " )
713
- # s = status.reset_index()
714
- # log.info((s.accessible_perc, s.cnt, s.status))
715
-
716
- if events is not None and len (events ):
717
- log .info ("Rebuilding feeds" )
718
- rebuild_feeds (events )
719
- # TODO: create an index of available RSS feeds
747
+ process (conf .start_date , conf .end_date , conf .interval , services )
720
748
721
749
gen_stats ()
722
750
log .info ("Done" )
0 commit comments