-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdump_events_to_nvd3.py
66 lines (53 loc) · 1.93 KB
/
dump_events_to_nvd3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import cPickle as pkl
import pandas as pd
import ujson as json
from datetime import datetime as dt
from check_k_best_trees import k_best_trees
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--result_path')
parser.add_argument('--interactions_path')
parser.add_argument('--output_path')
parser.add_argument('--non_event_sample_n', type=int)
parser.add_argument('--freq')
parser.add_argument('--k', type=int)
args = parser.parse_args()
result = pkl.load(open(args.result_path))
trees = k_best_trees(result, args.k)
try:
df = pd.read_json(args.interactions_path)
except ValueError:
df = pd.read_pickle(args.interactions_path)
# for enron:
# df = df[df['datetime'] > dt(2000, 6, 1)]
timestamps = df.groupby(
pd.Grouper(key='datetime', freq=args.freq)
)['message_id'].count().index
values = lambda counts: [{'ts': ts.value/1000000,
'c': counts[ts] if ts in counts else 0}
for ts in timestamps]
data = []
event_nodes = set()
for i, t in enumerate(trees):
nids = set(t.nodes())
event_df = df[df['message_id'].apply(lambda m: m in nids)]
groups = event_df.groupby(pd.Grouper(key='datetime', freq=args.freq))
counts = groups['message_id'].count()
data.append({
'key': 'event-{}'.format(i+1),
'values': values(counts)
})
event_nodes |= nids
df = df[df['message_id'].map(lambda m: m not in event_nodes)]
if args.non_event_sample_n:
df = df.sample(n=args.non_event_sample_n)
counts = df.groupby(pd.Grouper(key='datetime', freq=args.freq))['message_id'].count()
data.append({
'key': 'non-event',
'values': values(counts)
})
# print(data)
json.dump(data, open(args.output_path, 'w'))
if __name__ == '__main__':
main()