-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathPredictor.py
156 lines (138 loc) · 4.71 KB
/
Predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
from __future__ import print_function
import random
from collections import defaultdict, OrderedDict
import dill
from tqdm import tqdm
from GCN import *
class Graph(object):
def __init__(self, params):
self.params = params
self.data_dir = 'data/' + self.params.dataset + '/'
path = self.data_dir + 'graph.pkl'
try:
self.nbs = dill.load(open(path, 'rb'))
except:
self.init_nbs()
dill.dump(self.nbs, open(path, 'wb'))
with open(self.data_dir + self.params.graph, 'r') as f:
self.num_node = int(f.readline().strip())
self.feature = self.read_feature()
self.feat_dim = len(self.feature[0])
def init_nbs(self):
with open(self.data_dir + self.params.graph, 'r') as f:
next(f)
self.nbs = defaultdict(lambda: set())
for line in f:
[n1, n2] = list(map(int, line.rstrip().split()))
self.nbs[n1].add(n2)
self.nbs[n2].add(n1)
def subgraph_es(self, ns):
ns_set = set(ns)
es = []
for n in ns:
nbs = self.nbs[n] & ns_set
es += [(n, nb) for nb in nbs]
return es
def read_feature(self):
feature = []
with open(self.data_dir + self.params.feature) as f:
for line in f:
feature.append(np.array(list(map(int, line.strip().split()))).astype(np.float32))
#0 index indicates invalid node
feature = [np.zeros(len(feature[0]))] + feature
return np.array(feature)
class SubGraph(object):
def __init__(self, ns, path):
self.ns = ns
self.init(path)
def init(self, path):
self.kernels = []
kernel = np.expand_dims(np.array(self.ns), axis=1)
num = 1
with open(path, 'r') as f:
for line in f:
line = line.rstrip().split()
if line[0] == '#':
if len(kernel) == 0:
kernel = np.expand_dims(np.array([0] * num), axis=0)
self.kernels.append(np.array(kernel))
num = int(line[2])
kernel = []
else:
# python set is ordered
unique = list(OrderedDict.fromkeys(line))
kernel.append(np.array(list(map(int, unique))))
if len(kernel) == 0:
kernel = np.expand_dims(np.array([0] * num), axis=0)
self.kernels.append(np.array(kernel))
class Data(object):
def __init__(self, subgraph, label):
self.subgraph = subgraph
self.label = label
'''
train.pkl and test.pkl are pre-serialized training data and test data,
remember to delete them after running preprocess.py
'''
class Predictor(object):
def __init__(self, params):
self.params = params
self.data_dir = 'data/' + self.params.dataset + '/'
self.graph = Graph(params)
data_dump = self.data_dir + 'data.pkl'
try:
self.data = dill.load(open(data_dump, 'rb'))
except:
self.data = self.read_data()
dill.dump(self.data, open(data_dump, 'wb'))
self.kernel_sizes = [len(kernel[0]) for kernel in self.data[0].subgraph.kernels]
self.num_kernel = len(self.kernel_sizes)
self.num_label = len(self.data[0].label)
def read_data(self):
data = []
id = 0
with open(self.data_dir + self.params.data, 'r') as f1:
with open(self.data_dir + self.params.label, 'r') as f2:
for line1, line2 in zip(f1, f2):
cascade = list(map(int, line1.strip().split()))
ns = cascade
subgraph = SubGraph(ns, self.data_dir + self.params.meta + 'g' + str(id))
label = np.array(list(map(int, line2.strip().split())))
id += 1
data.append(Data(subgraph, label))
return data
def feed_dict(self, data, training):
subgraph, label = data.subgraph, data.label
feed = {k: kernel for k, kernel in zip(self.model.kernel, subgraph.kernels)}
feed[self.model.label] = label
feed[self.model.training] = training
return feed
def fit(self):
self.params.feat_dim = self.graph.feat_dim
self.params.num_node = self.graph.num_node
self.params.kernel_sizes = self.kernel_sizes
self.params.num_kernel = self.num_kernel
self.params.num_label = self.num_label
random.shuffle(self.data)
split_idx = int(len(self.data) / 10)
train, test = self.data[:-split_idx], self.data[-split_idx:]
with tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=self.params.memory_fraction, allow_growth=True))) as sess:
self.model = GCN(self.params, self.graph)
sess.run(tf.global_variables_initializer())
for _ in tqdm(range(self.params.epoch), ncols=100):
for i in tqdm(range(len(train)), ncols=100):
data = train[i]
sess.run(self.model.gradient_descent, feed_dict=self.feed_dict(data, True))
train_accuracy = self.eval(sess, train)
test_accuracy = self.eval(sess, test)
return train_accuracy, test_accuracy
def eval(self, sess, test):
correct = 0.0
for i in tqdm(range(len(test)), ncols=100):
data = test[i]
truth = np.where(data.label == 1)[0][0]
predict = sess.run(self.model.predict, feed_dict=self.feed_dict(data, False))
if predict == truth:
correct += 1
return correct / len(test)