-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathfzeval.py
260 lines (220 loc) · 9.59 KB
/
fzeval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# -*- coding: utf-8 -*-
"""malcluster: malware clustering analysis tool"""
__version__ = "0.1.0"
import os
import shutil
import cPickle
import itertools
import cProfile
import numpy as np
import sklearn.metrics as sm
import multiprocessing as mp
import matplotlib.pyplot as plt
import mvhash
import nghash
import sdhash
import bshash
import imphash
import rlhash
malorder = []
fingerprints = {}
cwd = os.getcwd() # @UndefinedVariable
myhash = None # @UndefinedVariable
algorithms = [
bshash.BsHash(81920, 7), # BsHash works on original whole samples
nghash.NgHash(7), # NgHash works on original whole samples
imphash.ImpHash(1), # ImpHash works on original whole samples
rlhash.RlHash(16807, 256, 1),
mvhash.MvHash(512, 20, 0.7), # MvHash works on python-extracted code secquences
sdhash.SdHash() # SdHash works on python-extracted code secquences
]
hash_names = ["bshash", "nghash", "imphash", "rlhash", "mvhash", "sdhash"]
def hash_gen():
""" Wrapper for generating fingerprints from malware samples
:rtype : fingerprints
"""
shutil.rmtree(os.path.join(cwd, "hashs/"), ignore_errors=True) # @UndefinedVariable
os.mkdir(os.path.join(cwd, "hashs"))
mallist = (x for x in os.listdir(os.path.join(cwd, "samples/")))
for malware in mallist:
malpath = os.path.join(cwd, "samples/" + malware)
fingerprints[malware] = myhash.generateHash(malpath)
def hash_comp(ab):
return myhash.compareHash(ab[0], ab[1])
def get_dmentry_same():
"""
Get dmentry for same family samples, malorder is a single family set
:return: fingerprints pair from the same family
"""
for item in itertools.combinations(malorder, 2): # @UndefinedVariable
yield map(fingerprints.get, item)
def get_dmentry_different(mal_families):
"""
Get dmentry for different family samples
:return: fingerprints pair from different families
"""
for (i, j) in itertools.combinations(mal_families.keys(), 2):
for temp_i in mal_families[i]:
for temp_j in mal_families[j]:
yield (fingerprints.get(temp_i), fingerprints.get(temp_j))
def get_dmlist(mal_families):
""" Get all the pairwise distance matrix
:rtype : distance matrics
"""
number_per_round = 10000
result = []
if not mal_families:
getdm = get_dmentry_same()
else:
getdm = get_dmentry_different(mal_families)
pool = mp.Pool(processes=mp.cpu_count())
while True: # @UndefinedVariable
tempresult = pool.map(hash_comp, itertools.islice(getdm, number_per_round))
if tempresult:
result.extend(tempresult)
else:
break
return result
def generate_data(hashindex = -1):
global malorder
global myhash
if hashindex == -1:
hash_algos = algorithms
else:
hash_algos = [algorithms[hashindex]]
# if len(os.listdir("samples_whole")) == 1146 and hash_algo:
# for file_name in os.listdir("samples"):
# shutil.move("samples/" + file_name, "samples_code/" + file_name)
# shutil.move("samples_whole/" + file_name, "samples/" + file_name)
for hash_type in hash_algos:
hash_name = hash_names[hash_algos.index(hash_type)]
# if j == 2:
# if len(os.listdir("samples_code")) == 1146:
# for file_name in os.listdir("samples"):
# shutil.move("samples/" + file_name, "samples_whole/" + file_name)
# shutil.move("samples_code/" + file_name, "samples/" + file_name)
print "Generating fingerprint lists for %s (%d)." % (hash_name, len(os.listdir("samples"))) # @UndefinedVariable
myhash = hash_type
hash_gen()
for mal in fingerprints:
mal_family = mal.split("-")[0]
if mal_family not in mal_families:
mal_families[mal_family] = [mal]
else:
mal_families[mal_family].append(mal)
# Calculate the distances within each single family
same_family_dm = []
for family in mal_families:
malorder = mal_families[family]
print "Calculating pairwise distance for family %s (%d)." % (family, len(mal_families[family]))
same_family_dm.extend(get_dmlist(None))
with open(hash_name + ".same", 'w+b') as f:
cPickle.dump(same_family_dm, f)
# Calculate the pairwise distances between every two families
print "Calculating pairwise distance for samples from different families."
diff_family_dm = get_dmlist(mal_families)
with open(hash_name + ".diff", 'w+b') as f:
cPickle.dump(diff_family_dm, f)
def display_original():
for hash_name in hash_names:
with open(hash_name + ".same", 'r+b') as f:
same_family_dm = cPickle.load(f)
same_family_uniqw, same_family_inverse = np.unique(same_family_dm, return_inverse=True)
plt.figure(0)
plt.plot(same_family_uniqw, np.bincount(same_family_inverse), label=hash_name)
plt.legend(loc='upper right')
plt.title("Same Family Distances Distribution")
plt.xlabel("Distance")
plt.ylabel("Number Count")
with open(hash_name + ".diff", 'r+b') as f:
diff_family_dm = cPickle.load(f)
diff_family_uniqw, diff_family_inverse = np.unique(diff_family_dm, return_inverse=True)
plt.figure(1)
plt.plot(diff_family_uniqw, np.bincount(diff_family_inverse), label=hash_name)
plt.legend(loc='upper left')
plt.title("Diff Family Distances Distribution")
plt.xlabel("Distance")
plt.ylabel("Number Count")
plt.show()
def display_roc():
thresholds = np.linspace(0, 1, 21)
for hash_name in hash_names:
tpr = []
fpr = []
with open(hash_name + ".same", 'r+b') as f:
same_family_dm = np.array(cPickle.load(f))
same_family_uniqw, same_family_inverse = np.unique(same_family_dm, return_inverse=True)
same_family_dmlist = dict(zip(same_family_uniqw, np.bincount(same_family_inverse)))
with open(hash_name + ".diff", 'r+b') as f:
diff_family_dm = np.array(cPickle.load(f))
diff_family_uniqw, diff_family_inverse = np.unique(diff_family_dm, return_inverse=True)
diff_family_dmlist = dict(zip(diff_family_uniqw, np.bincount(diff_family_inverse)))
for threshold in thresholds:
tp = fp = 0
for dm in same_family_dmlist:
if dm <= threshold:
tp += same_family_dmlist[dm]
for dm in diff_family_dmlist:
if dm <= threshold:
fp += diff_family_dmlist[dm]
tpr.append(tp*1.0/same_family_dm.size)
fpr.append(fp*1.0/diff_family_dm.size)
print sm.auc(fpr, tpr)
print "Fuzzy hashing algorithm: %s, AUC: %f" %(hash_name, sm.auc(fpr, tpr))
plt.figure(0)
plt.plot(fpr, tpr, label=hash_name)
plt.ylim(0.75, 1)
plt.legend(loc='best')
plt.title("ROC curve for different algorithms")
plt.xlabel("False posive rate")
plt.ylabel("True posive rate")
plt.show()
def display_cdf():
for hash_name in hash_names:
with open(hash_name + ".same", 'r+b') as f:
same_family_dm = cPickle.load(f)
print "Calculate same family dmcount matrics for hash", hash_name
dmcount_total = len(same_family_dm)
same_family_uniqw, same_family_inverse = np.unique(same_family_dm, return_inverse=True)
same_family_dmcount = dict(zip(same_family_uniqw, np.bincount(same_family_inverse)*1.0/dmcount_total))
plt.figure(0)
plt.subplot(1, 2, 1)
same_family_x = np.sort(np.array(same_family_dmcount.keys()))
same_family_y = np.zeros(same_family_x.size)
same_family_y[0] = same_family_dmcount[same_family_x[0]]
for i in xrange(1, same_family_x.size):
same_family_y[i] = same_family_y[i-1] + same_family_dmcount[same_family_x[i]]
plt.plot(same_family_x, same_family_y, label=hash_name)
plt.legend(loc='lower right')
plt.title("Same Family Evaluation (recall)")
plt.xlabel("Distance")
plt.ylabel("Cumulative Probability")
with open(hash_name + ".diff", 'r+b') as f:
diff_family_dm = cPickle.load(f)
print "Calculate diff family dmcount matrics for hash", hash_name
dmcount_total = len(diff_family_dm)
diff_family_uniqw, diff_family_inverse = np.unique(diff_family_dm, return_inverse=True)
diff_family_dmcount = dict(zip(diff_family_uniqw, np.bincount(diff_family_inverse)*1.0/dmcount_total))
# plt.figure(1)
plt.subplot(1, 2, 2)
diff_family_x = np.sort(np.array(diff_family_dmcount.keys()))
diff_family_y = np.zeros(diff_family_x.size)
diff_family_y[0] = diff_family_dmcount[diff_family_x[0]]
for i in xrange(1, diff_family_x.size):
diff_family_y[i] = diff_family_y[i-1] + diff_family_dmcount[diff_family_x[i]]
plt.plot(diff_family_x, diff_family_y, label=hash_name)
plt.legend(loc='upper left')
plt.title("Different Families Evaluation (precision)")
plt.xlabel("Distance")
plt.ylabel("Cumulative Probability")
plt.ylim(0, 1)
plt.show()
def main():
# generate_data()
# display_original()
# display_cdf()
display_roc()
print "Finish fuzzy hashing evaluation analyais"
if __name__ == "__main__":
# cProfile.run('main()')
main()