This repository was archived by the owner on Mar 8, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathex14_3.py
More file actions
40 lines (32 loc) · 1.4 KB
/
ex14_3.py
File metadata and controls
40 lines (32 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/usr/bin/python3
import os
import glob #The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell, although results are returned in arbitrary order. No tilde expansion is done, but *, ?, and character ranges expressed with [] will be correctly matched.
# The author asked to write a md5sum check to find duplicate without providing anymore information. It is interesting to learn how md5sum actually works. So for reference: https://en.wikipedia.org/wiki/Md5sum
import hashlib
import itertools
#following code is copied from http://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def main():
d = {}
rev_d = {}
for filename in glob.glob('./**/*.py',recursive = True):
print(os.path.abspath(filename))
print(md5(os.path.abspath(filename)))
d[filename] = md5(os.path.abspath(filename))
for k, v in d.items():
rev_d.setdefault(v, set()).add(k)
print ("Are there any duplicate files?")
'''
for k, v in rev_d.items():
if len(v) > 1:
print ("Yes, and they are:")
print (v)
'''
print(set(itertools. chain.from_iterable(values for key, values in rev_d.items() if len(values) > 1)))
if __name__ == '__main__':
main()