-
Notifications
You must be signed in to change notification settings - Fork 3
/
dupinator.Andrew_Shearer.py
118 lines (108 loc) · 3.4 KB
/
dupinator.Andrew_Shearer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/python
# Dupinator
# Original script by Bill Bumgarner: see
# http://www.pycs.net/bbum/2004/12/29/
#
# Updated by Andrew Shearer on 2004-12-31: see
# http://www.shearersoftware.com/personal/weblog/2005/01/14/dupinator-ii
import os
import sys
import stat
import md5
filesBySize = {}
requireEqualNames = False
def walker(arg, dirname, fnames):
d = os.getcwd()
os.chdir(dirname)
try:
fnames.remove('Thumbs')
except ValueError:
pass
for f in fnames:
if not os.path.isfile(f) or os.path.islink(f) or f == '.DS_Store':
continue
size = os.stat(f)[stat.ST_SIZE]
if size < 100:
continue
if filesBySize.has_key(size):
a = filesBySize[size]
else:
a = []
filesBySize[size] = a
a.append(os.path.join(dirname, f))
os.chdir(d)
for x in sys.argv[1:]:
print 'Scanning directory "%s"....' % x
os.path.walk(x, walker, filesBySize)
FIRST_SCAN_BYTES = 1024
print 'Finding potential dupes...'
dupes = [] # ashearer
potentialDupes = []
potentialCount = 0
sizes = filesBySize.keys()
sizes.sort()
for k in sizes:
inFiles = filesBySize[k]
hashes = {}
if len(inFiles) is 1: continue
print 'Testing %d files of size %d...' % (len(inFiles), k)
if requireEqualNames:
for fileName in inFiles:
hashes.setdefault(os.path.basename(fileName), []).append(fileName)
inFiles = []
for nameGroup in hashes.values():
if len(nameGroup) > 1:
inFiles.extend(nameGroup)
hashes = {}
for fileName in inFiles:
#if not os.path.isfile(fileName):
# continue
aFile = file(fileName, 'r')
hasher = md5.new(aFile.read(FIRST_SCAN_BYTES))
hashValue = hasher.digest()
if hashes.has_key(hashValue):
hashes[hashValue].append(fileName)
else:
hashes[hashValue] = [fileName]
aFile.close()
outFileGroups = [fileGroup for fileGroup in hashes.values() if len(fileGroup) > 1] # ashearer
if k <= FIRST_SCAN_BYTES: # we already scanned to whole file; put into definite dups list (ashearer)
dupes.extend(outFileGroups)
else:
potentialDupes.extend(outFileGroups)
potentialCount = potentialCount + len(outFileGroups)
del filesBySize
print 'Found %d sets of potential dupes...' % potentialCount
print 'Scanning for real dupes...'
#dupes = [] ashearer
for aSet in potentialDupes:
#outFiles = []
hashes = {}
for fileName in aSet:
print 'Scanning file "%s"...' % fileName
aFile = file(fileName, 'r')
hasher = md5.new()
while True:
r = aFile.read(4096)
if not len(r):
break
hasher.update(r)
aFile.close()
hashValue = hasher.digest()
if hashes.has_key(hashValue):
hashes[hashValue].append(fileName) # ashearer
else:
hashes[hashValue] = [fileName] #ashearer
outFileGroups = [fileGroup for fileGroup in hashes.values() if len(fileGroup) > 1] # ashearer
dupes.extend(outFileGroups)
i = 0
bytesSaved = 0
for d in dupes:
print 'Original is %s' % d[0]
for f in d[1:]:
i = i + 1
print 'Would have deleted %s' % f
bytesSaved += os.path.getsize(f)
#os.remove(f)
print
print "Would have saved %.1fK; %d file(s) duplicated." % (bytesSaved/1024.0,len(dupes))