L
Lowell Kirsh
I have a script which I use to find all duplicates of files within a
given directory and all its subdirectories. It seems like it's longer
than it needs to be but I can't figure out how to shorten it. Perhaps
there are some python features or libraries I'm not taking advantage of.
The way it works is that it puts references to all the files in a
dictionary with file size being the key. The dictionary can hold
multiple values per key. Then it looks at each key and all the
associated files (which are the same size). Then it uses filecmp to see
if they are actually byte-for-byte copies.
It's not 100% complete but it's pretty close.
Lowell
#!/usr/bin/env python
import os, os.path, filecmp, operator, sys
# return a list of lists of duplicates
# ie. partitionDuplicates([1, 2, 3, 1, 1, 3], operator.eq) -> [[1,1,1],[3,3]]
def partitionDuplicates(args, eq=operator.eq):
numFiles = len(args)
if numFiles <= 1:
return []
partitions = []
flags = [False] * numFiles
for i in xrange(numFiles - 1):
if not flags: # file hasn't been matched to earlier file
matches = [args] # the current file we're comparing others to
for j in xrange(i+1, numFiles):
if not flags[j] and eq(args, args[j]):
matches.append(args[j])
flags[j] = True
if len(matches) > 1:
partitions.append(matches)
assert(reduce(operator.add, map(len, partitions), 0)
<= len(args))
for partition in partitions:
for i in xrange(len(partition) - 1):
assert eq(partition, partition[i+1])
return partitions
assert partitionDuplicates([1, 2, 3, 1, 1, 3], operator.eq) == [[1,1,1],[3,3]]
def enumerateMatches(dir):
all = {}
# first, find duplicates strictly by their sizes
for root, dirs, files in os.walk(dir):
for file in files:
fullname = os.path.join(root, file)
size = os.path.getsize(fullname)
if size not in all:
all[size] = []
all[size].append(fullname)
# now check which files are really duplicates of each other
toreturn = []
for files in all.itervalues():
partition = partitionDuplicates(files, filecmp.cmp)
if partition:
for i in partition:
toreturn.append(i)
return toreturn
def fakeDelete(file):
print 'simulating deleting %s\n' % file
def dealWithMatches(matches, delete=fakeDelete):
"return true if files were deleted"
while True:
print 'Which file do you want to keep? (all others will be deleted)'
print 'or press "s" to skip this group'
for i in xrange(len(matches)):
print '%d) %s' % (i, matches)
input = raw_input().strip().lower()
if input == 's':
print
return False
try:
input = int(input)
if 0 <= input < len(matches):
for i in xrange(len(matches)):
if i != input:
delete(matches)
return True
except ValueError:
pass
print 'Invalid entry. Try again\n'
if __name__ == '__main__':
global matcheses # should get rid of this line
if len(sys.argv) != 2:
print 'usage: %s dirname' % os.path.basename(sys.argv[0])
else:
matcheses = enumerateMatches(sys.argv[1])
print 'Found at least %d matches' % len(matcheses)
for matches in matcheses:
dealWithMatches(matches)
given directory and all its subdirectories. It seems like it's longer
than it needs to be but I can't figure out how to shorten it. Perhaps
there are some python features or libraries I'm not taking advantage of.
The way it works is that it puts references to all the files in a
dictionary with file size being the key. The dictionary can hold
multiple values per key. Then it looks at each key and all the
associated files (which are the same size). Then it uses filecmp to see
if they are actually byte-for-byte copies.
It's not 100% complete but it's pretty close.
Lowell
#!/usr/bin/env python
import os, os.path, filecmp, operator, sys
# return a list of lists of duplicates
# ie. partitionDuplicates([1, 2, 3, 1, 1, 3], operator.eq) -> [[1,1,1],[3,3]]
def partitionDuplicates(args, eq=operator.eq):
numFiles = len(args)
if numFiles <= 1:
return []
partitions = []
flags = [False] * numFiles
for i in xrange(numFiles - 1):
if not flags: # file hasn't been matched to earlier file
matches = [args] # the current file we're comparing others to
for j in xrange(i+1, numFiles):
if not flags[j] and eq(args, args[j]):
matches.append(args[j])
flags[j] = True
if len(matches) > 1:
partitions.append(matches)
assert(reduce(operator.add, map(len, partitions), 0)
<= len(args))
for partition in partitions:
for i in xrange(len(partition) - 1):
assert eq(partition, partition[i+1])
return partitions
assert partitionDuplicates([1, 2, 3, 1, 1, 3], operator.eq) == [[1,1,1],[3,3]]
def enumerateMatches(dir):
all = {}
# first, find duplicates strictly by their sizes
for root, dirs, files in os.walk(dir):
for file in files:
fullname = os.path.join(root, file)
size = os.path.getsize(fullname)
if size not in all:
all[size] = []
all[size].append(fullname)
# now check which files are really duplicates of each other
toreturn = []
for files in all.itervalues():
partition = partitionDuplicates(files, filecmp.cmp)
if partition:
for i in partition:
toreturn.append(i)
return toreturn
def fakeDelete(file):
print 'simulating deleting %s\n' % file
def dealWithMatches(matches, delete=fakeDelete):
"return true if files were deleted"
while True:
print 'Which file do you want to keep? (all others will be deleted)'
print 'or press "s" to skip this group'
for i in xrange(len(matches)):
print '%d) %s' % (i, matches)
input = raw_input().strip().lower()
if input == 's':
return False
try:
input = int(input)
if 0 <= input < len(matches):
for i in xrange(len(matches)):
if i != input:
delete(matches)
return True
except ValueError:
pass
print 'Invalid entry. Try again\n'
if __name__ == '__main__':
global matcheses # should get rid of this line
if len(sys.argv) != 2:
print 'usage: %s dirname' % os.path.basename(sys.argv[0])
else:
matcheses = enumerateMatches(sys.argv[1])
print 'Found at least %d matches' % len(matcheses)
for matches in matcheses:
dealWithMatches(matches)