Ubuntu下有一个fdupes程序可以用来打印当前目录下有多少文件是重复的。因为是二进制程序,所以效率上更高。
而用Python实现的,效率就很低了,同样的目录,fdupes只用了2秒的样子,而Python程序就用了15s,大约7-8倍速度差距。
所以这个代码仅仅只是一个演示罢了。代码参考《Python for Unix and Linux System Administration》一书。
| 以下是代码片段: #!/usr/bin/python -tt # -*- coding:utf-8 -*- __DOC__ = ’’’ identifies duplicate files within given directories including subdirectories pydupes.py uses files’ size and md5sums to find duplicate files within a set of directories. ’’’ __Author__ = ’wgzhao,wgzhao##gmail.com’ import os from hashlib import md5 from sys import exit, argv class diskwalk(object): """API for getting directory walking collections""" def __init__(self, path): self.path = path def enumeratePaths(self): """Returns the path to all the files in a directory as a list""" path_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for file in filenames: fullpath = os.path.join(dirpath, file) path_collection.append(fullpath) return path_collection def enumerateFiles(self): """Returns all the files in a directory as a list""" file_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for file in filenames: file_collection.append(file) return file_collection def enumerateDir(self): """Returns all the directories in a directory as a list""" dir_collection = [] for dirpath, dirnames, filenames in os.walk(self.path): for dir in dirnames: dir_collection.append(dir) return dir_collection def create_checksum(path): """ Reads in file. Creates checksum of file line by line. Returns complete checksum total for file. """ fp = open(path) checksum = md5() while True: buffer = fp.read(8192) if not buffer: break checksum.update(buffer) fp.close() checksum = checksum.digest() return checksum def findDupes(path = ’/tmp’): dup = {} record = {} d = diskwalk(path) files = d.enumeratePaths() for file in files: compound_key = (os.path.getsize(file),create_checksum(file)) if compound_key in record: if compound_key in dup: dup[compound_key].append(record[compound_key]) else: dup[compound_key] = [ record[compound_key] ] dup[compound_key].append(file) else: #print "Creating compound key record:", compound_key record[compound_key] = file return dup if __name__ == "__main__": if len(argv) < 2: try: path = raw_input(’pls input dir:’) except: path = ’’ else: path = argv[1] if not path: exit(1) dupes = findDupes(path) for k,v in dupes.items(): for filename in v: print filename |
代码运行类似如下:
以下是代码片段: |