Python code is used to find a file that specifies the same content

From , 3 Years ago, written in Python, viewed 210 times.
URL https://pastebin.vip/view/95e62984
  1. # Hello, this script is written in Python - http://www.python.org
  2. # doublesdetector.py 1.0p
  3. import os, os.path, string, sys, sha
  4.  
  5. message = """
  6. doublesdetector.py 1.0p
  7.  
  8. This script will search for files that are identical
  9. (whatever their name/date/time).
  10.  
  11.  Syntax : python %s <directories>
  12.  
  13.      where <directories> is a directory or a list of directories
  14.      separated by a semicolon (;)
  15.  
  16. Examples : python %s c:\windows
  17.           python %s c:\;d:\;e:\ > doubles.txt
  18.           python %s c:\program files > doubles.txt
  19.  
  20. This script is public domain. Feel free to reuse and tweak it.
  21. The author of this script Sebastien SAUVAGE <sebsauvage at sebsauvage dot net>
  22. http://sebsauvage.net/python/
  23. """ % ((sys.argv[0], )*4)
  24.  
  25. def fileSHA ( filepath ) :
  26.     """ Compute SHA (Secure Hash Algorythm) of a file.
  27.        Input : filepath : full path and name of file (eg. 'c:\windows\emm386.exe')
  28.        Output : string : contains the hexadecimal representation of the SHA of the file.
  29.                          returns '0' if file could not be read (file not found, no read rights...)
  30.    """
  31.     try:
  32.         file = open(filepath,'rb')
  33.         digest = sha.new()
  34.         data = file.read(65536)
  35.         while len(data) != 0:
  36.             digest.update(data)
  37.             data = file.read(65536)
  38.         file.close()
  39.     except:
  40.         return '0'
  41.     else:
  42.         return digest.hexdigest()
  43.  
  44. def detectDoubles( directories ):
  45.     fileslist = {}
  46.     # Group all files by size (in the fileslist dictionnary)
  47.     for directory in directories.split(';'):
  48.         directory = os.path.abspath(directory)
  49.         sys.stderr.write('Scanning directory '+directory+'...')
  50.         os.path.walk(directory,callback,fileslist)
  51.         sys.stderr.write('\n')
  52.  
  53.     sys.stderr.write('Comparing files...')
  54.     # Remove keys (filesize) in the dictionnary which have only 1 file
  55.     for (filesize,listoffiles) in fileslist.items():
  56.         if len(listoffiles) == 1:
  57.             del fileslist[filesize]
  58.  
  59.     # Now compute SHA of files that have the same size,
  60.     # and group files by SHA (in the filessha dictionnary)
  61.     filessha = {}
  62.     while len(fileslist)>0:
  63.         (filesize,listoffiles) = fileslist.popitem()
  64.         for filepath in listoffiles:
  65.             sys.stderr.write('.')
  66.             sha = fileSHA(filepath)
  67.             if filessha.has_key(sha):
  68.                 filessha[sha].append(filepath)
  69.             else:
  70.                 filessha[sha] = [filepath]
  71.     if filessha.has_key('0'):
  72.         del filessha['0']
  73.  
  74.     # Remove keys (sha) in the dictionnary which have only 1 file
  75.     for (sha,listoffiles) in filessha.items():
  76.         if len(listoffiles) == 1:
  77.             del filessha[sha]
  78.     sys.stderr.write('\n')
  79.     return filessha
  80.  
  81. def callback(fileslist,directory,files):
  82.     sys.stderr.write('.')
  83.     for fileName in files:
  84.         filepath = os.path.join(directory,fileName)
  85.         if os.path.isfile(filepath):
  86.             filesize = os.stat(filepath)[6]
  87.             if fileslist.has_key(filesize):
  88.                 fileslist[filesize].append(filepath)
  89.             else:
  90.                 fileslist[filesize] = [filepath]
  91.  
  92. if len(sys.argv)>1 :
  93.     doubles = detectDoubles(" ".join(sys.argv[1:]))
  94.     print 'The following files are identical:'
  95.     print '\n'.join(["----\n%s" % '\n'.join(doubles[filesha]) for filesha in doubles.keys()])
  96.     print '----'
  97. else:
  98.     print message
  99. #//python/5176

Reply to "Python code is used to find a file that specifies the same content"

Here you can reply to the paste above

captcha

https://burned.cc - Burn After Reading Website