python find duplicated zip files

Solutions on MaxInterview for python find duplicated zip files by the best coders in the world

showing results for - "python find duplicated zip files"
12 Sep 2020
1#!/usr/bin/env python
2# if running in py3, change the shebang, drop the next import for readability (it does no harm in py3)
3from __future__ import print_function   # py2 compatibility
4from collections import defaultdict
5import hashlib
6import os
7import sys
8
9
10def chunk_reader(fobj, chunk_size=1024):
11    """Generator that reads a file in chunks of bytes"""
12    while True:
13        chunk = fobj.read(chunk_size)
14        if not chunk:
15            return
16        yield chunk
17
18
19def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
20    hashobj = hash()
21    file_object = open(filename, 'rb')
22
23    if first_chunk_only:
24        hashobj.update(file_object.read(1024))
25    else:
26        for chunk in chunk_reader(file_object):
27            hashobj.update(chunk)
28    hashed = hashobj.digest()
29
30    file_object.close()
31    return hashed
32
33
34def check_for_duplicates(paths, hash=hashlib.sha1):
35    hashes_by_size = defaultdict(list)  # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ]
36    hashes_on_1k = defaultdict(list)  # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ]
37    hashes_full = {}   # dict of full_file_hash: full_path_to_file_string
38
39    for path in paths:
40        for dirpath, dirnames, filenames in os.walk(path):
41            # get all files that have the same size - they are the collision candidates
42            for filename in filenames:
43                full_path = os.path.join(dirpath, filename)
44                try:
45                    # if the target is a symlink (soft one), this will 
46                    # dereference it - change the value to the actual target file
47                    full_path = os.path.realpath(full_path)
48                    file_size = os.path.getsize(full_path)
49                    hashes_by_size[file_size].append(full_path)
50                except (OSError,):
51                    # not accessible (permissions, etc) - pass on
52                    continue
53
54
55    # For all files with the same file size, get their hash on the 1st 1024 bytes only
56    for size_in_bytes, files in hashes_by_size.items():
57        if len(files) < 2:
58            continue    # this file size is unique, no need to spend CPU cycles on it
59
60        for filename in files:
61            try:
62                small_hash = get_hash(filename, first_chunk_only=True)
63                # the key is the hash on the first 1024 bytes plus the size - to
64                # avoid collisions on equal hashes in the first part of the file
65                # credits to @Futal for the optimization
66                hashes_on_1k[(small_hash, size_in_bytes)].append(filename)
67            except (OSError,):
68                # the file access might've changed till the exec point got here 
69                continue
70
71    # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
72    for __, files_list in hashes_on_1k.items():
73        if len(files_list) < 2:
74            continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it
75
76        for filename in files_list:
77            try: 
78                full_hash = get_hash(filename, first_chunk_only=False)
79                duplicate = hashes_full.get(full_hash)
80                if duplicate:
81                    print("Duplicate found: {} and {}".format(filename, duplicate))
82                else:
83                    hashes_full[full_hash] = filename
84            except (OSError,):
85                # the file access might've changed till the exec point got here 
86                continue
87
88
89if __name__ == "__main__":
90    if sys.argv[1:]:
91        check_for_duplicates(sys.argv[1:])
92    else:
93        print("Please pass the paths to check as parameters to the script")
94
source