-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathduplicate_exact_photos.py
165 lines (129 loc) · 6.99 KB
/
duplicate_exact_photos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import os
import time
import argparse
import numpy as np
from PIL import Image
from tqdm import tqdm
# Flag duplicates using cosine distance
def find_duplicates(image, thresh=1e-3):
"""
Function for finding pair of duplicates in an array.
As all samples combination pairs needs to be tested , I decided to use cosine distance among samples as it is fast to compute.
A threshold is used for deciding if samples are same or not. Found 1e-3 to work the best.
Parameters:
image (numpy array) : 2D array of shape (n_files, flattened_image_pixels)
threshold for cosine distance (float) : threshold for deciding duplicate or not.
Returns:
bool numpy array of shape (n_files, n_files)
"""
image_unit = image / np.linalg.norm(image, ord=2, axis=1, keepdims=True) # Convert images to unit vectors
image_distances = 1 - image_unit @ image_unit.T # Calculate cosine distance among images
return image_distances < thresh # Using 0.0001 distance as threshold to call it a duplicate image
def check_folder(folder, compare_size=300):
"""
Function for reading all the files inside the folder and comparing them.
Parameters:
folder (directory_path) : Path to the folder that needs to be checked.
compare_size (int) : All images will be resized to this size for comparison.
Higher consumes memory and Low can lead to inaccurate results.
300 is a good spot which means every image will be resized to 300 X 300.
Returns:
array of the file path that are duplicate.
"""
print("Checking folder → " + folder)
files = os.listdir(folder) # Get all files in a folder
files.sort()
images_name = []
all_images = []
to_delete = []
# Read all images in the folder
time.sleep(0.1) # Otherwise TQDM's output are generally messed up
for file_name in tqdm(files):
# Try to read each file as an image
try:
image = Image.open(os.path.join(folder, file_name)).convert('L') # Convert to grayscale after reading
if image is not None:
image = image.resize((compare_size, compare_size)) # Resize image to compare size
image = np.array(image).reshape(-1) # Convert 2D images to 1D array for easier computation
all_images.append(image)
images_name.append(file_name)
except:
pass
time.sleep(0.1)
m = len(all_images)
print("Total images found = " + str(m))
if m < 2: # Duplicates not possible if the number of images are 0 or 1.
print()
return to_delete # Return empty array
# Combine all image arrays together and flag duplicates
all_images = np.stack(all_images)
print(f"Finding duplicate images within the folder {folder}...")
images_duplicates = find_duplicates(all_images)
def file_size(image):
"""
Function to get file size to decide which image to keep
Parameters:
image : index of image
Returns:
file size
"""
return os.path.getsize(os.path.join(folder, images_name[image]))
# Collect all flagged duplicates and decide which files to keep.
visited = [False] * m # To avoid unnecessary calls to duplicate images
files_duplicates = [] # Store duplicates for each file
for i in tqdm(range(m)):
if visited[i]:
continue
visited[i] = True
image_duplicates = images_duplicates[i] # Get files with duplicates flags for the current file
if sum(image_duplicates) > 1: # If more than 1 file similar to the current file (including current file)
duplicate_idx = np.where(image_duplicates)[0].tolist() # Get indexes of all duplicate images
duplicate_idx.sort(key=file_size, reverse=args.keep_largest == 1) # Sort the list based on their size, Reverse is set based on which file size is preferred.
for j, idx in enumerate(duplicate_idx):
visited[idx] = True # Set visited of duplicate images to True to avoid unnecessary calls
if j > 0:
to_delete.append(os.path.join(folder, images_name[idx])) # Add all duplicate files for deletion except the prefered file file
files_duplicates.append(duplicate_idx)
time.sleep(0.1)
# Display all duplicate files within the folder.
if len(files_duplicates) > 0:
print("\nDuplicates:")
for i in range(len(files_duplicates)):
for j in range(len(files_duplicates[i])):
print(images_name[files_duplicates[i][j]], end="\t")
print()
else:
print("No duplicates found.")
print()
return to_delete
def main(args):
print(f"Folder to be explored: {args.folder}\n\n")
folders = [folder[0] for folder in os.walk(args.folder)] # Find all folders to be searched recursively
folders.sort()
all_deletes = [] # Stores all files to deleted
for folder in folders:
all_deletes += check_folder(folder, args.compare_size)
print('-----------------------------Overall Report----------------------------------')
if len(all_deletes) == 0:
print("No duplicates found.")
exit()
# Display all files marked for delete
print("\nFiles marked for delete:")
for file in all_deletes:
print(file)
# Prompt to decide whether to delete
print("Print Y to delete")
inp = input()
if inp.lower() == 'y':
for file in all_deletes:
os.remove(file)
print("Done.")
else:
print("Files not deleted.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='find-exact-duplicate-images')
parser.add_argument('--folder', type=str, default="." , help='Directory of images. Default is current directory.')
parser.add_argument('--keep_largest', type=int, default=0, help='0: keeps the smallest file among duplicates; 1: keeps the largest file among duplicates')
parser.add_argument('--compare_size', type=int, default=300, help='Size used for comparison. Found 300 to be best for performance and results.')
args = parser.parse_args()
main(args)