-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdupfinder.pl
executable file
·375 lines (286 loc) · 10.7 KB
/
dupfinder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/perl -w
# dupfinder is a small tool to find duplicates
# files in a filesystem.
# You can provide none, one or several
# directories where it will search (if no directory
# is provided, "." will be used)
# How it works: no much magic here.
# First,
# it finds all the files in the specified directories, and
# create a hash (file_size_hash) of the form:
# "filename" => file size
# Then, the hash is inverted. Multiple files
# can have the same size (out first candidates of duplicates!).
# The inverted hash (size_files_hash) contains an array as value,
# with all the filenames of a given size:
# file size => "filename"
#
# Second,
# another hash is created (file_md5_hash).
# This hash maps the filenames to is md5 hash value
# Note that we only need to hash the files in the
# size_files_hash, since we are only interested
# in files with same length (take a look at
# sub process_checksums()). The current implementation
# will detect if we need to compute the md5 checksum
# for the current file by searching for its size
# in size_files_hash. This is "slow", since we have
# to look in a hash. We could improve speed
# by iterating directly in every value of size_files_hash.
# However, in practice, this seems not to be a bottleneck.
#
# Finally, as fdupes (http://premium.caribe.net/~adrian2/fdupes.html) does,
# we should do a bit-by-bit comparision of files with the same md5sum.
# This is NOT implemented now.
# TODO:
# * more options support. reference could be taken from fdupes
# For instance: avoid certain patterns, scan certains pattern (v.g., .mp3),
# skip files with size less than/bigger than, hard links.
# * better warnings and messages
#
# READ others TODO in the code
# Known BUGS:
# I can't find a way to prevent File::Find to try to change to a dir.
# This is a problem when the user has no permission to cd to that dir,
# since a message will be printed in STDERR. It would be
# nice to capture that error
#
# Changelog:
# Version 0.01. Initial release
# Maximiliano Combina -- maxicombina@gmail.com
# 29 January 2008
# COPYING:
# Distributed under GPLv2 of June 1991, which you can find at
# http://www.gnu.org/licenses/gpl-2.0.html
use strict;
use Data::Dumper;
use warnings;
use File::Find;
use Digest::MD5;
use File::stat;
use Cwd 'abs_path';
use Cwd 'getcwd';
# The idea with bundling is to support short and long options, such as
# --dumpinfo and -d. This seems not to work.
use Getopt::Long qw(:config bundling no_getopt_compat autoversion);
# Variables used to hold file information
# Hash to map filenames to byte size:
# "filename" => stat("filename")->size
my %file_size_hash = ();
# Inverse hash of %file_size_hash.
# It may contain duplicates values, thus the structure is:
# size => ["file1", "file2", ..., "fileN"]
my %size_files_hash = ();
# The hash to map filenames to md5 checksum.
# The structure is:
# "filename" => "md5 checksum"
my %file_md5_hash = ();
# Inverse of file_md5_hash
# Thus, the hash structure is:
# "md5 checksum" => ["filename1", "filename2", ..., "filenameN"]
my %md5_files_hash = ();
# Variables used to general work
my $md5 = Digest::MD5->new;
my $file_count = 0;
my $current_file_count = 0;
my @indicator = ('-', '\\', '|', '/');
my $progress_indicator = 0;
my $errors_processing_checksums = '';
my $errors_chdir = '';
# Command line variables
my $follow_symlinks = 0; # default: not to follow symlinks
my $dump_info = 0; # default: not to dump info
my $help_requested = 0;
# Everything ready. Let's start with main()
&main;
sub file_setup
{
if ( -l $_ && ! $follow_symlinks ){
#print "Discarding $_!!\n";
return;
}
if (-f $_){
print STDERR "\rBuilding file list $indicator[$progress_indicator]";
$progress_indicator = ($progress_indicator + 1) % 4;
$file_count++;
$file_size_hash{$File::Find::name} = stat($_)->size;
}
}
sub process_checksums
{
my $filename = $_;
if ( -l $_ && ! $follow_symlinks ){
#printf "Skipping $_!!\n";
return;
}
if ( -f $filename){
$current_file_count++;
my $percentage = int(100 * $current_file_count / $file_count);
print STDERR "\rProgress [$current_file_count/$file_count] $percentage%";
if (exists $size_files_hash{stat($filename)->size} ){
if (open(FILE, "<", $filename)){
#printf "\nProcessing file $filename\n";
binmode(FILE);
$file_md5_hash{$File::Find::name}=$md5->addfile(*FILE)->hexdigest;
close(FILE);
} else {
# TODO: this code was reached because the size of this file is the same
# than the code in some other(s) file(s)! Log the other(s) file(s)!
# NEWs: when a filane ends in space (' '), open() also fails
# It is weird to have such a filename, but...
$errors_processing_checksums .= "Warning! Could not open file '$File::Find::name': $!\n";
}
}
}
}
sub dump_info
{
if (! $dump_info ){
return;
}
print "\nDUMPING file_size_hash\n";
print Dumper \%file_size_hash;
print "\n";
print "\nDUMPING size_files_hash\n";
print Dumper \%size_files_hash;
print "\n";
print "\nDUMPING file_md5_hash\n";
print Dumper \%file_md5_hash;
print "\n";
print "\nDUMPING md5_files_hash\n";
print Dumper \%md5_files_hash;
print "\n";
print "\nDUMPING more stats:\n";
print "file_count = $file_count\n";
print "current_file_count = $current_file_count\n";
}
sub print_help_and_exit
{
print STDERR "Usage $0 [options] [dir1] [dir2] ...\n";
print STDERR "\nOptions are:\n\n";
print STDERR " -s --symlinks\tfollow symlinks. Default: disabled\n";
print STDERR " -d --dumpinfo\tDump interesting variables after processing.\n";
print STDERR "\t\tUseful for debugging only. Default: disabled\n";
print STDERR " -? -h --help\tDisplay this help and exit\n";
print STDERR "\n";
exit;
}
sub main
{
my $result = GetOptions ("symlinks|s" => \$follow_symlinks,
"dumpinfo|d" => \$dump_info,
"help|h|?" => \$help_requested);
if ( $help_requested || ! $result ){
&print_help_and_exit;
}
my @dirs = ();
if (scalar(@ARGV) == 0){
@dirs = ('.');
} else {
# We eliminate duplicate directories, by
# normalizing all the paths in the command line (using Cwd::abs_path)
# and inserting them in a hash.
# The inspiring techinque is described in 'Perl Cookbook':
# Recipe 4.6: Extracting Unique Elements from a list
# available at http://www.unix.org.ua/orelly/perl/cookbook/ch04_07.htm
my %seen = ();
foreach my $item (@ARGV){
if (! -e $item && ! -l $item){
print STDERR "Warning! $item does not exist\n";
next;
} elsif ( ! -e $item && -l $item ){
print STDERR "Warning! $item points nowhere\n";
next;
}
if ( -d $item && -r $item && -x $item ){
my $abs_path_to_item = abs_path($item);
$seen{$abs_path_to_item}++;
} else {
if ( "" eq -d $item ){
print STDERR "Warning! '$item' does not seem a directory\n";
} else {
print STDERR "Warning! directory '$item' can't be accessed\n";
}
}
}
@dirs = sort keys %seen;
# If the user specified the current directory,
# it is nice that he/she sees the duplicates
# in the form "./path/to/dup".
# Up to here, the directory was changed to absolute path,
# and the user will see "/abs/path/to/here/path/to/dup".
# we will revert this
for (my $i = 0 ; $i < scalar (@dirs); $i++){
if (getcwd() eq $dirs[$i]){
$dirs[$i] = ".";
}
}
}
# print "follow_symlinks = $follow_symlinks\n";
# print "dump_info = $dump_info\n";
# print Dumper \@dirs;
# First pass: detect total amount of files,
# and detect those with same size
foreach my $dir (@dirs){
if ( -d $dir ){
find ({wanted => \&file_setup, follow=>$follow_symlinks}, $dir);
} else {
print STDERR "$dir is not a directory! skipped\n";
}
}
print STDERR " done!\n";
## printf STDERR "\r%40s\r", " "; # Clean the console, stolen from fdupes.c version 1.40
# print Dumper \%file_size_hash;
for my $file (keys %file_size_hash){
push(@{$size_files_hash{$file_size_hash{$file}}}, $file);
}
for my $size (keys %size_files_hash){
my @possible_dups = @{$size_files_hash{$size}};
if ( scalar (@possible_dups) == 1 ){
delete $size_files_hash{$size};
}
}
# Now %size_files_hash only contains sizes with 2 or
# more matching files. It is enough to search here only.
# print Dumper \%size_files_hash;
# die "fake done\n";
foreach my $dir (@dirs){
if ( -d $dir ){
find ({wanted => \&process_checksums, follow=>$follow_symlinks}, $dir);
} else {
print STDERR "$dir is not a directory! skipped\n";
}
}
printf STDERR " done!\n";
## printf STDERR "\r%40s\r", " "; # Clean the console, stolen from fdupes.c version 1.40
printf STDERR $errors_processing_checksums."\n" if ($errors_processing_checksums);
for my $file (keys %file_md5_hash){
push(@{$md5_files_hash{$file_md5_hash{$file}}}, $file);
}
# We now iterate on the values. When we detect
# that some value has more than 1 file name (i.e.,
# the array has more than 1 element), we print
# the file names: these are the duplicate files
# TODO: this only relies on the md5 sum, we
# should do a more exhaustive comparision, just
# to be sure
for my $duplicates (values %md5_files_hash){
my $dup_size = scalar( @{$duplicates} );
# file with duplicates hash
if ( $dup_size > 1 ){
my @dups_files; # will store all the filenames
# insert filenames in the array
for (my $i = 0; $i < $dup_size; $i++){
push(@dups_files, $duplicates->[$i]);
}
# print the size of the group, in bytes. Any file in @duplicates will be useful
print "size: ".$file_size_hash{$duplicates->[0]}."\n";
# print sorted output
for my $filename (sort @dups_files){
print "$filename\n";
}
print "\n";
}
}
&dump_info;
}