-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkindfs.py
executable file
·963 lines (885 loc) · 54.5 KB
/
kindfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Adrien Demarez (adrien.demarez@free.fr)
# License: GPLv3
# Prerequisite : pip install xxhash numpy python-magic termcolor
# Beware : this software only hashes portions of files for speed (and therefore may consider that some files/dirs are identical when they are not really). Use this program at your own risk and only when you know what you are doing ! (and double-check with filenames + if unsure, triple-check with full md5sum or diff -r !)
# FIXME: in some cases, it makes parentdir_len=1 and parentdir="/"
# Parameters
DB_COMMIT_PERIODICITY=0.3 # flush/commit DB every x seconds. Higher values implies more RAM usage but higher I/O performance
DISPLAY_PERIODICITY=0.05
FILE_HASH_CHUNKSIZE=1<<20 # default is to hash 1MB data at begin/middle/end of files i.e. 3MB total. Smaller values means faster scan but more risk to miss differences in files. set "None" if you want to scan 100% of the contents of your files for more safety (at the expense of scanning speed)
import sqlite3,xxhash
#import fnmatch
import math
import os, errno, sys, stat
from array import array # More lightweight than numpy
from collections import defaultdict
import time
import magic
import re
from termcolor import colored
import argparse
import shutil
try:
from os import scandir, walk
except ImportError:
from scandir import scandir, walk
#import bisect
#import chardet
#import codecs
#import cProfile
def xxhash_file(filename, filesize=None, chunksize=FILE_HASH_CHUNKSIZE, inclsize=False, inclname=False):
"""Return pseudo-hash of a file using xxhash64 on 3 MBytes of that file at its beginning/middle/end. Optionally include the size and filename into the pseudo-hash"""
if filesize==None:
filesize=int(os.stat(filename).st_size)
if filesize==0:
return 0
CHUNKSIZE=filesize if (chunksize==None or chunksize<1) else chunksize # default value == 1<<20 i.e. 1 MByte chunk size
digest = xxhash.xxh64()
if inclsize==True:
digest.update(filesize)
if inclname==True:
digest.update(os.path.basename(filename))
with open(filename,'rb') as fh:
if(filesize<=3*CHUNKSIZE):
data = fh.read() # FIXME: what if CHUNKSIZE==filesize and filesize is bigger than RAM ? use mmap()...
is_there_data = any(data)
digest.update(data)
else:
data = fh.read(CHUNKSIZE)
is_there_data = any(data)
digest.update(data)
fh.seek(math.floor(filesize/2-CHUNKSIZE/2))
data = fh.read(CHUNKSIZE)
is_there_data |= any(data)
digest.update(data)
fh.seek(filesize-CHUNKSIZE)
data = fh.read(CHUNKSIZE)
is_there_data |= any(data)
digest.update(data)
if not is_there_data and (filesize<=3*CHUNKSIZE or check_zerofile(filename)):
return 0 # File with size >0 but totally filled with zeros...
return digest.intdigest() - (1<<63) # return integer rather than hexdigest because it is more efficient. "- (1<<63)" is there because SQLite3 unfortunately only supports signed 64 bits integers and doesn't support unsigned
def check_zerofile(filename):
with open(filename,'rb') as fh:
k = not any(fh.read())
return k
def mydecode_path(pathbytes,fixparts=False):
# In worst cases, there may be a mix of encoding in the path (e.g. beginning as utf8, and starting from some point some subdirs as iso8859, and deeper in the path again some utf8). The best way would be to use convmv to fix the issue before running this script. However in real life there may not always be a way to fix the data prior to processing, and I don't want the script to fail miserably in those cases => therefore here we return two decoded string : a first string with "surrogateescape" that can be used by os.* file methods (but cannot be printed or really used as string), and another string which cannot be used by file methods (it doesn't correspond to a valid path on the filesystem) but can be printed and manipulated (and inserted in the DB). For the latter, there are two options : either use error="replace" (default behavior), or fix every subsection of the path (slower, will convert every part nicely to utf8 for printing, but will also hide under the carpet that there is an issue that deserves to be fixed with convmv on this section of the filesystem)
# TODO: check os.fsdecode() ?
#print(chardet.detect(pathbytes))
if fixparts:
pathlist=pathbytes.split(b'/')
path_decoded=""
for k in pathlist:
try:
k2=k.decode('utf-8')
except UnicodeDecodeError:
k2=k.decode('8859')
path_decoded += '/'+k2
path_printable = path_decoded[1:]
else:
path_printable = pathbytes.decode('utf-8',errors="replace")
return pathbytes.decode('utf-8',errors="surrogateescape"), path_printable
def regmulti(regfile):
regstr = ""
with open(regfile) as fh:
for reg in fh:
reg=reg.replace("\n","")
regstr += f'(?:{reg})|'
regex = re.compile(regstr[:-1]) # FIXME: max length ?
print(regex)
return regex
def globmulti(globfile,var='path'):
"""Generate the 'where' part of an SQL query excluding all the entries in globfile"""
globarr = []
with open(globfile) as fh:
for g in fh:
g=g.replace("\n","")
globarr.append(f"not {var} glob '{g}'")
globstr = "(" + " and ".join(globarr) + ")"
return globstr
######################################################
# DB class
class DDB():
def __init__(self, dbname, domagic=False, rw=False, regignore=None, globignore=None,chunksize=FILE_HASH_CHUNKSIZE):
self.dbname=dbname
self.conn = sqlite3.connect(dbname)
self.conn.create_function("get_parentdir_len", 1, lambda x: 0 if os.path.dirname(x)=='/' else len(os.path.dirname(x)))
self.cur = self.conn.cursor()
self.processedfiles = 0
self.processedsize = 0
if rw==True:
print('Looking for resume') # FIXME: make it also work to load the previous version of a db and only scan dirs with updated mtime
tables = [k[0] for k in self.cur.execute("select name from sqlite_master where type='table'").fetchall()]
if 'entries' in tables:
mydirs = self.cur.execute("select path,size,hash,nsubfiles from entries where type='D'").fetchall()
if len(mydirs)>0:
print("Resuming from previous scan")
self.dbcache_dirs = {k[0]:[k[1],k[2],k[3]] for k in mydirs}
self.cur.execute("delete from entries where type in ('F', 'S') and not parentdir in (select path from entries where type='D')")
#emptyf = self.cur.execute("select count(*),sum(size) from files").fetchall()
#self.processedfiles=emptyf[0][0]
#self.processedsize=emptyf[0][1]
else:
print('No existing entries')
elif 'dirs' in tables or 'files' in tables:
sys.exit('Old DB schema. Migrate first !')
else:
print('No existing tables')
self.createdb()
else:
pass
#self.conn.execute("PRAGMA temp_store = MEMORY") # "pragma query_only = ON" does not enable temp views...
#self.init_path=init_path.rstrip('/')
self.magictypes = {}
self.domagic=domagic
self.param_id = 0
self.timer_print=time.time()
self.timer_insert=time.time()
self.dbcache_insert=[]
if self.domagic==True:
for line in self.cur.execute('select id,magictype from magictypes'):
magicid,magictype = line
self.magictypes[magictype] = magicid
self.regex = regmulti(regignore) if regignore else None
self.globignore= globmulti(globignore) if globignore else ''
self.chunksize=chunksize
#print(f"Hash chunk size is {chunksize} bytes")
#print(self.globignore)
def createdb(self):
print("Creating / resetting DB")
cur = self.conn.cursor()
with open("kindfs.sql", encoding='utf-8') as schema:
cur.executescript(schema.read())
def magicid(self, path, dofull=True, domime=True):
"""Compute 'magic' filetype from libmagic, insert it in DB (if not already there) and return the ID of that entry"""
if self.domagic==False or (dofull==False and domime==False):
return 0
magictype = re.sub(', BuildID\\[sha1\\]=[0-9a-f]*','',magic.from_file(path)) if dofull else None
magicmime = magic.from_file(path, mime=True) if domime else None
if magictype in self.magictypes:
return self.magictypes[magictype]
cur = self.conn.cursor()
rs = cur.execute('insert into magictypes values(null,?,?)', (magictype,magicmime))
magic_id = cur.lastrowid
self.magictypes[magictype] = magic_id
return magic_id
def sync_db(self):
vec=self.dbcache_insert
if len(vec)>0 and (isinstance(vec[0],tuple) or isinstance(vec[0],list)):
self.cur.executemany(f'insert or replace into entries values ({",".join("?" for k in vec[0])})', vec)
vec.clear()
self.conn.commit()
def insert_db(self,vec, sync=False):
"""Insert line in DB with some caching in order to perform the real insert/commit in batch (for performance) rather than one-by-one"""
if len(vec)==0:
return
self.dbcache_insert.append(vec)
mytime2=time.time()
if sync or mytime2-self.timer_insert>DB_COMMIT_PERIODICITY:
self.sync_db()
self.timer_insert=time.time() # Not 'mytime2' since sync_db() itself might take a few seconds, maybe more than DB_COMMIT_PERIODICITY
#self.cur.execute(f'insert or replace into {table} values ({",".join("?" for k in vec)})', vec) #q = '(' + '?,' * (len(vec)-1) + '?)'
def insert_db1(self,vec): # For debug purposes, yet slower than the batch insert (even with pragmas and caching)
self.cur.execute(f'insert or replace into entries values ({",".join("?" for k in vec)})', vec)
def dirscan(self, bdir, init_path=None, parentdir=None, dirstat=None, dirlevel=0, prevdb=None):
"""Recursively scan a dir/ (taking care of encoding issues), compute checksums and store metadata in the DB"""
if isinstance(bdir,str):
bdir=bytes(bdir, encoding='utf-8') # This avoids issues when walking through a filesystem with various encodings...
def dbpath(path):
if os.sep != '/': path=path.replace(os.sep, '/')
return path.replace(init_path, '') if path!=init_path else "/"
def printprogress():
mytime2=time.time()
if mytime2-self.timer_print>DISPLAY_PERIODICITY:
k=dbpath(dir_printable)
ld=len(k) - (os.get_terminal_size()[0]-40)
if ld>0:
k=colored("...",'red')+k[ld:]
sys.stderr.write(f"\033[2K\rScanning: [{self.processedsize>>20} MB, {self.processedfiles} files] {k}")
sys.stderr.flush()
self.timer_print=mytime2
#print((bdir,processed,init_path,parentdir,self.mytime))
dir,dir_printable = mydecode_path(bdir)
if init_path==None: # root call (before recursion)
init_path=dir.rstrip('/')
print("\n==== Starting scan ====\n")
self.cur.execute('insert or replace into dbsessions values (null, ?,?)', (int(self.timer_print), init_path))
self.param_id=self.cur.lastrowid
parentdir_len = len(dbpath(parentdir)) if parentdir!=None else 0
curdir_len = len(dbpath(dir))
if hasattr(self,'dbcache_dirs'): # Resume / speedup scan
mypath=dbpath(dir_printable)
if mypath in self.dbcache_dirs:
mysize,myxxh,mysubfiles = self.dbcache_dirs[mypath]
self.processedfiles+=mysubfiles
self.processedsize+=mysize
return mysize,myxxh
#else: # FIXME: seems counter productive (sqlite bottleneck ?)
# refdb_alreadythere={k[0]:k[1] for k in self.cur_ref.execute("select path,size from files where parentdir=?", (dbpath(dir_printable),)).fetchall() }
if prevdb is not None:
connprev = sqlite3.connect(prevdb)
curprev = connprev.cursor()
dirsize=0 # size of current dir including subdirs
dircontents = array('q') # Array of hashes for the contents of current dir. array('q') is more space-efficient than linked list, and better than numpy in this phase as it can easily grow without destroying/recreating the array
dir_numfiles = 0
dir_numdirs = 0
dir_nsubfiles_rec = 0
for entry in os.scandir(bdir):
path,path_printable = mydecode_path(entry.path)
name,name_printable = mydecode_path(entry.name)
path_in_db = dbpath(path_printable)
if not os.path.exists(path) or not os.access(path, os.R_OK):
continue
if entry.is_dir(follow_symlinks=False):
try:
entrysize,dxxh,nsr = self.dirscan(entry.path,init_path=init_path,parentdir=dir_printable,dirstat=entry.stat(follow_symlinks=False),dirlevel=dirlevel+1)
# Insertion in DB is below at dir toplevel (and this is a recursive call)
dircontents.append(dxxh)
dir_numdirs+=1
dir_nsubfiles_rec += nsr
except:
sys.stderr.write(f"\n=> Error in {path_printable}\n")
elif entry.is_symlink():
ltarget = os.readlink(path)
lxxh = xxhash.xxh64(name + ' -> ' + ltarget).intdigest() - (1<<63)
dircontents.append(lxxh)
ext_len = path_in_db.rindex('.') if "." in path_in_db[curdir_len:] else None
self.insert_db((
None, # id integer primary key autoincrement
'S', # type: symlink
path_in_db, # path
curdir_len, # parentdir_len
ext_len, # ext_len
None, # size
lxxh, # hash
None, None, None, None, # magictype, nsubdirs, nsubfiles, nsubfiles_rec
ltarget, # symtarget
None,None,None,None,None,None,None, # struct stat is not needed
self.param_id # dbsession
))
entrysize=0
#dir_numfiles += 1 # FIXME: should we do it ?
#dir_nsubfiles_rec += 1
elif entry.is_file(follow_symlinks=False): # regular file. FIXME: sort by inode (like in https://github.com/pixelb/fslint/blob/master/fslint/findup) in order to speed up scanning ?
filestat = entry.stat(follow_symlinks=False)
entrysize = int(filestat.st_size)
fxxh = None
if prevdb is not None:
fxxh = curprev.execute("select size,st_mtime,hash where type='F' and path=? and size=? and st_mtime=?", path_in_db, entrysize, int(filestat.st_mtime)).fetchall[0][0]
if fxxh is None: # fxxh may still be None after the previous query
fxxh = xxhash_file(path, entrysize, chunksize=self.chunksize)
ext_len = path_in_db.rindex('.') if "." in path_in_db[curdir_len:] else None
mymagicid = self.magicid(path)
self.insert_db((
None, # id integer primary key autoincrement
'F', # type: file
path_in_db, # path
curdir_len, # parentdir_len
ext_len, # ext_len
entrysize, # size
fxxh, # hash
mymagicid, # magictype
None, None, None, None, # nsubdirs, nsubfiles, nsubfiles_rec, symtarget
int(filestat.st_mtime), filestat.st_mode, filestat.st_uid, filestat.st_gid, filestat.st_ino, filestat.st_nlink, filestat.st_dev-(1<<63),
self.param_id # dbsession
))
dircontents.append(fxxh) #bisect.insort(dircontents[dir], xxh)
self.processedfiles+=1
self.processedsize+=entrysize
dir_numfiles += 1
dir_nsubfiles_rec += 1
else:
continue # e.g. named pipes...
#print("__error__: " + path)
#entrysize=0
dirsize += entrysize
printprogress()
dircontents = array('q', sorted(dircontents))
dxxh = 0 if dirsize==0 else xxhash.xxh64(dircontents.tobytes()).intdigest() - (1<<63)
#bisect.insort(dircontents[os.path.dirname(dir)], dirxxh)
if dirstat==None:
dirstat = os.lstat(dir)
path_in_db = dbpath(dir_printable)
self.insert_db((
None, # id integer primary key autoincrement
'D', # type: dir
path_in_db, # path
parentdir_len, # parentdir_len
None, # ext_len
dirsize, # size
dxxh, # hash
None, # magictype
dir_numdirs, # nsubdirs
dir_numfiles, # nsubfiles
dir_nsubfiles_rec, # nsubfiles_rec
None, # symtarget
int(dirstat.st_mtime), dirstat.st_mode, dirstat.st_uid, dirstat.st_gid, None ,dirstat.st_nlink, dirstat.st_dev-(1<<63),
self.param_id # dbsession
))
return dirsize,dxxh,dir_nsubfiles_rec
def walkupdate(self, init_path="/mnt/raid"):
def fspath(dbpath):
return init_path+'/'+dbpath
if not os.path.exists(init_path) or not os.access(init_path, os.R_OK):
return
cur = self.conn.cursor()
cur2 = self.conn.cursor()
for dir in cur.execute("select path from entries where path like ? and type='D' order by path", (init_path+'/%',)):
fsdir = fspath(dir[0])
if not os.path.exists(fsdir) or not os.access(fsdir, os.R_OK) or not os.path.isdir(fsdir):
print(f"Deleting {dir} from DB")
cur2.execute("delete from entries where path like ?", (fsdir+'/%',)) # FIXME: will it affect current readings ?
for file in cur.execute("select path from entries where path like ? and (type='F' or type='S') order by path", (init_path+'/%',)):
fsfile = fspath(file)
if not os.path.exists(fsfile) or not os.access(fsfile, os.R_OK): # or not os.path.isfile(fsfile):
print(f"Deleting {file} from DB")
cur2.execute("delete from entries where path=?", (fsfile,))
def compute_cachedups(self,basedir=""):
cur = self.conn.cursor()
print("\nComputing duplicates...")
wbasedir = f"where path like '{basedir}/%'" if basedir!='' else ''
wbasedir2 = f"and path like '{basedir}/%'" if basedir!='' else ''
cur.executescript(f'''
drop table if exists cachedups_h;
create table cachedups_h (hash integer, size integer, ndups integer,type char(1));
insert into cachedups_h select hash,size,count(*),type from entries {wbasedir} group by hash,size,type having count(*)>1 and size>0;
drop table if exists cachedups;
create table cachedups (entry_id integer not null, size integer, ndups integer, totaldupsize integer GENERATED ALWAYS AS (size*(ndups-1)) VIRTUAL);
create index cachedups_totaldupsize_idx on cachedups(totaldupsize);
create index cachedups_entry_id_idx on cachedups(entry_id);
create index cachedups_size_idx on cachedups(size);
insert into cachedups select entries.id,entries.size,ndups from entries inner join cachedups_h
on entries.hash=cachedups_h.hash where entries.size=cachedups_h.size {wbasedir2} and entries.type=cachedups_h.type
order by entries.size desc;
''')
def get_rec_pcdup(self, dir, entry_id=None):
# compute the % of dups within dirs
cur = self.conn.cursor()
if entry_id is None:
cur.execute("create table cachedups_d (entry_id integer not null, szdup integer, ndupsubs integer)")
entry_id = cur.execute("select id from entries where path=?", (dir,)).fetchall()[0][0]
sz_n_dup = 0 + 0j
rs = cur.execute("select id, path, type, size, nsubfiles_rec from entries where parentdir=?",(dir,))
for rowid, path, entrytype, size, nrec in rs:
szdup_entry = cur.execute("select size from cachedups inner join entries on entry_id=entries.id where path=?", (path,)).fetchall()[0][0]
if szdup_entry is not None:
sz_n_dup += szdup_entry + 1j*(nrec if entrytype == "D" else 1)
elif entrytype == "D":
sz_n_dup += self.get_rec_pcdup(path, entry_id=rowid)
cur.execute("insert into cachedups_d (id, szdup, ndupsubs) values (?,?)", entry_id, sz_n_dup.real, sz_n_dup.imag)
return sz_n_dup
def compute_cachedups_dpartial(self,basedir=""):
cur = self.conn.cursor()
cur2 = self.conn.cursor()
wbasedir = f"where path like '{basedir}/%'" if basedir!='' else ''
orderbysize=True
orderby = "entries.size" if orderbysize else "nsubfiles_rec"
rs = cur.execute(f"select type,path,cachedups.size,hash,ndups,parentdir,nsubfiles_rec from cachedups inner join entries on entry_id=entries.id where cachedups.size>0 {wbasedir} order by {orderby} desc") #where not parentdir in (select path from cachedups)
for type,path,size,hash,ndups,parentdir,nsubfiles_rec in rs:
rs2 = cur2.execute("select entry_id from cachedups inner join entries on entry_id=entries.id where parentdir=?", parentdir).fetchall()[0]
if len(rs2):
continue
def compute_cachedups_d(self,basedir=""): # FIXME: Experimental function supporting getincluded()
cur = self.conn.cursor()
cur2 = self.conn.cursor()
#cur3 = self.conn.cursor()
cur.executescript(f'''
drop table if exists cachedups_d;
create table cachedups_d (entry_id integer not null, size integer, nsubdups integer);
create index cachedups_d_entry_id_idx on cachedups_d(entry_id);
create index cachedups_d_size_idx on cachedups_d(size);
''')
cachedups_d = defaultdict(int)
n=0 ; ncount = cur.execute("select count(*) from cachedups_h where type='F'").fetchall()[0][0]
rs = cur.execute("select hash,ndups from cachedups_h where type='F' and hash!=0 and hash!=-1<<63 and size>0") # and ndups<1000
for hash,ndups in rs:
n+=1
paths=[k[0] for k in cur2.execute("select path from entries where type='F' and hash=?",(hash,)).fetchall()]
for path in paths:
dir_tmp = os.path.dirname(path)
#pdir_isdup = cur3.execute("select count(*) from cachedups inner join entries on entry_id=entries.id where path=?", (dir_tmp,)).fetchall()[0][0]
while dir_tmp!='/':
if any([not dir_tmp in k for k in paths]):
cachedups_d[dir_tmp] += 1
dir_tmp = os.path.dirname(dir_tmp)
else: break
#sys.stderr.write(f".");sys.stderr.flush()
sys.stderr.write(f"\033[2K\r{int(100*n/ncount)}%");sys.stderr.flush()
#rs = cur.execute("select path,hash from cachedups inner join entries on entry_id=entries.id where type='F'")
#for path,hash in rs:
#path_tmp = os.path.dirname(path)
#while path_tmp!='/':
#cachedirs_dupratio[path_tmp] += 1
#path_tmp = os.path.dirname(path_tmp)
print("Phase 3")
cur.executemany("insert into cachedups_d select id,size,? from entries where path=?", [(v,k) for k,v in cachedups_d.items()])
self.conn.commit()
# select size,ndups,path,type,hash from cachedups where not parentdir in (select path from cachedups) order by size desc
def showdups(self,basedir="",mountpoint="",nres=None,orderbysize=True):
"""Main function to display the duplicate entries (file or dirs) sorted by decreasing size"""
#orderby = "totaldupsize" if orderbysize else "nsubfiles_rec"
orderby = "entries.size" if orderbysize else "nsubfiles_rec"
#orderby = "cachedups.size" if orderbysize else "nsubfiles_rec"
cur = self.conn.cursor()
cur2 = self.conn.cursor()
tables = [k[0] for k in self.cur.execute("select name from sqlite_master where type='table'").fetchall()]
if not 'cachedups' in tables:
self.compute_cachedups()
wbasedir = f"and path like '{basedir}/%'" if basedir!='' else "" #"where type='D'"
limit_nres = f'limit {int(nres)}' if nres else ''
rs = cur.execute(f"select type,path,cachedups.size,hash,ndups,parentdir,nsubfiles_rec from cachedups inner join entries on entry_id=entries.id where cachedups.size>0 {wbasedir} order by {orderby} desc {limit_nres}") #where not parentdir in (select path from cachedups)
for ftype,path,size,hash,ndups,parentdir,nsubfiles_rec in rs:
pdir_isdup = cur2.execute("select count(*) from cachedups inner join entries on entry_id=entries.id where path=?", (parentdir,)).fetchall()[0][0]
path_real = mountpoint+path
if mountpoint!='' and mountpoint!=None and not os.path.exists(mountpoint+path):
path_real = colored(path_real, 'red')
elif pdir_isdup>0:
path_real = colored(path_real, 'cyan') + colored(' [parent dir already in dups]', 'yellow')
elif 'syncthing' in path_real or 'lost+found' in path_real:
path_real = colored(path_real, 'cyan')
print(colored(f"{ftype} 0x{hash+(1<<63):0>16x}, {ndups} * {size>>20} Mo | {nsubfiles_rec} files : ", 'yellow') + path_real)
#print(colored(f"{ftype} {hash+(1<<63)}, {ndups} * {size} | {nsubfiles_rec} files : ", 'yellow') + path_real)
def show_same_inode(self,basedir="",nres=None):
"""Same as showdups() but only return entries with identical inodes"""
cur = self.conn.cursor()
cur2 = self.conn.cursor()
limit_nres = f'limit {int(nres)}' if nres else ''
rs = cur.execute(f"select size,path,hash,st_ino,parentdir from entries where st_ino in (select st_ino from entries group by st_ino having count(*)>1 and type='F') order by size desc {limit_nres}")
for size,path,hash,inode,parentdir in rs:
pdir_isdup = cur2.execute("select count(*) from cachedups where path=?", (parentdir,)).fetchall()[0][0]
path_real = basedir+path
if basedir!='' and basedir!=None and not os.path.exists(basedir+path):
path_real = colored(path_real, 'red')
elif pdir_isdup>0:
path_real = colored(path_real, 'cyan') + colored(' [parent dir already in dups]', 'yellow')
elif 'syncthing' in path_real or 'lost+found' in path_real:
path_real = colored(path_real, 'cyan')
print(colored(f"{inode} 0x{hash+(1<<63):0>16x}, {size>>20} Mo : ", 'yellow') + path_real)
def compute_dupfiles(self,basedir=None,nres=None):
# This function will probably be superseded soon (by showdups())
if basedir==None:
basedir=""
limit_nres = f'limit {int(nres)}' if nres else ''
cur = self.conn.cursor()
cur2 = self.conn.cursor()
print("Computing duplicates...")
rs1 = cur.execute(f'select xxh64be,size,count(*) from files group by xxh64be,size having count(*)>1 order by size desc {limit_nres}')
print("Second phase")
for xxh,size,ndups in rs1:
paths = []
rs2 = cur2.execute('select xxh64be,size,path from files where xxh64be=? and size=?', (xxh,size))
for xxh2,size2,path in rs2:
if basedir!='' and basedir!=None and not os.path.exists(basedir+path):
path_real = colored(basedir+path, 'red')
else:
path_real = basedir+path
if not 'syncthing' in path_real and not 'lost+found' in path_real:
paths.append(path_real)
print(colored(f"0x{xxh+(1<<63):0>16x}, {ndups} * {size>>20} Mo :", 'yellow'))
print('\t'+'\n\t'.join(paths))
def compute_dupdirs(self,basedir=None,dirsorfiles="dirs", wherepathlike='/%', nres=None):
# This function will probably be superseded soon (by showdups())
# select (dups-1)*size/1048576 as sz, * from dirdups where not parentdir in (select path from dirdups)
cur = self.conn.cursor()
cur2 = self.conn.cursor()
limit_nres = f'limit {int(nres)}' if nres else ''
print("Computing duplicates...")
cur.executescript("""
create temp view duphashes_dirs as select type,hash,size,count(*) ndups from entries where type='D' group by hash,size,type having count(*)>1;
create temp view dupentries_dirs as select entries.type,parentdir,path,duphashes_dirs.hash,duphashes_dirs.size,ndups from entries inner join duphashes_dirs on duphashes_dirs.hash=entries.hash where entries.type='D';
create temp view duphashes_filtered as select type,hash,size,ndups from dupentries_dirs where parentdir not in (select path from dupentries_dirs) group by hash,size,type
""")
rs = cur.execute(f"select hash,size,ndups from duphashes_filtered where type='D' order by size desc {limit_nres}")
for xxh,size,ndups in rs:
rs2=cur2.execute(f"select path from entries where type='D' and hash=? and size=? and not path like '/syncthing/%'", (xxh,size)).fetchall()
paths = []
l=0
for k in rs2:
if basedir!='' and basedir!=None:
mystr = basedir+k[0] if os.path.exists(basedir+k[0]) else colored(basedir+k[0], 'red')
l += 1 if os.path.exists(basedir+k[0]) else 0 # and not "syncthing" in (basedir+k[0])
paths.append(mystr)
elif not 'lost+found' in k[0]: # and not 'syncthing' in k[0]
mystr=k[0]
l+=1
paths.append(mystr)
else:
print(f"Not inserting {k[0]}")
if l>1:
print(colored(f"0x{xxh+(1<<63):0>16x}, {ndups} * {size>>20} Mo :", 'yellow'))
print('\t'+'\n\t'.join(paths))
def walk(self,init_path=''): # FIXME: self.init_path ?
"""Same function as os.walk() for filesystems"""
cur = self.conn.cursor()
cur2 = self.conn.cursor()
for res in cur.execute("select path from entries where type='D' and path like ? order by path", (init_path+'/%',)):
dir=res[0]
dirs = [k[0] for k in cur2.execute("select name from entries where type='D' and parentdir=?",(dir,))]
files = [k[0] for k in cur2.execute("select name from entries where parentdir=? and (type='F' or type='S')",(dir,))]
#files.append([k[0] for k in cur2.execute('select name from symlinks where parentdir=?',(dir,))])
yield dir,dirs,files
def grepext(self, wordlist, ext='.txt', init_path='', mountpoint='', case_insensitive=True):
cur = self.conn.cursor()
count = cur.execute("select count(path) from entries where type='F' and name like ?", ('%'+ext,)).fetchall()[0][0]
k=0
pct=0
print(wordlist)
if case_insensitive:
for idx in range(len(wordlist)):
wordlist[idx]=wordlist[idx].lower()
for res in cur.execute("select path from entries where type='F' and name like ?", ('%'+ext,)):
path=res[0]
try: btext = open(mountpoint+'/'+path,"rb").read()
except (FileNotFoundError,PermissionError): print('\033[2K\r__error file__: '+path)
for w in wordlist:
try: text = btext.decode()
except UnicodeDecodeError: text = btext.decode(encoding="latin1")
if case_insensitive: text=text.lower()
#if w.encode() in open(bytes(mountpoint+'/'+path, encoding='utf-8'),"rb").read():
if w in text:
print('\033[2K\r'+path)
k+=1
pct = 100*k//count
sys.stderr.write(f"\033[2K\r{pct} % : {k} / {count}")
def dumpdir(self, adir=''):
"""Dumps the contents of a dir (and all subdirs) with hash and size"""
cur = self.conn.cursor()
for line in cur.execute("select path,xxh64be,size from dirs where path like ? order by path", (adir+'/%',)):
(path,xxh64be,size) = line
print("0x%016x, %d : %s" % (xxh64be+(1<<63), size, path.replace(adir,'')))
def dbgsize(self):
cur = self.conn.cursor()
return cur.execute("select sum(size) from files").fetchall()[0][0]
def isincluded(self, path_test, path_ref, otherddbfs=None, docount=True,display_included=False,display_notincluded=True,basedir="", checkfs=True, raw=False):
"""Checks whether every file under path_test/ (and subdirs) has a copy somewhere in path_ref (regardless of the directory structure in path_ref/ )"""
cur = self.conn.cursor()
if otherddbfs:
conn2 = sqlite3.connect(otherddbfs)
cur2 = conn2.cursor()
else:
cur2 = self.conn.cursor()
ignorestr = f'and {self.globignore}' if self.globignore else ''
mycount=cur.execute(f"select count(*) from (select path from files where size>0 and path like ? order by id)", (path_test+"/%",)).fetchone()[0] if docount else 1 # FIXME: putting {ignorestr} here would make the result accurate but would significantly slow-down the query... I think it is OK if the number/progressbar is overestimated
rs = cur.execute(f"select name,xxh64be,size,path from files where size>0 and path like ? {ignorestr} order by id", (path_test+'/%',))
k=1
if not rs or mycount==0:
print('No results !')
if basedir=='':
checkfs=False
for line in rs:
name,xxh,size,path=line
if xxh==0 or size==0: # skip null files or files made of zeros
continue
if checkfs and not os.path.exists(basedir+path):
if not raw:
sys.stderr.write(colored(f"\033[2K\r{basedir+path} ({size>>20} Mo) is deleted\n",'red'))
continue
if not otherddbfs and path_ref=='':
rs2=cur2.execute("select path from files where xxh64be=? and size=? and not path like ?", (xxh, size, path_test+'/%')).fetchall()
elif otherddbfs:
#rs2=cur2.execute("select path from files where xxh64be=? and size=? and path like ?", (xxh, size, path_ref+'/%')).fetchall()
rs2=cur2.execute("select path from files where xxh64be=? and size=? and path like ? limit 1", (xxh, size, path_ref+'/%')).fetchall()
else:
rs2=cur2.execute("select path from files where xxh64be=? and size=? and path!=? and path like ?", (xxh, size, path, path_ref+'/%')).fetchall()
#rs2=cur2.execute("select path from files where xxh64be=? and size=? and path!=?", (xxh, size, path)).fetchall()
if not rs2:
if display_notincluded:
if(raw): print(path)
else:
sys.stderr.write(f"\033[2K\r")
#print(colored(f"No equivalent for {xxh+(1<<63):0>16x}, {size>>20} Mo : {self.dbname}:{path}",'yellow'))
print(colored(f"No equivalent for {xxh+(1<<63)}, {size>>20} Mo : {path}",'yellow'))
else:
if checkfs and not any([os.path.exists(basedir+dup[0]) for dup in rs2[:20]]):
# Even if there are results, we check here whether they still exist (when checkfs==True) in case they might have been deleted since previous scan
# FIXME: we restrict to the 20 first dups as otherwise there is a performance issue for rare cases with large number of results (e.g. small system/compilation files that are identical among many projects). But this workaround is suboptimal.
if display_notincluded:
if(raw): print(path)
else:
sys.stderr.write(f"\033[2K\r")
print(colored(f"No equivalent anymore for ({size>>20} Mo) : {path}",'red'))
elif display_included: # in that case we only display results for dirA that _are_ in dirB
if(raw): print(path)
else:
sys.stderr.write(f"\033[2K\r")
print(colored(f"{path} ({size>>20} Mo) has the equivalents: {rs2}",'green'))
if not raw:
mytime2=time.time()
if mytime2-self.timer_print>0.05:
sys.stderr.write(f"\033[2K\rScanning: [{k} / {mycount} entries, {int(100*k/mycount)}%] ")
sys.stderr.flush()
self.timer_print=mytime2
k+=1
if otherddbfs:
conn2.close()
def diff(self,dir1,dir2):
self.isincluded(dir1,dir2)
self.isincluded(dir2,dir1)
def getincluded(self,basedir=''):
cur = self.conn.cursor()
cur2 = self.conn.cursor()
rs = cur.execute("select path,nsubfiles_rec,size from entries where type='D' and path like ? order by size desc", (basedir+'/%',))
#rs = cur.execute("select path,size from entries where type='D' and path like ?", (basedir+'/%',))
for parentdir,nsubfiles_rec,size in rs:
#nsubfiles_rec = make_nsubfiles_rec(parentdir)
dir_isdup = cur2.execute("select count(*) from cachedups where path=?", (parentdir,)).fetchall()[0][0]
if dir_isdup:
print(f"{parentdir} is dup")
continue
nsubfiles_rec_dups = cur2.execute("select count(*) from cachedups where type='F' and path like ?", (parentdir+'/%',)).fetchall()[0][0]
print(f"{size>>20} MB, {100*nsubfiles_rec_dups/nsubfiles_rec:.1f}% dups : {parentdir}")
def nsubfiles_rec(self,adir,k=0):
if adir=='/' or adir=='':
return None
#rs1=cur_tmp.execute("select sum(nsubfiles) ns from entries where path like ? and type='D'", (adir+'%',)).fetchall()[0][0]
cur = self.conn.cursor()
nfiles,ndirs,nfiles_r=cur.execute("select nsubfiles,nsubdirs,nsubfiles_rec from entries where path=? and type='D'", (adir,)).fetchall()[0]
if nfiles_r!=None:
return nfiles_r
if ndirs>0:
rs = cur.execute("select path,nsubfiles,nsubdirs from entries where parentdir=? and type='D'", (adir,)).fetchall()
for path,nsubfiles,nsubdirs in rs:
nfiles += self.nsubfiles_rec(path,k+1) if nsubdirs>0 else nsubfiles
cur.execute("update entries set nsubfiles_rec=? where path=?",(nfiles,adir))
return nfiles
def schema2(self): # unfinished. Problem: no details of the contents of virtual columns
cur = self.conn.cursor()
rs = cur.execute("select name from sqlite_master where type='table' order by name").fetchall()
tables = [k[0] for k in rs if k[0] is not None]
for table in tables:
rs = cur.execute(f"select * from pragma_table_xinfo('{table}') order by name").fetchall()
def schema(self, dosort=True):
def schemasort(a): # make the schema sorted and determinist for future comparisons
if not dosort:
return a
a = re.sub("--.*", "", a) # FIXME: it was nice to have a -- version: XXX in SQL comments. This line discards it
a = re.sub("\n *", "", a)
table = re.sub("(CREATE TABLE [^ \\(]*).*", "\\1",a)
a = re.sub("CREATE TABLE [^\\(]*\\(", "", a)[:-1]
a = a.replace(', ',',')
c = 0 ; out = ""
for k in a: # we need to distinguish ',' separating fields and ',' as part of functions...
if k=='(': c+=1
elif k==')': c-=1
elif k==',' and c>0: k=';'
out += k
alist = [k.replace(';', ',') for k in out.split(',')]
a = ",\n\t".join(sorted(alist))
return table + '(\n\t' + a + '\n)'
cur = self.conn.cursor()
rs = cur.execute("select sql from sqlite_master where type='table' order by name").fetchall()
schema_text = ";\n".join([schemasort(k[0]) for k in rs if k[0] is not None])
schema_hash = xxhash.xxh64(schema_text).hexdigest()
return schema_text, schema_hash
def migrate2(self):
print("Migrating DB") # FIXME: nsbubfiles_rec is not processed yet
self.cur.executescript('''
drop index entries_name_idx;
alter table entries drop column name;
update entries set parentdir_len=1 where parentdir_len=0 and path!='/';
alter table entries add column name text GENERATED ALWAYS AS (substr(path,parentdir_len+iif(parentdir_len<2,1,2))) VIRTUAL;
create index entries_name_idx on entries(name);
''')
def migrate(self):
"""Migrate from old to new DB schema (table 'entries' instead of tables 'files', 'dirs' and 'symlinks')"""
tables = [k[0] for k in self.cur.execute("select name from sqlite_master where type='table'").fetchall()]
populate_nsubfiles_rec=False
if 'files' in tables and 'dirs' in tables and not 'entries' in tables:
print("Migrating DB") # FIXME: nsbubfiles_rec is not processed yet
self.cur.executescript('''
drop table if exists entries;
create table entries(
id integer primary key autoincrement,
type CHAR(1) NOT NULL,
path text UNIQUE NOT NULL,
parentdir_len integer,
parentdir text GENERATED ALWAYS AS (substr(path,1,parentdir_len)) VIRTUAL,
name text GENERATED ALWAYS AS (substr(path,parentdir_len+iif(parentdir_len<2,1,2))) VIRTUAL,
size integer,
hash integer,
magictype integer,
nsubdirs integer,
nsubfiles integer,
nsubfiles_rec integer,
symtarget text,
st_mtime integer, st_mode integer, st_uid integer, st_gid integer, st_ino integer, st_nlink integer, st_dev integer,
dbsession integer not null
);
create index entries_parentdir_idx on entries(parentdir);
create index entries_path_idx on entries(path);
create index entries_size_idx on entries(size);
create index entries_hash_idx on entries(hash);
insert into entries(type, path, parentdir_len, size, hash, magictype, st_mtime, st_mode, st_uid, st_gid, st_ino, st_nlink, st_dev, dbsession)
select 'F', path, get_parentdir_len(path), size, xxh64be, magictype, st_mtime, st_mode, st_uid, st_gid, st_ino, st_nlink, st_dev, dbsession from files;
insert into entries(type, path, parentdir_len, hash, symtarget, dbsession)
select 'S', path, get_parentdir_len(path), xxh64be, target, dbsession from symlinks;
insert into entries(type, path, parentdir_len, size, hash, st_mtime, st_mode, st_uid, st_gid, st_nlink, st_dev, nsubfiles, nsubdirs, dbsession)
select 'D', path, get_parentdir_len(path), size, xxh64be, st_mtime, st_mode, st_uid, st_gid, st_nlink, st_dev, nsubfiles, nsubdirs, dbsession from dirs;
drop table files;
drop table symlinks;
drop table dirs;
create view files as select id,parentdir,name,path,size,hash as xxh64be,st_mtime, st_mode, st_uid, st_gid, st_ino, st_nlink, st_dev,dbsession,magictype from entries where type='F';
create view dirs as select id,parentdir,name,path,size,nsubfiles,nsubdirs,hash as xxh64be,st_mtime, st_mode, st_uid, st_gid, st_nlink, st_dev,dbsession,magictype from entries where type='D';
create view symlinks as select id,parentdir,name,path,symtarget as target,NULL as type,hash as xxh64be,dbsession,magictype from entries where type='S';
''')
populate_nsubfiles_rec = True
else:
sql = self.cur.execute("select sql from sqlite_master where type='table' and name='entries'").fetchall()[0]
if not 'nsubfiles_rec integer' in sql:
print("Adding column nsubfiles_rec")
self.cur.execute("alter table entries add nsubfiles_rec integer")
populate_nsubfiles_rec = True
if populate_nsubfiles_rec:
print("Generating nsubfiles_rec")
rs = self.cur.execute("select path,nsubfiles,nsubdirs from entries where type='D' and not path in ('','/')")
for path,nsubfiles,nsubdirs in rs:
if nsubdirs>0:
nsubfiles_r = self.nsubfiles_rec(path)
self.cur.execute("update entries set nsubfiles_rec=nsubfiles where nsubdirs=0 and type='D'")
self.conn.commit()
############################################
# Main
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("dbfile", help="DB path")
subparsers = parser.add_subparsers(dest="subcommand", required=True)
parser_scan = subparsers.add_parser('scan', help="Scan directory")
parser_scan.add_argument("path", help="path to scan")
parser_scan.add_argument("--resetdb", "-R", help="Reset DB", action='store_true', default=False)
parser_scan.add_argument("--previousdb", "-p", help="Previous DB (speed up scan)", default=None)
parser_scan.add_argument("--chunksize", "-C", help="file chunk size in bytes for xxhash (default is 1 MB)", default=FILE_HASH_CHUNKSIZE>>10)
# FIXME: add option to use a previous db, and skip dirs (+ copy the previous db contents for those subdirs) when mtime has not changed
parser_mount = subparsers.add_parser('mount')
parser_mount.add_argument("mountpoint", help="Mount point")
parser_isincluded = subparsers.add_parser('isincluded', help="Check whether all files in dirA/ are included in dirB/")
parser_isincluded.add_argument('dirA', help="source dir")
parser_isincluded.add_argument('dirB', help="dest dir")
parser_isincluded.add_argument("--otherdb", "-o", help="otherdb", default=None)
parser_isincluded.add_argument("--mountpoint", "-m", help="mountpoint for checking whether files are still present", default='')
parser_isincluded.add_argument("--display_included", "-i", help="Display files from dirA that are in dirB", action='store_true', default=False)
parser_isincluded.add_argument("--display_notincluded", "-n", help="Display files from dirA that are not in dirB", action='store_true', default=False)
#parser_isincluded.add_argument("--raw", "-r", help="Display files from dirA that are not in dirB", action='store_true', default=False)
parser_isincluded.add_argument("--globignore", "-g", help="Glob file to ignore results", default=None)
parser_comparedb = subparsers.add_parser('comparedb', help="Compare two DB")
parser_comparedb.add_argument('otherdb', help="DB to compare with")
parser_comparedb.add_argument("--globignore", "-z", help="Glob file to ignore results", default=None)
parser_migrate = subparsers.add_parser('migrate', help="Migrate DB schema")
parser_migrate.add_argument('newdb', help="New DB filename with updated schema")
parser_diff = subparsers.add_parser('diff', help="Show diffs between dirA/ and dirB/")
parser_diff.add_argument('dirA', help="source dir")
parser_diff.add_argument('dirB', help="dest dir")
parser_dump=subparsers.add_parser('dump', help="dump DB")
parser_dump.add_argument("--basedir", "-b", help="Basedir", default='')
subparsers.add_parser('computehash', help="Compute hash")
subparsers.add_parser('check_zerofile', help="Check whether file is made only of zeros (e.g. corrupted)")
parser_schema = subparsers.add_parser('schema', help="Print DB schema")
parser_schema.add_argument("--sql", "-s", help="Display SQL schema", action='store_true', default=False)
parser_schema.add_argument("--hash", "-n", help="Display schema hash", action='store_false', default=True)
subparsers.add_parser('compute_cachedups', help="Compute cachedups")
parser_showdups = subparsers.add_parser('showdups', help="show duplicates")
parser_showdups.add_argument("--mountpoint", "-m", help="mountpoint for checking whether files are still present", default='')
parser_showdups.add_argument("--basedir", "-b", help="Basedir", default='')
parser_showdups.add_argument("--limit", "-l", help="Max number of results", default=None)
parser_showdups.add_argument("--nsubfiles_first", "-n", help="sort by number of subfiles rather than size", action='store_true', default=False)
parser_grep = subparsers.add_parser('grep', help="grep text")
parser_grep.add_argument("--mountpoint", "-m", help="mountpoint", default='')
parser_grep.add_argument("--wordlist", "-w", help="mountpoint", nargs="+", required=True)
# Legacy
parser_dupfiles = subparsers.add_parser('dupfiles', help="show duplicate files")
parser_dupfiles.add_argument("--mountpoint", "-m", help="mountpoint for checking whether files are still present", default=None)
parser_dupfiles.add_argument("--limit", "-l", help="Max number of results", default=None)
parser_dupdirs = subparsers.add_parser('dupdirs', help="show duplicate dirs")
parser_dupdirs.add_argument("--mountpoint", "-m", help="mountpoint for checking whether files are still present", default=None)
parser_dupdirs.add_argument("--limit", "-l", help="Max number of results", default=None)
parser_inodes = subparsers.add_parser('inodes', help="show duplicate dirs")
parser_inodes.add_argument("--mountpoint", "-m", help="mountpoint for checking whether files are still present", default=None)
parser_inodes.add_argument("--limit", "-l", help="Max number of results", default=None)
parser_getincluded = subparsers.add_parser('getincluded', help="test")
parser_getincluded.add_argument("--basedir", "-b", help="Basedir", default='')
parser_testreg = subparsers.add_parser('testreg', help="test")
parser_testreg.add_argument("teststring", default="")
args = parser.parse_args()
if args.subcommand=='scan':
if args.resetdb:
os.remove(args.dbfile)
#ddb.createdb()
#ddb.conn.commit()
ddb=DDB(args.dbfile, rw=True, chunksize=int(args.chunksize)<<10)
try:
ddb.dirscan(args.path, prevdb = args.previousdb)
ddb.sync_db()
ddb.compute_cachedups()
ddb.conn.close()
except(KeyboardInterrupt):
ddb.sync_db()
ddb.conn.close()
print("\n_________________\nkeyboard interrupt !")
#allsize = ddb.dbgsize()
#print("\n_________________\nkeyboard interrupt, %d stored" % (allsize>>20))
elif args.subcommand=='dump': # FIXME: change it to table "entries" instead of "dirs"
ddb=DDB(args.dbfile)
ddb.dumpdir(args.basedir)
elif args.subcommand=='isincluded':
ddb=DDB(args.dbfile, globignore=args.globignore)
if not (args.display_notincluded or args.display_included):
print('Choose -n or -i !')
exit()
ddb.isincluded(args.dirA, args.dirB,
otherddbfs=args.otherdb, basedir=args.mountpoint,
display_included=args.display_included,
display_notincluded=args.display_notincluded)
elif args.subcommand=='diff':
ddb=DDB(args.dbfile)
ddb.diff(args.dirA, args.dirB)
elif args.subcommand=='dupdirs':
ddb=DDB(args.dbfile)
ddb.compute_dupdirs(basedir=args.mountpoint, nres=args.limit)
elif args.subcommand=='dupfiles':
ddb=DDB(args.dbfile)
ddb.compute_dupfiles(basedir=args.mountpoint, nres=args.limit)
elif args.subcommand=='comparedb':
print(f"Files from {args.dbfile} that are not in {args.otherdb} (i.e. deleted files)")
ddb=DDB(args.dbfile, globignore=args.globignore)
ddb.isincluded('', '', otherddbfs=args.otherdb, basedir='', checkfs=False)
#print(f"\n_________\nFiles from {args.otherdb} that are not in {args.dbfile} (i.e. new files)")
#ddb=DDB(args.otherdb)
#ddb.isincluded('', '', otherddbfs=args.dbfile, basedir='', checkfs=False)
elif args.subcommand=='migrate':
print(f"Copying {args.dbfile} -> {args.newdb}")
shutil.copyfile(args.dbfile, args.newdb)
ddb=DDB(args.newdb)
ddb.migrate()
elif args.subcommand=='computehash':
filestat = os.stat(args.dbfile)
entrysize = int(filestat.st_size)
fxxh = xxhash_file(args.dbfile, entrysize)
print(f"0x{fxxh+(1<<63):0>16x}, {fxxh+(1<<63)} {fxxh}, {entrysize>>20} Mo : {args.dbfile}")
elif args.subcommand=='check_zerofile':
print(check_zerofile(args.dbfile))
elif args.subcommand=='schema':
ddb=DDB(args.dbfile)
schema,shash = ddb.schema()
if args.sql: print(schema)
if args.hash: print(f"Schema hash: {shash}")
elif args.subcommand=='compute_cachedups':
ddb=DDB(args.dbfile)
ddb.compute_cachedups()
elif args.subcommand=='showdups':
ddb=DDB(args.dbfile)
ddb.showdups(basedir=args.basedir, mountpoint=args.mountpoint, nres=args.limit, orderbysize=not args.nsubfiles_first)
elif args.subcommand=='inodes':
ddb=DDB(args.dbfile)
ddb.show_same_inode(basedir=args.mountpoint, nres=args.limit)
elif args.subcommand=='getincluded':
ddb=DDB(args.dbfile)
ddb.getincluded(basedir=args.basedir)
elif args.subcommand=='testreg':
regex = regmulti(args.dbfile)
print(re.match(regex,args.teststring))
elif args.subcommand=='grep':
ddb=DDB(args.dbfile)
ddb.grepext(wordlist=args.wordlist, mountpoint=args.mountpoint)
print()