-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use the xxHash algorithm instead of crc32, CLI flags, Go module.
xxHash is an extremely fast non-cryptographic hash algorithm https://github.com/Cyan4973/xxHash "output" and "config" flags, to better CLI usage. Go module, to control dependencies.
- Loading branch information
1 parent
bae0513
commit 4665092
Showing
9 changed files
with
157 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,37 @@ | ||
# go-file-copies | ||
go-file-copies | ||
============== | ||
|
||
A Go package to fetch duplicate files | ||
A Go program to get duplicates from specified paths. | ||
|
||
Table of Contents | ||
================= | ||
|
||
+ [Images](#Images) | ||
+ [Flags](#Flags) | ||
+ [Install](#Install) | ||
+ [Run](#Run) | ||
|
||
### Images | ||
|
||
 | ||
 | ||
|
||
### Flags | ||
|
||
* config - _specify the path to your config file, which has paths to directories with duplicates._ | ||
* output - _specify the path to the output file with results._ | ||
|
||
### Install | ||
|
||
##### Compile for yourself | ||
Install [Go](https://golang.org/) and run [compile.sh](compile.sh) from the terminal. | ||
Binaries will be placed in the "build" directory. | ||
|
||
 | ||
##### Use precompiled binaries | ||
Download binary for your system from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases) | ||
|
||
### Usage | ||
### Run | ||
|
||
Clone repository and write in terminal: `cd go-dungeon-gen && go build main.go && main.exe` | ||
<br> | ||
Or you can use precompiled binaries from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases) and run as default program. | ||
<br> | ||
You'll get the file `duplicates.json` in the directory with program. | ||
<br> | ||
**Files with 0 hash are empty.** | ||
Rename [config-sample.json](config-sample.json) to config.json and add paths with duplicates to "dirs".<br> | ||
The program will take __all files recursively__ from specified directories.<br> | ||
You can specify paths to config and output via [flags](#Flags).<br> | ||
Go to the directory with the program and run it like usual binary. |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#!/bin/bash | ||
OsArray=("linux" "darwin" "windows") | ||
|
||
for os in ${OsArray[*]}; do | ||
name="go-file-copies" | ||
if [ "$os" = "windows" ]; then | ||
name+=".exe" | ||
fi | ||
GOOS=$os go build -o build/"$os"/$name | ||
cp config-sample.json build/"$os"/ | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
{ | ||
"directories": [ | ||
"C:\\Program Files", | ||
"C:\\Path\\To\\Your\\Directory" | ||
"dirs": [ | ||
"/mnt/Data/Pictures", | ||
"/home/Dmitriy/Pictures", | ||
"./../../Downloads" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
module github.com/Dmitriy-Vas/go-file-copies | ||
|
||
go 1.13 | ||
|
||
require github.com/cespare/xxhash/v2 v2.1.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= | ||
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= | ||
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,131 +1,150 @@ | ||
package main | ||
|
||
//#region Header | ||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"hash/crc32" | ||
"flag" | ||
"github.com/cespare/xxhash/v2" | ||
"io/ioutil" | ||
"log" | ||
"os" | ||
"os/signal" | ||
"path/filepath" | ||
"sync" | ||
"syscall" | ||
) | ||
|
||
const ( | ||
OS_PERMISSIONS os.FileMode = 0644 | ||
var ( | ||
config *Config | ||
) | ||
|
||
var ( | ||
config Config | ||
duplicates map[uint32][]File | ||
const ( | ||
OsFilePermissions os.FileMode = 0644 | ||
DefaultConfigPath string = "./config.json" | ||
DefaultOutputPath string = "./output.json" | ||
) | ||
|
||
type Config struct { | ||
Directories []string `json:"directories"` | ||
} | ||
type ( | ||
Config struct { | ||
ConfigPath string | ||
OutputPath string | ||
|
||
type File struct { | ||
Hash uint32 | ||
Path string | ||
} | ||
// Paths to directories, which has duplicates | ||
Directories []string `json:"dirs"` | ||
} | ||
|
||
File string | ||
|
||
Executor struct { | ||
mutex *sync.Mutex | ||
wg *sync.WaitGroup | ||
|
||
//#endregion | ||
// Results, which will be saved in JSON | ||
// map[xxHash][]Path | ||
Results map[uint64][]File | ||
} | ||
) | ||
|
||
func isErr(err error) { | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
} | ||
|
||
//#region Storage | ||
func getConfig() { | ||
data, err := ioutil.ReadFile("config.json") | ||
isErr(err) | ||
err = json.Unmarshal(data, &config) | ||
isErr(err) | ||
} | ||
|
||
func saveConfig() { | ||
data, err := json.Marshal(&config) | ||
isErr(err) | ||
err = ioutil.WriteFile("config.json", data, OS_PERMISSIONS) | ||
isErr(err) | ||
fmt.Println("Config successfully saved!") | ||
} | ||
|
||
func saveResult() { | ||
data, err := json.Marshal(&duplicates) | ||
isErr(err) | ||
err = ioutil.WriteFile("duplicates.json", data, OS_PERMISSIONS) | ||
isErr(err) | ||
} | ||
|
||
//#endregion | ||
|
||
func getCRCHash(name string) uint32 { | ||
data, err := ioutil.ReadFile(name) | ||
isErr(err) | ||
return crc32.ChecksumIEEE(data) | ||
// Check directories, which specified as paths with duplicates, to exists | ||
// Returns non-existing directory | ||
func (c *Config) IsDirsExists() (dir string) { | ||
for _, dir := range config.Directories { | ||
if _, err := os.Stat(dir); os.IsNotExist(err) { | ||
return dir | ||
} | ||
} | ||
return | ||
} | ||
|
||
func getFilesToScan() { | ||
duplicates = make(map[uint32][]File) | ||
// По очереди берём пути из конфига и проходим по ним, собирая все найденные файлы | ||
// Walkthrough specified directories and save paths to all found files | ||
// Returns slice of found paths | ||
func (c *Config) GetFiles() []File { | ||
output := make([]File, 0) | ||
for _, dir := range config.Directories { | ||
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { | ||
// Проверяем путь на директорию, добавляем только файлы | ||
if !info.IsDir() { | ||
// Получаем хэш файла | ||
hash := getCRCHash(path) | ||
f := File{ | ||
Path: path, | ||
Hash: hash, | ||
} | ||
duplicates[hash] = append(duplicates[hash], f) | ||
filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { | ||
if info.IsDir() { | ||
return nil | ||
} | ||
output = append(output, File(path)) | ||
return nil | ||
}) | ||
isErr(err) | ||
} | ||
return output | ||
} | ||
|
||
fmt.Println("Found", len(duplicates), "hashes") | ||
// Get uint64 hash from a file, using xxHash algorithm | ||
// https://github.com/Cyan4973/xxHash#benchmarks | ||
func (f File) GetHash() uint64 { | ||
raw, err := ioutil.ReadFile(string(f)) | ||
isErr(err) | ||
return xxhash.Sum64(raw) | ||
} | ||
|
||
func checkFiles() { | ||
result := make(map[uint32][]File) | ||
// Collect all files and save their hashes to the mapping | ||
func (e *Executor) SaveFileHash(file File) { | ||
e.mutex.Lock() | ||
defer e.mutex.Unlock() | ||
defer e.wg.Done() | ||
|
||
for h, v := range duplicates { | ||
if len(v) == 1 { | ||
continue | ||
} | ||
result[h] = duplicates[h] | ||
} | ||
duplicates = result | ||
hash := file.GetHash() | ||
e.Results[hash] = append(e.Results[hash], file) | ||
} | ||
|
||
func init() { | ||
config = new(Config) | ||
|
||
flag.StringVar(&config.ConfigPath, "config", DefaultConfigPath, "Use this flag to specify the path to your config file, which has paths to directories with duplicates.") | ||
flag.StringVar(&config.OutputPath, "output", DefaultOutputPath, "Use this flag to specify the path to the output file with results.") | ||
flag.Parse() | ||
|
||
fmt.Println("Found", len(duplicates), "copies") | ||
raw, err := ioutil.ReadFile(config.ConfigPath) | ||
isErr(err) | ||
isErr(json.Unmarshal(raw, &config)) | ||
} | ||
|
||
func main() { | ||
// Загружаем конфиг с диска | ||
getConfig() | ||
|
||
// Отлавливаем Ctrl^C для сохранения конфига | ||
go func() { | ||
signalChan := make(chan os.Signal, 1) | ||
signal.Notify(signalChan, os.Interrupt) | ||
signal.Notify(signalChan, os.Interrupt, syscall.SIGKILL, syscall.SIGHUP) | ||
defer close(signalChan) | ||
|
||
<-signalChan | ||
|
||
saveConfig() | ||
log.Println("Shutting down the program...") | ||
os.Exit(0) | ||
}() | ||
// Сохраняем конфиг при ошибке или если программа закончила выполнение | ||
defer saveConfig() | ||
|
||
getFilesToScan() | ||
if dir := config.IsDirsExists(); dir != "" { | ||
log.Fatalf("Directory %s can not be found.", dir) | ||
} | ||
|
||
checkFiles() | ||
files := config.GetFiles() | ||
log.Printf("Found %d files.", len(files)) | ||
|
||
saveResult() | ||
exec := &Executor{ | ||
mutex: new(sync.Mutex), | ||
wg: new(sync.WaitGroup), | ||
Results: make(map[uint64][]File), | ||
} | ||
for _, file := range files { | ||
exec.wg.Add(1) | ||
go exec.SaveFileHash(file) | ||
} | ||
exec.wg.Wait() | ||
|
||
for hash, files := range exec.Results { | ||
if len(files) <= 1 { | ||
delete(exec.Results, hash) | ||
} | ||
} | ||
log.Printf("Found %d hashes of duplicates.", len(exec.Results)) | ||
|
||
raw, err := json.Marshal(exec.Results) | ||
isErr(err) | ||
isErr(ioutil.WriteFile(config.OutputPath, raw, OsFilePermissions)) | ||
log.Printf("All results saved to %s", config.OutputPath) | ||
} |