diff --git a/.gitignore b/.gitignore index 84d6087..e32e1df 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,6 @@ *.dll *.so *.dylib -Linux* -Darwin* # Config file config.json @@ -14,8 +12,8 @@ config.json *.test # Output -*.out -duplicates.json +build/ +output.json # IDEA configuration files *.iws diff --git a/LICENSE b/LICENSE index 9ef0225..d5e90db 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Дмитрий Васильев +Copyright (c) 2019, 2020 Дмитрий Васильев Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index bbf89d2..eb56034 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,37 @@ -# go-file-copies +go-file-copies +============== -A Go package to fetch duplicate files +A Go program to get duplicates from specified paths. + +Table of Contents +================= + ++ [Images](#Images) ++ [Flags](#Flags) ++ [Install](#Install) ++ [Run](#Run) ### Images -![](https://i.imgur.com/vDyJmYs.png) +![](https://i.imgur.com/mY3fIni.png) + +### Flags + + * config - _specify the path to your config file, which has paths to directories with duplicates._ + * output - _specify the path to the output file with results._ + +### Install + +##### Compile for yourself +Install [Go](https://golang.org/) and run [compile.sh](compile.sh) from the terminal. +Binaries will be placed in the "build" directory. -![](https://i.imgur.com/IJvNUtF.png) +##### Use precompiled binaries +Download binary for your system from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases) -### Usage +### Run -Clone repository and write in terminal: `cd go-dungeon-gen && go build main.go && main.exe` -
-Or you can use precompiled binaries from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases) and run as default program. -
-You'll get the file `duplicates.json` in the directory with program. -
-**Files with 0 hash are empty.** +Rename [config-sample.json](config-sample.json) to config.json and add paths with duplicates to "dirs".
+The program will take __all files recursively__ from specified directories.
+You can specify paths to config and output via [flags](#Flags).
+Go to the directory with the program and run it like usual binary. diff --git a/compile.bat b/compile.bat deleted file mode 100644 index 233d998..0000000 --- a/compile.bat +++ /dev/null @@ -1,6 +0,0 @@ -set GOARCH=386 -go build -o WindowsCopies.exe main.go -set GOOS=darwin -go build -o DarwinCopies main.go -set GOOS=linux -go build -o LinuxCopies main.go diff --git a/compile.sh b/compile.sh new file mode 100644 index 0000000..9f2acb8 --- /dev/null +++ b/compile.sh @@ -0,0 +1,11 @@ +#!/bin/bash +OsArray=("linux" "darwin" "windows") + +for os in ${OsArray[*]}; do + name="go-file-copies" + if [ "$os" = "windows" ]; then + name+=".exe" + fi + GOOS=$os go build -o build/"$os"/$name + cp config-sample.json build/"$os"/ +done diff --git a/config-sample.json b/config-sample.json index e86f70c..4282c30 100644 --- a/config-sample.json +++ b/config-sample.json @@ -1,6 +1,7 @@ { - "directories": [ - "C:\\Program Files", - "C:\\Path\\To\\Your\\Directory" + "dirs": [ + "/mnt/Data/Pictures", + "/home/Dmitriy/Pictures", + "./../../Downloads" ] } diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..c933533 --- /dev/null +++ b/go.mod @@ -0,0 +1,5 @@ +module github.com/Dmitriy-Vas/go-file-copies + +go 1.13 + +require github.com/cespare/xxhash/v2 v2.1.1 diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..3f603af --- /dev/null +++ b/go.sum @@ -0,0 +1,3 @@ +github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= +github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY= +github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= diff --git a/main.go b/main.go index acda3dd..b2cfe5d 100644 --- a/main.go +++ b/main.go @@ -1,36 +1,48 @@ package main -//#region Header import ( "encoding/json" - "fmt" - "hash/crc32" + "flag" + "github.com/cespare/xxhash/v2" "io/ioutil" "log" "os" "os/signal" "path/filepath" + "sync" + "syscall" ) -const ( - OS_PERMISSIONS os.FileMode = 0644 +var ( + config *Config ) -var ( - config Config - duplicates map[uint32][]File +const ( + OsFilePermissions os.FileMode = 0644 + DefaultConfigPath string = "./config.json" + DefaultOutputPath string = "./output.json" ) -type Config struct { - Directories []string `json:"directories"` -} +type ( + Config struct { + ConfigPath string + OutputPath string -type File struct { - Hash uint32 - Path string -} + // Paths to directories, which has duplicates + Directories []string `json:"dirs"` + } + + File string + + Executor struct { + mutex *sync.Mutex + wg *sync.WaitGroup -//#endregion + // Results, which will be saved in JSON + // map[xxHash][]Path + Results map[uint64][]File + } +) func isErr(err error) { if err != nil { @@ -38,94 +50,101 @@ func isErr(err error) { } } -//#region Storage -func getConfig() { - data, err := ioutil.ReadFile("config.json") - isErr(err) - err = json.Unmarshal(data, &config) - isErr(err) -} - -func saveConfig() { - data, err := json.Marshal(&config) - isErr(err) - err = ioutil.WriteFile("config.json", data, OS_PERMISSIONS) - isErr(err) - fmt.Println("Config successfully saved!") -} - -func saveResult() { - data, err := json.Marshal(&duplicates) - isErr(err) - err = ioutil.WriteFile("duplicates.json", data, OS_PERMISSIONS) - isErr(err) -} - -//#endregion - -func getCRCHash(name string) uint32 { - data, err := ioutil.ReadFile(name) - isErr(err) - return crc32.ChecksumIEEE(data) +// Check directories, which specified as paths with duplicates, to exists +// Returns non-existing directory +func (c *Config) IsDirsExists() (dir string) { + for _, dir := range config.Directories { + if _, err := os.Stat(dir); os.IsNotExist(err) { + return dir + } + } + return } -func getFilesToScan() { - duplicates = make(map[uint32][]File) - // По очереди берём пути из конфига и проходим по ним, собирая все найденные файлы +// Walkthrough specified directories and save paths to all found files +// Returns slice of found paths +func (c *Config) GetFiles() []File { + output := make([]File, 0) for _, dir := range config.Directories { - err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { - // Проверяем путь на директорию, добавляем только файлы - if !info.IsDir() { - // Получаем хэш файла - hash := getCRCHash(path) - f := File{ - Path: path, - Hash: hash, - } - duplicates[hash] = append(duplicates[hash], f) + filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if info.IsDir() { + return nil } + output = append(output, File(path)) return nil }) - isErr(err) } + return output +} - fmt.Println("Found", len(duplicates), "hashes") +// Get uint64 hash from a file, using xxHash algorithm +// https://github.com/Cyan4973/xxHash#benchmarks +func (f File) GetHash() uint64 { + raw, err := ioutil.ReadFile(string(f)) + isErr(err) + return xxhash.Sum64(raw) } -func checkFiles() { - result := make(map[uint32][]File) +// Collect all files and save their hashes to the mapping +func (e *Executor) SaveFileHash(file File) { + e.mutex.Lock() + defer e.mutex.Unlock() + defer e.wg.Done() - for h, v := range duplicates { - if len(v) == 1 { - continue - } - result[h] = duplicates[h] - } - duplicates = result + hash := file.GetHash() + e.Results[hash] = append(e.Results[hash], file) +} + +func init() { + config = new(Config) + + flag.StringVar(&config.ConfigPath, "config", DefaultConfigPath, "Use this flag to specify the path to your config file, which has paths to directories with duplicates.") + flag.StringVar(&config.OutputPath, "output", DefaultOutputPath, "Use this flag to specify the path to the output file with results.") + flag.Parse() - fmt.Println("Found", len(duplicates), "copies") + raw, err := ioutil.ReadFile(config.ConfigPath) + isErr(err) + isErr(json.Unmarshal(raw, &config)) } func main() { - // Загружаем конфиг с диска - getConfig() - - // Отлавливаем Ctrl^C для сохранения конфига go func() { signalChan := make(chan os.Signal, 1) - signal.Notify(signalChan, os.Interrupt) + signal.Notify(signalChan, os.Interrupt, syscall.SIGKILL, syscall.SIGHUP) + defer close(signalChan) <-signalChan - - saveConfig() + log.Println("Shutting down the program...") os.Exit(0) }() - // Сохраняем конфиг при ошибке или если программа закончила выполнение - defer saveConfig() - getFilesToScan() + if dir := config.IsDirsExists(); dir != "" { + log.Fatalf("Directory %s can not be found.", dir) + } - checkFiles() + files := config.GetFiles() + log.Printf("Found %d files.", len(files)) - saveResult() + exec := &Executor{ + mutex: new(sync.Mutex), + wg: new(sync.WaitGroup), + Results: make(map[uint64][]File), + } + for _, file := range files { + exec.wg.Add(1) + go exec.SaveFileHash(file) + } + exec.wg.Wait() + + for hash, files := range exec.Results { + if len(files) <= 1 { + delete(exec.Results, hash) + } + } + log.Printf("Found %d hashes of duplicates.", len(exec.Results)) + + raw, err := json.Marshal(exec.Results) + isErr(err) + isErr(ioutil.WriteFile(config.OutputPath, raw, OsFilePermissions)) + log.Printf("All results saved to %s", config.OutputPath) }