Skip to content

Commit

Permalink
Use the xxHash algorithm instead of crc32, CLI flags, Go module.
Browse files Browse the repository at this point in the history
xxHash is an extremely fast non-cryptographic hash algorithm
https://github.com/Cyan4973/xxHash

"output" and "config" flags, to better CLI usage.

Go module, to control dependencies.
  • Loading branch information
Dmitriy-Vas committed Jan 21, 2020
1 parent bae0513 commit 4665092
Show file tree
Hide file tree
Showing 9 changed files with 157 additions and 108 deletions.
6 changes: 2 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
*.dll
*.so
*.dylib
Linux*
Darwin*

# Config file
config.json
Expand All @@ -14,8 +12,8 @@ config.json
*.test

# Output
*.out
duplicates.json
build/
output.json

# IDEA configuration files
*.iws
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2019 Дмитрий Васильев
Copyright (c) 2019, 2020 Дмитрий Васильев

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
42 changes: 30 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,19 +1,37 @@
# go-file-copies
go-file-copies
==============

A Go package to fetch duplicate files
A Go program to get duplicates from specified paths.

Table of Contents
=================

+ [Images](#Images)
+ [Flags](#Flags)
+ [Install](#Install)
+ [Run](#Run)

### Images

![](https://i.imgur.com/vDyJmYs.png)
![](https://i.imgur.com/mY3fIni.png)

### Flags

* config - _specify the path to your config file, which has paths to directories with duplicates._
* output - _specify the path to the output file with results._

### Install

##### Compile for yourself
Install [Go](https://golang.org/) and run [compile.sh](compile.sh) from the terminal.
Binaries will be placed in the "build" directory.

![](https://i.imgur.com/IJvNUtF.png)
##### Use precompiled binaries
Download binary for your system from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases)

### Usage
### Run

Clone repository and write in terminal: `cd go-dungeon-gen && go build main.go && main.exe`
<br>
Or you can use precompiled binaries from [releases](https://github.com/Dmitriy-Vas/go-file-copies/releases) and run as default program.
<br>
You'll get the file `duplicates.json` in the directory with program.
<br>
**Files with 0 hash are empty.**
Rename [config-sample.json](config-sample.json) to config.json and add paths with duplicates to "dirs".<br>
The program will take __all files recursively__ from specified directories.<br>
You can specify paths to config and output via [flags](#Flags).<br>
Go to the directory with the program and run it like usual binary.
6 changes: 0 additions & 6 deletions compile.bat

This file was deleted.

11 changes: 11 additions & 0 deletions compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
OsArray=("linux" "darwin" "windows")

for os in ${OsArray[*]}; do
name="go-file-copies"
if [ "$os" = "windows" ]; then
name+=".exe"
fi
GOOS=$os go build -o build/"$os"/$name
cp config-sample.json build/"$os"/
done
7 changes: 4 additions & 3 deletions config-sample.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"directories": [
"C:\\Program Files",
"C:\\Path\\To\\Your\\Directory"
"dirs": [
"/mnt/Data/Pictures",
"/home/Dmitriy/Pictures",
"./../../Downloads"
]
}
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/Dmitriy-Vas/go-file-copies

go 1.13

require github.com/cespare/xxhash/v2 v2.1.1
3 changes: 3 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko=
github.com/cespare/xxhash/v2 v2.1.1 h1:6MnRN8NT7+YBpUIWxHtefFZOKTAPgGjpQSxqLNn0+qY=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
183 changes: 101 additions & 82 deletions main.go
Original file line number Diff line number Diff line change
@@ -1,131 +1,150 @@
package main

//#region Header
import (
"encoding/json"
"fmt"
"hash/crc32"
"flag"
"github.com/cespare/xxhash/v2"
"io/ioutil"
"log"
"os"
"os/signal"
"path/filepath"
"sync"
"syscall"
)

const (
OS_PERMISSIONS os.FileMode = 0644
var (
config *Config
)

var (
config Config
duplicates map[uint32][]File
const (
OsFilePermissions os.FileMode = 0644
DefaultConfigPath string = "./config.json"
DefaultOutputPath string = "./output.json"
)

type Config struct {
Directories []string `json:"directories"`
}
type (
Config struct {
ConfigPath string
OutputPath string

type File struct {
Hash uint32
Path string
}
// Paths to directories, which has duplicates
Directories []string `json:"dirs"`
}

File string

Executor struct {
mutex *sync.Mutex
wg *sync.WaitGroup

//#endregion
// Results, which will be saved in JSON
// map[xxHash][]Path
Results map[uint64][]File
}
)

func isErr(err error) {
if err != nil {
log.Fatal(err)
}
}

//#region Storage
func getConfig() {
data, err := ioutil.ReadFile("config.json")
isErr(err)
err = json.Unmarshal(data, &config)
isErr(err)
}

func saveConfig() {
data, err := json.Marshal(&config)
isErr(err)
err = ioutil.WriteFile("config.json", data, OS_PERMISSIONS)
isErr(err)
fmt.Println("Config successfully saved!")
}

func saveResult() {
data, err := json.Marshal(&duplicates)
isErr(err)
err = ioutil.WriteFile("duplicates.json", data, OS_PERMISSIONS)
isErr(err)
}

//#endregion

func getCRCHash(name string) uint32 {
data, err := ioutil.ReadFile(name)
isErr(err)
return crc32.ChecksumIEEE(data)
// Check directories, which specified as paths with duplicates, to exists
// Returns non-existing directory
func (c *Config) IsDirsExists() (dir string) {
for _, dir := range config.Directories {
if _, err := os.Stat(dir); os.IsNotExist(err) {
return dir
}
}
return
}

func getFilesToScan() {
duplicates = make(map[uint32][]File)
// По очереди берём пути из конфига и проходим по ним, собирая все найденные файлы
// Walkthrough specified directories and save paths to all found files
// Returns slice of found paths
func (c *Config) GetFiles() []File {
output := make([]File, 0)
for _, dir := range config.Directories {
err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
// Проверяем путь на директорию, добавляем только файлы
if !info.IsDir() {
// Получаем хэш файла
hash := getCRCHash(path)
f := File{
Path: path,
Hash: hash,
}
duplicates[hash] = append(duplicates[hash], f)
filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
output = append(output, File(path))
return nil
})
isErr(err)
}
return output
}

fmt.Println("Found", len(duplicates), "hashes")
// Get uint64 hash from a file, using xxHash algorithm
// https://github.com/Cyan4973/xxHash#benchmarks
func (f File) GetHash() uint64 {
raw, err := ioutil.ReadFile(string(f))
isErr(err)
return xxhash.Sum64(raw)
}

func checkFiles() {
result := make(map[uint32][]File)
// Collect all files and save their hashes to the mapping
func (e *Executor) SaveFileHash(file File) {
e.mutex.Lock()
defer e.mutex.Unlock()
defer e.wg.Done()

for h, v := range duplicates {
if len(v) == 1 {
continue
}
result[h] = duplicates[h]
}
duplicates = result
hash := file.GetHash()
e.Results[hash] = append(e.Results[hash], file)
}

func init() {
config = new(Config)

flag.StringVar(&config.ConfigPath, "config", DefaultConfigPath, "Use this flag to specify the path to your config file, which has paths to directories with duplicates.")
flag.StringVar(&config.OutputPath, "output", DefaultOutputPath, "Use this flag to specify the path to the output file with results.")
flag.Parse()

fmt.Println("Found", len(duplicates), "copies")
raw, err := ioutil.ReadFile(config.ConfigPath)
isErr(err)
isErr(json.Unmarshal(raw, &config))
}

func main() {
// Загружаем конфиг с диска
getConfig()

// Отлавливаем Ctrl^C для сохранения конфига
go func() {
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, os.Interrupt)
signal.Notify(signalChan, os.Interrupt, syscall.SIGKILL, syscall.SIGHUP)
defer close(signalChan)

<-signalChan

saveConfig()
log.Println("Shutting down the program...")
os.Exit(0)
}()
// Сохраняем конфиг при ошибке или если программа закончила выполнение
defer saveConfig()

getFilesToScan()
if dir := config.IsDirsExists(); dir != "" {
log.Fatalf("Directory %s can not be found.", dir)
}

checkFiles()
files := config.GetFiles()
log.Printf("Found %d files.", len(files))

saveResult()
exec := &Executor{
mutex: new(sync.Mutex),
wg: new(sync.WaitGroup),
Results: make(map[uint64][]File),
}
for _, file := range files {
exec.wg.Add(1)
go exec.SaveFileHash(file)
}
exec.wg.Wait()

for hash, files := range exec.Results {
if len(files) <= 1 {
delete(exec.Results, hash)
}
}
log.Printf("Found %d hashes of duplicates.", len(exec.Results))

raw, err := json.Marshal(exec.Results)
isErr(err)
isErr(ioutil.WriteFile(config.OutputPath, raw, OsFilePermissions))
log.Printf("All results saved to %s", config.OutputPath)
}

0 comments on commit 4665092

Please sign in to comment.