146 lines
2.8 KiB
Go
Executable File
146 lines
2.8 KiB
Go
Executable File
package main
|
|
|
|
import (
|
|
"crypto/md5"
|
|
"database/sql"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
_ "github.com/ClickHouse/clickhouse-go/v2"
|
|
)
|
|
|
|
var (
|
|
serverName = flag.String("server", "", "Server name to process")
|
|
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
|
|
verbose = flag.Bool("v", false, "Verbose output")
|
|
)
|
|
|
|
func quickHash(path string) (string, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
|
|
stat, err := f.Stat()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
size := stat.Size()
|
|
|
|
h := md5.New()
|
|
buf := make([]byte, 65536)
|
|
|
|
n, err := f.Read(buf)
|
|
if err != nil && err != io.EOF {
|
|
return "", err
|
|
}
|
|
h.Write(buf[:n])
|
|
|
|
if size > 131072 {
|
|
_, err = f.Seek(-65536, io.SeekEnd)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
n, err = f.Read(buf)
|
|
if err != nil && err != io.EOF {
|
|
return "", err
|
|
}
|
|
h.Write(buf[:n])
|
|
}
|
|
|
|
return fmt.Sprintf("%x", h.Sum(nil)), nil
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
|
|
if *serverName == "" {
|
|
fmt.Fprintln(os.Stderr, "Server name required: -server <n>")
|
|
os.Exit(1)
|
|
}
|
|
|
|
dsn := fmt.Sprintf("clickhouse://%s/files", *chHost)
|
|
db, err := sql.Open("clickhouse", dsn)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "DB error: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
|
|
// Find sizes that appear more than once (potential dupes)
|
|
rows, err := db.Query(`
|
|
SELECT DISTINCT size
|
|
FROM files.inventory
|
|
WHERE server = ? AND hash = '' AND size > 0
|
|
GROUP BY size
|
|
HAVING count(*) > 1
|
|
ORDER BY size DESC
|
|
`, *serverName)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Query error: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
var sizes []int64
|
|
for rows.Next() {
|
|
var size int64
|
|
rows.Scan(&size)
|
|
sizes = append(sizes, size)
|
|
}
|
|
rows.Close()
|
|
|
|
fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes))
|
|
|
|
// Get files to hash
|
|
var totalHashed int64
|
|
for _, size := range sizes {
|
|
fileRows, err := db.Query(`
|
|
SELECT folder, filename
|
|
FROM files.inventory
|
|
WHERE server = ? AND size = ? AND hash = ''
|
|
`, *serverName, size)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
for fileRows.Next() {
|
|
var folder, filename string
|
|
fileRows.Scan(&folder, &filename)
|
|
|
|
fullPath := filepath.Join(folder, filename)
|
|
hash, err := quickHash(fullPath)
|
|
if err != nil {
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err)
|
|
}
|
|
continue
|
|
}
|
|
|
|
_, err = db.Exec(`
|
|
ALTER TABLE files.inventory
|
|
UPDATE hash = ?
|
|
WHERE server = ? AND folder = ? AND filename = ?
|
|
`, hash, *serverName, folder, filename)
|
|
|
|
if err != nil {
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Update error: %v\n", err)
|
|
}
|
|
continue
|
|
}
|
|
|
|
totalHashed++
|
|
if *verbose {
|
|
fmt.Printf("Hashed: %s -> %s\n", fullPath, hash)
|
|
}
|
|
}
|
|
fileRows.Close()
|
|
}
|
|
|
|
fmt.Printf("Hashed %d files\n", totalHashed)
|
|
}
|