filescanner/cmd/hashupdate/main.go

146 lines
2.8 KiB
Go
Executable File

package main
import (
"crypto/md5"
"database/sql"
"flag"
"fmt"
"io"
"os"
"path/filepath"
_ "github.com/ClickHouse/clickhouse-go/v2"
)
var (
serverName = flag.String("server", "", "Server name to process")
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
verbose = flag.Bool("v", false, "Verbose output")
)
func quickHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
return "", err
}
size := stat.Size()
h := md5.New()
buf := make([]byte, 65536)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
if size > 131072 {
_, err = f.Seek(-65536, io.SeekEnd)
if err != nil {
return "", err
}
n, err = f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func main() {
flag.Parse()
if *serverName == "" {
fmt.Fprintln(os.Stderr, "Server name required: -server <n>")
os.Exit(1)
}
dsn := fmt.Sprintf("clickhouse://%s/files", *chHost)
db, err := sql.Open("clickhouse", dsn)
if err != nil {
fmt.Fprintf(os.Stderr, "DB error: %v\n", err)
os.Exit(1)
}
defer db.Close()
// Find sizes that appear more than once (potential dupes)
rows, err := db.Query(`
SELECT DISTINCT size
FROM files.inventory
WHERE server = ? AND hash = '' AND size > 0
GROUP BY size
HAVING count(*) > 1
ORDER BY size DESC
`, *serverName)
if err != nil {
fmt.Fprintf(os.Stderr, "Query error: %v\n", err)
os.Exit(1)
}
var sizes []int64
for rows.Next() {
var size int64
rows.Scan(&size)
sizes = append(sizes, size)
}
rows.Close()
fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes))
// Get files to hash
var totalHashed int64
for _, size := range sizes {
fileRows, err := db.Query(`
SELECT folder, filename
FROM files.inventory
WHERE server = ? AND size = ? AND hash = ''
`, *serverName, size)
if err != nil {
continue
}
for fileRows.Next() {
var folder, filename string
fileRows.Scan(&folder, &filename)
fullPath := filepath.Join(folder, filename)
hash, err := quickHash(fullPath)
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err)
}
continue
}
_, err = db.Exec(`
ALTER TABLE files.inventory
UPDATE hash = ?
WHERE server = ? AND folder = ? AND filename = ?
`, hash, *serverName, folder, filename)
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Update error: %v\n", err)
}
continue
}
totalHashed++
if *verbose {
fmt.Printf("Hashed: %s -> %s\n", fullPath, hash)
}
}
fileRows.Close()
}
fmt.Printf("Hashed %d files\n", totalHashed)
}