package main import ( "crypto/md5" "database/sql" "flag" "fmt" "io" "os" "path/filepath" _ "github.com/ClickHouse/clickhouse-go/v2" ) var ( serverName = flag.String("server", "", "Server name to process") chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port") verbose = flag.Bool("v", false, "Verbose output") ) func quickHash(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() stat, err := f.Stat() if err != nil { return "", err } size := stat.Size() h := md5.New() buf := make([]byte, 65536) n, err := f.Read(buf) if err != nil && err != io.EOF { return "", err } h.Write(buf[:n]) if size > 131072 { _, err = f.Seek(-65536, io.SeekEnd) if err != nil { return "", err } n, err = f.Read(buf) if err != nil && err != io.EOF { return "", err } h.Write(buf[:n]) } return fmt.Sprintf("%x", h.Sum(nil)), nil } func main() { flag.Parse() if *serverName == "" { fmt.Fprintln(os.Stderr, "Server name required: -server ") os.Exit(1) } dsn := fmt.Sprintf("clickhouse://%s/files", *chHost) db, err := sql.Open("clickhouse", dsn) if err != nil { fmt.Fprintf(os.Stderr, "DB error: %v\n", err) os.Exit(1) } defer db.Close() // Find sizes that appear more than once (potential dupes) rows, err := db.Query(` SELECT DISTINCT size FROM files.inventory WHERE server = ? AND hash = '' AND size > 0 GROUP BY size HAVING count(*) > 1 ORDER BY size DESC `, *serverName) if err != nil { fmt.Fprintf(os.Stderr, "Query error: %v\n", err) os.Exit(1) } var sizes []int64 for rows.Next() { var size int64 rows.Scan(&size) sizes = append(sizes, size) } rows.Close() fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes)) // Get files to hash var totalHashed int64 for _, size := range sizes { fileRows, err := db.Query(` SELECT folder, filename FROM files.inventory WHERE server = ? AND size = ? AND hash = '' `, *serverName, size) if err != nil { continue } for fileRows.Next() { var folder, filename string fileRows.Scan(&folder, &filename) fullPath := filepath.Join(folder, filename) hash, err := quickHash(fullPath) if err != nil { if *verbose { fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err) } continue } _, err = db.Exec(` ALTER TABLE files.inventory UPDATE hash = ? WHERE server = ? AND folder = ? AND filename = ? `, hash, *serverName, folder, filename) if err != nil { if *verbose { fmt.Fprintf(os.Stderr, "Update error: %v\n", err) } continue } totalHashed++ if *verbose { fmt.Printf("Hashed: %s -> %s\n", fullPath, hash) } } fileRows.Close() } fmt.Printf("Hashed %d files\n", totalHashed) }