package main import ( "crypto/md5" "database/sql" "flag" "fmt" "io" "os" "path/filepath" "runtime" "strings" "time" _ "github.com/ClickHouse/clickhouse-go/v2" ) var ( serverName = flag.String("server", "", "Server name (required)") rootPath = flag.String("path", ".", "Path to scan") chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port") source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton") dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB") verbose = flag.Bool("v", false, "Verbose output") ) // Platform-specific directories to exclude (conservative - only true junk) var excludeDirs = map[string]bool{ // Windows system "$RECYCLE.BIN": true, "System Volume Information": true, "Windows": true, // macOS system ".Spotlight-V100": true, ".fseventsd": true, // Linux system "proc": true, "sys": true, "dev": true, "run": true, "lost+found": true, // Dev artifacts (large, reproducible) "node_modules": true, ".git": true, "__pycache__": true, } // Additional root-level excludes per OS func shouldExcludeRoot(path string) bool { switch runtime.GOOS { case "darwin": excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"} for _, ex := range excludeRoots { if strings.HasPrefix(path, ex) { return true } } case "linux": excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"} for _, ex := range excludeRoots { if strings.HasPrefix(path, ex) { return true } } } return false } type FileEntry struct { Server string Source string Folder string Filename string Ext string Size int64 Created time.Time Modified time.Time Hash string } var currentFolder string func truncatePath(path string, maxLen int) string { if len(path) <= maxLen { return path } // Show beginning and end half := (maxLen - 3) / 2 return path[:half] + "..." + path[len(path)-half:] } func quickHash(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() stat, err := f.Stat() if err != nil { return "", err } size := stat.Size() h := md5.New() buf := make([]byte, 65536) // First 64KB n, err := f.Read(buf) if err != nil && err != io.EOF { return "", err } h.Write(buf[:n]) // Last 64KB (if file > 128KB) if size > 131072 { _, err = f.Seek(-65536, io.SeekEnd) if err != nil { return "", err } n, err = f.Read(buf) if err != nil && err != io.EOF { return "", err } h.Write(buf[:n]) } return fmt.Sprintf("%x", h.Sum(nil)), nil } func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) { modified = info.ModTime() created = modified // fallback - platform-specific code can override // Note: Getting birth time is OS-specific and complex // For MVP, we use modified time for both return } func scanFiles(root string, entries chan<- FileEntry) error { return filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if err != nil { if *verbose { fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err) } return nil // continue scanning } // Skip excluded directories if info.IsDir() { name := info.Name() if excludeDirs[name] { currentFolder = "[skip] " + path return filepath.SkipDir } if shouldExcludeRoot(path) { currentFolder = "[skip] " + path return filepath.SkipDir } if strings.Contains(strings.ToLower(path), "cache") { currentFolder = "[skip] " + path return filepath.SkipDir } currentFolder = path return nil } // Skip symlinks if info.Mode()&os.ModeSymlink != 0 { return nil } // Skip empty files if info.Size() == 0 { return nil } created, modified := getFileTimes(info, path) ext := strings.ToLower(filepath.Ext(info.Name())) if ext != "" { ext = ext[1:] // remove leading dot } entry := FileEntry{ Server: *serverName, Source: *source, Folder: filepath.Dir(path), Filename: info.Name(), Ext: ext, Size: info.Size(), Created: created, Modified: modified, } entries <- entry return nil }) } func initDB(host string) (*sql.DB, error) { dsn := fmt.Sprintf("clickhouse://%s/files", host) db, err := sql.Open("clickhouse", dsn) if err != nil { return nil, err } // Create database and table _, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`) if err != nil { return nil, fmt.Errorf("create database: %w", err) } _, err = db.Exec(` CREATE TABLE IF NOT EXISTS files.inventory ( scan_id String, scan_time DateTime64(3), server LowCardinality(String), source LowCardinality(String), folder String, filename String, ext LowCardinality(String), size UInt64, created DateTime64(3), modified DateTime64(3), hash String DEFAULT '' ) ENGINE = MergeTree ORDER BY (server, folder, filename) `) if err != nil { return nil, fmt.Errorf("create table: %w", err) } return db, nil } func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error { if len(entries) == 0 { return nil } tx, err := db.Begin() if err != nil { return err } stmt, err := tx.Prepare(` INSERT INTO files.inventory (scan_id, scan_time, server, source, folder, filename, ext, size, created, modified) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `) if err != nil { tx.Rollback() return err } defer stmt.Close() for _, e := range entries { _, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified) if err != nil { tx.Rollback() return err } } return tx.Commit() } func main() { flag.Parse() if *serverName == "" { hostname, _ := os.Hostname() *serverName = hostname } absPath, err := filepath.Abs(*rootPath) if err != nil { fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err) os.Exit(1) } fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName) var db *sql.DB if !*dryRun { db, err = initDB(*chHost) if err != nil { fmt.Fprintf(os.Stderr, "Database error: %v\n", err) os.Exit(1) } defer db.Close() } scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix()) scanTime := time.Now() entries := make(chan FileEntry, 1000) done := make(chan error) // Scanner goroutine go func() { err := scanFiles(absPath, entries) close(entries) done <- err }() // Collector var batch []FileEntry var totalFiles int64 var totalSize int64 batchSize := 1000 for entry := range entries { totalFiles++ totalSize += entry.Size if *dryRun { fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size) } else { batch = append(batch, entry) if len(batch) >= batchSize { if err := insertBatch(db, scanID, scanTime, batch); err != nil { fmt.Fprintf(os.Stderr, "Insert error: %v\n", err) } batch = batch[:0] } if totalFiles%100 == 0 { folder := truncatePath(currentFolder, 90) fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder)) } } } // Insert remaining if !*dryRun && len(batch) > 0 { if err := insertBatch(db, scanID, scanTime, batch); err != nil { fmt.Fprintf(os.Stderr, "Insert error: %v\n", err) } } // Wait for scanner if err := <-done; err != nil { fmt.Fprintf(os.Stderr, "Scan error: %v\n", err) } fmt.Printf("\r%-120s\r", "") // clear progress line fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024)) fmt.Printf("Scan ID: %s\n", scanID) }