filescanner/main.go

349 lines
7.6 KiB
Go
Executable File

package main
import (
"crypto/md5"
"database/sql"
"flag"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
"time"
_ "github.com/ClickHouse/clickhouse-go/v2"
)
var (
serverName = flag.String("server", "", "Server name (required)")
rootPath = flag.String("path", ".", "Path to scan")
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton")
dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB")
verbose = flag.Bool("v", false, "Verbose output")
)
// Platform-specific directories to exclude (conservative - only true junk)
var excludeDirs = map[string]bool{
// Windows system
"$RECYCLE.BIN": true,
"System Volume Information": true,
"Windows": true,
// macOS system
".Spotlight-V100": true,
".fseventsd": true,
// Linux system
"proc": true,
"sys": true,
"dev": true,
"run": true,
"lost+found": true,
// Dev artifacts (large, reproducible)
"node_modules": true,
".git": true,
"__pycache__": true,
}
// Additional root-level excludes per OS
func shouldExcludeRoot(path string) bool {
switch runtime.GOOS {
case "darwin":
excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"}
for _, ex := range excludeRoots {
if strings.HasPrefix(path, ex) {
return true
}
}
case "linux":
excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"}
for _, ex := range excludeRoots {
if strings.HasPrefix(path, ex) {
return true
}
}
}
return false
}
type FileEntry struct {
Server string
Source string
Folder string
Filename string
Ext string
Size int64
Created time.Time
Modified time.Time
Hash string
}
var currentFolder string
func truncatePath(path string, maxLen int) string {
if len(path) <= maxLen {
return path
}
// Show beginning and end
half := (maxLen - 3) / 2
return path[:half] + "..." + path[len(path)-half:]
}
func quickHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
return "", err
}
size := stat.Size()
h := md5.New()
buf := make([]byte, 65536)
// First 64KB
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
// Last 64KB (if file > 128KB)
if size > 131072 {
_, err = f.Seek(-65536, io.SeekEnd)
if err != nil {
return "", err
}
n, err = f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) {
modified = info.ModTime()
created = modified // fallback - platform-specific code can override
// Note: Getting birth time is OS-specific and complex
// For MVP, we use modified time for both
return
}
func scanFiles(root string, entries chan<- FileEntry) error {
return filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err)
}
return nil // continue scanning
}
// Skip excluded directories
if info.IsDir() {
name := info.Name()
if excludeDirs[name] {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
if shouldExcludeRoot(path) {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
if strings.Contains(strings.ToLower(path), "cache") {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
currentFolder = path
return nil
}
// Skip symlinks
if info.Mode()&os.ModeSymlink != 0 {
return nil
}
// Skip empty files
if info.Size() == 0 {
return nil
}
created, modified := getFileTimes(info, path)
ext := strings.ToLower(filepath.Ext(info.Name()))
if ext != "" {
ext = ext[1:] // remove leading dot
}
entry := FileEntry{
Server: *serverName,
Source: *source,
Folder: filepath.Dir(path),
Filename: info.Name(),
Ext: ext,
Size: info.Size(),
Created: created,
Modified: modified,
}
entries <- entry
return nil
})
}
func initDB(host string) (*sql.DB, error) {
dsn := fmt.Sprintf("clickhouse://%s/files", host)
db, err := sql.Open("clickhouse", dsn)
if err != nil {
return nil, err
}
// Create database and table
_, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`)
if err != nil {
return nil, fmt.Errorf("create database: %w", err)
}
_, err = db.Exec(`
CREATE TABLE IF NOT EXISTS files.inventory (
scan_id String,
scan_time DateTime64(3),
server LowCardinality(String),
source LowCardinality(String),
folder String,
filename String,
ext LowCardinality(String),
size UInt64,
created DateTime64(3),
modified DateTime64(3),
hash String DEFAULT ''
) ENGINE = MergeTree
ORDER BY (server, folder, filename)
`)
if err != nil {
return nil, fmt.Errorf("create table: %w", err)
}
return db, nil
}
func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error {
if len(entries) == 0 {
return nil
}
tx, err := db.Begin()
if err != nil {
return err
}
stmt, err := tx.Prepare(`
INSERT INTO files.inventory
(scan_id, scan_time, server, source, folder, filename, ext, size, created, modified)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
for _, e := range entries {
_, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified)
if err != nil {
tx.Rollback()
return err
}
}
return tx.Commit()
}
func main() {
flag.Parse()
if *serverName == "" {
hostname, _ := os.Hostname()
*serverName = hostname
}
absPath, err := filepath.Abs(*rootPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err)
os.Exit(1)
}
fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName)
var db *sql.DB
if !*dryRun {
db, err = initDB(*chHost)
if err != nil {
fmt.Fprintf(os.Stderr, "Database error: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix())
scanTime := time.Now()
entries := make(chan FileEntry, 1000)
done := make(chan error)
// Scanner goroutine
go func() {
err := scanFiles(absPath, entries)
close(entries)
done <- err
}()
// Collector
var batch []FileEntry
var totalFiles int64
var totalSize int64
batchSize := 1000
for entry := range entries {
totalFiles++
totalSize += entry.Size
if *dryRun {
fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size)
} else {
batch = append(batch, entry)
if len(batch) >= batchSize {
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
}
batch = batch[:0]
}
if totalFiles%100 == 0 {
folder := truncatePath(currentFolder, 90)
fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder))
}
}
}
// Insert remaining
if !*dryRun && len(batch) > 0 {
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
}
}
// Wait for scanner
if err := <-done; err != nil {
fmt.Fprintf(os.Stderr, "Scan error: %v\n", err)
}
fmt.Printf("\r%-120s\r", "") // clear progress line
fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024))
fmt.Printf("Scan ID: %s\n", scanID)
}