349 lines
7.6 KiB
Go
Executable File
349 lines
7.6 KiB
Go
Executable File
package main
|
|
|
|
import (
|
|
"crypto/md5"
|
|
"database/sql"
|
|
"flag"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"runtime"
|
|
"strings"
|
|
"time"
|
|
|
|
_ "github.com/ClickHouse/clickhouse-go/v2"
|
|
)
|
|
|
|
var (
|
|
serverName = flag.String("server", "", "Server name (required)")
|
|
rootPath = flag.String("path", ".", "Path to scan")
|
|
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
|
|
source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton")
|
|
dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB")
|
|
verbose = flag.Bool("v", false, "Verbose output")
|
|
)
|
|
|
|
// Platform-specific directories to exclude (conservative - only true junk)
|
|
var excludeDirs = map[string]bool{
|
|
// Windows system
|
|
"$RECYCLE.BIN": true,
|
|
"System Volume Information": true,
|
|
"Windows": true,
|
|
// macOS system
|
|
".Spotlight-V100": true,
|
|
".fseventsd": true,
|
|
// Linux system
|
|
"proc": true,
|
|
"sys": true,
|
|
"dev": true,
|
|
"run": true,
|
|
"lost+found": true,
|
|
// Dev artifacts (large, reproducible)
|
|
"node_modules": true,
|
|
".git": true,
|
|
"__pycache__": true,
|
|
}
|
|
|
|
// Additional root-level excludes per OS
|
|
func shouldExcludeRoot(path string) bool {
|
|
switch runtime.GOOS {
|
|
case "darwin":
|
|
excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"}
|
|
for _, ex := range excludeRoots {
|
|
if strings.HasPrefix(path, ex) {
|
|
return true
|
|
}
|
|
}
|
|
case "linux":
|
|
excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"}
|
|
for _, ex := range excludeRoots {
|
|
if strings.HasPrefix(path, ex) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
type FileEntry struct {
|
|
Server string
|
|
Source string
|
|
Folder string
|
|
Filename string
|
|
Ext string
|
|
Size int64
|
|
Created time.Time
|
|
Modified time.Time
|
|
Hash string
|
|
}
|
|
|
|
var currentFolder string
|
|
|
|
func truncatePath(path string, maxLen int) string {
|
|
if len(path) <= maxLen {
|
|
return path
|
|
}
|
|
// Show beginning and end
|
|
half := (maxLen - 3) / 2
|
|
return path[:half] + "..." + path[len(path)-half:]
|
|
}
|
|
|
|
func quickHash(path string) (string, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
|
|
stat, err := f.Stat()
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
size := stat.Size()
|
|
|
|
h := md5.New()
|
|
buf := make([]byte, 65536)
|
|
|
|
// First 64KB
|
|
n, err := f.Read(buf)
|
|
if err != nil && err != io.EOF {
|
|
return "", err
|
|
}
|
|
h.Write(buf[:n])
|
|
|
|
// Last 64KB (if file > 128KB)
|
|
if size > 131072 {
|
|
_, err = f.Seek(-65536, io.SeekEnd)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
n, err = f.Read(buf)
|
|
if err != nil && err != io.EOF {
|
|
return "", err
|
|
}
|
|
h.Write(buf[:n])
|
|
}
|
|
|
|
return fmt.Sprintf("%x", h.Sum(nil)), nil
|
|
}
|
|
|
|
func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) {
|
|
modified = info.ModTime()
|
|
created = modified // fallback - platform-specific code can override
|
|
// Note: Getting birth time is OS-specific and complex
|
|
// For MVP, we use modified time for both
|
|
return
|
|
}
|
|
|
|
func scanFiles(root string, entries chan<- FileEntry) error {
|
|
return filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
|
|
if err != nil {
|
|
if *verbose {
|
|
fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err)
|
|
}
|
|
return nil // continue scanning
|
|
}
|
|
|
|
// Skip excluded directories
|
|
if info.IsDir() {
|
|
name := info.Name()
|
|
if excludeDirs[name] {
|
|
currentFolder = "[skip] " + path
|
|
return filepath.SkipDir
|
|
}
|
|
if shouldExcludeRoot(path) {
|
|
currentFolder = "[skip] " + path
|
|
return filepath.SkipDir
|
|
}
|
|
if strings.Contains(strings.ToLower(path), "cache") {
|
|
currentFolder = "[skip] " + path
|
|
return filepath.SkipDir
|
|
}
|
|
currentFolder = path
|
|
return nil
|
|
}
|
|
|
|
// Skip symlinks
|
|
if info.Mode()&os.ModeSymlink != 0 {
|
|
return nil
|
|
}
|
|
|
|
// Skip empty files
|
|
if info.Size() == 0 {
|
|
return nil
|
|
}
|
|
|
|
created, modified := getFileTimes(info, path)
|
|
ext := strings.ToLower(filepath.Ext(info.Name()))
|
|
if ext != "" {
|
|
ext = ext[1:] // remove leading dot
|
|
}
|
|
|
|
entry := FileEntry{
|
|
Server: *serverName,
|
|
Source: *source,
|
|
Folder: filepath.Dir(path),
|
|
Filename: info.Name(),
|
|
Ext: ext,
|
|
Size: info.Size(),
|
|
Created: created,
|
|
Modified: modified,
|
|
}
|
|
|
|
entries <- entry
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func initDB(host string) (*sql.DB, error) {
|
|
dsn := fmt.Sprintf("clickhouse://%s/files", host)
|
|
db, err := sql.Open("clickhouse", dsn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create database and table
|
|
_, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create database: %w", err)
|
|
}
|
|
|
|
_, err = db.Exec(`
|
|
CREATE TABLE IF NOT EXISTS files.inventory (
|
|
scan_id String,
|
|
scan_time DateTime64(3),
|
|
server LowCardinality(String),
|
|
source LowCardinality(String),
|
|
folder String,
|
|
filename String,
|
|
ext LowCardinality(String),
|
|
size UInt64,
|
|
created DateTime64(3),
|
|
modified DateTime64(3),
|
|
hash String DEFAULT ''
|
|
) ENGINE = MergeTree
|
|
ORDER BY (server, folder, filename)
|
|
`)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create table: %w", err)
|
|
}
|
|
|
|
return db, nil
|
|
}
|
|
|
|
func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error {
|
|
if len(entries) == 0 {
|
|
return nil
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stmt, err := tx.Prepare(`
|
|
INSERT INTO files.inventory
|
|
(scan_id, scan_time, server, source, folder, filename, ext, size, created, modified)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`)
|
|
if err != nil {
|
|
tx.Rollback()
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, e := range entries {
|
|
_, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified)
|
|
if err != nil {
|
|
tx.Rollback()
|
|
return err
|
|
}
|
|
}
|
|
|
|
return tx.Commit()
|
|
}
|
|
|
|
func main() {
|
|
flag.Parse()
|
|
|
|
if *serverName == "" {
|
|
hostname, _ := os.Hostname()
|
|
*serverName = hostname
|
|
}
|
|
|
|
absPath, err := filepath.Abs(*rootPath)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName)
|
|
|
|
var db *sql.DB
|
|
if !*dryRun {
|
|
db, err = initDB(*chHost)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "Database error: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
}
|
|
|
|
scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix())
|
|
scanTime := time.Now()
|
|
|
|
entries := make(chan FileEntry, 1000)
|
|
done := make(chan error)
|
|
|
|
// Scanner goroutine
|
|
go func() {
|
|
err := scanFiles(absPath, entries)
|
|
close(entries)
|
|
done <- err
|
|
}()
|
|
|
|
// Collector
|
|
var batch []FileEntry
|
|
var totalFiles int64
|
|
var totalSize int64
|
|
batchSize := 1000
|
|
|
|
for entry := range entries {
|
|
totalFiles++
|
|
totalSize += entry.Size
|
|
|
|
if *dryRun {
|
|
fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size)
|
|
} else {
|
|
batch = append(batch, entry)
|
|
if len(batch) >= batchSize {
|
|
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
|
|
}
|
|
batch = batch[:0]
|
|
}
|
|
if totalFiles%100 == 0 {
|
|
folder := truncatePath(currentFolder, 90)
|
|
fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder))
|
|
}
|
|
}
|
|
}
|
|
|
|
// Insert remaining
|
|
if !*dryRun && len(batch) > 0 {
|
|
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
|
|
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
|
|
}
|
|
}
|
|
|
|
// Wait for scanner
|
|
if err := <-done; err != nil {
|
|
fmt.Fprintf(os.Stderr, "Scan error: %v\n", err)
|
|
}
|
|
|
|
fmt.Printf("\r%-120s\r", "") // clear progress line
|
|
fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024))
|
|
fmt.Printf("Scan ID: %s\n", scanID)
|
|
}
|