commit e8d9a31216d694233da6889d64c2d7549e1abfa7 Author: Johan Date: Sun Feb 1 02:00:29 2026 -0500 Initial commit diff --git a/._Makefile b/._Makefile new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._Makefile differ diff --git a/._README.md b/._README.md new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._README.md differ diff --git a/._cmd b/._cmd new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._cmd differ diff --git a/._go.mod b/._go.mod new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._go.mod differ diff --git a/._main.go b/._main.go new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._main.go differ diff --git a/._queries.sql b/._queries.sql new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/._queries.sql differ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c64ecb --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Binaries +*.exe +*.exe~ +*.dll +*.so +*.dylib +/bin/ +/dist/ + +# Test binary +*.test + +# Output +*.out + +# Dependency directories +vendor/ + +# IDE +.idea/ +.vscode/ +*.swp +*.swo + +# Env files +.env +.env.* + +# Databases +*.db +*.sqlite diff --git a/Makefile b/Makefile new file mode 100755 index 0000000..3f48a68 --- /dev/null +++ b/Makefile @@ -0,0 +1,62 @@ +.PHONY: all clean mac linux windows deps deploy + +LINUX_SERVERS = 192.168.1.240 192.168.1.252 192.168.1.253 192.168.1.254 +LINUX_USER = johan +LINUX_PASS = Helder06 +WIN_SERVER = 192.168.1.251 +WIN_USER = johan + +all: deps mac linux windows deploy + +deps: + go mod tidy + +mac: + GOOS=darwin GOARCH=amd64 go build -o bin/filescan-mac-amd64 . + GOOS=darwin GOARCH=arm64 go build -o bin/filescan-mac-arm64 . + GOOS=darwin GOARCH=amd64 go build -o bin/hashupdate-mac-amd64 ./cmd/hashupdate + GOOS=darwin GOARCH=arm64 go build -o bin/hashupdate-mac-arm64 ./cmd/hashupdate + +linux: + GOOS=linux GOARCH=amd64 go build -o bin/filescan-linux . + GOOS=linux GOARCH=amd64 go build -o bin/hashupdate-linux ./cmd/hashupdate + +windows: + GOOS=windows GOARCH=amd64 go build -o bin/filescan.exe . + GOOS=windows GOARCH=amd64 go build -o bin/hashupdate.exe ./cmd/hashupdate + +clean: + rm -rf bin/ + +deploy: deploy-linux deploy-windows + +deploy-linux: + @echo "Deploying to Linux servers..." + @for server in $(LINUX_SERVERS); do \ + echo " -> $$server"; \ + sshpass -p '$(LINUX_PASS)' scp -o StrictHostKeyChecking=no bin/filescan-linux bin/hashupdate-linux $(LINUX_USER)@$$server:~/ 2>/dev/null || echo " [offline]"; \ + done + +deploy-windows: + @echo "Deploying to Windows server $(WIN_SERVER)..." + @mkdir -p /tmp/251c 2>/dev/null || true + @umount /tmp/251c 2>/dev/null || true + @mount_smbfs '//johan:%21%21Lekker69@192.168.1.251/C' /tmp/251c 2>/dev/null && \ + cp bin/filescan.exe bin/hashupdate.exe /tmp/251c/Users/johan/ && \ + echo " -> $(WIN_SERVER) [ok]" && \ + umount /tmp/251c || echo " -> $(WIN_SERVER) [failed]" + +# Quick build for current platform +build: + go build -o bin/filescan . + go build -o bin/hashupdate ./cmd/hashupdate + +# Run scanner locally (example) +run: + go run . -server test -path /tmp -dry-run + +# Find duplicates query +dupes: + @echo "Run this in ClickHouse:" + @echo "SELECT hash, count(*) as cnt, groupArray(concat(folder, '/', filename)) as files" + @echo "FROM files.inventory WHERE hash != '' GROUP BY hash HAVING cnt > 1 ORDER BY cnt DESC" diff --git a/README.md b/README.md new file mode 100755 index 0000000..c6807ec --- /dev/null +++ b/README.md @@ -0,0 +1,67 @@ +# Filescanner + +Cross-platform file inventory scanner with ClickHouse backend. + +## Quick Start + +```bash +# Get dependencies +go mod tidy + +# Build all platforms +make all + +# Or build current platform only +make build +``` + +## Usage + +### Scan files +```bash +# Scan with dry-run (no DB) +./filescan -server myserver -path /home -dry-run + +# Scan to ClickHouse +./filescan -server myserver -path /home -ch 192.168.1.253:9000 + +# Verbose +./filescan -server myserver -path /home -v +``` + +### Add hashes for duplicate detection +```bash +# Only hashes files with non-unique sizes +./hashupdate -server myserver -ch 192.168.1.253:9000 +``` + +### Find duplicates +```sql +SELECT hash, count(*) as cnt, + groupArray(concat(server, ':', folder, '/', filename)) as files +FROM files.inventory +WHERE hash != '' +GROUP BY hash +HAVING cnt > 1 +ORDER BY any(size) DESC; +``` + +## Binaries + +After `make all`: +- `bin/filescan-mac-arm64` - Mac M1/M2/M3 +- `bin/filescan-mac-amd64` - Mac Intel +- `bin/filescan-linux` - Linux +- `bin/filescan.exe` - Windows + +## Excluded Directories + +Automatically skips: +- Windows: `$RECYCLE.BIN`, `Windows`, `Program Files`, `AppData`, etc. +- macOS: `.Trash`, `Library`, `.Spotlight-V100`, etc. +- Linux: `/proc`, `/sys`, `/dev`, `/run`, etc. +- Common: `node_modules`, `.git`, `__pycache__` + +## ClickHouse Schema + +See `queries.sql` for schema and useful queries. diff --git a/cmd/._hashupdate b/cmd/._hashupdate new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/cmd/._hashupdate differ diff --git a/cmd/hashupdate/._main.go b/cmd/hashupdate/._main.go new file mode 100755 index 0000000..2565469 Binary files /dev/null and b/cmd/hashupdate/._main.go differ diff --git a/cmd/hashupdate/main.go b/cmd/hashupdate/main.go new file mode 100755 index 0000000..da6708a --- /dev/null +++ b/cmd/hashupdate/main.go @@ -0,0 +1,145 @@ +package main + +import ( + "crypto/md5" + "database/sql" + "flag" + "fmt" + "io" + "os" + "path/filepath" + + _ "github.com/ClickHouse/clickhouse-go/v2" +) + +var ( + serverName = flag.String("server", "", "Server name to process") + chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port") + verbose = flag.Bool("v", false, "Verbose output") +) + +func quickHash(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + stat, err := f.Stat() + if err != nil { + return "", err + } + size := stat.Size() + + h := md5.New() + buf := make([]byte, 65536) + + n, err := f.Read(buf) + if err != nil && err != io.EOF { + return "", err + } + h.Write(buf[:n]) + + if size > 131072 { + _, err = f.Seek(-65536, io.SeekEnd) + if err != nil { + return "", err + } + n, err = f.Read(buf) + if err != nil && err != io.EOF { + return "", err + } + h.Write(buf[:n]) + } + + return fmt.Sprintf("%x", h.Sum(nil)), nil +} + +func main() { + flag.Parse() + + if *serverName == "" { + fmt.Fprintln(os.Stderr, "Server name required: -server ") + os.Exit(1) + } + + dsn := fmt.Sprintf("clickhouse://%s/files", *chHost) + db, err := sql.Open("clickhouse", dsn) + if err != nil { + fmt.Fprintf(os.Stderr, "DB error: %v\n", err) + os.Exit(1) + } + defer db.Close() + + // Find sizes that appear more than once (potential dupes) + rows, err := db.Query(` + SELECT DISTINCT size + FROM files.inventory + WHERE server = ? AND hash = '' AND size > 0 + GROUP BY size + HAVING count(*) > 1 + ORDER BY size DESC + `, *serverName) + if err != nil { + fmt.Fprintf(os.Stderr, "Query error: %v\n", err) + os.Exit(1) + } + + var sizes []int64 + for rows.Next() { + var size int64 + rows.Scan(&size) + sizes = append(sizes, size) + } + rows.Close() + + fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes)) + + // Get files to hash + var totalHashed int64 + for _, size := range sizes { + fileRows, err := db.Query(` + SELECT folder, filename + FROM files.inventory + WHERE server = ? AND size = ? AND hash = '' + `, *serverName, size) + if err != nil { + continue + } + + for fileRows.Next() { + var folder, filename string + fileRows.Scan(&folder, &filename) + + fullPath := filepath.Join(folder, filename) + hash, err := quickHash(fullPath) + if err != nil { + if *verbose { + fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err) + } + continue + } + + _, err = db.Exec(` + ALTER TABLE files.inventory + UPDATE hash = ? + WHERE server = ? AND folder = ? AND filename = ? + `, hash, *serverName, folder, filename) + + if err != nil { + if *verbose { + fmt.Fprintf(os.Stderr, "Update error: %v\n", err) + } + continue + } + + totalHashed++ + if *verbose { + fmt.Printf("Hashed: %s -> %s\n", fullPath, hash) + } + } + fileRows.Close() + } + + fmt.Printf("Hashed %d files\n", totalHashed) +} diff --git a/go.mod b/go.mod new file mode 100755 index 0000000..f09eae2 --- /dev/null +++ b/go.mod @@ -0,0 +1,23 @@ +module filescanner + +go 1.22 + +require github.com/ClickHouse/clickhouse-go/v2 v2.23.0 + +require ( + github.com/ClickHouse/ch-go v0.61.5 // indirect + github.com/andybalholm/brotli v1.1.0 // indirect + github.com/go-faster/city v1.0.1 // indirect + github.com/go-faster/errors v0.7.1 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/klauspost/compress v1.17.7 // indirect + github.com/paulmach/orb v0.11.1 // indirect + github.com/pierrec/lz4/v4 v4.1.21 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/segmentio/asm v1.2.0 // indirect + github.com/shopspring/decimal v1.3.1 // indirect + go.opentelemetry.io/otel v1.24.0 // indirect + go.opentelemetry.io/otel/trace v1.24.0 // indirect + golang.org/x/sys v0.18.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100755 index 0000000..c03066c --- /dev/null +++ b/go.sum @@ -0,0 +1,110 @@ +github.com/ClickHouse/ch-go v0.61.5 h1:zwR8QbYI0tsMiEcze/uIMK+Tz1D3XZXLdNrlaOpeEI4= +github.com/ClickHouse/ch-go v0.61.5/go.mod h1:s1LJW/F/LcFs5HJnuogFMta50kKDO0lf9zzfrbl0RQg= +github.com/ClickHouse/clickhouse-go/v2 v2.23.0 h1:srmRrkS0BR8gEut87u8jpcZ7geOob6nGj9ifrb+aKmg= +github.com/ClickHouse/clickhouse-go/v2 v2.23.0/go.mod h1:tBhdF3f3RdP7sS59+oBAtTyhWpy0024ZxDMhgxra0QE= +github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M= +github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw= +github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw= +github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg= +github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk= +github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg= +github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= +github.com/paulmach/orb v0.11.1 h1:3koVegMC4X/WeiXYz9iswopaTwMem53NzTJuTF20JzU= +github.com/paulmach/orb v0.11.1/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU= +github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY= +github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ= +github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys= +github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs= +github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8= +github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= +github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g= +github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8= +github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g= +go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo= +go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo= +go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI= +go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go new file mode 100755 index 0000000..7a52d76 --- /dev/null +++ b/main.go @@ -0,0 +1,348 @@ +package main + +import ( + "crypto/md5" + "database/sql" + "flag" + "fmt" + "io" + "os" + "path/filepath" + "runtime" + "strings" + "time" + + _ "github.com/ClickHouse/clickhouse-go/v2" +) + +var ( + serverName = flag.String("server", "", "Server name (required)") + rootPath = flag.String("path", ".", "Path to scan") + chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port") + source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton") + dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB") + verbose = flag.Bool("v", false, "Verbose output") +) + +// Platform-specific directories to exclude (conservative - only true junk) +var excludeDirs = map[string]bool{ + // Windows system + "$RECYCLE.BIN": true, + "System Volume Information": true, + "Windows": true, + // macOS system + ".Spotlight-V100": true, + ".fseventsd": true, + // Linux system + "proc": true, + "sys": true, + "dev": true, + "run": true, + "lost+found": true, + // Dev artifacts (large, reproducible) + "node_modules": true, + ".git": true, + "__pycache__": true, +} + +// Additional root-level excludes per OS +func shouldExcludeRoot(path string) bool { + switch runtime.GOOS { + case "darwin": + excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"} + for _, ex := range excludeRoots { + if strings.HasPrefix(path, ex) { + return true + } + } + case "linux": + excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"} + for _, ex := range excludeRoots { + if strings.HasPrefix(path, ex) { + return true + } + } + } + return false +} + +type FileEntry struct { + Server string + Source string + Folder string + Filename string + Ext string + Size int64 + Created time.Time + Modified time.Time + Hash string +} + +var currentFolder string + +func truncatePath(path string, maxLen int) string { + if len(path) <= maxLen { + return path + } + // Show beginning and end + half := (maxLen - 3) / 2 + return path[:half] + "..." + path[len(path)-half:] +} + +func quickHash(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + stat, err := f.Stat() + if err != nil { + return "", err + } + size := stat.Size() + + h := md5.New() + buf := make([]byte, 65536) + + // First 64KB + n, err := f.Read(buf) + if err != nil && err != io.EOF { + return "", err + } + h.Write(buf[:n]) + + // Last 64KB (if file > 128KB) + if size > 131072 { + _, err = f.Seek(-65536, io.SeekEnd) + if err != nil { + return "", err + } + n, err = f.Read(buf) + if err != nil && err != io.EOF { + return "", err + } + h.Write(buf[:n]) + } + + return fmt.Sprintf("%x", h.Sum(nil)), nil +} + +func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) { + modified = info.ModTime() + created = modified // fallback - platform-specific code can override + // Note: Getting birth time is OS-specific and complex + // For MVP, we use modified time for both + return +} + +func scanFiles(root string, entries chan<- FileEntry) error { + return filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if err != nil { + if *verbose { + fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err) + } + return nil // continue scanning + } + + // Skip excluded directories + if info.IsDir() { + name := info.Name() + if excludeDirs[name] { + currentFolder = "[skip] " + path + return filepath.SkipDir + } + if shouldExcludeRoot(path) { + currentFolder = "[skip] " + path + return filepath.SkipDir + } + if strings.Contains(strings.ToLower(path), "cache") { + currentFolder = "[skip] " + path + return filepath.SkipDir + } + currentFolder = path + return nil + } + + // Skip symlinks + if info.Mode()&os.ModeSymlink != 0 { + return nil + } + + // Skip empty files + if info.Size() == 0 { + return nil + } + + created, modified := getFileTimes(info, path) + ext := strings.ToLower(filepath.Ext(info.Name())) + if ext != "" { + ext = ext[1:] // remove leading dot + } + + entry := FileEntry{ + Server: *serverName, + Source: *source, + Folder: filepath.Dir(path), + Filename: info.Name(), + Ext: ext, + Size: info.Size(), + Created: created, + Modified: modified, + } + + entries <- entry + return nil + }) +} + +func initDB(host string) (*sql.DB, error) { + dsn := fmt.Sprintf("clickhouse://%s/files", host) + db, err := sql.Open("clickhouse", dsn) + if err != nil { + return nil, err + } + + // Create database and table + _, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`) + if err != nil { + return nil, fmt.Errorf("create database: %w", err) + } + + _, err = db.Exec(` + CREATE TABLE IF NOT EXISTS files.inventory ( + scan_id String, + scan_time DateTime64(3), + server LowCardinality(String), + source LowCardinality(String), + folder String, + filename String, + ext LowCardinality(String), + size UInt64, + created DateTime64(3), + modified DateTime64(3), + hash String DEFAULT '' + ) ENGINE = MergeTree + ORDER BY (server, folder, filename) + `) + if err != nil { + return nil, fmt.Errorf("create table: %w", err) + } + + return db, nil +} + +func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error { + if len(entries) == 0 { + return nil + } + + tx, err := db.Begin() + if err != nil { + return err + } + + stmt, err := tx.Prepare(` + INSERT INTO files.inventory + (scan_id, scan_time, server, source, folder, filename, ext, size, created, modified) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `) + if err != nil { + tx.Rollback() + return err + } + defer stmt.Close() + + for _, e := range entries { + _, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified) + if err != nil { + tx.Rollback() + return err + } + } + + return tx.Commit() +} + +func main() { + flag.Parse() + + if *serverName == "" { + hostname, _ := os.Hostname() + *serverName = hostname + } + + absPath, err := filepath.Abs(*rootPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err) + os.Exit(1) + } + + fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName) + + var db *sql.DB + if !*dryRun { + db, err = initDB(*chHost) + if err != nil { + fmt.Fprintf(os.Stderr, "Database error: %v\n", err) + os.Exit(1) + } + defer db.Close() + } + + scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix()) + scanTime := time.Now() + + entries := make(chan FileEntry, 1000) + done := make(chan error) + + // Scanner goroutine + go func() { + err := scanFiles(absPath, entries) + close(entries) + done <- err + }() + + // Collector + var batch []FileEntry + var totalFiles int64 + var totalSize int64 + batchSize := 1000 + + for entry := range entries { + totalFiles++ + totalSize += entry.Size + + if *dryRun { + fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size) + } else { + batch = append(batch, entry) + if len(batch) >= batchSize { + if err := insertBatch(db, scanID, scanTime, batch); err != nil { + fmt.Fprintf(os.Stderr, "Insert error: %v\n", err) + } + batch = batch[:0] + } + if totalFiles%100 == 0 { + folder := truncatePath(currentFolder, 90) + fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder)) + } + } + } + + // Insert remaining + if !*dryRun && len(batch) > 0 { + if err := insertBatch(db, scanID, scanTime, batch); err != nil { + fmt.Fprintf(os.Stderr, "Insert error: %v\n", err) + } + } + + // Wait for scanner + if err := <-done; err != nil { + fmt.Fprintf(os.Stderr, "Scan error: %v\n", err) + } + + fmt.Printf("\r%-120s\r", "") // clear progress line + fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024)) + fmt.Printf("Scan ID: %s\n", scanID) +} diff --git a/queries.sql b/queries.sql new file mode 100755 index 0000000..ee143e5 --- /dev/null +++ b/queries.sql @@ -0,0 +1,98 @@ +-- ClickHouse queries for file inventory analysis + +-- Create database (done automatically by scanner) +CREATE DATABASE IF NOT EXISTS files; + +-- Schema (done automatically by scanner) +CREATE TABLE IF NOT EXISTS files.inventory ( + scan_id String, + scan_time DateTime64(3), + server LowCardinality(String), + source LowCardinality(String), + folder String, + filename String, + ext LowCardinality(String), + size UInt64, + created DateTime64(3), + modified DateTime64(3), + hash String DEFAULT '' +) ENGINE = MergeTree +ORDER BY (server, folder, filename); + +-- Summary by server +SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size +FROM files.inventory +GROUP BY server +ORDER BY sum(size) DESC; + +-- Find exact duplicates (after running hashupdate) +SELECT + hash, + count(*) as cnt, + formatReadableSize(any(size)) as size, + groupArray(concat(server, ':', folder, '/', filename)) as files +FROM files.inventory +WHERE hash != '' +GROUP BY hash +HAVING cnt > 1 +ORDER BY any(size) DESC +LIMIT 100; + +-- Find files by extension +SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total +FROM files.inventory +GROUP BY ext +ORDER BY sum(size) DESC +LIMIT 20; + +-- Largest files +SELECT server, folder, filename, formatReadableSize(size) as size +FROM files.inventory +ORDER BY size DESC +LIMIT 50; + +-- Find files by name pattern +SELECT server, folder, filename, formatReadableSize(size) as size +FROM files.inventory +WHERE filename ILIKE '%pattern%' +ORDER BY size DESC; + +-- Compare two servers - files only on server A +SELECT a.folder, a.filename, formatReadableSize(a.size) as size +FROM files.inventory a +LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB' +WHERE a.server = 'serverA' AND b.filename IS NULL +ORDER BY a.size DESC; + +-- Files modified in last 30 days +SELECT server, folder, filename, modified, formatReadableSize(size) as size +FROM files.inventory +WHERE modified > now() - INTERVAL 30 DAY +ORDER BY modified DESC +LIMIT 100; + +-- Potential duplicate sizes (before hashing) +SELECT size, count(*) as cnt +FROM files.inventory +WHERE hash = '' AND size > 1000000 -- > 1MB +GROUP BY size +HAVING cnt > 1 +ORDER BY size DESC +LIMIT 100; + +-- Storage by folder (top level) +SELECT + server, + arrayElement(splitByChar('/', folder), 2) as top_folder, + count(*) as files, + formatReadableSize(sum(size)) as total +FROM files.inventory +GROUP BY server, top_folder +ORDER BY sum(size) DESC +LIMIT 50; + +-- Clear old scans (keep latest per server) +-- DELETE FROM files.inventory +-- WHERE (server, scan_time) NOT IN ( +-- SELECT server, max(scan_time) FROM files.inventory GROUP BY server +-- );