Initial commit

This commit is contained in:
Johan 2026-02-01 02:00:29 -05:00
commit e8d9a31216
16 changed files with 884 additions and 0 deletions

BIN
._Makefile Executable file

Binary file not shown.

BIN
._README.md Executable file

Binary file not shown.

BIN
._cmd Executable file

Binary file not shown.

BIN
._go.mod Executable file

Binary file not shown.

BIN
._main.go Executable file

Binary file not shown.

BIN
._queries.sql Executable file

Binary file not shown.

31
.gitignore vendored Normal file
View File

@ -0,0 +1,31 @@
# Binaries
*.exe
*.exe~
*.dll
*.so
*.dylib
/bin/
/dist/
# Test binary
*.test
# Output
*.out
# Dependency directories
vendor/
# IDE
.idea/
.vscode/
*.swp
*.swo
# Env files
.env
.env.*
# Databases
*.db
*.sqlite

62
Makefile Executable file
View File

@ -0,0 +1,62 @@
.PHONY: all clean mac linux windows deps deploy
LINUX_SERVERS = 192.168.1.240 192.168.1.252 192.168.1.253 192.168.1.254
LINUX_USER = johan
LINUX_PASS = Helder06
WIN_SERVER = 192.168.1.251
WIN_USER = johan
all: deps mac linux windows deploy
deps:
go mod tidy
mac:
GOOS=darwin GOARCH=amd64 go build -o bin/filescan-mac-amd64 .
GOOS=darwin GOARCH=arm64 go build -o bin/filescan-mac-arm64 .
GOOS=darwin GOARCH=amd64 go build -o bin/hashupdate-mac-amd64 ./cmd/hashupdate
GOOS=darwin GOARCH=arm64 go build -o bin/hashupdate-mac-arm64 ./cmd/hashupdate
linux:
GOOS=linux GOARCH=amd64 go build -o bin/filescan-linux .
GOOS=linux GOARCH=amd64 go build -o bin/hashupdate-linux ./cmd/hashupdate
windows:
GOOS=windows GOARCH=amd64 go build -o bin/filescan.exe .
GOOS=windows GOARCH=amd64 go build -o bin/hashupdate.exe ./cmd/hashupdate
clean:
rm -rf bin/
deploy: deploy-linux deploy-windows
deploy-linux:
@echo "Deploying to Linux servers..."
@for server in $(LINUX_SERVERS); do \
echo " -> $$server"; \
sshpass -p '$(LINUX_PASS)' scp -o StrictHostKeyChecking=no bin/filescan-linux bin/hashupdate-linux $(LINUX_USER)@$$server:~/ 2>/dev/null || echo " [offline]"; \
done
deploy-windows:
@echo "Deploying to Windows server $(WIN_SERVER)..."
@mkdir -p /tmp/251c 2>/dev/null || true
@umount /tmp/251c 2>/dev/null || true
@mount_smbfs '//johan:%21%21Lekker69@192.168.1.251/C' /tmp/251c 2>/dev/null && \
cp bin/filescan.exe bin/hashupdate.exe /tmp/251c/Users/johan/ && \
echo " -> $(WIN_SERVER) [ok]" && \
umount /tmp/251c || echo " -> $(WIN_SERVER) [failed]"
# Quick build for current platform
build:
go build -o bin/filescan .
go build -o bin/hashupdate ./cmd/hashupdate
# Run scanner locally (example)
run:
go run . -server test -path /tmp -dry-run
# Find duplicates query
dupes:
@echo "Run this in ClickHouse:"
@echo "SELECT hash, count(*) as cnt, groupArray(concat(folder, '/', filename)) as files"
@echo "FROM files.inventory WHERE hash != '' GROUP BY hash HAVING cnt > 1 ORDER BY cnt DESC"

67
README.md Executable file
View File

@ -0,0 +1,67 @@
# Filescanner
Cross-platform file inventory scanner with ClickHouse backend.
## Quick Start
```bash
# Get dependencies
go mod tidy
# Build all platforms
make all
# Or build current platform only
make build
```
## Usage
### Scan files
```bash
# Scan with dry-run (no DB)
./filescan -server myserver -path /home -dry-run
# Scan to ClickHouse
./filescan -server myserver -path /home -ch 192.168.1.253:9000
# Verbose
./filescan -server myserver -path /home -v
```
### Add hashes for duplicate detection
```bash
# Only hashes files with non-unique sizes
./hashupdate -server myserver -ch 192.168.1.253:9000
```
### Find duplicates
```sql
SELECT hash, count(*) as cnt,
groupArray(concat(server, ':', folder, '/', filename)) as files
FROM files.inventory
WHERE hash != ''
GROUP BY hash
HAVING cnt > 1
ORDER BY any(size) DESC;
```
## Binaries
After `make all`:
- `bin/filescan-mac-arm64` - Mac M1/M2/M3
- `bin/filescan-mac-amd64` - Mac Intel
- `bin/filescan-linux` - Linux
- `bin/filescan.exe` - Windows
## Excluded Directories
Automatically skips:
- Windows: `$RECYCLE.BIN`, `Windows`, `Program Files`, `AppData`, etc.
- macOS: `.Trash`, `Library`, `.Spotlight-V100`, etc.
- Linux: `/proc`, `/sys`, `/dev`, `/run`, etc.
- Common: `node_modules`, `.git`, `__pycache__`
## ClickHouse Schema
See `queries.sql` for schema and useful queries.

BIN
cmd/._hashupdate Executable file

Binary file not shown.

BIN
cmd/hashupdate/._main.go Executable file

Binary file not shown.

145
cmd/hashupdate/main.go Executable file
View File

@ -0,0 +1,145 @@
package main
import (
"crypto/md5"
"database/sql"
"flag"
"fmt"
"io"
"os"
"path/filepath"
_ "github.com/ClickHouse/clickhouse-go/v2"
)
var (
serverName = flag.String("server", "", "Server name to process")
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
verbose = flag.Bool("v", false, "Verbose output")
)
func quickHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
return "", err
}
size := stat.Size()
h := md5.New()
buf := make([]byte, 65536)
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
if size > 131072 {
_, err = f.Seek(-65536, io.SeekEnd)
if err != nil {
return "", err
}
n, err = f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func main() {
flag.Parse()
if *serverName == "" {
fmt.Fprintln(os.Stderr, "Server name required: -server <n>")
os.Exit(1)
}
dsn := fmt.Sprintf("clickhouse://%s/files", *chHost)
db, err := sql.Open("clickhouse", dsn)
if err != nil {
fmt.Fprintf(os.Stderr, "DB error: %v\n", err)
os.Exit(1)
}
defer db.Close()
// Find sizes that appear more than once (potential dupes)
rows, err := db.Query(`
SELECT DISTINCT size
FROM files.inventory
WHERE server = ? AND hash = '' AND size > 0
GROUP BY size
HAVING count(*) > 1
ORDER BY size DESC
`, *serverName)
if err != nil {
fmt.Fprintf(os.Stderr, "Query error: %v\n", err)
os.Exit(1)
}
var sizes []int64
for rows.Next() {
var size int64
rows.Scan(&size)
sizes = append(sizes, size)
}
rows.Close()
fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes))
// Get files to hash
var totalHashed int64
for _, size := range sizes {
fileRows, err := db.Query(`
SELECT folder, filename
FROM files.inventory
WHERE server = ? AND size = ? AND hash = ''
`, *serverName, size)
if err != nil {
continue
}
for fileRows.Next() {
var folder, filename string
fileRows.Scan(&folder, &filename)
fullPath := filepath.Join(folder, filename)
hash, err := quickHash(fullPath)
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err)
}
continue
}
_, err = db.Exec(`
ALTER TABLE files.inventory
UPDATE hash = ?
WHERE server = ? AND folder = ? AND filename = ?
`, hash, *serverName, folder, filename)
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Update error: %v\n", err)
}
continue
}
totalHashed++
if *verbose {
fmt.Printf("Hashed: %s -> %s\n", fullPath, hash)
}
}
fileRows.Close()
}
fmt.Printf("Hashed %d files\n", totalHashed)
}

23
go.mod Executable file
View File

@ -0,0 +1,23 @@
module filescanner
go 1.22
require github.com/ClickHouse/clickhouse-go/v2 v2.23.0
require (
github.com/ClickHouse/ch-go v0.61.5 // indirect
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/go-faster/city v1.0.1 // indirect
github.com/go-faster/errors v0.7.1 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/klauspost/compress v1.17.7 // indirect
github.com/paulmach/orb v0.11.1 // indirect
github.com/pierrec/lz4/v4 v4.1.21 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/segmentio/asm v1.2.0 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
go.opentelemetry.io/otel v1.24.0 // indirect
go.opentelemetry.io/otel/trace v1.24.0 // indirect
golang.org/x/sys v0.18.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

110
go.sum Executable file
View File

@ -0,0 +1,110 @@
github.com/ClickHouse/ch-go v0.61.5 h1:zwR8QbYI0tsMiEcze/uIMK+Tz1D3XZXLdNrlaOpeEI4=
github.com/ClickHouse/ch-go v0.61.5/go.mod h1:s1LJW/F/LcFs5HJnuogFMta50kKDO0lf9zzfrbl0RQg=
github.com/ClickHouse/clickhouse-go/v2 v2.23.0 h1:srmRrkS0BR8gEut87u8jpcZ7geOob6nGj9ifrb+aKmg=
github.com/ClickHouse/clickhouse-go/v2 v2.23.0/go.mod h1:tBhdF3f3RdP7sS59+oBAtTyhWpy0024ZxDMhgxra0QE=
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw=
github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw=
github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg=
github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo=
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg=
github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
github.com/paulmach/orb v0.11.1 h1:3koVegMC4X/WeiXYz9iswopaTwMem53NzTJuTF20JzU=
github.com/paulmach/orb v0.11.1/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU=
github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY=
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys=
github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g=
github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8=
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g=
go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo=
go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo=
go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI=
go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

348
main.go Executable file
View File

@ -0,0 +1,348 @@
package main
import (
"crypto/md5"
"database/sql"
"flag"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
"time"
_ "github.com/ClickHouse/clickhouse-go/v2"
)
var (
serverName = flag.String("server", "", "Server name (required)")
rootPath = flag.String("path", ".", "Path to scan")
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton")
dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB")
verbose = flag.Bool("v", false, "Verbose output")
)
// Platform-specific directories to exclude (conservative - only true junk)
var excludeDirs = map[string]bool{
// Windows system
"$RECYCLE.BIN": true,
"System Volume Information": true,
"Windows": true,
// macOS system
".Spotlight-V100": true,
".fseventsd": true,
// Linux system
"proc": true,
"sys": true,
"dev": true,
"run": true,
"lost+found": true,
// Dev artifacts (large, reproducible)
"node_modules": true,
".git": true,
"__pycache__": true,
}
// Additional root-level excludes per OS
func shouldExcludeRoot(path string) bool {
switch runtime.GOOS {
case "darwin":
excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"}
for _, ex := range excludeRoots {
if strings.HasPrefix(path, ex) {
return true
}
}
case "linux":
excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"}
for _, ex := range excludeRoots {
if strings.HasPrefix(path, ex) {
return true
}
}
}
return false
}
type FileEntry struct {
Server string
Source string
Folder string
Filename string
Ext string
Size int64
Created time.Time
Modified time.Time
Hash string
}
var currentFolder string
func truncatePath(path string, maxLen int) string {
if len(path) <= maxLen {
return path
}
// Show beginning and end
half := (maxLen - 3) / 2
return path[:half] + "..." + path[len(path)-half:]
}
func quickHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
stat, err := f.Stat()
if err != nil {
return "", err
}
size := stat.Size()
h := md5.New()
buf := make([]byte, 65536)
// First 64KB
n, err := f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
// Last 64KB (if file > 128KB)
if size > 131072 {
_, err = f.Seek(-65536, io.SeekEnd)
if err != nil {
return "", err
}
n, err = f.Read(buf)
if err != nil && err != io.EOF {
return "", err
}
h.Write(buf[:n])
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) {
modified = info.ModTime()
created = modified // fallback - platform-specific code can override
// Note: Getting birth time is OS-specific and complex
// For MVP, we use modified time for both
return
}
func scanFiles(root string, entries chan<- FileEntry) error {
return filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if err != nil {
if *verbose {
fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err)
}
return nil // continue scanning
}
// Skip excluded directories
if info.IsDir() {
name := info.Name()
if excludeDirs[name] {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
if shouldExcludeRoot(path) {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
if strings.Contains(strings.ToLower(path), "cache") {
currentFolder = "[skip] " + path
return filepath.SkipDir
}
currentFolder = path
return nil
}
// Skip symlinks
if info.Mode()&os.ModeSymlink != 0 {
return nil
}
// Skip empty files
if info.Size() == 0 {
return nil
}
created, modified := getFileTimes(info, path)
ext := strings.ToLower(filepath.Ext(info.Name()))
if ext != "" {
ext = ext[1:] // remove leading dot
}
entry := FileEntry{
Server: *serverName,
Source: *source,
Folder: filepath.Dir(path),
Filename: info.Name(),
Ext: ext,
Size: info.Size(),
Created: created,
Modified: modified,
}
entries <- entry
return nil
})
}
func initDB(host string) (*sql.DB, error) {
dsn := fmt.Sprintf("clickhouse://%s/files", host)
db, err := sql.Open("clickhouse", dsn)
if err != nil {
return nil, err
}
// Create database and table
_, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`)
if err != nil {
return nil, fmt.Errorf("create database: %w", err)
}
_, err = db.Exec(`
CREATE TABLE IF NOT EXISTS files.inventory (
scan_id String,
scan_time DateTime64(3),
server LowCardinality(String),
source LowCardinality(String),
folder String,
filename String,
ext LowCardinality(String),
size UInt64,
created DateTime64(3),
modified DateTime64(3),
hash String DEFAULT ''
) ENGINE = MergeTree
ORDER BY (server, folder, filename)
`)
if err != nil {
return nil, fmt.Errorf("create table: %w", err)
}
return db, nil
}
func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error {
if len(entries) == 0 {
return nil
}
tx, err := db.Begin()
if err != nil {
return err
}
stmt, err := tx.Prepare(`
INSERT INTO files.inventory
(scan_id, scan_time, server, source, folder, filename, ext, size, created, modified)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
for _, e := range entries {
_, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified)
if err != nil {
tx.Rollback()
return err
}
}
return tx.Commit()
}
func main() {
flag.Parse()
if *serverName == "" {
hostname, _ := os.Hostname()
*serverName = hostname
}
absPath, err := filepath.Abs(*rootPath)
if err != nil {
fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err)
os.Exit(1)
}
fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName)
var db *sql.DB
if !*dryRun {
db, err = initDB(*chHost)
if err != nil {
fmt.Fprintf(os.Stderr, "Database error: %v\n", err)
os.Exit(1)
}
defer db.Close()
}
scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix())
scanTime := time.Now()
entries := make(chan FileEntry, 1000)
done := make(chan error)
// Scanner goroutine
go func() {
err := scanFiles(absPath, entries)
close(entries)
done <- err
}()
// Collector
var batch []FileEntry
var totalFiles int64
var totalSize int64
batchSize := 1000
for entry := range entries {
totalFiles++
totalSize += entry.Size
if *dryRun {
fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size)
} else {
batch = append(batch, entry)
if len(batch) >= batchSize {
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
}
batch = batch[:0]
}
if totalFiles%100 == 0 {
folder := truncatePath(currentFolder, 90)
fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder))
}
}
}
// Insert remaining
if !*dryRun && len(batch) > 0 {
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
}
}
// Wait for scanner
if err := <-done; err != nil {
fmt.Fprintf(os.Stderr, "Scan error: %v\n", err)
}
fmt.Printf("\r%-120s\r", "") // clear progress line
fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024))
fmt.Printf("Scan ID: %s\n", scanID)
}

98
queries.sql Executable file
View File

@ -0,0 +1,98 @@
-- ClickHouse queries for file inventory analysis
-- Create database (done automatically by scanner)
CREATE DATABASE IF NOT EXISTS files;
-- Schema (done automatically by scanner)
CREATE TABLE IF NOT EXISTS files.inventory (
scan_id String,
scan_time DateTime64(3),
server LowCardinality(String),
source LowCardinality(String),
folder String,
filename String,
ext LowCardinality(String),
size UInt64,
created DateTime64(3),
modified DateTime64(3),
hash String DEFAULT ''
) ENGINE = MergeTree
ORDER BY (server, folder, filename);
-- Summary by server
SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size
FROM files.inventory
GROUP BY server
ORDER BY sum(size) DESC;
-- Find exact duplicates (after running hashupdate)
SELECT
hash,
count(*) as cnt,
formatReadableSize(any(size)) as size,
groupArray(concat(server, ':', folder, '/', filename)) as files
FROM files.inventory
WHERE hash != ''
GROUP BY hash
HAVING cnt > 1
ORDER BY any(size) DESC
LIMIT 100;
-- Find files by extension
SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total
FROM files.inventory
GROUP BY ext
ORDER BY sum(size) DESC
LIMIT 20;
-- Largest files
SELECT server, folder, filename, formatReadableSize(size) as size
FROM files.inventory
ORDER BY size DESC
LIMIT 50;
-- Find files by name pattern
SELECT server, folder, filename, formatReadableSize(size) as size
FROM files.inventory
WHERE filename ILIKE '%pattern%'
ORDER BY size DESC;
-- Compare two servers - files only on server A
SELECT a.folder, a.filename, formatReadableSize(a.size) as size
FROM files.inventory a
LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB'
WHERE a.server = 'serverA' AND b.filename IS NULL
ORDER BY a.size DESC;
-- Files modified in last 30 days
SELECT server, folder, filename, modified, formatReadableSize(size) as size
FROM files.inventory
WHERE modified > now() - INTERVAL 30 DAY
ORDER BY modified DESC
LIMIT 100;
-- Potential duplicate sizes (before hashing)
SELECT size, count(*) as cnt
FROM files.inventory
WHERE hash = '' AND size > 1000000 -- > 1MB
GROUP BY size
HAVING cnt > 1
ORDER BY size DESC
LIMIT 100;
-- Storage by folder (top level)
SELECT
server,
arrayElement(splitByChar('/', folder), 2) as top_folder,
count(*) as files,
formatReadableSize(sum(size)) as total
FROM files.inventory
GROUP BY server, top_folder
ORDER BY sum(size) DESC
LIMIT 50;
-- Clear old scans (keep latest per server)
-- DELETE FROM files.inventory
-- WHERE (server, scan_time) NOT IN (
-- SELECT server, max(scan_time) FROM files.inventory GROUP BY server
-- );