Initial commit
This commit is contained in:
commit
e8d9a31216
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,31 @@
|
||||||
|
# Binaries
|
||||||
|
*.exe
|
||||||
|
*.exe~
|
||||||
|
*.dll
|
||||||
|
*.so
|
||||||
|
*.dylib
|
||||||
|
/bin/
|
||||||
|
/dist/
|
||||||
|
|
||||||
|
# Test binary
|
||||||
|
*.test
|
||||||
|
|
||||||
|
# Output
|
||||||
|
*.out
|
||||||
|
|
||||||
|
# Dependency directories
|
||||||
|
vendor/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
.vscode/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# Env files
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
|
||||||
|
# Databases
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
|
@ -0,0 +1,62 @@
|
||||||
|
.PHONY: all clean mac linux windows deps deploy
|
||||||
|
|
||||||
|
LINUX_SERVERS = 192.168.1.240 192.168.1.252 192.168.1.253 192.168.1.254
|
||||||
|
LINUX_USER = johan
|
||||||
|
LINUX_PASS = Helder06
|
||||||
|
WIN_SERVER = 192.168.1.251
|
||||||
|
WIN_USER = johan
|
||||||
|
|
||||||
|
all: deps mac linux windows deploy
|
||||||
|
|
||||||
|
deps:
|
||||||
|
go mod tidy
|
||||||
|
|
||||||
|
mac:
|
||||||
|
GOOS=darwin GOARCH=amd64 go build -o bin/filescan-mac-amd64 .
|
||||||
|
GOOS=darwin GOARCH=arm64 go build -o bin/filescan-mac-arm64 .
|
||||||
|
GOOS=darwin GOARCH=amd64 go build -o bin/hashupdate-mac-amd64 ./cmd/hashupdate
|
||||||
|
GOOS=darwin GOARCH=arm64 go build -o bin/hashupdate-mac-arm64 ./cmd/hashupdate
|
||||||
|
|
||||||
|
linux:
|
||||||
|
GOOS=linux GOARCH=amd64 go build -o bin/filescan-linux .
|
||||||
|
GOOS=linux GOARCH=amd64 go build -o bin/hashupdate-linux ./cmd/hashupdate
|
||||||
|
|
||||||
|
windows:
|
||||||
|
GOOS=windows GOARCH=amd64 go build -o bin/filescan.exe .
|
||||||
|
GOOS=windows GOARCH=amd64 go build -o bin/hashupdate.exe ./cmd/hashupdate
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf bin/
|
||||||
|
|
||||||
|
deploy: deploy-linux deploy-windows
|
||||||
|
|
||||||
|
deploy-linux:
|
||||||
|
@echo "Deploying to Linux servers..."
|
||||||
|
@for server in $(LINUX_SERVERS); do \
|
||||||
|
echo " -> $$server"; \
|
||||||
|
sshpass -p '$(LINUX_PASS)' scp -o StrictHostKeyChecking=no bin/filescan-linux bin/hashupdate-linux $(LINUX_USER)@$$server:~/ 2>/dev/null || echo " [offline]"; \
|
||||||
|
done
|
||||||
|
|
||||||
|
deploy-windows:
|
||||||
|
@echo "Deploying to Windows server $(WIN_SERVER)..."
|
||||||
|
@mkdir -p /tmp/251c 2>/dev/null || true
|
||||||
|
@umount /tmp/251c 2>/dev/null || true
|
||||||
|
@mount_smbfs '//johan:%21%21Lekker69@192.168.1.251/C' /tmp/251c 2>/dev/null && \
|
||||||
|
cp bin/filescan.exe bin/hashupdate.exe /tmp/251c/Users/johan/ && \
|
||||||
|
echo " -> $(WIN_SERVER) [ok]" && \
|
||||||
|
umount /tmp/251c || echo " -> $(WIN_SERVER) [failed]"
|
||||||
|
|
||||||
|
# Quick build for current platform
|
||||||
|
build:
|
||||||
|
go build -o bin/filescan .
|
||||||
|
go build -o bin/hashupdate ./cmd/hashupdate
|
||||||
|
|
||||||
|
# Run scanner locally (example)
|
||||||
|
run:
|
||||||
|
go run . -server test -path /tmp -dry-run
|
||||||
|
|
||||||
|
# Find duplicates query
|
||||||
|
dupes:
|
||||||
|
@echo "Run this in ClickHouse:"
|
||||||
|
@echo "SELECT hash, count(*) as cnt, groupArray(concat(folder, '/', filename)) as files"
|
||||||
|
@echo "FROM files.inventory WHERE hash != '' GROUP BY hash HAVING cnt > 1 ORDER BY cnt DESC"
|
||||||
|
|
@ -0,0 +1,67 @@
|
||||||
|
# Filescanner
|
||||||
|
|
||||||
|
Cross-platform file inventory scanner with ClickHouse backend.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get dependencies
|
||||||
|
go mod tidy
|
||||||
|
|
||||||
|
# Build all platforms
|
||||||
|
make all
|
||||||
|
|
||||||
|
# Or build current platform only
|
||||||
|
make build
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Scan files
|
||||||
|
```bash
|
||||||
|
# Scan with dry-run (no DB)
|
||||||
|
./filescan -server myserver -path /home -dry-run
|
||||||
|
|
||||||
|
# Scan to ClickHouse
|
||||||
|
./filescan -server myserver -path /home -ch 192.168.1.253:9000
|
||||||
|
|
||||||
|
# Verbose
|
||||||
|
./filescan -server myserver -path /home -v
|
||||||
|
```
|
||||||
|
|
||||||
|
### Add hashes for duplicate detection
|
||||||
|
```bash
|
||||||
|
# Only hashes files with non-unique sizes
|
||||||
|
./hashupdate -server myserver -ch 192.168.1.253:9000
|
||||||
|
```
|
||||||
|
|
||||||
|
### Find duplicates
|
||||||
|
```sql
|
||||||
|
SELECT hash, count(*) as cnt,
|
||||||
|
groupArray(concat(server, ':', folder, '/', filename)) as files
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE hash != ''
|
||||||
|
GROUP BY hash
|
||||||
|
HAVING cnt > 1
|
||||||
|
ORDER BY any(size) DESC;
|
||||||
|
```
|
||||||
|
|
||||||
|
## Binaries
|
||||||
|
|
||||||
|
After `make all`:
|
||||||
|
- `bin/filescan-mac-arm64` - Mac M1/M2/M3
|
||||||
|
- `bin/filescan-mac-amd64` - Mac Intel
|
||||||
|
- `bin/filescan-linux` - Linux
|
||||||
|
- `bin/filescan.exe` - Windows
|
||||||
|
|
||||||
|
## Excluded Directories
|
||||||
|
|
||||||
|
Automatically skips:
|
||||||
|
- Windows: `$RECYCLE.BIN`, `Windows`, `Program Files`, `AppData`, etc.
|
||||||
|
- macOS: `.Trash`, `Library`, `.Spotlight-V100`, etc.
|
||||||
|
- Linux: `/proc`, `/sys`, `/dev`, `/run`, etc.
|
||||||
|
- Common: `node_modules`, `.git`, `__pycache__`
|
||||||
|
|
||||||
|
## ClickHouse Schema
|
||||||
|
|
||||||
|
See `queries.sql` for schema and useful queries.
|
||||||
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,145 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/md5"
|
||||||
|
"database/sql"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
|
_ "github.com/ClickHouse/clickhouse-go/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
serverName = flag.String("server", "", "Server name to process")
|
||||||
|
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
|
||||||
|
verbose = flag.Bool("v", false, "Verbose output")
|
||||||
|
)
|
||||||
|
|
||||||
|
func quickHash(path string) (string, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
stat, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
size := stat.Size()
|
||||||
|
|
||||||
|
h := md5.New()
|
||||||
|
buf := make([]byte, 65536)
|
||||||
|
|
||||||
|
n, err := f.Read(buf)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
h.Write(buf[:n])
|
||||||
|
|
||||||
|
if size > 131072 {
|
||||||
|
_, err = f.Seek(-65536, io.SeekEnd)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
n, err = f.Read(buf)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
h.Write(buf[:n])
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%x", h.Sum(nil)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if *serverName == "" {
|
||||||
|
fmt.Fprintln(os.Stderr, "Server name required: -server <n>")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
dsn := fmt.Sprintf("clickhouse://%s/files", *chHost)
|
||||||
|
db, err := sql.Open("clickhouse", dsn)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "DB error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
// Find sizes that appear more than once (potential dupes)
|
||||||
|
rows, err := db.Query(`
|
||||||
|
SELECT DISTINCT size
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE server = ? AND hash = '' AND size > 0
|
||||||
|
GROUP BY size
|
||||||
|
HAVING count(*) > 1
|
||||||
|
ORDER BY size DESC
|
||||||
|
`, *serverName)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Query error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
var sizes []int64
|
||||||
|
for rows.Next() {
|
||||||
|
var size int64
|
||||||
|
rows.Scan(&size)
|
||||||
|
sizes = append(sizes, size)
|
||||||
|
}
|
||||||
|
rows.Close()
|
||||||
|
|
||||||
|
fmt.Printf("Found %d file sizes with potential duplicates\n", len(sizes))
|
||||||
|
|
||||||
|
// Get files to hash
|
||||||
|
var totalHashed int64
|
||||||
|
for _, size := range sizes {
|
||||||
|
fileRows, err := db.Query(`
|
||||||
|
SELECT folder, filename
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE server = ? AND size = ? AND hash = ''
|
||||||
|
`, *serverName, size)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
for fileRows.Next() {
|
||||||
|
var folder, filename string
|
||||||
|
fileRows.Scan(&folder, &filename)
|
||||||
|
|
||||||
|
fullPath := filepath.Join(folder, filename)
|
||||||
|
hash, err := quickHash(fullPath)
|
||||||
|
if err != nil {
|
||||||
|
if *verbose {
|
||||||
|
fmt.Fprintf(os.Stderr, "Hash error %s: %v\n", fullPath, err)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = db.Exec(`
|
||||||
|
ALTER TABLE files.inventory
|
||||||
|
UPDATE hash = ?
|
||||||
|
WHERE server = ? AND folder = ? AND filename = ?
|
||||||
|
`, hash, *serverName, folder, filename)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
if *verbose {
|
||||||
|
fmt.Fprintf(os.Stderr, "Update error: %v\n", err)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
totalHashed++
|
||||||
|
if *verbose {
|
||||||
|
fmt.Printf("Hashed: %s -> %s\n", fullPath, hash)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fileRows.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Hashed %d files\n", totalHashed)
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,23 @@
|
||||||
|
module filescanner
|
||||||
|
|
||||||
|
go 1.22
|
||||||
|
|
||||||
|
require github.com/ClickHouse/clickhouse-go/v2 v2.23.0
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/ClickHouse/ch-go v0.61.5 // indirect
|
||||||
|
github.com/andybalholm/brotli v1.1.0 // indirect
|
||||||
|
github.com/go-faster/city v1.0.1 // indirect
|
||||||
|
github.com/go-faster/errors v0.7.1 // indirect
|
||||||
|
github.com/google/uuid v1.6.0 // indirect
|
||||||
|
github.com/klauspost/compress v1.17.7 // indirect
|
||||||
|
github.com/paulmach/orb v0.11.1 // indirect
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.21 // indirect
|
||||||
|
github.com/pkg/errors v0.9.1 // indirect
|
||||||
|
github.com/segmentio/asm v1.2.0 // indirect
|
||||||
|
github.com/shopspring/decimal v1.3.1 // indirect
|
||||||
|
go.opentelemetry.io/otel v1.24.0 // indirect
|
||||||
|
go.opentelemetry.io/otel/trace v1.24.0 // indirect
|
||||||
|
golang.org/x/sys v0.18.0 // indirect
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
github.com/ClickHouse/ch-go v0.61.5 h1:zwR8QbYI0tsMiEcze/uIMK+Tz1D3XZXLdNrlaOpeEI4=
|
||||||
|
github.com/ClickHouse/ch-go v0.61.5/go.mod h1:s1LJW/F/LcFs5HJnuogFMta50kKDO0lf9zzfrbl0RQg=
|
||||||
|
github.com/ClickHouse/clickhouse-go/v2 v2.23.0 h1:srmRrkS0BR8gEut87u8jpcZ7geOob6nGj9ifrb+aKmg=
|
||||||
|
github.com/ClickHouse/clickhouse-go/v2 v2.23.0/go.mod h1:tBhdF3f3RdP7sS59+oBAtTyhWpy0024ZxDMhgxra0QE=
|
||||||
|
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
|
||||||
|
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
|
||||||
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/go-faster/city v1.0.1 h1:4WAxSZ3V2Ws4QRDrscLEDcibJY8uf41H6AhXDrNDcGw=
|
||||||
|
github.com/go-faster/city v1.0.1/go.mod h1:jKcUJId49qdW3L1qKHH/3wPeUstCVpVSXTM6vO3VcTw=
|
||||||
|
github.com/go-faster/errors v0.7.1 h1:MkJTnDoEdi9pDabt1dpWf7AA8/BaSYZqibYyhZ20AYg=
|
||||||
|
github.com/go-faster/errors v0.7.1/go.mod h1:5ySTjWFiphBs07IKuiL69nxdfd5+fzh1u7FPGZP2quo=
|
||||||
|
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||||
|
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||||
|
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
|
||||||
|
github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
|
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||||
|
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||||
|
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||||
|
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||||
|
github.com/klauspost/compress v1.13.6/go.mod h1:/3/Vjq9QcHkK5uEr5lBEmyoZ1iFhe47etQ6QUkpK6sk=
|
||||||
|
github.com/klauspost/compress v1.17.7 h1:ehO88t2UGzQK66LMdE8tibEd1ErmzZjNEqWkjLAKQQg=
|
||||||
|
github.com/klauspost/compress v1.17.7/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
|
||||||
|
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
|
||||||
|
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||||
|
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||||
|
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||||
|
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
|
||||||
|
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||||
|
github.com/montanaflynn/stats v0.0.0-20171201202039-1bf9dbcd8cbe/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc=
|
||||||
|
github.com/paulmach/orb v0.11.1 h1:3koVegMC4X/WeiXYz9iswopaTwMem53NzTJuTF20JzU=
|
||||||
|
github.com/paulmach/orb v0.11.1/go.mod h1:5mULz1xQfs3bmQm63QEJA6lNGujuRafwA5S/EnuLaLU=
|
||||||
|
github.com/paulmach/protoscan v0.2.1/go.mod h1:SpcSwydNLrxUGSDvXvO0P7g7AuhJ7lcKfDlhJCDw2gY=
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.21 h1:yOVMLb6qSIDP67pl/5F7RepeKYu/VmTyEXvuMI5d9mQ=
|
||||||
|
github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
||||||
|
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||||
|
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||||
|
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||||
|
github.com/segmentio/asm v1.2.0 h1:9BQrFxC+YOHJlTlHGkTrFWf59nbL3XnCoFLTwDCI7ys=
|
||||||
|
github.com/segmentio/asm v1.2.0/go.mod h1:BqMnlJP91P8d+4ibuonYZw9mfnzI9HfxselHZr5aAcs=
|
||||||
|
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
|
||||||
|
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||||
|
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||||
|
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||||
|
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||||
|
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||||
|
github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk=
|
||||||
|
github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI=
|
||||||
|
github.com/xdg-go/scram v1.1.1/go.mod h1:RaEWvsqvNKKvBPvcKeFjrG2cJqOkHTiyTpzz23ni57g=
|
||||||
|
github.com/xdg-go/stringprep v1.0.3/go.mod h1:W3f5j4i+9rC0kuIEJL0ky1VpHXQU3ocBgklLGvcBnW8=
|
||||||
|
github.com/youmark/pkcs8 v0.0.0-20181117223130-1be2e3e5546d/go.mod h1:rHwXgn7JulP+udvsHwJoVG1YGAP6VLg4y9I5dyZdqmA=
|
||||||
|
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||||
|
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||||
|
go.mongodb.org/mongo-driver v1.11.4/go.mod h1:PTSz5yu21bkT/wXpkS7WR5f0ddqw5quethTUn9WM+2g=
|
||||||
|
go.opentelemetry.io/otel v1.24.0 h1:0LAOdjNmQeSTzGBzduGe/rU4tZhMwL5rWgtp9Ku5Jfo=
|
||||||
|
go.opentelemetry.io/otel v1.24.0/go.mod h1:W7b9Ozg4nkF5tWI5zsXkaKKDjdVjpD4oAt9Qi/MArHo=
|
||||||
|
go.opentelemetry.io/otel/trace v1.24.0 h1:CsKnnL4dUAr/0llH9FKuc698G04IrpWV0MQA/Y1YELI=
|
||||||
|
go.opentelemetry.io/otel/trace v1.24.0/go.mod h1:HPc3Xr/cOApsBI154IU0OI0HJexz+aw5uPdbs3UCjNU=
|
||||||
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
|
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||||
|
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||||
|
golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||||
|
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||||
|
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||||
|
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||||
|
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
|
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
|
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||||
|
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
|
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
|
||||||
|
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
|
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
|
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||||
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
|
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||||
|
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||||
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||||
|
google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||||
|
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|
@ -0,0 +1,348 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crypto/md5"
|
||||||
|
"database/sql"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/ClickHouse/clickhouse-go/v2"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
serverName = flag.String("server", "", "Server name (required)")
|
||||||
|
rootPath = flag.String("path", ".", "Path to scan")
|
||||||
|
chHost = flag.String("ch", "192.168.1.253:9000", "ClickHouse host:port")
|
||||||
|
source = flag.String("source", "local", "Source type: local, gdrive, onedrive, proton")
|
||||||
|
dryRun = flag.Bool("dry-run", false, "Print files without inserting to DB")
|
||||||
|
verbose = flag.Bool("v", false, "Verbose output")
|
||||||
|
)
|
||||||
|
|
||||||
|
// Platform-specific directories to exclude (conservative - only true junk)
|
||||||
|
var excludeDirs = map[string]bool{
|
||||||
|
// Windows system
|
||||||
|
"$RECYCLE.BIN": true,
|
||||||
|
"System Volume Information": true,
|
||||||
|
"Windows": true,
|
||||||
|
// macOS system
|
||||||
|
".Spotlight-V100": true,
|
||||||
|
".fseventsd": true,
|
||||||
|
// Linux system
|
||||||
|
"proc": true,
|
||||||
|
"sys": true,
|
||||||
|
"dev": true,
|
||||||
|
"run": true,
|
||||||
|
"lost+found": true,
|
||||||
|
// Dev artifacts (large, reproducible)
|
||||||
|
"node_modules": true,
|
||||||
|
".git": true,
|
||||||
|
"__pycache__": true,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Additional root-level excludes per OS
|
||||||
|
func shouldExcludeRoot(path string) bool {
|
||||||
|
switch runtime.GOOS {
|
||||||
|
case "darwin":
|
||||||
|
excludeRoots := []string{"/System", "/Library", "/private/var", "/Volumes/.timemachine"}
|
||||||
|
for _, ex := range excludeRoots {
|
||||||
|
if strings.HasPrefix(path, ex) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case "linux":
|
||||||
|
excludeRoots := []string{"/proc", "/sys", "/dev", "/run", "/snap", "/boot"}
|
||||||
|
for _, ex := range excludeRoots {
|
||||||
|
if strings.HasPrefix(path, ex) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type FileEntry struct {
|
||||||
|
Server string
|
||||||
|
Source string
|
||||||
|
Folder string
|
||||||
|
Filename string
|
||||||
|
Ext string
|
||||||
|
Size int64
|
||||||
|
Created time.Time
|
||||||
|
Modified time.Time
|
||||||
|
Hash string
|
||||||
|
}
|
||||||
|
|
||||||
|
var currentFolder string
|
||||||
|
|
||||||
|
func truncatePath(path string, maxLen int) string {
|
||||||
|
if len(path) <= maxLen {
|
||||||
|
return path
|
||||||
|
}
|
||||||
|
// Show beginning and end
|
||||||
|
half := (maxLen - 3) / 2
|
||||||
|
return path[:half] + "..." + path[len(path)-half:]
|
||||||
|
}
|
||||||
|
|
||||||
|
func quickHash(path string) (string, error) {
|
||||||
|
f, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
stat, err := f.Stat()
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
size := stat.Size()
|
||||||
|
|
||||||
|
h := md5.New()
|
||||||
|
buf := make([]byte, 65536)
|
||||||
|
|
||||||
|
// First 64KB
|
||||||
|
n, err := f.Read(buf)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
h.Write(buf[:n])
|
||||||
|
|
||||||
|
// Last 64KB (if file > 128KB)
|
||||||
|
if size > 131072 {
|
||||||
|
_, err = f.Seek(-65536, io.SeekEnd)
|
||||||
|
if err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
n, err = f.Read(buf)
|
||||||
|
if err != nil && err != io.EOF {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
h.Write(buf[:n])
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%x", h.Sum(nil)), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getFileTimes(info os.FileInfo, path string) (created, modified time.Time) {
|
||||||
|
modified = info.ModTime()
|
||||||
|
created = modified // fallback - platform-specific code can override
|
||||||
|
// Note: Getting birth time is OS-specific and complex
|
||||||
|
// For MVP, we use modified time for both
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanFiles(root string, entries chan<- FileEntry) error {
|
||||||
|
return filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
if *verbose {
|
||||||
|
fmt.Fprintf(os.Stderr, "Error accessing %s: %v\n", path, err)
|
||||||
|
}
|
||||||
|
return nil // continue scanning
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip excluded directories
|
||||||
|
if info.IsDir() {
|
||||||
|
name := info.Name()
|
||||||
|
if excludeDirs[name] {
|
||||||
|
currentFolder = "[skip] " + path
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
if shouldExcludeRoot(path) {
|
||||||
|
currentFolder = "[skip] " + path
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
if strings.Contains(strings.ToLower(path), "cache") {
|
||||||
|
currentFolder = "[skip] " + path
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
currentFolder = path
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip symlinks
|
||||||
|
if info.Mode()&os.ModeSymlink != 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip empty files
|
||||||
|
if info.Size() == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
created, modified := getFileTimes(info, path)
|
||||||
|
ext := strings.ToLower(filepath.Ext(info.Name()))
|
||||||
|
if ext != "" {
|
||||||
|
ext = ext[1:] // remove leading dot
|
||||||
|
}
|
||||||
|
|
||||||
|
entry := FileEntry{
|
||||||
|
Server: *serverName,
|
||||||
|
Source: *source,
|
||||||
|
Folder: filepath.Dir(path),
|
||||||
|
Filename: info.Name(),
|
||||||
|
Ext: ext,
|
||||||
|
Size: info.Size(),
|
||||||
|
Created: created,
|
||||||
|
Modified: modified,
|
||||||
|
}
|
||||||
|
|
||||||
|
entries <- entry
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func initDB(host string) (*sql.DB, error) {
|
||||||
|
dsn := fmt.Sprintf("clickhouse://%s/files", host)
|
||||||
|
db, err := sql.Open("clickhouse", dsn)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create database and table
|
||||||
|
_, err = db.Exec(`CREATE DATABASE IF NOT EXISTS files`)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("create database: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = db.Exec(`
|
||||||
|
CREATE TABLE IF NOT EXISTS files.inventory (
|
||||||
|
scan_id String,
|
||||||
|
scan_time DateTime64(3),
|
||||||
|
server LowCardinality(String),
|
||||||
|
source LowCardinality(String),
|
||||||
|
folder String,
|
||||||
|
filename String,
|
||||||
|
ext LowCardinality(String),
|
||||||
|
size UInt64,
|
||||||
|
created DateTime64(3),
|
||||||
|
modified DateTime64(3),
|
||||||
|
hash String DEFAULT ''
|
||||||
|
) ENGINE = MergeTree
|
||||||
|
ORDER BY (server, folder, filename)
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("create table: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return db, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func insertBatch(db *sql.DB, scanID string, scanTime time.Time, entries []FileEntry) error {
|
||||||
|
if len(entries) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
tx, err := db.Begin()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt, err := tx.Prepare(`
|
||||||
|
INSERT INTO files.inventory
|
||||||
|
(scan_id, scan_time, server, source, folder, filename, ext, size, created, modified)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
`)
|
||||||
|
if err != nil {
|
||||||
|
tx.Rollback()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer stmt.Close()
|
||||||
|
|
||||||
|
for _, e := range entries {
|
||||||
|
_, err = stmt.Exec(scanID, scanTime, e.Server, e.Source, e.Folder, e.Filename, e.Ext, e.Size, e.Created, e.Modified)
|
||||||
|
if err != nil {
|
||||||
|
tx.Rollback()
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tx.Commit()
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if *serverName == "" {
|
||||||
|
hostname, _ := os.Hostname()
|
||||||
|
*serverName = hostname
|
||||||
|
}
|
||||||
|
|
||||||
|
absPath, err := filepath.Abs(*rootPath)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Invalid path: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("Scanning: %s on server %s\n", absPath, *serverName)
|
||||||
|
|
||||||
|
var db *sql.DB
|
||||||
|
if !*dryRun {
|
||||||
|
db, err = initDB(*chHost)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Database error: %v\n", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
}
|
||||||
|
|
||||||
|
scanID := fmt.Sprintf("%s-%d", *serverName, time.Now().Unix())
|
||||||
|
scanTime := time.Now()
|
||||||
|
|
||||||
|
entries := make(chan FileEntry, 1000)
|
||||||
|
done := make(chan error)
|
||||||
|
|
||||||
|
// Scanner goroutine
|
||||||
|
go func() {
|
||||||
|
err := scanFiles(absPath, entries)
|
||||||
|
close(entries)
|
||||||
|
done <- err
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Collector
|
||||||
|
var batch []FileEntry
|
||||||
|
var totalFiles int64
|
||||||
|
var totalSize int64
|
||||||
|
batchSize := 1000
|
||||||
|
|
||||||
|
for entry := range entries {
|
||||||
|
totalFiles++
|
||||||
|
totalSize += entry.Size
|
||||||
|
|
||||||
|
if *dryRun {
|
||||||
|
fmt.Printf("%s/%s (%d bytes)\n", entry.Folder, entry.Filename, entry.Size)
|
||||||
|
} else {
|
||||||
|
batch = append(batch, entry)
|
||||||
|
if len(batch) >= batchSize {
|
||||||
|
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
|
||||||
|
}
|
||||||
|
batch = batch[:0]
|
||||||
|
}
|
||||||
|
if totalFiles%100 == 0 {
|
||||||
|
folder := truncatePath(currentFolder, 90)
|
||||||
|
fmt.Printf("\r%-120s", fmt.Sprintf("%d files (%.2f GB) %s", totalFiles, float64(totalSize)/(1024*1024*1024), folder))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert remaining
|
||||||
|
if !*dryRun && len(batch) > 0 {
|
||||||
|
if err := insertBatch(db, scanID, scanTime, batch); err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Insert error: %v\n", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for scanner
|
||||||
|
if err := <-done; err != nil {
|
||||||
|
fmt.Fprintf(os.Stderr, "Scan error: %v\n", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("\r%-120s\r", "") // clear progress line
|
||||||
|
fmt.Printf("Scan complete: %d files, %.2f GB\n", totalFiles, float64(totalSize)/(1024*1024*1024))
|
||||||
|
fmt.Printf("Scan ID: %s\n", scanID)
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,98 @@
|
||||||
|
-- ClickHouse queries for file inventory analysis
|
||||||
|
|
||||||
|
-- Create database (done automatically by scanner)
|
||||||
|
CREATE DATABASE IF NOT EXISTS files;
|
||||||
|
|
||||||
|
-- Schema (done automatically by scanner)
|
||||||
|
CREATE TABLE IF NOT EXISTS files.inventory (
|
||||||
|
scan_id String,
|
||||||
|
scan_time DateTime64(3),
|
||||||
|
server LowCardinality(String),
|
||||||
|
source LowCardinality(String),
|
||||||
|
folder String,
|
||||||
|
filename String,
|
||||||
|
ext LowCardinality(String),
|
||||||
|
size UInt64,
|
||||||
|
created DateTime64(3),
|
||||||
|
modified DateTime64(3),
|
||||||
|
hash String DEFAULT ''
|
||||||
|
) ENGINE = MergeTree
|
||||||
|
ORDER BY (server, folder, filename);
|
||||||
|
|
||||||
|
-- Summary by server
|
||||||
|
SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size
|
||||||
|
FROM files.inventory
|
||||||
|
GROUP BY server
|
||||||
|
ORDER BY sum(size) DESC;
|
||||||
|
|
||||||
|
-- Find exact duplicates (after running hashupdate)
|
||||||
|
SELECT
|
||||||
|
hash,
|
||||||
|
count(*) as cnt,
|
||||||
|
formatReadableSize(any(size)) as size,
|
||||||
|
groupArray(concat(server, ':', folder, '/', filename)) as files
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE hash != ''
|
||||||
|
GROUP BY hash
|
||||||
|
HAVING cnt > 1
|
||||||
|
ORDER BY any(size) DESC
|
||||||
|
LIMIT 100;
|
||||||
|
|
||||||
|
-- Find files by extension
|
||||||
|
SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total
|
||||||
|
FROM files.inventory
|
||||||
|
GROUP BY ext
|
||||||
|
ORDER BY sum(size) DESC
|
||||||
|
LIMIT 20;
|
||||||
|
|
||||||
|
-- Largest files
|
||||||
|
SELECT server, folder, filename, formatReadableSize(size) as size
|
||||||
|
FROM files.inventory
|
||||||
|
ORDER BY size DESC
|
||||||
|
LIMIT 50;
|
||||||
|
|
||||||
|
-- Find files by name pattern
|
||||||
|
SELECT server, folder, filename, formatReadableSize(size) as size
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE filename ILIKE '%pattern%'
|
||||||
|
ORDER BY size DESC;
|
||||||
|
|
||||||
|
-- Compare two servers - files only on server A
|
||||||
|
SELECT a.folder, a.filename, formatReadableSize(a.size) as size
|
||||||
|
FROM files.inventory a
|
||||||
|
LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB'
|
||||||
|
WHERE a.server = 'serverA' AND b.filename IS NULL
|
||||||
|
ORDER BY a.size DESC;
|
||||||
|
|
||||||
|
-- Files modified in last 30 days
|
||||||
|
SELECT server, folder, filename, modified, formatReadableSize(size) as size
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE modified > now() - INTERVAL 30 DAY
|
||||||
|
ORDER BY modified DESC
|
||||||
|
LIMIT 100;
|
||||||
|
|
||||||
|
-- Potential duplicate sizes (before hashing)
|
||||||
|
SELECT size, count(*) as cnt
|
||||||
|
FROM files.inventory
|
||||||
|
WHERE hash = '' AND size > 1000000 -- > 1MB
|
||||||
|
GROUP BY size
|
||||||
|
HAVING cnt > 1
|
||||||
|
ORDER BY size DESC
|
||||||
|
LIMIT 100;
|
||||||
|
|
||||||
|
-- Storage by folder (top level)
|
||||||
|
SELECT
|
||||||
|
server,
|
||||||
|
arrayElement(splitByChar('/', folder), 2) as top_folder,
|
||||||
|
count(*) as files,
|
||||||
|
formatReadableSize(sum(size)) as total
|
||||||
|
FROM files.inventory
|
||||||
|
GROUP BY server, top_folder
|
||||||
|
ORDER BY sum(size) DESC
|
||||||
|
LIMIT 50;
|
||||||
|
|
||||||
|
-- Clear old scans (keep latest per server)
|
||||||
|
-- DELETE FROM files.inventory
|
||||||
|
-- WHERE (server, scan_time) NOT IN (
|
||||||
|
-- SELECT server, max(scan_time) FROM files.inventory GROUP BY server
|
||||||
|
-- );
|
||||||
Loading…
Reference in New Issue