99 lines
2.7 KiB
SQL
Executable File
99 lines
2.7 KiB
SQL
Executable File
-- ClickHouse queries for file inventory analysis
|
|
|
|
-- Create database (done automatically by scanner)
|
|
CREATE DATABASE IF NOT EXISTS files;
|
|
|
|
-- Schema (done automatically by scanner)
|
|
CREATE TABLE IF NOT EXISTS files.inventory (
|
|
scan_id String,
|
|
scan_time DateTime64(3),
|
|
server LowCardinality(String),
|
|
source LowCardinality(String),
|
|
folder String,
|
|
filename String,
|
|
ext LowCardinality(String),
|
|
size UInt64,
|
|
created DateTime64(3),
|
|
modified DateTime64(3),
|
|
hash String DEFAULT ''
|
|
) ENGINE = MergeTree
|
|
ORDER BY (server, folder, filename);
|
|
|
|
-- Summary by server
|
|
SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size
|
|
FROM files.inventory
|
|
GROUP BY server
|
|
ORDER BY sum(size) DESC;
|
|
|
|
-- Find exact duplicates (after running hashupdate)
|
|
SELECT
|
|
hash,
|
|
count(*) as cnt,
|
|
formatReadableSize(any(size)) as size,
|
|
groupArray(concat(server, ':', folder, '/', filename)) as files
|
|
FROM files.inventory
|
|
WHERE hash != ''
|
|
GROUP BY hash
|
|
HAVING cnt > 1
|
|
ORDER BY any(size) DESC
|
|
LIMIT 100;
|
|
|
|
-- Find files by extension
|
|
SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total
|
|
FROM files.inventory
|
|
GROUP BY ext
|
|
ORDER BY sum(size) DESC
|
|
LIMIT 20;
|
|
|
|
-- Largest files
|
|
SELECT server, folder, filename, formatReadableSize(size) as size
|
|
FROM files.inventory
|
|
ORDER BY size DESC
|
|
LIMIT 50;
|
|
|
|
-- Find files by name pattern
|
|
SELECT server, folder, filename, formatReadableSize(size) as size
|
|
FROM files.inventory
|
|
WHERE filename ILIKE '%pattern%'
|
|
ORDER BY size DESC;
|
|
|
|
-- Compare two servers - files only on server A
|
|
SELECT a.folder, a.filename, formatReadableSize(a.size) as size
|
|
FROM files.inventory a
|
|
LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB'
|
|
WHERE a.server = 'serverA' AND b.filename IS NULL
|
|
ORDER BY a.size DESC;
|
|
|
|
-- Files modified in last 30 days
|
|
SELECT server, folder, filename, modified, formatReadableSize(size) as size
|
|
FROM files.inventory
|
|
WHERE modified > now() - INTERVAL 30 DAY
|
|
ORDER BY modified DESC
|
|
LIMIT 100;
|
|
|
|
-- Potential duplicate sizes (before hashing)
|
|
SELECT size, count(*) as cnt
|
|
FROM files.inventory
|
|
WHERE hash = '' AND size > 1000000 -- > 1MB
|
|
GROUP BY size
|
|
HAVING cnt > 1
|
|
ORDER BY size DESC
|
|
LIMIT 100;
|
|
|
|
-- Storage by folder (top level)
|
|
SELECT
|
|
server,
|
|
arrayElement(splitByChar('/', folder), 2) as top_folder,
|
|
count(*) as files,
|
|
formatReadableSize(sum(size)) as total
|
|
FROM files.inventory
|
|
GROUP BY server, top_folder
|
|
ORDER BY sum(size) DESC
|
|
LIMIT 50;
|
|
|
|
-- Clear old scans (keep latest per server)
|
|
-- DELETE FROM files.inventory
|
|
-- WHERE (server, scan_time) NOT IN (
|
|
-- SELECT server, max(scan_time) FROM files.inventory GROUP BY server
|
|
-- );
|