-- ClickHouse queries for file inventory analysis -- Create database (done automatically by scanner) CREATE DATABASE IF NOT EXISTS files; -- Schema (done automatically by scanner) CREATE TABLE IF NOT EXISTS files.inventory ( scan_id String, scan_time DateTime64(3), server LowCardinality(String), source LowCardinality(String), folder String, filename String, ext LowCardinality(String), size UInt64, created DateTime64(3), modified DateTime64(3), hash String DEFAULT '' ) ENGINE = MergeTree ORDER BY (server, folder, filename); -- Summary by server SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size FROM files.inventory GROUP BY server ORDER BY sum(size) DESC; -- Find exact duplicates (after running hashupdate) SELECT hash, count(*) as cnt, formatReadableSize(any(size)) as size, groupArray(concat(server, ':', folder, '/', filename)) as files FROM files.inventory WHERE hash != '' GROUP BY hash HAVING cnt > 1 ORDER BY any(size) DESC LIMIT 100; -- Find files by extension SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total FROM files.inventory GROUP BY ext ORDER BY sum(size) DESC LIMIT 20; -- Largest files SELECT server, folder, filename, formatReadableSize(size) as size FROM files.inventory ORDER BY size DESC LIMIT 50; -- Find files by name pattern SELECT server, folder, filename, formatReadableSize(size) as size FROM files.inventory WHERE filename ILIKE '%pattern%' ORDER BY size DESC; -- Compare two servers - files only on server A SELECT a.folder, a.filename, formatReadableSize(a.size) as size FROM files.inventory a LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB' WHERE a.server = 'serverA' AND b.filename IS NULL ORDER BY a.size DESC; -- Files modified in last 30 days SELECT server, folder, filename, modified, formatReadableSize(size) as size FROM files.inventory WHERE modified > now() - INTERVAL 30 DAY ORDER BY modified DESC LIMIT 100; -- Potential duplicate sizes (before hashing) SELECT size, count(*) as cnt FROM files.inventory WHERE hash = '' AND size > 1000000 -- > 1MB GROUP BY size HAVING cnt > 1 ORDER BY size DESC LIMIT 100; -- Storage by folder (top level) SELECT server, arrayElement(splitByChar('/', folder), 2) as top_folder, count(*) as files, formatReadableSize(sum(size)) as total FROM files.inventory GROUP BY server, top_folder ORDER BY sum(size) DESC LIMIT 50; -- Clear old scans (keep latest per server) -- DELETE FROM files.inventory -- WHERE (server, scan_time) NOT IN ( -- SELECT server, max(scan_time) FROM files.inventory GROUP BY server -- );