filescanner/queries.sql

99 lines
2.7 KiB
SQL
Executable File

-- ClickHouse queries for file inventory analysis
-- Create database (done automatically by scanner)
CREATE DATABASE IF NOT EXISTS files;
-- Schema (done automatically by scanner)
CREATE TABLE IF NOT EXISTS files.inventory (
scan_id String,
scan_time DateTime64(3),
server LowCardinality(String),
source LowCardinality(String),
folder String,
filename String,
ext LowCardinality(String),
size UInt64,
created DateTime64(3),
modified DateTime64(3),
hash String DEFAULT ''
) ENGINE = MergeTree
ORDER BY (server, folder, filename);
-- Summary by server
SELECT server, count(*) as files, formatReadableSize(sum(size)) as total_size
FROM files.inventory
GROUP BY server
ORDER BY sum(size) DESC;
-- Find exact duplicates (after running hashupdate)
SELECT
hash,
count(*) as cnt,
formatReadableSize(any(size)) as size,
groupArray(concat(server, ':', folder, '/', filename)) as files
FROM files.inventory
WHERE hash != ''
GROUP BY hash
HAVING cnt > 1
ORDER BY any(size) DESC
LIMIT 100;
-- Find files by extension
SELECT ext, count(*) as cnt, formatReadableSize(sum(size)) as total
FROM files.inventory
GROUP BY ext
ORDER BY sum(size) DESC
LIMIT 20;
-- Largest files
SELECT server, folder, filename, formatReadableSize(size) as size
FROM files.inventory
ORDER BY size DESC
LIMIT 50;
-- Find files by name pattern
SELECT server, folder, filename, formatReadableSize(size) as size
FROM files.inventory
WHERE filename ILIKE '%pattern%'
ORDER BY size DESC;
-- Compare two servers - files only on server A
SELECT a.folder, a.filename, formatReadableSize(a.size) as size
FROM files.inventory a
LEFT JOIN files.inventory b ON a.filename = b.filename AND a.size = b.size AND b.server = 'serverB'
WHERE a.server = 'serverA' AND b.filename IS NULL
ORDER BY a.size DESC;
-- Files modified in last 30 days
SELECT server, folder, filename, modified, formatReadableSize(size) as size
FROM files.inventory
WHERE modified > now() - INTERVAL 30 DAY
ORDER BY modified DESC
LIMIT 100;
-- Potential duplicate sizes (before hashing)
SELECT size, count(*) as cnt
FROM files.inventory
WHERE hash = '' AND size > 1000000 -- > 1MB
GROUP BY size
HAVING cnt > 1
ORDER BY size DESC
LIMIT 100;
-- Storage by folder (top level)
SELECT
server,
arrayElement(splitByChar('/', folder), 2) as top_folder,
count(*) as files,
formatReadableSize(sum(size)) as total
FROM files.inventory
GROUP BY server, top_folder
ORDER BY sum(size) DESC
LIMIT 50;
-- Clear old scans (keep latest per server)
-- DELETE FROM files.inventory
-- WHERE (server, scan_time) NOT IN (
-- SELECT server, max(scan_time) FROM files.inventory GROUP BY server
-- );