#!/usr/bin/env python3 """Build hash index using first 64KB of files for fast duplicate detection.""" import sqlite3 import hashlib import os import sys from pathlib import Path DB_PATH = "/home/johan/immich-compare/hash_index.db" EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.gif', '.mp4', '.mov', '.avi', '.m4v', '.3gp'} def hash_file_head(filepath, size=65536): """Hash first 64KB of file.""" try: with open(filepath, 'rb') as f: return hashlib.md5(f.read(size)).hexdigest() except: return None def create_db(): if os.path.exists(DB_PATH): os.remove(DB_PATH) conn = sqlite3.connect(DB_PATH) c = conn.cursor() c.execute('''CREATE TABLE files ( id INTEGER PRIMARY KEY, source TEXT, filename TEXT, filepath TEXT, filesize INTEGER, hash64k TEXT )''') c.execute('CREATE INDEX idx_hash ON files(hash64k)') c.execute('CREATE INDEX idx_source ON files(source)') c.execute('CREATE INDEX idx_filename ON files(filename)') c.execute('CREATE INDEX idx_hash_source ON files(hash64k, source)') conn.commit() return conn def scan_directory(conn, source_name, base_path): """Scan a directory and add files to database.""" c = conn.cursor() count = 0 for root, dirs, files in os.walk(base_path): for fname in files: ext = os.path.splitext(fname)[1].lower() if ext not in EXTENSIONS: continue filepath = os.path.join(root, fname) try: filesize = os.path.getsize(filepath) if filesize == 0: continue hash64k = hash_file_head(filepath) if hash64k: c.execute('INSERT INTO files (source, filename, filepath, filesize, hash64k) VALUES (?, ?, ?, ?, ?)', (source_name, fname, filepath, filesize, hash64k)) count += 1 if count % 1000 == 0: print(f" {source_name}: {count} files...") conn.commit() except Exception as e: pass conn.commit() print(f" {source_name}: {count} files total") return count def main(): print("Building hash index for all sources...") conn = create_db() sources = [ ("mylio", "/tank/mylio-backup/Mylio"), ("immich", "/tank/immich/library/library/admin"), ("takeout", "/tank/staging/Data/Google Takeout 20210421/Takeout/Google Photos"), ] total = 0 for name, path in sources: if os.path.exists(path): print(f"\nScanning {name}: {path}") total += scan_directory(conn, name, path) else: print(f"\nSkipping {name}: {path} not found") # Summary c = conn.cursor() print(f"\n{'='*50}") print(f"Total files indexed: {total}") for name, _ in sources: c.execute("SELECT COUNT(*) FROM files WHERE source = ?", (name,)) cnt = c.fetchone()[0] print(f" {name}: {cnt}") # Find duplicates c.execute(""" SELECT hash64k, COUNT(*) as cnt, GROUP_CONCAT(DISTINCT source) as sources FROM files GROUP BY hash64k HAVING COUNT(DISTINCT source) > 1 LIMIT 10 """) print(f"\nSample cross-source duplicates:") for row in c.fetchall(): print(f" {row[0][:12]}... appears in: {row[2]}") conn.close() print(f"\nDone! Database: {DB_PATH}") if __name__ == "__main__": main()