immich-compare/build_hash_index.py

#!/usr/bin/env python3
"""Build hash index using first 64KB of files for fast duplicate detection."""

import sqlite3
import hashlib
import os
import sys
from pathlib import Path

DB_PATH = "/home/johan/immich-compare/hash_index.db"

EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.gif', '.mp4', '.mov', '.avi', '.m4v', '.3gp'}

def hash_file_head(filepath, size=65536):
    """Hash first 64KB of file."""
    try:
        with open(filepath, 'rb') as f:
            return hashlib.md5(f.read(size)).hexdigest()
    except:
        return None

def create_db():
    if os.path.exists(DB_PATH):
        os.remove(DB_PATH)

    conn = sqlite3.connect(DB_PATH)
    c = conn.cursor()
    c.execute('''CREATE TABLE files (
        id INTEGER PRIMARY KEY,
        source TEXT,
        filename TEXT,
        filepath TEXT,
        filesize INTEGER,
        hash64k TEXT
    )''')
    c.execute('CREATE INDEX idx_hash ON files(hash64k)')
    c.execute('CREATE INDEX idx_source ON files(source)')
    c.execute('CREATE INDEX idx_filename ON files(filename)')
    c.execute('CREATE INDEX idx_hash_source ON files(hash64k, source)')
    conn.commit()
    return conn

def scan_directory(conn, source_name, base_path):
    """Scan a directory and add files to database."""
    c = conn.cursor()
    count = 0

    for root, dirs, files in os.walk(base_path):
        for fname in files:
            ext = os.path.splitext(fname)[1].lower()
            if ext not in EXTENSIONS:
                continue

            filepath = os.path.join(root, fname)
            try:
                filesize = os.path.getsize(filepath)
                if filesize == 0:
                    continue
                hash64k = hash_file_head(filepath)
                if hash64k:
                    c.execute('INSERT INTO files (source, filename, filepath, filesize, hash64k) VALUES (?, ?, ?, ?, ?)',
                              (source_name, fname, filepath, filesize, hash64k))
                    count += 1
                    if count % 1000 == 0:
                        print(f"  {source_name}: {count} files...")
                        conn.commit()
            except Exception as e:
                pass

    conn.commit()
    print(f"  {source_name}: {count} files total")
    return count

def main():
    print("Building hash index for all sources...")
    conn = create_db()

    sources = [
        ("mylio", "/tank/mylio-backup/Mylio"),
        ("immich", "/tank/immich/library/library/admin"),
        ("takeout", "/tank/staging/Data/Google Takeout 20210421/Takeout/Google Photos"),
    ]

    total = 0
    for name, path in sources:
        if os.path.exists(path):
            print(f"\nScanning {name}: {path}")
            total += scan_directory(conn, name, path)
        else:
            print(f"\nSkipping {name}: {path} not found")

    # Summary
    c = conn.cursor()
    print(f"\n{'='*50}")
    print(f"Total files indexed: {total}")

    for name, _ in sources:
        c.execute("SELECT COUNT(*) FROM files WHERE source = ?", (name,))
        cnt = c.fetchone()[0]
        print(f"  {name}: {cnt}")

    # Find duplicates
    c.execute("""
        SELECT hash64k, COUNT(*) as cnt, GROUP_CONCAT(DISTINCT source) as sources
        FROM files
        GROUP BY hash64k
        HAVING COUNT(DISTINCT source) > 1
        LIMIT 10
    """)

    print(f"\nSample cross-source duplicates:")
    for row in c.fetchall():
        print(f"  {row[0][:12]}... appears in: {row[2]}")

    conn.close()
    print(f"\nDone! Database: {DB_PATH}")

if __name__ == "__main__":
    main()