immich-compare/build_hash_index.py

120 lines
3.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""Build hash index using first 64KB of files for fast duplicate detection."""
import sqlite3
import hashlib
import os
import sys
from pathlib import Path
DB_PATH = "/home/johan/immich-compare/hash_index.db"
EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.gif', '.mp4', '.mov', '.avi', '.m4v', '.3gp'}
def hash_file_head(filepath, size=65536):
"""Hash first 64KB of file."""
try:
with open(filepath, 'rb') as f:
return hashlib.md5(f.read(size)).hexdigest()
except:
return None
def create_db():
if os.path.exists(DB_PATH):
os.remove(DB_PATH)
conn = sqlite3.connect(DB_PATH)
c = conn.cursor()
c.execute('''CREATE TABLE files (
id INTEGER PRIMARY KEY,
source TEXT,
filename TEXT,
filepath TEXT,
filesize INTEGER,
hash64k TEXT
)''')
c.execute('CREATE INDEX idx_hash ON files(hash64k)')
c.execute('CREATE INDEX idx_source ON files(source)')
c.execute('CREATE INDEX idx_filename ON files(filename)')
c.execute('CREATE INDEX idx_hash_source ON files(hash64k, source)')
conn.commit()
return conn
def scan_directory(conn, source_name, base_path):
"""Scan a directory and add files to database."""
c = conn.cursor()
count = 0
for root, dirs, files in os.walk(base_path):
for fname in files:
ext = os.path.splitext(fname)[1].lower()
if ext not in EXTENSIONS:
continue
filepath = os.path.join(root, fname)
try:
filesize = os.path.getsize(filepath)
if filesize == 0:
continue
hash64k = hash_file_head(filepath)
if hash64k:
c.execute('INSERT INTO files (source, filename, filepath, filesize, hash64k) VALUES (?, ?, ?, ?, ?)',
(source_name, fname, filepath, filesize, hash64k))
count += 1
if count % 1000 == 0:
print(f" {source_name}: {count} files...")
conn.commit()
except Exception as e:
pass
conn.commit()
print(f" {source_name}: {count} files total")
return count
def main():
print("Building hash index for all sources...")
conn = create_db()
sources = [
("mylio", "/tank/mylio-backup/Mylio"),
("immich", "/tank/immich/library/library/admin"),
("takeout", "/tank/staging/Data/Google Takeout 20210421/Takeout/Google Photos"),
]
total = 0
for name, path in sources:
if os.path.exists(path):
print(f"\nScanning {name}: {path}")
total += scan_directory(conn, name, path)
else:
print(f"\nSkipping {name}: {path} not found")
# Summary
c = conn.cursor()
print(f"\n{'='*50}")
print(f"Total files indexed: {total}")
for name, _ in sources:
c.execute("SELECT COUNT(*) FROM files WHERE source = ?", (name,))
cnt = c.fetchone()[0]
print(f" {name}: {cnt}")
# Find duplicates
c.execute("""
SELECT hash64k, COUNT(*) as cnt, GROUP_CONCAT(DISTINCT source) as sources
FROM files
GROUP BY hash64k
HAVING COUNT(DISTINCT source) > 1
LIMIT 10
""")
print(f"\nSample cross-source duplicates:")
for row in c.fetchall():
print(f" {row[0][:12]}... appears in: {row[2]}")
conn.close()
print(f"\nDone! Database: {DB_PATH}")
if __name__ == "__main__":
main()