120 lines
3.5 KiB
Python
Executable File
120 lines
3.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Build hash index using first 64KB of files for fast duplicate detection."""
|
|
|
|
import sqlite3
|
|
import hashlib
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
DB_PATH = "/home/johan/immich-compare/hash_index.db"
|
|
|
|
EXTENSIONS = {'.jpg', '.jpeg', '.png', '.heic', '.gif', '.mp4', '.mov', '.avi', '.m4v', '.3gp'}
|
|
|
|
def hash_file_head(filepath, size=65536):
|
|
"""Hash first 64KB of file."""
|
|
try:
|
|
with open(filepath, 'rb') as f:
|
|
return hashlib.md5(f.read(size)).hexdigest()
|
|
except:
|
|
return None
|
|
|
|
def create_db():
|
|
if os.path.exists(DB_PATH):
|
|
os.remove(DB_PATH)
|
|
|
|
conn = sqlite3.connect(DB_PATH)
|
|
c = conn.cursor()
|
|
c.execute('''CREATE TABLE files (
|
|
id INTEGER PRIMARY KEY,
|
|
source TEXT,
|
|
filename TEXT,
|
|
filepath TEXT,
|
|
filesize INTEGER,
|
|
hash64k TEXT
|
|
)''')
|
|
c.execute('CREATE INDEX idx_hash ON files(hash64k)')
|
|
c.execute('CREATE INDEX idx_source ON files(source)')
|
|
c.execute('CREATE INDEX idx_filename ON files(filename)')
|
|
c.execute('CREATE INDEX idx_hash_source ON files(hash64k, source)')
|
|
conn.commit()
|
|
return conn
|
|
|
|
def scan_directory(conn, source_name, base_path):
|
|
"""Scan a directory and add files to database."""
|
|
c = conn.cursor()
|
|
count = 0
|
|
|
|
for root, dirs, files in os.walk(base_path):
|
|
for fname in files:
|
|
ext = os.path.splitext(fname)[1].lower()
|
|
if ext not in EXTENSIONS:
|
|
continue
|
|
|
|
filepath = os.path.join(root, fname)
|
|
try:
|
|
filesize = os.path.getsize(filepath)
|
|
if filesize == 0:
|
|
continue
|
|
hash64k = hash_file_head(filepath)
|
|
if hash64k:
|
|
c.execute('INSERT INTO files (source, filename, filepath, filesize, hash64k) VALUES (?, ?, ?, ?, ?)',
|
|
(source_name, fname, filepath, filesize, hash64k))
|
|
count += 1
|
|
if count % 1000 == 0:
|
|
print(f" {source_name}: {count} files...")
|
|
conn.commit()
|
|
except Exception as e:
|
|
pass
|
|
|
|
conn.commit()
|
|
print(f" {source_name}: {count} files total")
|
|
return count
|
|
|
|
def main():
|
|
print("Building hash index for all sources...")
|
|
conn = create_db()
|
|
|
|
sources = [
|
|
("mylio", "/tank/mylio-backup/Mylio"),
|
|
("immich", "/tank/immich/library/library/admin"),
|
|
("takeout", "/tank/staging/Data/Google Takeout 20210421/Takeout/Google Photos"),
|
|
]
|
|
|
|
total = 0
|
|
for name, path in sources:
|
|
if os.path.exists(path):
|
|
print(f"\nScanning {name}: {path}")
|
|
total += scan_directory(conn, name, path)
|
|
else:
|
|
print(f"\nSkipping {name}: {path} not found")
|
|
|
|
# Summary
|
|
c = conn.cursor()
|
|
print(f"\n{'='*50}")
|
|
print(f"Total files indexed: {total}")
|
|
|
|
for name, _ in sources:
|
|
c.execute("SELECT COUNT(*) FROM files WHERE source = ?", (name,))
|
|
cnt = c.fetchone()[0]
|
|
print(f" {name}: {cnt}")
|
|
|
|
# Find duplicates
|
|
c.execute("""
|
|
SELECT hash64k, COUNT(*) as cnt, GROUP_CONCAT(DISTINCT source) as sources
|
|
FROM files
|
|
GROUP BY hash64k
|
|
HAVING COUNT(DISTINCT source) > 1
|
|
LIMIT 10
|
|
""")
|
|
|
|
print(f"\nSample cross-source duplicates:")
|
|
for row in c.fetchall():
|
|
print(f" {row[0][:12]}... appears in: {row[2]}")
|
|
|
|
conn.close()
|
|
print(f"\nDone! Database: {DB_PATH}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|