#!/usr/bin/env python3 """ Document Processor for ~/documents/inbox/ Watches for new documents, OCRs them, classifies, and files them. """ import os import sys import json import hashlib import subprocess import shutil import sqlite3 import csv from datetime import datetime from pathlib import Path from typing import Optional, Dict, Any import re import time # Paths DOCUMENTS_ROOT = Path.home() / "documents" INBOX = DOCUMENTS_ROOT / "inbox" STORE = DOCUMENTS_ROOT / "store" RECORDS = DOCUMENTS_ROOT / "records" INDEX = DOCUMENTS_ROOT / "index" EXPORTS = DOCUMENTS_ROOT / "exports" # Categories CATEGORIES = [ "taxes", "bills", "medical", "insurance", "legal", "financial", "expenses", "vehicles", "home", "personal", "contacts", "uncategorized" ] # Ensure directories exist for d in [STORE, INDEX, EXPORTS]: d.mkdir(parents=True, exist_ok=True) for cat in CATEGORIES: (RECORDS / cat).mkdir(parents=True, exist_ok=True) def file_hash(filepath: Path) -> str: """SHA256 hash of file contents.""" h = hashlib.sha256() with open(filepath, 'rb') as f: for chunk in iter(lambda: f.read(8192), b''): h.update(chunk) return h.hexdigest()[:16] # Short hash for filename def extract_text_pdf(filepath: Path) -> str: """Extract text from PDF using pdftotext.""" try: result = subprocess.run( ['pdftotext', '-layout', str(filepath), '-'], capture_output=True, text=True, timeout=30 ) text = result.stdout.strip() if len(text) > 50: # Got meaningful text return text except Exception as e: print(f"pdftotext failed: {e}") # Fallback to OCR return ocr_document(filepath) def ocr_document(filepath: Path) -> str: """OCR a document using tesseract.""" try: # For PDFs, convert to images first if filepath.suffix.lower() == '.pdf': # Use pdftoppm to convert to images, then OCR result = subprocess.run( ['pdftoppm', '-png', '-r', '300', str(filepath), '/tmp/doc_page'], capture_output=True, timeout=60 ) # OCR all pages text_parts = [] for img in sorted(Path('/tmp').glob('doc_page-*.png')): result = subprocess.run( ['tesseract', str(img), 'stdout'], capture_output=True, text=True, timeout=60 ) text_parts.append(result.stdout) img.unlink() # Clean up return '\n'.join(text_parts).strip() else: # Direct image OCR result = subprocess.run( ['tesseract', str(filepath), 'stdout'], capture_output=True, text=True, timeout=60 ) return result.stdout.strip() except Exception as e: print(f"OCR failed: {e}") return "" def extract_text(filepath: Path) -> str: """Extract text from document based on type.""" suffix = filepath.suffix.lower() if suffix == '.pdf': return extract_text_pdf(filepath) elif suffix in ['.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']: return ocr_document(filepath) elif suffix in ['.txt', '.md']: return filepath.read_text() else: return "" def classify_document(text: str, filename: str) -> Dict[str, Any]: """ Classify document based on content. Returns: {category, doc_type, date, vendor, amount, summary} """ text_lower = text.lower() result = { "category": "uncategorized", "doc_type": "unknown", "date": None, "vendor": None, "amount": None, "summary": None, } # Date extraction (various formats) date_patterns = [ r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})', r'(\d{4}[/-]\d{1,2}[/-]\d{1,2})', r'((?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4})', ] for pattern in date_patterns: match = re.search(pattern, text_lower) if match: result["date"] = match.group(1) break # Amount extraction amount_match = re.search(r'\$[\d,]+\.?\d*', text) if amount_match: result["amount"] = amount_match.group(0) # Classification rules if any(x in text_lower for x in ['w-2', 'w2', '1099', 'tax return', 'irs', '1040', 'schedule c', 'form 1098']): result["category"] = "taxes" result["doc_type"] = "tax_form" elif any(x in text_lower for x in ['invoice', 'bill', 'amount due', 'payment due', 'account number', 'autopay']): result["category"] = "bills" result["doc_type"] = "bill" # Try to extract vendor vendors = ['duke energy', 'fpl', 'florida power', 'spectrum', 'at&t', 'verizon', 't-mobile', 'comcast', 'xfinity'] for v in vendors: if v in text_lower: result["vendor"] = v.title() break elif any(x in text_lower for x in ['patient', 'diagnosis', 'prescription', 'medical', 'physician', 'hospital', 'clinic', 'dr.', 'md']): result["category"] = "medical" result["doc_type"] = "medical_record" elif any(x in text_lower for x in ['policy', 'coverage', 'premium', 'deductible', 'insurance', 'claim']): result["category"] = "insurance" result["doc_type"] = "insurance_doc" elif any(x in text_lower for x in ['agreement', 'contract', 'terms', 'hereby', 'whereas', 'attorney', 'legal']): result["category"] = "legal" result["doc_type"] = "legal_doc" elif any(x in text_lower for x in ['bank', 'statement', 'account', 'balance', 'deposit', 'withdrawal', 'investment', 'portfolio']): result["category"] = "financial" result["doc_type"] = "financial_statement" elif any(x in text_lower for x in ['receipt', 'purchase', 'order', 'subtotal', 'total', 'qty', 'item']): result["category"] = "expenses" result["doc_type"] = "receipt" elif any(x in text_lower for x in ['vin', 'vehicle', 'registration', 'dmv', 'license plate', 'odometer']): result["category"] = "vehicles" result["doc_type"] = "vehicle_doc" elif any(x in text_lower for x in ['mortgage', 'deed', 'property', 'hoa', 'homeowner']): result["category"] = "home" result["doc_type"] = "property_doc" # Generate summary (first 200 chars, cleaned) clean_text = ' '.join(text.split())[:200] result["summary"] = clean_text return result def store_document(filepath: Path, hash_id: str) -> Path: """Copy document to store with hash-based name.""" suffix = filepath.suffix.lower() store_path = STORE / f"{hash_id}{suffix}" if not store_path.exists(): shutil.copy2(filepath, store_path) return store_path def create_record(filepath: Path, hash_id: str, text: str, classification: Dict) -> Path: """Create markdown record in appropriate category folder.""" cat = classification["category"] now = datetime.now() record_name = f"{now.strftime('%Y%m%d')}_{hash_id}.md" record_path = RECORDS / cat / record_name content = f"""# Document Record **ID:** {hash_id} **Original File:** {filepath.name} **Processed:** {now.isoformat()} **Category:** {cat} **Type:** {classification.get('doc_type', 'unknown')} ## Extracted Info | Field | Value | |-------|-------| | Date | {classification.get('date', 'N/A')} | | Vendor | {classification.get('vendor', 'N/A')} | | Amount | {classification.get('amount', 'N/A')} | ## Summary {classification.get('summary', 'No summary available.')} ## Full Text ``` {text[:5000]} ``` ## Files - **PDF:** [store/{hash_id}{filepath.suffix}](../../store/{hash_id}{filepath.suffix}) """ record_path.write_text(content) return record_path def update_master_index(hash_id: str, filepath: Path, classification: Dict) -> None: """Update the master.json index.""" index_path = INDEX / "master.json" if index_path.exists(): with open(index_path) as f: data = json.load(f) else: data = {"version": "1.0", "created": datetime.now().strftime("%Y-%m-%d"), "documents": [], "stats": {"total": 0, "by_type": {}, "by_year": {}}} doc_entry = { "id": hash_id, "filename": filepath.name, "category": classification["category"], "type": classification.get("doc_type", "unknown"), "date": classification.get("date"), "amount": classification.get("amount"), "processed": datetime.now().isoformat(), } # Check for duplicate if not any(d["id"] == hash_id for d in data["documents"]): data["documents"].append(doc_entry) data["stats"]["total"] = len(data["documents"]) # Update type stats dtype = classification.get("doc_type", "unknown") data["stats"]["by_type"][dtype] = data["stats"]["by_type"].get(dtype, 0) + 1 with open(index_path, 'w') as f: json.dump(data, f, indent=2) def export_expense(hash_id: str, classification: Dict, filepath: Path) -> None: """Append to expenses.csv if it's an expense/receipt.""" if classification["category"] not in ["expenses", "bills"]: return csv_path = EXPORTS / "expenses.csv" file_exists = csv_path.exists() with open(csv_path, 'a', newline='') as f: writer = csv.writer(f) if not file_exists: writer.writerow(["date", "vendor", "amount", "category", "type", "doc_id", "filename"]) writer.writerow([ classification.get("date", ""), classification.get("vendor", ""), classification.get("amount", ""), classification["category"], classification.get("doc_type", ""), hash_id, filepath.name, ]) def process_document(filepath: Path) -> bool: """Process a single document through the full pipeline.""" print(f"Processing: {filepath.name}") # Skip hidden files and non-documents if filepath.name.startswith('.'): return False valid_extensions = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.txt'} if filepath.suffix.lower() not in valid_extensions: print(f" Skipping unsupported format: {filepath.suffix}") return False # 1. Generate hash hash_id = file_hash(filepath) print(f" Hash: {hash_id}") # 2. Check if already processed store_path = STORE / f"{hash_id}{filepath.suffix.lower()}" if store_path.exists(): print(f" Already processed, removing from inbox") filepath.unlink() return True # 3. Extract text (OCR if needed) print(" Extracting text...") text = extract_text(filepath) if not text: print(" Warning: No text extracted") text = "(No text could be extracted)" else: print(f" Extracted {len(text)} characters") # 4. Classify print(" Classifying...") classification = classify_document(text, filepath.name) print(f" Category: {classification['category']}, Type: {classification.get('doc_type')}") # 5. Store PDF print(" Storing document...") store_document(filepath, hash_id) # 6. Create record print(" Creating record...") record_path = create_record(filepath, hash_id, text, classification) print(f" Record: {record_path}") # 7. Update index print(" Updating index...") update_master_index(hash_id, filepath, classification) # 8. Export if expense export_expense(hash_id, classification, filepath) # 9. Remove from inbox print(" Removing from inbox...") filepath.unlink() print(f" ✓ Done: {classification['category']}/{hash_id}") return True def process_inbox() -> int: """Process all documents in inbox. Returns count processed.""" count = 0 for filepath in INBOX.iterdir(): if filepath.is_file() and not filepath.name.startswith('.'): try: if process_document(filepath): count += 1 except Exception as e: print(f"Error processing {filepath}: {e}") return count def watch_inbox(interval: int = 30) -> None: """Watch inbox continuously.""" print(f"Watching {INBOX} (interval: {interval}s)") print("Press Ctrl+C to stop") while True: count = process_inbox() if count: print(f"Processed {count} document(s)") time.sleep(interval) def main(): import argparse parser = argparse.ArgumentParser(description="Document processor") parser.add_argument("--watch", action="store_true", help="Watch inbox continuously") parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds") parser.add_argument("--file", type=Path, help="Process single file") args = parser.parse_args() if args.file: if args.file.exists(): process_document(args.file) else: print(f"File not found: {args.file}") sys.exit(1) elif args.watch: watch_inbox(args.interval) else: count = process_inbox() print(f"Processed {count} document(s)") if __name__ == "__main__": main()