doc-processor/processor.py

#!/usr/bin/env python3
"""
Document Processor for ~/documents/inbox/
Watches for new documents, OCRs them, classifies, and files them.
"""

import os
import sys
import json
import hashlib
import subprocess
import shutil
import sqlite3
import csv
from datetime import datetime
from pathlib import Path
from typing import Optional, Dict, Any
import re
import time

# Paths
DOCUMENTS_ROOT = Path.home() / "documents"
INBOX = DOCUMENTS_ROOT / "inbox"
STORE = DOCUMENTS_ROOT / "store"
RECORDS = DOCUMENTS_ROOT / "records"
INDEX = DOCUMENTS_ROOT / "index"
EXPORTS = DOCUMENTS_ROOT / "exports"

# Categories
CATEGORIES = [
    "taxes", "bills", "medical", "insurance", "legal",
    "financial", "expenses", "vehicles", "home",
    "personal", "contacts", "uncategorized"
]

# Ensure directories exist
for d in [STORE, INDEX, EXPORTS]:
    d.mkdir(parents=True, exist_ok=True)
for cat in CATEGORIES:
    (RECORDS / cat).mkdir(parents=True, exist_ok=True)


def file_hash(filepath: Path) -> str:
    """SHA256 hash of file contents."""
    h = hashlib.sha256()
    with open(filepath, 'rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            h.update(chunk)
    return h.hexdigest()[:16]  # Short hash for filename


def extract_text_pdf(filepath: Path) -> str:
    """Extract text from PDF using pdftotext."""
    try:
        result = subprocess.run(
            ['pdftotext', '-layout', str(filepath), '-'],
            capture_output=True, text=True, timeout=30
        )
        text = result.stdout.strip()
        if len(text) > 50:  # Got meaningful text
            return text
    except Exception as e:
        print(f"pdftotext failed: {e}")

    # Fallback to OCR
    return ocr_document(filepath)


def ocr_document(filepath: Path) -> str:
    """OCR a document using tesseract."""
    try:
        # For PDFs, convert to images first
        if filepath.suffix.lower() == '.pdf':
            # Use pdftoppm to convert to images, then OCR
            result = subprocess.run(
                ['pdftoppm', '-png', '-r', '300', str(filepath), '/tmp/doc_page'],
                capture_output=True, timeout=60
            )
            # OCR all pages
            text_parts = []
            for img in sorted(Path('/tmp').glob('doc_page-*.png')):
                result = subprocess.run(
                    ['tesseract', str(img), 'stdout'],
                    capture_output=True, text=True, timeout=60
                )
                text_parts.append(result.stdout)
                img.unlink()  # Clean up
            return '\n'.join(text_parts).strip()
        else:
            # Direct image OCR
            result = subprocess.run(
                ['tesseract', str(filepath), 'stdout'],
                capture_output=True, text=True, timeout=60
            )
            return result.stdout.strip()
    except Exception as e:
        print(f"OCR failed: {e}")
        return ""


def extract_text(filepath: Path) -> str:
    """Extract text from document based on type."""
    suffix = filepath.suffix.lower()
    if suffix == '.pdf':
        return extract_text_pdf(filepath)
    elif suffix in ['.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']:
        return ocr_document(filepath)
    elif suffix in ['.txt', '.md']:
        return filepath.read_text()
    else:
        return ""


def classify_document(text: str, filename: str) -> Dict[str, Any]:
    """
    Classify document based on content.
    Returns: {category, doc_type, date, vendor, amount, summary}
    """
    text_lower = text.lower()
    result = {
        "category": "uncategorized",
        "doc_type": "unknown",
        "date": None,
        "vendor": None,
        "amount": None,
        "summary": None,
    }

    # Date extraction (various formats)
    date_patterns = [
        r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
        r'(\d{4}[/-]\d{1,2}[/-]\d{1,2})',
        r'((?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2},? \d{4})',
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text_lower)
        if match:
            result["date"] = match.group(1)
            break

    # Amount extraction
    amount_match = re.search(r'\$[\d,]+\.?\d*', text)
    if amount_match:
        result["amount"] = amount_match.group(0)

    # Classification rules
    if any(x in text_lower for x in ['w-2', 'w2', '1099', 'tax return', 'irs', '1040', 'schedule c', 'form 1098']):
        result["category"] = "taxes"
        result["doc_type"] = "tax_form"
    elif any(x in text_lower for x in ['invoice', 'bill', 'amount due', 'payment due', 'account number', 'autopay']):
        result["category"] = "bills"
        result["doc_type"] = "bill"
        # Try to extract vendor
        vendors = ['duke energy', 'fpl', 'florida power', 'spectrum', 'at&t', 'verizon', 't-mobile', 'comcast', 'xfinity']
        for v in vendors:
            if v in text_lower:
                result["vendor"] = v.title()
                break
    elif any(x in text_lower for x in ['patient', 'diagnosis', 'prescription', 'medical', 'physician', 'hospital', 'clinic', 'dr.', 'md']):
        result["category"] = "medical"
        result["doc_type"] = "medical_record"
    elif any(x in text_lower for x in ['policy', 'coverage', 'premium', 'deductible', 'insurance', 'claim']):
        result["category"] = "insurance"
        result["doc_type"] = "insurance_doc"
    elif any(x in text_lower for x in ['agreement', 'contract', 'terms', 'hereby', 'whereas', 'attorney', 'legal']):
        result["category"] = "legal"
        result["doc_type"] = "legal_doc"
    elif any(x in text_lower for x in ['bank', 'statement', 'account', 'balance', 'deposit', 'withdrawal', 'investment', 'portfolio']):
        result["category"] = "financial"
        result["doc_type"] = "financial_statement"
    elif any(x in text_lower for x in ['receipt', 'purchase', 'order', 'subtotal', 'total', 'qty', 'item']):
        result["category"] = "expenses"
        result["doc_type"] = "receipt"
    elif any(x in text_lower for x in ['vin', 'vehicle', 'registration', 'dmv', 'license plate', 'odometer']):
        result["category"] = "vehicles"
        result["doc_type"] = "vehicle_doc"
    elif any(x in text_lower for x in ['mortgage', 'deed', 'property', 'hoa', 'homeowner']):
        result["category"] = "home"
        result["doc_type"] = "property_doc"

    # Generate summary (first 200 chars, cleaned)
    clean_text = ' '.join(text.split())[:200]
    result["summary"] = clean_text

    return result


def store_document(filepath: Path, hash_id: str) -> Path:
    """Copy document to store with hash-based name."""
    suffix = filepath.suffix.lower()
    store_path = STORE / f"{hash_id}{suffix}"
    if not store_path.exists():
        shutil.copy2(filepath, store_path)
    return store_path


def create_record(filepath: Path, hash_id: str, text: str, classification: Dict) -> Path:
    """Create markdown record in appropriate category folder."""
    cat = classification["category"]
    now = datetime.now()

    record_name = f"{now.strftime('%Y%m%d')}_{hash_id}.md"
    record_path = RECORDS / cat / record_name

    content = f"""# Document Record

**ID:** {hash_id}
**Original File:** {filepath.name}
**Processed:** {now.isoformat()}
**Category:** {cat}
**Type:** {classification.get('doc_type', 'unknown')}

## Extracted Info

| Field | Value |
|-------|-------|
| Date | {classification.get('date', 'N/A')} |
| Vendor | {classification.get('vendor', 'N/A')} |
| Amount | {classification.get('amount', 'N/A')} |

## Summary

{classification.get('summary', 'No summary available.')}

## Full Text

```
{text[:5000]}
```

## Files

- **PDF:** [store/{hash_id}{filepath.suffix}](../../store/{hash_id}{filepath.suffix})
"""

    record_path.write_text(content)
    return record_path


def update_master_index(hash_id: str, filepath: Path, classification: Dict) -> None:
    """Update the master.json index."""
    index_path = INDEX / "master.json"

    if index_path.exists():
        with open(index_path) as f:
            data = json.load(f)
    else:
        data = {"version": "1.0", "created": datetime.now().strftime("%Y-%m-%d"), "documents": [], "stats": {"total": 0, "by_type": {}, "by_year": {}}}

    doc_entry = {
        "id": hash_id,
        "filename": filepath.name,
        "category": classification["category"],
        "type": classification.get("doc_type", "unknown"),
        "date": classification.get("date"),
        "amount": classification.get("amount"),
        "processed": datetime.now().isoformat(),
    }

    # Check for duplicate
    if not any(d["id"] == hash_id for d in data["documents"]):
        data["documents"].append(doc_entry)
        data["stats"]["total"] = len(data["documents"])

        # Update type stats
        dtype = classification.get("doc_type", "unknown")
        data["stats"]["by_type"][dtype] = data["stats"]["by_type"].get(dtype, 0) + 1

    with open(index_path, 'w') as f:
        json.dump(data, f, indent=2)


def export_expense(hash_id: str, classification: Dict, filepath: Path) -> None:
    """Append to expenses.csv if it's an expense/receipt."""
    if classification["category"] not in ["expenses", "bills"]:
        return

    csv_path = EXPORTS / "expenses.csv"
    file_exists = csv_path.exists()

    with open(csv_path, 'a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["date", "vendor", "amount", "category", "type", "doc_id", "filename"])

        writer.writerow([
            classification.get("date", ""),
            classification.get("vendor", ""),
            classification.get("amount", ""),
            classification["category"],
            classification.get("doc_type", ""),
            hash_id,
            filepath.name,
        ])


def process_document(filepath: Path) -> bool:
    """Process a single document through the full pipeline."""
    print(f"Processing: {filepath.name}")

    # Skip hidden files and non-documents
    if filepath.name.startswith('.'):
        return False

    valid_extensions = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp', '.txt'}
    if filepath.suffix.lower() not in valid_extensions:
        print(f"  Skipping unsupported format: {filepath.suffix}")
        return False

    # 1. Generate hash
    hash_id = file_hash(filepath)
    print(f"  Hash: {hash_id}")

    # 2. Check if already processed
    store_path = STORE / f"{hash_id}{filepath.suffix.lower()}"
    if store_path.exists():
        print(f"  Already processed, removing from inbox")
        filepath.unlink()
        return True

    # 3. Extract text (OCR if needed)
    print("  Extracting text...")
    text = extract_text(filepath)
    if not text:
        print("  Warning: No text extracted")
        text = "(No text could be extracted)"
    else:
        print(f"  Extracted {len(text)} characters")

    # 4. Classify
    print("  Classifying...")
    classification = classify_document(text, filepath.name)
    print(f"  Category: {classification['category']}, Type: {classification.get('doc_type')}")

    # 5. Store PDF
    print("  Storing document...")
    store_document(filepath, hash_id)

    # 6. Create record
    print("  Creating record...")
    record_path = create_record(filepath, hash_id, text, classification)
    print(f"  Record: {record_path}")

    # 7. Update index
    print("  Updating index...")
    update_master_index(hash_id, filepath, classification)

    # 8. Export if expense
    export_expense(hash_id, classification, filepath)

    # 9. Remove from inbox
    print("  Removing from inbox...")
    filepath.unlink()

    print(f"  ✓ Done: {classification['category']}/{hash_id}")
    return True


def process_inbox() -> int:
    """Process all documents in inbox. Returns count processed."""
    count = 0
    for filepath in INBOX.iterdir():
        if filepath.is_file() and not filepath.name.startswith('.'):
            try:
                if process_document(filepath):
                    count += 1
            except Exception as e:
                print(f"Error processing {filepath}: {e}")
    return count


def watch_inbox(interval: int = 30) -> None:
    """Watch inbox continuously."""
    print(f"Watching {INBOX} (interval: {interval}s)")
    print("Press Ctrl+C to stop")

    while True:
        count = process_inbox()
        if count:
            print(f"Processed {count} document(s)")
        time.sleep(interval)


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Document processor")
    parser.add_argument("--watch", action="store_true", help="Watch inbox continuously")
    parser.add_argument("--interval", type=int, default=30, help="Watch interval in seconds")
    parser.add_argument("--file", type=Path, help="Process single file")
    args = parser.parse_args()

    if args.file:
        if args.file.exists():
            process_document(args.file)
        else:
            print(f"File not found: {args.file}")
            sys.exit(1)
    elif args.watch:
        watch_inbox(args.interval)
    else:
        count = process_inbox()
        print(f"Processed {count} document(s)")


if __name__ == "__main__":
    main()