doc-processor/search.py

#!/usr/bin/env python3
"""
Search documents in the document management system.
"""

import os
import sys
import json
import argparse
from pathlib import Path
from datetime import datetime

DOCUMENTS_ROOT = Path.home() / "documents"
INDEX = DOCUMENTS_ROOT / "index"
RECORDS = DOCUMENTS_ROOT / "records"


def load_index() -> dict:
    """Load the master index."""
    index_path = INDEX / "master.json"
    if index_path.exists():
        with open(index_path) as f:
            return json.load(f)
    return {"documents": []}


def search_documents(query: str, category: str = None, doc_type: str = None) -> list:
    """Search documents by query, optionally filtered by category/type."""
    data = load_index()
    results = []

    query_lower = query.lower() if query else ""

    for doc in data["documents"]:
        # Apply filters
        if category and doc.get("category") != category:
            continue
        if doc_type and doc.get("type") != doc_type:
            continue

        # If no query, return all matching filters
        if not query:
            results.append(doc)
            continue

        # Search in indexed fields
        searchable = f"{doc.get('filename', '')} {doc.get('category', '')} {doc.get('type', '')} {doc.get('date', '')} {doc.get('amount', '')}".lower()
        if query_lower in searchable:
            results.append(doc)
            continue

        # Search in full text record
        record_path = find_record(doc["id"], doc["category"])
        if record_path and record_path.exists():
            content = record_path.read_text().lower()
            if query_lower in content:
                results.append(doc)

    return results


def find_record(doc_id: str, category: str) -> Path:
    """Find the record file for a document."""
    cat_dir = RECORDS / category
    if cat_dir.exists():
        for f in cat_dir.iterdir():
            if doc_id in f.name:
                return f
    return None


def show_document(doc_id: str) -> None:
    """Show full details of a document."""
    data = load_index()

    for doc in data["documents"]:
        if doc["id"] == doc_id or doc_id in doc.get("filename", ""):
            print(f"\n{'='*60}")
            print(f"Document: {doc['filename']}")
            print(f"ID: {doc['id']}")
            print(f"Category: {doc['category']}")
            print(f"Type: {doc.get('type', 'unknown')}")
            print(f"Date: {doc.get('date', 'N/A')}")
            print(f"Amount: {doc.get('amount', 'N/A')}")
            print(f"Processed: {doc.get('processed', 'N/A')}")
            print(f"{'='*60}")

            # Show record content
            record_path = find_record(doc["id"], doc["category"])
            if record_path:
                print(f"\nRecord: {record_path}")
                print("-"*60)
                print(record_path.read_text())
            return

    print(f"Document not found: {doc_id}")


def list_stats() -> None:
    """Show document statistics."""
    data = load_index()

    print("\n📊 Document Statistics")
    print("="*40)
    print(f"Total documents: {data['stats']['total']}")

    print("\nBy type:")
    for dtype, count in sorted(data["stats"].get("by_type", {}).items()):
        print(f"  {dtype}: {count}")

    print("\nBy category:")
    by_cat = {}
    for doc in data["documents"]:
        cat = doc.get("category", "unknown")
        by_cat[cat] = by_cat.get(cat, 0) + 1
    for cat, count in sorted(by_cat.items()):
        print(f"  {cat}: {count}")


def main():
    parser = argparse.ArgumentParser(description="Search documents")
    parser.add_argument("query", nargs="?", help="Search query")
    parser.add_argument("-c", "--category", help="Filter by category")
    parser.add_argument("-t", "--type", help="Filter by document type")
    parser.add_argument("-s", "--show", help="Show full document by ID")
    parser.add_argument("--stats", action="store_true", help="Show statistics")
    parser.add_argument("-l", "--list", action="store_true", help="List all documents")
    args = parser.parse_args()

    if args.stats:
        list_stats()
        return

    if args.show:
        show_document(args.show)
        return

    if args.list or args.query or args.category or args.type:
        results = search_documents(args.query, args.category, args.type)

        if not results:
            print("No documents found")
            return

        print(f"\nFound {len(results)} document(s):\n")
        for doc in results:
            date = doc.get("date", "")[:10] if doc.get("date") else ""
            amount = doc.get("amount", "")
            print(f"  [{doc['id'][:8]}] {doc['category']:12} {doc.get('type', ''):15} {date:12} {amount:10} {doc['filename']}")
    else:
        parser.print_help()


if __name__ == "__main__":
    main()