doc-processor/search.py

156 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Search documents in the document management system.
"""
import os
import sys
import json
import argparse
from pathlib import Path
from datetime import datetime
DOCUMENTS_ROOT = Path.home() / "documents"
INDEX = DOCUMENTS_ROOT / "index"
RECORDS = DOCUMENTS_ROOT / "records"
def load_index() -> dict:
"""Load the master index."""
index_path = INDEX / "master.json"
if index_path.exists():
with open(index_path) as f:
return json.load(f)
return {"documents": []}
def search_documents(query: str, category: str = None, doc_type: str = None) -> list:
"""Search documents by query, optionally filtered by category/type."""
data = load_index()
results = []
query_lower = query.lower() if query else ""
for doc in data["documents"]:
# Apply filters
if category and doc.get("category") != category:
continue
if doc_type and doc.get("type") != doc_type:
continue
# If no query, return all matching filters
if not query:
results.append(doc)
continue
# Search in indexed fields
searchable = f"{doc.get('filename', '')} {doc.get('category', '')} {doc.get('type', '')} {doc.get('date', '')} {doc.get('amount', '')}".lower()
if query_lower in searchable:
results.append(doc)
continue
# Search in full text record
record_path = find_record(doc["id"], doc["category"])
if record_path and record_path.exists():
content = record_path.read_text().lower()
if query_lower in content:
results.append(doc)
return results
def find_record(doc_id: str, category: str) -> Path:
"""Find the record file for a document."""
cat_dir = RECORDS / category
if cat_dir.exists():
for f in cat_dir.iterdir():
if doc_id in f.name:
return f
return None
def show_document(doc_id: str) -> None:
"""Show full details of a document."""
data = load_index()
for doc in data["documents"]:
if doc["id"] == doc_id or doc_id in doc.get("filename", ""):
print(f"\n{'='*60}")
print(f"Document: {doc['filename']}")
print(f"ID: {doc['id']}")
print(f"Category: {doc['category']}")
print(f"Type: {doc.get('type', 'unknown')}")
print(f"Date: {doc.get('date', 'N/A')}")
print(f"Amount: {doc.get('amount', 'N/A')}")
print(f"Processed: {doc.get('processed', 'N/A')}")
print(f"{'='*60}")
# Show record content
record_path = find_record(doc["id"], doc["category"])
if record_path:
print(f"\nRecord: {record_path}")
print("-"*60)
print(record_path.read_text())
return
print(f"Document not found: {doc_id}")
def list_stats() -> None:
"""Show document statistics."""
data = load_index()
print("\n📊 Document Statistics")
print("="*40)
print(f"Total documents: {data['stats']['total']}")
print("\nBy type:")
for dtype, count in sorted(data["stats"].get("by_type", {}).items()):
print(f" {dtype}: {count}")
print("\nBy category:")
by_cat = {}
for doc in data["documents"]:
cat = doc.get("category", "unknown")
by_cat[cat] = by_cat.get(cat, 0) + 1
for cat, count in sorted(by_cat.items()):
print(f" {cat}: {count}")
def main():
parser = argparse.ArgumentParser(description="Search documents")
parser.add_argument("query", nargs="?", help="Search query")
parser.add_argument("-c", "--category", help="Filter by category")
parser.add_argument("-t", "--type", help="Filter by document type")
parser.add_argument("-s", "--show", help="Show full document by ID")
parser.add_argument("--stats", action="store_true", help="Show statistics")
parser.add_argument("-l", "--list", action="store_true", help="List all documents")
args = parser.parse_args()
if args.stats:
list_stats()
return
if args.show:
show_document(args.show)
return
if args.list or args.query or args.category or args.type:
results = search_documents(args.query, args.category, args.type)
if not results:
print("No documents found")
return
print(f"\nFound {len(results)} document(s):\n")
for doc in results:
date = doc.get("date", "")[:10] if doc.get("date") else ""
amount = doc.get("amount", "")
print(f" [{doc['id'][:8]}] {doc['category']:12} {doc.get('type', ''):15} {date:12} {amount:10} {doc['filename']}")
else:
parser.print_help()
if __name__ == "__main__":
main()