#!/usr/bin/env python3 """ Search documents in the document management system. Uses SQLite full-text search on document content. """ import os import sys import json import sqlite3 import argparse from pathlib import Path from datetime import datetime DOCUMENTS_ROOT = Path.home() / "documents" INDEX = DOCUMENTS_ROOT / "index" RECORDS = DOCUMENTS_ROOT / "records" EMBEDDINGS_DB = INDEX / "embeddings.db" def get_db() -> sqlite3.Connection: """Get database connection.""" if not EMBEDDINGS_DB.exists(): print(f"Database not found: {EMBEDDINGS_DB}") print("Run the processor first to create the database.") sys.exit(1) return sqlite3.connect(EMBEDDINGS_DB) def search_documents(query: str, category: str = None, doc_type: str = None, limit: int = 20) -> list: """ Search documents by query using SQLite full-text search. Returns list of matching documents. """ conn = get_db() conn.row_factory = sqlite3.Row # Build query conditions = [] params = [] if query: # Search in full_text, summary, vendor, filename conditions.append("""( full_text LIKE ? OR summary LIKE ? OR vendor LIKE ? OR filename LIKE ? )""") like_query = f"%{query}%" params.extend([like_query, like_query, like_query, like_query]) if category: conditions.append("category = ?") params.append(category) if doc_type: conditions.append("doc_type = ?") params.append(doc_type) where_clause = " AND ".join(conditions) if conditions else "1=1" sql = f""" SELECT doc_id, filename, category, doc_type, date, vendor, amount, summary, processed_at FROM documents WHERE {where_clause} ORDER BY processed_at DESC LIMIT ? """ params.append(limit) cursor = conn.execute(sql, params) results = [dict(row) for row in cursor.fetchall()] conn.close() return results def get_document(doc_id: str) -> dict: """Get full document details by ID.""" conn = get_db() conn.row_factory = sqlite3.Row cursor = conn.execute(""" SELECT * FROM documents WHERE doc_id = ? OR doc_id LIKE ? """, (doc_id, f"{doc_id}%")) row = cursor.fetchone() conn.close() return dict(row) if row else None def list_categories() -> dict: """List all categories with document counts.""" conn = get_db() cursor = conn.execute(""" SELECT category, COUNT(*) as count FROM documents GROUP BY category ORDER BY count DESC """) results = {row[0]: row[1] for row in cursor.fetchall()} conn.close() return results def list_types() -> dict: """List all document types with counts.""" conn = get_db() cursor = conn.execute(""" SELECT doc_type, COUNT(*) as count FROM documents GROUP BY doc_type ORDER BY count DESC """) results = {row[0]: row[1] for row in cursor.fetchall()} conn.close() return results def show_stats() -> None: """Show document statistics.""" conn = get_db() # Total count total = conn.execute("SELECT COUNT(*) FROM documents").fetchone()[0] print("\nšŸ“Š Document Statistics") print("=" * 40) print(f"Total documents: {total}") # By category print("\nBy category:") for cat, count in list_categories().items(): print(f" {cat}: {count}") # By type print("\nBy type:") for dtype, count in list_types().items(): print(f" {dtype}: {count}") conn.close() def show_document(doc_id: str) -> None: """Show full details of a document.""" doc = get_document(doc_id) if not doc: print(f"Document not found: {doc_id}") return print(f"\n{'=' * 60}") print(f"Document: {doc['filename']}") print(f"ID: {doc['doc_id']}") print(f"Category: {doc['category']}") print(f"Type: {doc['doc_type'] or 'unknown'}") print(f"Date: {doc['date'] or 'N/A'}") print(f"Vendor: {doc['vendor'] or 'N/A'}") print(f"Amount: {doc['amount'] or 'N/A'}") print(f"Processed: {doc['processed_at']}") print(f"{'=' * 60}") if doc['summary']: print(f"\nSummary:\n{doc['summary']}") if doc['full_text']: print(f"\n--- Full Text (first 2000 chars) ---\n") print(doc['full_text'][:2000]) if len(doc['full_text']) > 2000: print(f"\n... [{len(doc['full_text']) - 2000} more characters]") def format_results(results: list) -> None: """Format and print search results.""" if not results: print("No documents found") return print(f"\nFound {len(results)} document(s):\n") # Header print(f"{'ID':<10} {'Category':<12} {'Type':<18} {'Date':<12} {'Amount':<10} {'Filename'}") print("-" * 90) for doc in results: doc_id = doc['doc_id'][:8] cat = (doc['category'] or '')[:12] dtype = (doc['doc_type'] or 'unknown')[:18] date = (doc['date'] or '')[:12] amount = (doc['amount'] or '')[:10] filename = doc['filename'][:30] print(f"{doc_id:<10} {cat:<12} {dtype:<18} {date:<12} {amount:<10} {filename}") def main(): parser = argparse.ArgumentParser(description="Search documents") parser.add_argument("query", nargs="?", help="Search query") parser.add_argument("-c", "--category", help="Filter by category") parser.add_argument("-t", "--type", help="Filter by document type") parser.add_argument("-s", "--show", help="Show full document by ID") parser.add_argument("--stats", action="store_true", help="Show statistics") parser.add_argument("-l", "--list", action="store_true", help="List all documents") parser.add_argument("-n", "--limit", type=int, default=20, help="Max results (default: 20)") parser.add_argument("--full-text", action="store_true", help="Show full text in results") args = parser.parse_args() if args.stats: show_stats() return if args.show: show_document(args.show) return if args.list or args.query or args.category or args.type: results = search_documents(args.query, args.category, args.type, args.limit) format_results(results) else: parser.print_help() if __name__ == "__main__": main()