Initial commit

2026-02-04 13:35:03 -05:00 · 2026-02-04 13:35:03 -05:00 · 880f9dab9d
commit 880f9dab9d
11 changed files with 2698 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+# Binaries
+*.exe
+node_modules/
+.venv/
+__pycache__/
--- a/51
+++ b/51
@ -0,0 +1,51 @@
+.PHONY: build run install clean dev deps
+
+BINARY=docman
+VERSION=$(shell git describe --tags --always --dirty 2>/dev/null || echo "dev")
+BUILD_DIR=build
+
+# Build flags
+LDFLAGS=-ldflags "-s -w -X main.Version=$(VERSION)"
+CGO_CFLAGS=-DSQLITE_ENABLE_FTS5
+TAGS=-tags "fts5"
+
+build:
+	@mkdir -p $(BUILD_DIR)
+	CGO_CFLAGS="$(CGO_CFLAGS)" go build $(TAGS) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY) ./cmd/docman
+
+run: build
+	./$(BUILD_DIR)/$(BINARY)
+
+dev:
+	go run ./cmd/docman -port 8200
+
+deps:
+	go mod download
+	go mod tidy
+
+# Install to ~/bin
+install: build
+	@mkdir -p $(HOME)/bin
+	cp $(BUILD_DIR)/$(BINARY) $(HOME)/bin/
+	@echo "Installed to $(HOME)/bin/$(BINARY)"
+
+# Install systemd service
+install-service:
+	@mkdir -p $(HOME)/.config/systemd/user
+	@envsubst < deploy/docman.service > $(HOME)/.config/systemd/user/docman.service
+	systemctl --user daemon-reload
+	systemctl --user enable docman
+	@echo "Service installed. Start with: systemctl --user start docman"
+
+clean:
+	rm -rf $(BUILD_DIR)
+
+# Development helpers
+test:
+	go test -v ./...
+
+lint:
+	golangci-lint run
+
+fmt:
+	go fmt ./...
--- a/README.md
+++ b/README.md
@ -0,0 +1,116 @@
+# DocMan - Document Management System
+
+AI-powered document scanning, OCR, classification, and search.
+
+## Quick Start
+
+```bash
+# Run directly (dev mode)
+cd ~/dev/docman
+make dev
+
+# Or run the installed binary
+~/bin/docman -port 8200
+```
+
+Open http://localhost:8200
+
+## Features
+
+- **Auto-processing**: Drop PDFs/images into `~/documents/inbox/` → auto-classified and indexed
+- **PDF Preview**: Built-in PDF viewer
+- **Full-text Search**: FTS5 + semantic search
+- **Categories**: taxes, expenses, bills, medical, contacts, legal, insurance, banking, receipts
+- **Expense Tracking**: Filter by date, export to CSV, track tax-deductible items
+- **Markdown Records**: Each document gets a searchable markdown file
+
+## Scanner Setup
+
+1. **Set scanner to save to SMB share:**
+   - Share: `\\192.168.1.16\documents\inbox` (or wherever james is)
+   - Or use your scanner's app to save to `~/documents/inbox/`
+
+2. **Workflow:**
+   - Scan document → lands in inbox
+   - DocMan auto-processes (OCR → classify → store)
+   - View/search in web UI
+
+## Directory Structure
+
+```
+~/documents/
+├── inbox/      # Drop scans here (auto-processed)
+├── store/      # PDF storage (by checksum)
+├── records/    # Markdown records by category
+│   ├── taxes/
+│   ├── expenses/
+│   ├── bills/
+│   └── ...
+└── index/      # SQLite database
+```
+
+## Configuration
+
+### Environment Variables
+
+```bash
+FIREWORKS_API_KEY=fw_xxx    # Required for AI classification
+```
+
+### Command Line Options
+
+```
+-port        HTTP port (default: 8200)
+-data        Data directory (default: ~/documents)
+-ai-endpoint AI API endpoint (default: Fireworks)
+-ai-key      AI API key
+-ai-model    Classification model
+-embed-model Embedding model
+-watch       Only watch inbox, don't start web server
+```
+
+## Systemd Service
+
+```bash
+# Edit to add your Fireworks API key
+nano ~/.config/systemd/user/docman.service
+
+# Enable and start
+systemctl --user daemon-reload
+systemctl --user enable docman
+systemctl --user start docman
+
+# Check status
+systemctl --user status docman
+journalctl --user -u docman -f
+```
+
+## API Endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/` | Dashboard |
+| GET | `/browse` | Browse documents |
+| GET | `/doc/:id` | View document |
+| GET | `/search` | Search page |
+| GET | `/expenses` | Expenses tracker |
+| GET | `/upload` | Upload page |
+| POST | `/api/upload` | Upload document |
+| GET | `/api/documents/:id` | Get document JSON |
+| PATCH | `/api/documents/:id` | Update document |
+| DELETE | `/api/documents/:id` | Delete document |
+| GET | `/api/search?q=` | Search API |
+| GET | `/api/expenses/export` | Export CSV |
+| GET | `/api/stats` | Dashboard stats |
+
+## Dependencies
+
+System packages (already installed):
+- `poppler-utils` (pdftotext, pdfinfo, pdftoppm)
+- `tesseract-ocr` (OCR for scanned images)
+
+## Notes
+
+- Without FIREWORKS_API_KEY, documents will be categorized as "uncategorized"
+- The inbox watcher runs continuously, processing new files automatically
+- Markdown files are searchable even without embeddings
--- a/build/docman
+++ b/build/docman
--- a/cmd/docman/main.go
+++ b/cmd/docman/main.go
@ -0,0 +1,164 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"syscall"
+
+	"docman/internal/db"
+	"docman/internal/handlers"
+	"docman/internal/processor"
+
+	"github.com/labstack/echo/v4"
+	"github.com/labstack/echo/v4/middleware"
+)
+
+func main() {
+	var (
+		port       = flag.Int("port", 8200, "HTTP port")
+		dataDir    = flag.String("data", "", "Data directory (default: ~/documents)")
+		aiEndpoint = flag.String("ai-endpoint", "https://api.fireworks.ai/inference/v1", "AI API endpoint")
+		aiKey      = flag.String("ai-key", "", "AI API key (or FIREWORKS_API_KEY env)")
+		aiModel    = flag.String("ai-model", "accounts/fireworks/models/qwen2-vl-72b-instruct", "AI model for classification")
+		embedModel = flag.String("embed-model", "nomic-ai/nomic-embed-text-v1.5", "Embedding model")
+		watchOnly  = flag.Bool("watch", false, "Only watch inbox, don't start web server")
+	)
+	flag.Parse()
+
+	// Resolve data directory
+	if *dataDir == "" {
+		home, _ := os.UserHomeDir()
+		*dataDir = filepath.Join(home, "documents")
+	}
+
+	// Resolve AI key
+	if *aiKey == "" {
+		*aiKey = os.Getenv("FIREWORKS_API_KEY")
+	}
+	if *aiKey == "" {
+		log.Println("Warning: No AI API key provided. Classification will fail.")
+	}
+
+	// Create directories
+	dirs := map[string]string{
+		"inbox":   filepath.Join(*dataDir, "inbox"),
+		"store":   filepath.Join(*dataDir, "store"),
+		"records": filepath.Join(*dataDir, "records"),
+		"index":   filepath.Join(*dataDir, "index"),
+	}
+	for _, dir := range dirs {
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			log.Fatalf("Failed to create directory %s: %v", dir, err)
+		}
+	}
+
+	// Open database
+	dbPath := filepath.Join(dirs["index"], "docman.db")
+	database, err := db.Open(dbPath)
+	if err != nil {
+		log.Fatalf("Failed to open database: %v", err)
+	}
+	defer database.Close()
+
+	if err := database.Init(); err != nil {
+		log.Fatalf("Failed to initialize database: %v", err)
+	}
+
+	// Create processor
+	proc := processor.New(processor.Config{
+		InboxDir:   dirs["inbox"],
+		StoreDir:   dirs["store"],
+		RecordsDir: dirs["records"],
+		AIEndpoint: *aiEndpoint,
+		AIKey:      *aiKey,
+		AIModel:    *aiModel,
+		EmbedModel: *embedModel,
+	}, database)
+
+	// Context for graceful shutdown
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Start inbox watcher
+	go func() {
+		if err := proc.Watch(ctx); err != nil && err != context.Canceled {
+			log.Printf("Watcher error: %v", err)
+		}
+	}()
+
+	if *watchOnly {
+		log.Printf("Watching inbox: %s", dirs["inbox"])
+		// Wait for signal
+		sigCh := make(chan os.Signal, 1)
+		signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+		<-sigCh
+		cancel()
+		return
+	}
+
+	// Create Echo server
+	e := echo.New()
+	e.HideBanner = true
+
+	// Middleware
+	e.Use(middleware.Logger())
+	e.Use(middleware.Recover())
+	e.Use(middleware.CORS())
+
+	// Template renderer
+	e.Renderer = &templateRenderer{}
+
+	// Handlers
+	h := handlers.New(database, proc, dirs["store"], dirs["records"])
+
+	// Page routes
+	e.GET("/", h.Dashboard)
+	e.GET("/browse", h.Browse)
+	e.GET("/doc/:id", h.Document)
+	e.GET("/search", h.Search)
+	e.GET("/expenses", h.Expenses)
+	e.GET("/upload", h.Upload)
+
+	// API routes
+	e.POST("/api/upload", h.APIUpload)
+	e.GET("/api/documents/:id", h.APIDocument)
+	e.PATCH("/api/documents/:id", h.APIUpdateDocument)
+	e.DELETE("/api/documents/:id", h.APIDeleteDocument)
+	e.GET("/api/search", h.APISearch)
+	e.GET("/api/expenses/export", h.APIExportExpenses)
+	e.GET("/api/stats", h.APIStats)
+
+	// Static file serving
+	e.GET("/pdf/:filename", h.ServePDF)
+	e.GET("/markdown", h.ServeMarkdown)
+
+	// Graceful shutdown
+	go func() {
+		sigCh := make(chan os.Signal, 1)
+		signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
+		<-sigCh
+		log.Println("Shutting down...")
+		cancel()
+		e.Close()
+	}()
+
+	// Start server
+	log.Printf("DocMan starting on http://localhost:%d", *port)
+	log.Printf("Data directory: %s", *dataDir)
+	log.Printf("Inbox: %s", dirs["inbox"])
+	if err := e.Start(fmt.Sprintf(":%d", *port)); err != nil {
+		log.Printf("Server stopped: %v", err)
+	}
+}
+
+type templateRenderer struct{}
+
+func (t *templateRenderer) Render(w io.Writer, name string, data interface{}, c echo.Context) error {
+	return handlers.NewTemplateRenderer().Render(w, name, data, c)
+}
--- a/deploy/docman.service
+++ b/deploy/docman.service
@ -0,0 +1,15 @@
+[Unit]
+Description=DocMan - Document Management System
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=${HOME}/bin/docman -port 8200 -ai-key ${FIREWORKS_API_KEY}
+Restart=on-failure
+RestartSec=5
+Environment=HOME=${HOME}
+Environment=FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
+WorkingDirectory=${HOME}
+
+[Install]
+WantedBy=default.target
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,25 @@
+module docman
+
+go 1.22
+
+require (
+	github.com/fsnotify/fsnotify v1.7.0
+	github.com/google/uuid v1.6.0
+	github.com/labstack/echo/v4 v4.12.0
+	github.com/mattn/go-sqlite3 v1.14.22
+	github.com/sashabaranov/go-openai v1.29.0
+)
+
+require (
+	github.com/golang-jwt/jwt v3.2.2+incompatible // indirect
+	github.com/labstack/gommon v0.4.2 // indirect
+	github.com/mattn/go-colorable v0.1.13 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/valyala/bytebufferpool v1.0.0 // indirect
+	github.com/valyala/fasttemplate v1.2.2 // indirect
+	golang.org/x/crypto v0.26.0 // indirect
+	golang.org/x/net v0.28.0 // indirect
+	golang.org/x/sys v0.23.0 // indirect
+	golang.org/x/text v0.17.0 // indirect
+	golang.org/x/time v0.5.0 // indirect
+)
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,43 @@
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
+github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
+github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
+github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/labstack/echo/v4 v4.12.0 h1:IKpw49IMryVB2p1a4dzwlhP1O2Tf2E0Ir/450lH+kI0=
+github.com/labstack/echo/v4 v4.12.0/go.mod h1:UP9Cr2DJXbOK3Kr9ONYzNowSh7HP0aG0ShAyycHSJvM=
+github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
+github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
+github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
+github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
+github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/sashabaranov/go-openai v1.29.0 h1:eBH6LSjtX4md5ImDCX8hNhHQvaRf22zujiERoQpsvLo=
+github.com/sashabaranov/go-openai v1.29.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
+github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
+github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
+github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
+github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
+golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
+golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
+golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
+golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
+golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM=
+golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
+golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
+golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
+golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/internal/db/db.go
+++ b/internal/db/db.go
@ -0,0 +1,406 @@
+package db
+
+import (
+	"database/sql"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	_ "github.com/mattn/go-sqlite3"
+)
+
+type DB struct {
+	*sql.DB
+}
+
+type Document struct {
+	ID           string          `json:"id"`
+	Filename     string          `json:"filename"`
+	OriginalName string          `json:"original_name"`
+	Category     string          `json:"category"`
+	Subcategory  string          `json:"subcategory,omitempty"`
+	Title        string          `json:"title"`
+	Date         *time.Time      `json:"date,omitempty"`
+	Vendor       string          `json:"vendor,omitempty"`
+	Amount       *float64        `json:"amount,omitempty"`
+	Currency     string          `json:"currency,omitempty"`
+	TaxDeductible bool           `json:"tax_deductible"`
+	OCRText      string          `json:"ocr_text"`
+	Metadata     json.RawMessage `json:"metadata,omitempty"`
+	Embedding    []byte          `json:"embedding,omitempty"`
+	StoragePath  string          `json:"storage_path"`
+	MarkdownPath string          `json:"markdown_path"`
+	PageCount    int             `json:"page_count"`
+	FileSize     int64           `json:"file_size"`
+	MimeType     string          `json:"mime_type"`
+	Checksum     string          `json:"checksum"`
+	ProcessedAt  time.Time       `json:"processed_at"`
+	CreatedAt    time.Time       `json:"created_at"`
+	UpdatedAt    time.Time       `json:"updated_at"`
+}
+
+type SearchResult struct {
+	Document
+	Score   float64 `json:"score"`
+	Snippet string  `json:"snippet"`
+}
+
+func Open(path string) (*DB, error) {
+	db, err := sql.Open("sqlite3", path+"?_journal_mode=WAL&_busy_timeout=5000")
+	if err != nil {
+		return nil, err
+	}
+
+	if err := db.Ping(); err != nil {
+		return nil, err
+	}
+
+	return &DB{db}, nil
+}
+
+func (db *DB) Init() error {
+	schema := `
+	CREATE TABLE IF NOT EXISTS documents (
+		id TEXT PRIMARY KEY,
+		filename TEXT NOT NULL,
+		original_name TEXT NOT NULL,
+		category TEXT NOT NULL DEFAULT 'uncategorized',
+		subcategory TEXT,
+		title TEXT NOT NULL,
+		date TEXT,
+		vendor TEXT,
+		amount REAL,
+		currency TEXT DEFAULT 'USD',
+		tax_deductible INTEGER DEFAULT 0,
+		ocr_text TEXT,
+		metadata TEXT,
+		embedding BLOB,
+		storage_path TEXT NOT NULL,
+		markdown_path TEXT,
+		page_count INTEGER DEFAULT 1,
+		file_size INTEGER,
+		mime_type TEXT,
+		checksum TEXT,
+		processed_at TEXT,
+		created_at TEXT DEFAULT CURRENT_TIMESTAMP,
+		updated_at TEXT DEFAULT CURRENT_TIMESTAMP
+	);
+
+	CREATE INDEX IF NOT EXISTS idx_documents_category ON documents(category);
+	CREATE INDEX IF NOT EXISTS idx_documents_date ON documents(date);
+	CREATE INDEX IF NOT EXISTS idx_documents_vendor ON documents(vendor);
+	CREATE INDEX IF NOT EXISTS idx_documents_amount ON documents(amount);
+
+	CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
+		id,
+		title,
+		ocr_text,
+		vendor,
+		category,
+		content='documents',
+		content_rowid='rowid'
+	);
+
+	CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
+		INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
+		VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
+	END;
+
+	CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
+		INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
+		VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
+	END;
+
+	CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
+		INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
+		VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
+		INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
+		VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
+	END;
+
+	CREATE TABLE IF NOT EXISTS settings (
+		key TEXT PRIMARY KEY,
+		value TEXT
+	);
+	`
+	_, err := db.Exec(schema)
+	return err
+}
+
+func (db *DB) InsertDocument(doc *Document) error {
+	var dateStr *string
+	if doc.Date != nil {
+		s := doc.Date.Format("2006-01-02")
+		dateStr = &s
+	}
+
+	_, err := db.Exec(`
+		INSERT INTO documents (
+			id, filename, original_name, category, subcategory, title, date,
+			vendor, amount, currency, tax_deductible, ocr_text, metadata,
+			embedding, storage_path, markdown_path, page_count, file_size,
+			mime_type, checksum, processed_at, created_at, updated_at
+		) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+	`, doc.ID, doc.Filename, doc.OriginalName, doc.Category, doc.Subcategory,
+		doc.Title, dateStr, doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
+		doc.OCRText, doc.Metadata, doc.Embedding, doc.StoragePath, doc.MarkdownPath,
+		doc.PageCount, doc.FileSize, doc.MimeType, doc.Checksum,
+		doc.ProcessedAt.Format(time.RFC3339),
+		doc.CreatedAt.Format(time.RFC3339),
+		doc.UpdatedAt.Format(time.RFC3339))
+	return err
+}
+
+func (db *DB) UpdateDocument(doc *Document) error {
+	var dateStr *string
+	if doc.Date != nil {
+		s := doc.Date.Format("2006-01-02")
+		dateStr = &s
+	}
+
+	_, err := db.Exec(`
+		UPDATE documents SET
+			category = ?, subcategory = ?, title = ?, date = ?,
+			vendor = ?, amount = ?, currency = ?, tax_deductible = ?,
+			metadata = ?, updated_at = ?
+		WHERE id = ?
+	`, doc.Category, doc.Subcategory, doc.Title, dateStr,
+		doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
+		doc.Metadata, time.Now().Format(time.RFC3339), doc.ID)
+	return err
+}
+
+func (db *DB) GetDocument(id string) (*Document, error) {
+	row := db.QueryRow(`SELECT * FROM documents WHERE id = ?`, id)
+	return scanDocument(row)
+}
+
+func (db *DB) DeleteDocument(id string) error {
+	_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
+	return err
+}
+
+func (db *DB) ListDocuments(category string, limit, offset int) ([]*Document, error) {
+	var rows *sql.Rows
+	var err error
+
+	if category != "" {
+		rows, err = db.Query(`
+			SELECT * FROM documents 
+			WHERE category = ?
+			ORDER BY COALESCE(date, created_at) DESC 
+			LIMIT ? OFFSET ?
+		`, category, limit, offset)
+	} else {
+		rows, err = db.Query(`
+			SELECT * FROM documents 
+			ORDER BY COALESCE(date, created_at) DESC 
+			LIMIT ? OFFSET ?
+		`, limit, offset)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	return scanDocuments(rows)
+}
+
+func (db *DB) RecentDocuments(limit int) ([]*Document, error) {
+	rows, err := db.Query(`
+		SELECT * FROM documents 
+		ORDER BY created_at DESC 
+		LIMIT ?
+	`, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	return scanDocuments(rows)
+}
+
+func (db *DB) SearchFTS(query string, limit int) ([]*SearchResult, error) {
+	rows, err := db.Query(`
+		SELECT d.*, 
+			   bm25(documents_fts) as score,
+			   snippet(documents_fts, 2, '<mark>', '</mark>', '...', 32) as snippet
+		FROM documents_fts f
+		JOIN documents d ON f.id = d.id
+		WHERE documents_fts MATCH ?
+		ORDER BY bm25(documents_fts)
+		LIMIT ?
+	`, query, limit)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	var results []*SearchResult
+	for rows.Next() {
+		var doc Document
+		var dateStr, processedStr, createdStr, updatedStr sql.NullString
+		var score float64
+		var snippet string
+
+		err := rows.Scan(
+			&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
+			&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
+			&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
+			&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
+			&processedStr, &createdStr, &updatedStr, &score, &snippet,
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		if dateStr.Valid {
+			t, _ := time.Parse("2006-01-02", dateStr.String)
+			doc.Date = &t
+		}
+		if createdStr.Valid {
+			t, _ := time.Parse(time.RFC3339, createdStr.String)
+			doc.CreatedAt = t
+		}
+
+		results = append(results, &SearchResult{Document: doc, Score: score, Snippet: snippet})
+	}
+
+	return results, nil
+}
+
+func (db *DB) GetStats() (map[string]interface{}, error) {
+	stats := make(map[string]interface{})
+
+	// Total documents
+	var total int
+	db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&total)
+	stats["total"] = total
+
+	// This month
+	var thisMonth int
+	db.QueryRow(`SELECT COUNT(*) FROM documents WHERE created_at >= date('now', 'start of month')`).Scan(&thisMonth)
+	stats["this_month"] = thisMonth
+
+	// Total size
+	var totalSize int64
+	db.QueryRow(`SELECT COALESCE(SUM(file_size), 0) FROM documents`).Scan(&totalSize)
+	stats["total_size"] = totalSize
+
+	// By category
+	rows, err := db.Query(`SELECT category, COUNT(*) FROM documents GROUP BY category`)
+	if err == nil {
+		categories := make(map[string]int)
+		for rows.Next() {
+			var cat string
+			var count int
+			rows.Scan(&cat, &count)
+			categories[cat] = count
+		}
+		rows.Close()
+		stats["by_category"] = categories
+	}
+
+	return stats, nil
+}
+
+func (db *DB) GetExpenses(year int, month int) ([]*Document, error) {
+	query := `
+		SELECT * FROM documents 
+		WHERE category = 'expenses' AND amount IS NOT NULL
+	`
+	args := []interface{}{}
+
+	if year > 0 {
+		query += ` AND strftime('%Y', date) = ?`
+		args = append(args, fmt.Sprintf("%04d", year))
+	}
+	if month > 0 {
+		query += ` AND strftime('%m', date) = ?`
+		args = append(args, fmt.Sprintf("%02d", month))
+	}
+
+	query += ` ORDER BY date DESC`
+
+	rows, err := db.Query(query, args...)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+
+	return scanDocuments(rows)
+}
+
+func scanDocument(row *sql.Row) (*Document, error) {
+	var doc Document
+	var dateStr, processedStr, createdStr, updatedStr sql.NullString
+
+	err := row.Scan(
+		&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
+		&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
+		&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
+		&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
+		&processedStr, &createdStr, &updatedStr,
+	)
+	if err != nil {
+		return nil, err
+	}
+
+	if dateStr.Valid {
+		t, _ := time.Parse("2006-01-02", dateStr.String)
+		doc.Date = &t
+	}
+	if processedStr.Valid {
+		t, _ := time.Parse(time.RFC3339, processedStr.String)
+		doc.ProcessedAt = t
+	}
+	if createdStr.Valid {
+		t, _ := time.Parse(time.RFC3339, createdStr.String)
+		doc.CreatedAt = t
+	}
+	if updatedStr.Valid {
+		t, _ := time.Parse(time.RFC3339, updatedStr.String)
+		doc.UpdatedAt = t
+	}
+
+	return &doc, nil
+}
+
+func scanDocuments(rows *sql.Rows) ([]*Document, error) {
+	var docs []*Document
+	for rows.Next() {
+		var doc Document
+		var dateStr, processedStr, createdStr, updatedStr sql.NullString
+
+		err := rows.Scan(
+			&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
+			&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
+			&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
+			&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
+			&processedStr, &createdStr, &updatedStr,
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		if dateStr.Valid {
+			t, _ := time.Parse("2006-01-02", dateStr.String)
+			doc.Date = &t
+		}
+		if processedStr.Valid {
+			t, _ := time.Parse(time.RFC3339, processedStr.String)
+			doc.ProcessedAt = t
+		}
+		if createdStr.Valid {
+			t, _ := time.Parse(time.RFC3339, createdStr.String)
+			doc.CreatedAt = t
+		}
+		if updatedStr.Valid {
+			t, _ := time.Parse(time.RFC3339, updatedStr.String)
+			doc.UpdatedAt = t
+		}
+
+		docs = append(docs, &doc)
+	}
+	return docs, nil
+}
--- a/internal/handlers/handlers.go
+++ b/internal/handlers/handlers.go
--- a/internal/processor/processor.go
+++ b/internal/processor/processor.go
@ -0,0 +1,700 @@
+package processor
+
+import (
+	"bytes"
+	"context"
+	"crypto/sha256"
+	"encoding/hex"
+	"encoding/json"
+	"fmt"
+	"log"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"time"
+
+	"docman/internal/db"
+
+	"github.com/fsnotify/fsnotify"
+	"github.com/google/uuid"
+	openai "github.com/sashabaranov/go-openai"
+)
+
+type Processor struct {
+	db          *db.DB
+	inboxDir    string
+	storeDir    string
+	recordsDir  string
+	aiClient    *openai.Client
+	aiModel     string
+	embedModel  string
+}
+
+type Config struct {
+	InboxDir    string
+	StoreDir    string
+	RecordsDir  string
+	AIEndpoint  string // Fireworks API endpoint
+	AIKey       string
+	AIModel     string // e.g., "accounts/fireworks/models/qwen2-vl-72b-instruct"
+	EmbedModel  string
+}
+
+type Classification struct {
+	Category      string   `json:"category"`
+	Subcategory   string   `json:"subcategory,omitempty"`
+	Title         string   `json:"title"`
+	Date          string   `json:"date,omitempty"`
+	Vendor        string   `json:"vendor,omitempty"`
+	Amount        *float64 `json:"amount,omitempty"`
+	Currency      string   `json:"currency,omitempty"`
+	TaxDeductible bool     `json:"tax_deductible"`
+	Summary       string   `json:"summary"`
+	KeyFields     map[string]string `json:"key_fields,omitempty"`
+}
+
+func New(cfg Config, database *db.DB) *Processor {
+	config := openai.DefaultConfig(cfg.AIKey)
+	config.BaseURL = cfg.AIEndpoint
+
+	return &Processor{
+		db:         database,
+		inboxDir:   cfg.InboxDir,
+		storeDir:   cfg.StoreDir,
+		recordsDir: cfg.RecordsDir,
+		aiClient:   openai.NewClientWithConfig(config),
+		aiModel:    cfg.AIModel,
+		embedModel: cfg.EmbedModel,
+	}
+}
+
+func (p *Processor) Watch(ctx context.Context) error {
+	// Ensure directories exist
+	for _, dir := range []string{p.inboxDir, p.storeDir, p.recordsDir} {
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			return fmt.Errorf("create directory %s: %w", dir, err)
+		}
+	}
+
+	// Process existing files first
+	entries, _ := os.ReadDir(p.inboxDir)
+	for _, entry := range entries {
+		if entry.IsDir() || strings.HasPrefix(entry.Name(), ".") {
+			continue
+		}
+		path := filepath.Join(p.inboxDir, entry.Name())
+		if err := p.ProcessFile(ctx, path); err != nil {
+			log.Printf("Error processing %s: %v", path, err)
+		}
+	}
+
+	// Watch for new files
+	watcher, err := fsnotify.NewWatcher()
+	if err != nil {
+		return err
+	}
+	defer watcher.Close()
+
+	if err := watcher.Add(p.inboxDir); err != nil {
+		return err
+	}
+
+	log.Printf("Watching inbox: %s", p.inboxDir)
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case event, ok := <-watcher.Events:
+			if !ok {
+				return nil
+			}
+			if event.Op&fsnotify.Create == fsnotify.Create {
+				// Wait a moment for file to be fully written
+				time.Sleep(500 * time.Millisecond)
+				if err := p.ProcessFile(ctx, event.Name); err != nil {
+					log.Printf("Error processing %s: %v", event.Name, err)
+				}
+			}
+		case err, ok := <-watcher.Errors:
+			if !ok {
+				return nil
+			}
+			log.Printf("Watcher error: %v", err)
+		}
+	}
+}
+
+func (p *Processor) ProcessFile(ctx context.Context, path string) error {
+	// Skip hidden files and non-PDFs/images
+	base := filepath.Base(path)
+	if strings.HasPrefix(base, ".") {
+		return nil
+	}
+
+	ext := strings.ToLower(filepath.Ext(path))
+	if ext != ".pdf" && ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
+		log.Printf("Skipping non-document file: %s", path)
+		return nil
+	}
+
+	log.Printf("Processing: %s", path)
+
+	// Read file
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return err
+	}
+
+	// Compute checksum
+	hash := sha256.Sum256(data)
+	checksum := hex.EncodeToString(hash[:])
+
+	// Generate ID
+	id := uuid.New().String()
+
+	// Extract text via OCR
+	ocrText, pageCount, err := p.extractText(path, ext)
+	if err != nil {
+		log.Printf("OCR failed for %s: %v", path, err)
+		ocrText = ""
+	}
+
+	// Classify with AI
+	classification, err := p.classify(ctx, ocrText, base)
+	if err != nil {
+		log.Printf("Classification failed for %s: %v", path, err)
+		classification = &Classification{
+			Category: "uncategorized",
+			Title:    base,
+		}
+	}
+
+	// Store PDF
+	storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
+	storagePath := filepath.Join(p.storeDir, storageName)
+	if err := os.WriteFile(storagePath, data, 0644); err != nil {
+		return err
+	}
+
+	// Parse date
+	var docDate *time.Time
+	if classification.Date != "" {
+		if t, err := parseDate(classification.Date); err == nil {
+			docDate = &t
+		}
+	}
+
+	// Create document record
+	doc := &db.Document{
+		ID:           id,
+		Filename:     storageName,
+		OriginalName: base,
+		Category:     classification.Category,
+		Subcategory:  classification.Subcategory,
+		Title:        classification.Title,
+		Date:         docDate,
+		Vendor:       classification.Vendor,
+		Amount:       classification.Amount,
+		Currency:     classification.Currency,
+		TaxDeductible: classification.TaxDeductible,
+		OCRText:      ocrText,
+		StoragePath:  storagePath,
+		PageCount:    pageCount,
+		FileSize:     int64(len(data)),
+		MimeType:     getMimeType(ext),
+		Checksum:     checksum,
+		ProcessedAt:  time.Now(),
+		CreatedAt:    time.Now(),
+		UpdatedAt:    time.Now(),
+	}
+
+	// Generate markdown record
+	mdPath, err := p.writeMarkdown(doc, classification)
+	if err != nil {
+		log.Printf("Failed to write markdown: %v", err)
+	} else {
+		doc.MarkdownPath = mdPath
+	}
+
+	// Generate embedding
+	if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
+		doc.Embedding = embedding
+	}
+
+	// Store metadata as JSON
+	if meta, err := json.Marshal(classification.KeyFields); err == nil {
+		doc.Metadata = meta
+	}
+
+	// Insert into database
+	if err := p.db.InsertDocument(doc); err != nil {
+		return err
+	}
+
+	// Remove from inbox
+	if err := os.Remove(path); err != nil {
+		log.Printf("Failed to remove inbox file: %v", err)
+	}
+
+	log.Printf("Processed: %s -> %s (%s)", base, classification.Title, classification.Category)
+	return nil
+}
+
+func (p *Processor) extractText(path, ext string) (string, int, error) {
+	if ext == ".pdf" {
+		return p.extractPDFText(path)
+	}
+	return p.extractImageText(path)
+}
+
+func (p *Processor) extractPDFText(path string) (string, int, error) {
+	// Try pdftotext first (poppler-utils)
+	cmd := exec.Command("pdftotext", "-layout", path, "-")
+	output, err := cmd.Output()
+	if err == nil && len(output) > 100 {
+		// Count pages
+		pageCmd := exec.Command("pdfinfo", path)
+		pageOut, _ := pageCmd.Output()
+		pages := 1
+		if match := regexp.MustCompile(`Pages:\s+(\d+)`).FindSubmatch(pageOut); len(match) > 1 {
+			fmt.Sscanf(string(match[1]), "%d", &pages)
+		}
+		return string(output), pages, nil
+	}
+
+	// Fallback to OCR via tesseract
+	// Convert PDF to images first
+	tmpDir, err := os.MkdirTemp("", "docman-ocr-")
+	if err != nil {
+		return "", 0, err
+	}
+	defer os.RemoveAll(tmpDir)
+
+	// Use pdftoppm to convert to images
+	cmd = exec.Command("pdftoppm", "-png", "-r", "300", path, filepath.Join(tmpDir, "page"))
+	if err := cmd.Run(); err != nil {
+		return "", 0, fmt.Errorf("pdftoppm failed: %w", err)
+	}
+
+	// OCR each page
+	var textBuf bytes.Buffer
+	pages, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
+	for _, pagePath := range pages {
+		text, _, _ := p.extractImageText(pagePath)
+		textBuf.WriteString(text)
+		textBuf.WriteString("\n\n--- Page Break ---\n\n")
+	}
+
+	return textBuf.String(), len(pages), nil
+}
+
+func (p *Processor) extractImageText(path string) (string, int, error) {
+	cmd := exec.Command("tesseract", path, "stdout", "-l", "eng+nld")
+	output, err := cmd.Output()
+	if err != nil {
+		return "", 1, err
+	}
+	return string(output), 1, nil
+}
+
+func (p *Processor) classify(ctx context.Context, text, filename string) (*Classification, error) {
+	prompt := fmt.Sprintf(`Analyze this scanned document and extract structured information.
+
+Document filename: %s
+
+OCR Text:
+%s
+
+Classify and extract the following JSON structure:
+{
+  "category": "taxes|expenses|bills|medical|contacts|legal|insurance|banking|receipts|correspondence|uncategorized",
+  "subcategory": "more specific category if applicable",
+  "title": "descriptive title for this document",
+  "date": "YYYY-MM-DD if found",
+  "vendor": "company/person name if applicable",
+  "amount": numeric amount if this is a financial document,
+  "currency": "USD" or other currency code,
+  "tax_deductible": true/false if this is a deductible expense,
+  "summary": "one paragraph summary of the document",
+  "key_fields": {"field_name": "value"} for any other important extracted data
+}
+
+Categories:
+- taxes: W-2, 1099, tax returns, deductions
+- expenses: receipts, invoices for purchases
+- bills: utility bills, service bills
+- medical: medical records, prescriptions, EOBs
+- contacts: business cards, contact info
+- legal: contracts, agreements, legal documents
+- insurance: policies, claims
+- banking: statements, checks
+- receipts: general purchase receipts
+- correspondence: letters, emails
+
+Return ONLY valid JSON.`, filename, truncate(text, 4000))
+
+	resp, err := p.aiClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
+		Model: p.aiModel,
+		Messages: []openai.ChatCompletionMessage{
+			{Role: openai.ChatMessageRoleUser, Content: prompt},
+		},
+		Temperature: 0.1,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	if len(resp.Choices) == 0 {
+		return nil, fmt.Errorf("no response from AI")
+	}
+
+	content := resp.Choices[0].Message.Content
+	// Extract JSON from response
+	content = extractJSON(content)
+
+	var classification Classification
+	if err := json.Unmarshal([]byte(content), &classification); err != nil {
+		return nil, fmt.Errorf("parse classification: %w", err)
+	}
+
+	return &classification, nil
+}
+
+func (p *Processor) generateEmbedding(ctx context.Context, text string) ([]byte, error) {
+	if p.embedModel == "" {
+		return nil, nil
+	}
+
+	resp, err := p.aiClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
+		Model: openai.EmbeddingModel(p.embedModel),
+		Input: []string{truncate(text, 8000)},
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	if len(resp.Data) == 0 {
+		return nil, fmt.Errorf("no embedding returned")
+	}
+
+	// Serialize embedding to bytes
+	embData, err := json.Marshal(resp.Data[0].Embedding)
+	return embData, err
+}
+
+func (p *Processor) writeMarkdown(doc *db.Document, class *Classification) (string, error) {
+	// Create category subdirectory
+	catDir := filepath.Join(p.recordsDir, doc.Category)
+	if err := os.MkdirAll(catDir, 0755); err != nil {
+		return "", err
+	}
+
+	// Generate filename
+	dateStr := "undated"
+	if doc.Date != nil {
+		dateStr = doc.Date.Format("2006-01-02")
+	}
+	safeName := sanitizeFilename(doc.Title)
+	mdName := fmt.Sprintf("%s_%s.md", dateStr, safeName)
+	mdPath := filepath.Join(catDir, mdName)
+
+	// Build markdown content
+	var buf bytes.Buffer
+	buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
+	buf.WriteString("## Metadata\n\n")
+	buf.WriteString(fmt.Sprintf("- **ID:** %s\n", doc.ID))
+	buf.WriteString(fmt.Sprintf("- **Category:** %s", doc.Category))
+	if doc.Subcategory != "" {
+		buf.WriteString(fmt.Sprintf(" / %s", doc.Subcategory))
+	}
+	buf.WriteString("\n")
+	if doc.Date != nil {
+		buf.WriteString(fmt.Sprintf("- **Date:** %s\n", doc.Date.Format("2006-01-02")))
+	}
+	if doc.Vendor != "" {
+		buf.WriteString(fmt.Sprintf("- **Vendor:** %s\n", doc.Vendor))
+	}
+	if doc.Amount != nil {
+		buf.WriteString(fmt.Sprintf("- **Amount:** %s %.2f\n", doc.Currency, *doc.Amount))
+	}
+	if doc.TaxDeductible {
+		buf.WriteString("- **Tax Deductible:** Yes\n")
+	}
+	buf.WriteString(fmt.Sprintf("- **Original File:** %s\n", doc.OriginalName))
+	buf.WriteString(fmt.Sprintf("- **PDF:** [View](%s)\n", doc.StoragePath))
+	buf.WriteString(fmt.Sprintf("- **Processed:** %s\n", doc.ProcessedAt.Format(time.RFC3339)))
+
+	if class.Summary != "" {
+		buf.WriteString("\n## Summary\n\n")
+		buf.WriteString(class.Summary)
+		buf.WriteString("\n")
+	}
+
+	if len(class.KeyFields) > 0 {
+		buf.WriteString("\n## Key Fields\n\n")
+		for k, v := range class.KeyFields {
+			buf.WriteString(fmt.Sprintf("- **%s:** %s\n", k, v))
+		}
+	}
+
+	buf.WriteString("\n## Full Text (OCR)\n\n```\n")
+	buf.WriteString(doc.OCRText)
+	buf.WriteString("\n```\n")
+
+	if err := os.WriteFile(mdPath, buf.Bytes(), 0644); err != nil {
+		return "", err
+	}
+
+	return mdPath, nil
+}
+
+// Helper functions
+
+func truncate(s string, max int) string {
+	if len(s) <= max {
+		return s
+	}
+	return s[:max]
+}
+
+func extractJSON(s string) string {
+	// Try to find JSON block
+	start := strings.Index(s, "{")
+	end := strings.LastIndex(s, "}")
+	if start >= 0 && end > start {
+		return s[start : end+1]
+	}
+	return s
+}
+
+func parseDate(s string) (time.Time, error) {
+	formats := []string{
+		"2006-01-02",
+		"01/02/2006",
+		"1/2/2006",
+		"January 2, 2006",
+		"Jan 2, 2006",
+		"2006/01/02",
+	}
+	for _, f := range formats {
+		if t, err := time.Parse(f, s); err == nil {
+			return t, nil
+		}
+	}
+	return time.Time{}, fmt.Errorf("cannot parse date: %s", s)
+}
+
+func getMimeType(ext string) string {
+	switch ext {
+	case ".pdf":
+		return "application/pdf"
+	case ".jpg", ".jpeg":
+		return "image/jpeg"
+	case ".png":
+		return "image/png"
+	default:
+		return "application/octet-stream"
+	}
+}
+
+func sanitizeFilename(s string) string {
+	s = strings.ToLower(s)
+	s = regexp.MustCompile(`[^a-z0-9]+`).ReplaceAllString(s, "-")
+	s = strings.Trim(s, "-")
+	if len(s) > 50 {
+		s = s[:50]
+	}
+	return s
+}
+
+// ProcessSingle processes a single file and returns the document (for API uploads)
+func (p *Processor) ProcessSingle(ctx context.Context, data []byte, filename string) (*db.Document, error) {
+	// Write to temp file for processing
+	ext := strings.ToLower(filepath.Ext(filename))
+	tmpFile, err := os.CreateTemp("", "docman-upload-*"+ext)
+	if err != nil {
+		return nil, err
+	}
+	defer os.Remove(tmpFile.Name())
+
+	if _, err := tmpFile.Write(data); err != nil {
+		return nil, err
+	}
+	tmpFile.Close()
+
+	// Use existing process logic but don't delete the temp file
+	// Compute checksum
+	hash := sha256.Sum256(data)
+	checksum := hex.EncodeToString(hash[:])
+	id := uuid.New().String()
+
+	ocrText, pageCount, err := p.extractText(tmpFile.Name(), ext)
+	if err != nil {
+		ocrText = ""
+	}
+
+	classification, err := p.classify(ctx, ocrText, filename)
+	if err != nil {
+		classification = &Classification{
+			Category: "uncategorized",
+			Title:    filename,
+		}
+	}
+
+	storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
+	storagePath := filepath.Join(p.storeDir, storageName)
+	if err := os.WriteFile(storagePath, data, 0644); err != nil {
+		return nil, err
+	}
+
+	var docDate *time.Time
+	if classification.Date != "" {
+		if t, err := parseDate(classification.Date); err == nil {
+			docDate = &t
+		}
+	}
+
+	doc := &db.Document{
+		ID:           id,
+		Filename:     storageName,
+		OriginalName: filename,
+		Category:     classification.Category,
+		Subcategory:  classification.Subcategory,
+		Title:        classification.Title,
+		Date:         docDate,
+		Vendor:       classification.Vendor,
+		Amount:       classification.Amount,
+		Currency:     classification.Currency,
+		TaxDeductible: classification.TaxDeductible,
+		OCRText:      ocrText,
+		StoragePath:  storagePath,
+		PageCount:    pageCount,
+		FileSize:     int64(len(data)),
+		MimeType:     getMimeType(ext),
+		Checksum:     checksum,
+		ProcessedAt:  time.Now(),
+		CreatedAt:    time.Now(),
+		UpdatedAt:    time.Now(),
+	}
+
+	mdPath, _ := p.writeMarkdown(doc, classification)
+	doc.MarkdownPath = mdPath
+
+	if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
+		doc.Embedding = embedding
+	}
+
+	if meta, err := json.Marshal(classification.KeyFields); err == nil {
+		doc.Metadata = meta
+	}
+
+	if err := p.db.InsertDocument(doc); err != nil {
+		return nil, err
+	}
+
+	return doc, nil
+}
+
+// SearchMarkdown searches markdown files directly (fallback when embeddings unavailable)
+func SearchMarkdown(recordsDir, query string, limit int) ([]*db.SearchResult, error) {
+	var results []*db.SearchResult
+	query = strings.ToLower(query)
+	terms := strings.Fields(query)
+
+	err := filepath.Walk(recordsDir, func(path string, info os.FileInfo, err error) error {
+		if err != nil || info.IsDir() || !strings.HasSuffix(path, ".md") {
+			return nil
+		}
+
+		data, err := os.ReadFile(path)
+		if err != nil {
+			return nil
+		}
+
+		content := strings.ToLower(string(data))
+		score := 0.0
+		for _, term := range terms {
+			if strings.Contains(content, term) {
+				score += 1.0
+			}
+		}
+
+		if score > 0 {
+			// Extract title from first line
+			lines := strings.Split(string(data), "\n")
+			title := strings.TrimPrefix(lines[0], "# ")
+
+			// Find snippet around first match
+			snippet := findSnippet(string(data), terms[0], 100)
+
+			results = append(results, &db.SearchResult{
+				Document: db.Document{
+					Title:        title,
+					MarkdownPath: path,
+				},
+				Score:   score / float64(len(terms)),
+				Snippet: snippet,
+			})
+		}
+		return nil
+	})
+
+	if err != nil {
+		return nil, err
+	}
+
+	// Sort by score descending
+	for i := 0; i < len(results)-1; i++ {
+		for j := i + 1; j < len(results); j++ {
+			if results[j].Score > results[i].Score {
+				results[i], results[j] = results[j], results[i]
+			}
+		}
+	}
+
+	if len(results) > limit {
+		results = results[:limit]
+	}
+
+	return results, nil
+}
+
+func findSnippet(text, term string, radius int) string {
+	lower := strings.ToLower(text)
+	idx := strings.Index(lower, strings.ToLower(term))
+	if idx < 0 {
+		if len(text) > radius*2 {
+			return text[:radius*2] + "..."
+		}
+		return text
+	}
+
+	start := idx - radius
+	if start < 0 {
+		start = 0
+	}
+	end := idx + len(term) + radius
+	if end > len(text) {
+		end = len(text)
+	}
+
+	snippet := text[start:end]
+	if start > 0 {
+		snippet = "..." + snippet
+	}
+	if end < len(text) {
+		snippet = snippet + "..."
+	}
+	return snippet
+}
+
+func (p *Processor) GetRecordsDir() string {
+	return p.recordsDir
+}
+
+func (p *Processor) GetStoreDir() string {
+	return p.storeDir
+}