commit 880f9dab9d56cc1c25a0ab5bbef8c6b65a83d10f Author: James Date: Wed Feb 4 13:35:03 2026 -0500 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..15cd156 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# Binaries +*.exe +node_modules/ +.venv/ +__pycache__/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6c2cc9e --- /dev/null +++ b/Makefile @@ -0,0 +1,51 @@ +.PHONY: build run install clean dev deps + +BINARY=docman +VERSION=$(shell git describe --tags --always --dirty 2>/dev/null || echo "dev") +BUILD_DIR=build + +# Build flags +LDFLAGS=-ldflags "-s -w -X main.Version=$(VERSION)" +CGO_CFLAGS=-DSQLITE_ENABLE_FTS5 +TAGS=-tags "fts5" + +build: + @mkdir -p $(BUILD_DIR) + CGO_CFLAGS="$(CGO_CFLAGS)" go build $(TAGS) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY) ./cmd/docman + +run: build + ./$(BUILD_DIR)/$(BINARY) + +dev: + go run ./cmd/docman -port 8200 + +deps: + go mod download + go mod tidy + +# Install to ~/bin +install: build + @mkdir -p $(HOME)/bin + cp $(BUILD_DIR)/$(BINARY) $(HOME)/bin/ + @echo "Installed to $(HOME)/bin/$(BINARY)" + +# Install systemd service +install-service: + @mkdir -p $(HOME)/.config/systemd/user + @envsubst < deploy/docman.service > $(HOME)/.config/systemd/user/docman.service + systemctl --user daemon-reload + systemctl --user enable docman + @echo "Service installed. Start with: systemctl --user start docman" + +clean: + rm -rf $(BUILD_DIR) + +# Development helpers +test: + go test -v ./... + +lint: + golangci-lint run + +fmt: + go fmt ./... diff --git a/README.md b/README.md new file mode 100644 index 0000000..918c0eb --- /dev/null +++ b/README.md @@ -0,0 +1,116 @@ +# DocMan - Document Management System + +AI-powered document scanning, OCR, classification, and search. + +## Quick Start + +```bash +# Run directly (dev mode) +cd ~/dev/docman +make dev + +# Or run the installed binary +~/bin/docman -port 8200 +``` + +Open http://localhost:8200 + +## Features + +- **Auto-processing**: Drop PDFs/images into `~/documents/inbox/` → auto-classified and indexed +- **PDF Preview**: Built-in PDF viewer +- **Full-text Search**: FTS5 + semantic search +- **Categories**: taxes, expenses, bills, medical, contacts, legal, insurance, banking, receipts +- **Expense Tracking**: Filter by date, export to CSV, track tax-deductible items +- **Markdown Records**: Each document gets a searchable markdown file + +## Scanner Setup + +1. **Set scanner to save to SMB share:** + - Share: `\\192.168.1.16\documents\inbox` (or wherever james is) + - Or use your scanner's app to save to `~/documents/inbox/` + +2. **Workflow:** + - Scan document → lands in inbox + - DocMan auto-processes (OCR → classify → store) + - View/search in web UI + +## Directory Structure + +``` +~/documents/ +├── inbox/ # Drop scans here (auto-processed) +├── store/ # PDF storage (by checksum) +├── records/ # Markdown records by category +│ ├── taxes/ +│ ├── expenses/ +│ ├── bills/ +│ └── ... +└── index/ # SQLite database +``` + +## Configuration + +### Environment Variables + +```bash +FIREWORKS_API_KEY=fw_xxx # Required for AI classification +``` + +### Command Line Options + +``` +-port HTTP port (default: 8200) +-data Data directory (default: ~/documents) +-ai-endpoint AI API endpoint (default: Fireworks) +-ai-key AI API key +-ai-model Classification model +-embed-model Embedding model +-watch Only watch inbox, don't start web server +``` + +## Systemd Service + +```bash +# Edit to add your Fireworks API key +nano ~/.config/systemd/user/docman.service + +# Enable and start +systemctl --user daemon-reload +systemctl --user enable docman +systemctl --user start docman + +# Check status +systemctl --user status docman +journalctl --user -u docman -f +``` + +## API Endpoints + +| Method | Path | Description | +|--------|------|-------------| +| GET | `/` | Dashboard | +| GET | `/browse` | Browse documents | +| GET | `/doc/:id` | View document | +| GET | `/search` | Search page | +| GET | `/expenses` | Expenses tracker | +| GET | `/upload` | Upload page | +| POST | `/api/upload` | Upload document | +| GET | `/api/documents/:id` | Get document JSON | +| PATCH | `/api/documents/:id` | Update document | +| DELETE | `/api/documents/:id` | Delete document | +| GET | `/api/search?q=` | Search API | +| GET | `/api/expenses/export` | Export CSV | +| GET | `/api/stats` | Dashboard stats | + +## Dependencies + +System packages (already installed): +- `poppler-utils` (pdftotext, pdfinfo, pdftoppm) +- `tesseract-ocr` (OCR for scanned images) + +## Notes + +- Without FIREWORKS_API_KEY, documents will be categorized as "uncategorized" +- The inbox watcher runs continuously, processing new files automatically +- Markdown files are searchable even without embeddings diff --git a/build/docman b/build/docman new file mode 100755 index 0000000..2615008 Binary files /dev/null and b/build/docman differ diff --git a/cmd/docman/main.go b/cmd/docman/main.go new file mode 100644 index 0000000..1092d89 --- /dev/null +++ b/cmd/docman/main.go @@ -0,0 +1,164 @@ +package main + +import ( + "context" + "flag" + "fmt" + "io" + "log" + "os" + "os/signal" + "path/filepath" + "syscall" + + "docman/internal/db" + "docman/internal/handlers" + "docman/internal/processor" + + "github.com/labstack/echo/v4" + "github.com/labstack/echo/v4/middleware" +) + +func main() { + var ( + port = flag.Int("port", 8200, "HTTP port") + dataDir = flag.String("data", "", "Data directory (default: ~/documents)") + aiEndpoint = flag.String("ai-endpoint", "https://api.fireworks.ai/inference/v1", "AI API endpoint") + aiKey = flag.String("ai-key", "", "AI API key (or FIREWORKS_API_KEY env)") + aiModel = flag.String("ai-model", "accounts/fireworks/models/qwen2-vl-72b-instruct", "AI model for classification") + embedModel = flag.String("embed-model", "nomic-ai/nomic-embed-text-v1.5", "Embedding model") + watchOnly = flag.Bool("watch", false, "Only watch inbox, don't start web server") + ) + flag.Parse() + + // Resolve data directory + if *dataDir == "" { + home, _ := os.UserHomeDir() + *dataDir = filepath.Join(home, "documents") + } + + // Resolve AI key + if *aiKey == "" { + *aiKey = os.Getenv("FIREWORKS_API_KEY") + } + if *aiKey == "" { + log.Println("Warning: No AI API key provided. Classification will fail.") + } + + // Create directories + dirs := map[string]string{ + "inbox": filepath.Join(*dataDir, "inbox"), + "store": filepath.Join(*dataDir, "store"), + "records": filepath.Join(*dataDir, "records"), + "index": filepath.Join(*dataDir, "index"), + } + for _, dir := range dirs { + if err := os.MkdirAll(dir, 0755); err != nil { + log.Fatalf("Failed to create directory %s: %v", dir, err) + } + } + + // Open database + dbPath := filepath.Join(dirs["index"], "docman.db") + database, err := db.Open(dbPath) + if err != nil { + log.Fatalf("Failed to open database: %v", err) + } + defer database.Close() + + if err := database.Init(); err != nil { + log.Fatalf("Failed to initialize database: %v", err) + } + + // Create processor + proc := processor.New(processor.Config{ + InboxDir: dirs["inbox"], + StoreDir: dirs["store"], + RecordsDir: dirs["records"], + AIEndpoint: *aiEndpoint, + AIKey: *aiKey, + AIModel: *aiModel, + EmbedModel: *embedModel, + }, database) + + // Context for graceful shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Start inbox watcher + go func() { + if err := proc.Watch(ctx); err != nil && err != context.Canceled { + log.Printf("Watcher error: %v", err) + } + }() + + if *watchOnly { + log.Printf("Watching inbox: %s", dirs["inbox"]) + // Wait for signal + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + <-sigCh + cancel() + return + } + + // Create Echo server + e := echo.New() + e.HideBanner = true + + // Middleware + e.Use(middleware.Logger()) + e.Use(middleware.Recover()) + e.Use(middleware.CORS()) + + // Template renderer + e.Renderer = &templateRenderer{} + + // Handlers + h := handlers.New(database, proc, dirs["store"], dirs["records"]) + + // Page routes + e.GET("/", h.Dashboard) + e.GET("/browse", h.Browse) + e.GET("/doc/:id", h.Document) + e.GET("/search", h.Search) + e.GET("/expenses", h.Expenses) + e.GET("/upload", h.Upload) + + // API routes + e.POST("/api/upload", h.APIUpload) + e.GET("/api/documents/:id", h.APIDocument) + e.PATCH("/api/documents/:id", h.APIUpdateDocument) + e.DELETE("/api/documents/:id", h.APIDeleteDocument) + e.GET("/api/search", h.APISearch) + e.GET("/api/expenses/export", h.APIExportExpenses) + e.GET("/api/stats", h.APIStats) + + // Static file serving + e.GET("/pdf/:filename", h.ServePDF) + e.GET("/markdown", h.ServeMarkdown) + + // Graceful shutdown + go func() { + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM) + <-sigCh + log.Println("Shutting down...") + cancel() + e.Close() + }() + + // Start server + log.Printf("DocMan starting on http://localhost:%d", *port) + log.Printf("Data directory: %s", *dataDir) + log.Printf("Inbox: %s", dirs["inbox"]) + if err := e.Start(fmt.Sprintf(":%d", *port)); err != nil { + log.Printf("Server stopped: %v", err) + } +} + +type templateRenderer struct{} + +func (t *templateRenderer) Render(w io.Writer, name string, data interface{}, c echo.Context) error { + return handlers.NewTemplateRenderer().Render(w, name, data, c) +} diff --git a/deploy/docman.service b/deploy/docman.service new file mode 100644 index 0000000..b0316bd --- /dev/null +++ b/deploy/docman.service @@ -0,0 +1,15 @@ +[Unit] +Description=DocMan - Document Management System +After=network.target + +[Service] +Type=simple +ExecStart=${HOME}/bin/docman -port 8200 -ai-key ${FIREWORKS_API_KEY} +Restart=on-failure +RestartSec=5 +Environment=HOME=${HOME} +Environment=FIREWORKS_API_KEY=${FIREWORKS_API_KEY} +WorkingDirectory=${HOME} + +[Install] +WantedBy=default.target diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f90275f --- /dev/null +++ b/go.mod @@ -0,0 +1,25 @@ +module docman + +go 1.22 + +require ( + github.com/fsnotify/fsnotify v1.7.0 + github.com/google/uuid v1.6.0 + github.com/labstack/echo/v4 v4.12.0 + github.com/mattn/go-sqlite3 v1.14.22 + github.com/sashabaranov/go-openai v1.29.0 +) + +require ( + github.com/golang-jwt/jwt v3.2.2+incompatible // indirect + github.com/labstack/gommon v0.4.2 // indirect + github.com/mattn/go-colorable v0.1.13 // indirect + github.com/mattn/go-isatty v0.0.20 // indirect + github.com/valyala/bytebufferpool v1.0.0 // indirect + github.com/valyala/fasttemplate v1.2.2 // indirect + golang.org/x/crypto v0.26.0 // indirect + golang.org/x/net v0.28.0 // indirect + golang.org/x/sys v0.23.0 // indirect + golang.org/x/text v0.17.0 // indirect + golang.org/x/time v0.5.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..1a13149 --- /dev/null +++ b/go.sum @@ -0,0 +1,43 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY= +github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/labstack/echo/v4 v4.12.0 h1:IKpw49IMryVB2p1a4dzwlhP1O2Tf2E0Ir/450lH+kI0= +github.com/labstack/echo/v4 v4.12.0/go.mod h1:UP9Cr2DJXbOK3Kr9ONYzNowSh7HP0aG0ShAyycHSJvM= +github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0= +github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sashabaranov/go-openai v1.29.0 h1:eBH6LSjtX4md5ImDCX8hNhHQvaRf22zujiERoQpsvLo= +github.com/sashabaranov/go-openai v1.29.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw= +github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc= +github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo= +github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= +golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= +golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= +golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/db/db.go b/internal/db/db.go new file mode 100644 index 0000000..7856282 --- /dev/null +++ b/internal/db/db.go @@ -0,0 +1,406 @@ +package db + +import ( + "database/sql" + "encoding/json" + "fmt" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +type DB struct { + *sql.DB +} + +type Document struct { + ID string `json:"id"` + Filename string `json:"filename"` + OriginalName string `json:"original_name"` + Category string `json:"category"` + Subcategory string `json:"subcategory,omitempty"` + Title string `json:"title"` + Date *time.Time `json:"date,omitempty"` + Vendor string `json:"vendor,omitempty"` + Amount *float64 `json:"amount,omitempty"` + Currency string `json:"currency,omitempty"` + TaxDeductible bool `json:"tax_deductible"` + OCRText string `json:"ocr_text"` + Metadata json.RawMessage `json:"metadata,omitempty"` + Embedding []byte `json:"embedding,omitempty"` + StoragePath string `json:"storage_path"` + MarkdownPath string `json:"markdown_path"` + PageCount int `json:"page_count"` + FileSize int64 `json:"file_size"` + MimeType string `json:"mime_type"` + Checksum string `json:"checksum"` + ProcessedAt time.Time `json:"processed_at"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` +} + +type SearchResult struct { + Document + Score float64 `json:"score"` + Snippet string `json:"snippet"` +} + +func Open(path string) (*DB, error) { + db, err := sql.Open("sqlite3", path+"?_journal_mode=WAL&_busy_timeout=5000") + if err != nil { + return nil, err + } + + if err := db.Ping(); err != nil { + return nil, err + } + + return &DB{db}, nil +} + +func (db *DB) Init() error { + schema := ` + CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + filename TEXT NOT NULL, + original_name TEXT NOT NULL, + category TEXT NOT NULL DEFAULT 'uncategorized', + subcategory TEXT, + title TEXT NOT NULL, + date TEXT, + vendor TEXT, + amount REAL, + currency TEXT DEFAULT 'USD', + tax_deductible INTEGER DEFAULT 0, + ocr_text TEXT, + metadata TEXT, + embedding BLOB, + storage_path TEXT NOT NULL, + markdown_path TEXT, + page_count INTEGER DEFAULT 1, + file_size INTEGER, + mime_type TEXT, + checksum TEXT, + processed_at TEXT, + created_at TEXT DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT DEFAULT CURRENT_TIMESTAMP + ); + + CREATE INDEX IF NOT EXISTS idx_documents_category ON documents(category); + CREATE INDEX IF NOT EXISTS idx_documents_date ON documents(date); + CREATE INDEX IF NOT EXISTS idx_documents_vendor ON documents(vendor); + CREATE INDEX IF NOT EXISTS idx_documents_amount ON documents(amount); + + CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( + id, + title, + ocr_text, + vendor, + category, + content='documents', + content_rowid='rowid' + ); + + CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN + INSERT INTO documents_fts(id, title, ocr_text, vendor, category) + VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category); + END; + + CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category) + VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category); + END; + + CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN + INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category) + VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category); + INSERT INTO documents_fts(id, title, ocr_text, vendor, category) + VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category); + END; + + CREATE TABLE IF NOT EXISTS settings ( + key TEXT PRIMARY KEY, + value TEXT + ); + ` + _, err := db.Exec(schema) + return err +} + +func (db *DB) InsertDocument(doc *Document) error { + var dateStr *string + if doc.Date != nil { + s := doc.Date.Format("2006-01-02") + dateStr = &s + } + + _, err := db.Exec(` + INSERT INTO documents ( + id, filename, original_name, category, subcategory, title, date, + vendor, amount, currency, tax_deductible, ocr_text, metadata, + embedding, storage_path, markdown_path, page_count, file_size, + mime_type, checksum, processed_at, created_at, updated_at + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + `, doc.ID, doc.Filename, doc.OriginalName, doc.Category, doc.Subcategory, + doc.Title, dateStr, doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible, + doc.OCRText, doc.Metadata, doc.Embedding, doc.StoragePath, doc.MarkdownPath, + doc.PageCount, doc.FileSize, doc.MimeType, doc.Checksum, + doc.ProcessedAt.Format(time.RFC3339), + doc.CreatedAt.Format(time.RFC3339), + doc.UpdatedAt.Format(time.RFC3339)) + return err +} + +func (db *DB) UpdateDocument(doc *Document) error { + var dateStr *string + if doc.Date != nil { + s := doc.Date.Format("2006-01-02") + dateStr = &s + } + + _, err := db.Exec(` + UPDATE documents SET + category = ?, subcategory = ?, title = ?, date = ?, + vendor = ?, amount = ?, currency = ?, tax_deductible = ?, + metadata = ?, updated_at = ? + WHERE id = ? + `, doc.Category, doc.Subcategory, doc.Title, dateStr, + doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible, + doc.Metadata, time.Now().Format(time.RFC3339), doc.ID) + return err +} + +func (db *DB) GetDocument(id string) (*Document, error) { + row := db.QueryRow(`SELECT * FROM documents WHERE id = ?`, id) + return scanDocument(row) +} + +func (db *DB) DeleteDocument(id string) error { + _, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id) + return err +} + +func (db *DB) ListDocuments(category string, limit, offset int) ([]*Document, error) { + var rows *sql.Rows + var err error + + if category != "" { + rows, err = db.Query(` + SELECT * FROM documents + WHERE category = ? + ORDER BY COALESCE(date, created_at) DESC + LIMIT ? OFFSET ? + `, category, limit, offset) + } else { + rows, err = db.Query(` + SELECT * FROM documents + ORDER BY COALESCE(date, created_at) DESC + LIMIT ? OFFSET ? + `, limit, offset) + } + if err != nil { + return nil, err + } + defer rows.Close() + + return scanDocuments(rows) +} + +func (db *DB) RecentDocuments(limit int) ([]*Document, error) { + rows, err := db.Query(` + SELECT * FROM documents + ORDER BY created_at DESC + LIMIT ? + `, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanDocuments(rows) +} + +func (db *DB) SearchFTS(query string, limit int) ([]*SearchResult, error) { + rows, err := db.Query(` + SELECT d.*, + bm25(documents_fts) as score, + snippet(documents_fts, 2, '', '', '...', 32) as snippet + FROM documents_fts f + JOIN documents d ON f.id = d.id + WHERE documents_fts MATCH ? + ORDER BY bm25(documents_fts) + LIMIT ? + `, query, limit) + if err != nil { + return nil, err + } + defer rows.Close() + + var results []*SearchResult + for rows.Next() { + var doc Document + var dateStr, processedStr, createdStr, updatedStr sql.NullString + var score float64 + var snippet string + + err := rows.Scan( + &doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory, + &doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible, + &doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath, + &doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum, + &processedStr, &createdStr, &updatedStr, &score, &snippet, + ) + if err != nil { + return nil, err + } + + if dateStr.Valid { + t, _ := time.Parse("2006-01-02", dateStr.String) + doc.Date = &t + } + if createdStr.Valid { + t, _ := time.Parse(time.RFC3339, createdStr.String) + doc.CreatedAt = t + } + + results = append(results, &SearchResult{Document: doc, Score: score, Snippet: snippet}) + } + + return results, nil +} + +func (db *DB) GetStats() (map[string]interface{}, error) { + stats := make(map[string]interface{}) + + // Total documents + var total int + db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&total) + stats["total"] = total + + // This month + var thisMonth int + db.QueryRow(`SELECT COUNT(*) FROM documents WHERE created_at >= date('now', 'start of month')`).Scan(&thisMonth) + stats["this_month"] = thisMonth + + // Total size + var totalSize int64 + db.QueryRow(`SELECT COALESCE(SUM(file_size), 0) FROM documents`).Scan(&totalSize) + stats["total_size"] = totalSize + + // By category + rows, err := db.Query(`SELECT category, COUNT(*) FROM documents GROUP BY category`) + if err == nil { + categories := make(map[string]int) + for rows.Next() { + var cat string + var count int + rows.Scan(&cat, &count) + categories[cat] = count + } + rows.Close() + stats["by_category"] = categories + } + + return stats, nil +} + +func (db *DB) GetExpenses(year int, month int) ([]*Document, error) { + query := ` + SELECT * FROM documents + WHERE category = 'expenses' AND amount IS NOT NULL + ` + args := []interface{}{} + + if year > 0 { + query += ` AND strftime('%Y', date) = ?` + args = append(args, fmt.Sprintf("%04d", year)) + } + if month > 0 { + query += ` AND strftime('%m', date) = ?` + args = append(args, fmt.Sprintf("%02d", month)) + } + + query += ` ORDER BY date DESC` + + rows, err := db.Query(query, args...) + if err != nil { + return nil, err + } + defer rows.Close() + + return scanDocuments(rows) +} + +func scanDocument(row *sql.Row) (*Document, error) { + var doc Document + var dateStr, processedStr, createdStr, updatedStr sql.NullString + + err := row.Scan( + &doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory, + &doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible, + &doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath, + &doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum, + &processedStr, &createdStr, &updatedStr, + ) + if err != nil { + return nil, err + } + + if dateStr.Valid { + t, _ := time.Parse("2006-01-02", dateStr.String) + doc.Date = &t + } + if processedStr.Valid { + t, _ := time.Parse(time.RFC3339, processedStr.String) + doc.ProcessedAt = t + } + if createdStr.Valid { + t, _ := time.Parse(time.RFC3339, createdStr.String) + doc.CreatedAt = t + } + if updatedStr.Valid { + t, _ := time.Parse(time.RFC3339, updatedStr.String) + doc.UpdatedAt = t + } + + return &doc, nil +} + +func scanDocuments(rows *sql.Rows) ([]*Document, error) { + var docs []*Document + for rows.Next() { + var doc Document + var dateStr, processedStr, createdStr, updatedStr sql.NullString + + err := rows.Scan( + &doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory, + &doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible, + &doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath, + &doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum, + &processedStr, &createdStr, &updatedStr, + ) + if err != nil { + return nil, err + } + + if dateStr.Valid { + t, _ := time.Parse("2006-01-02", dateStr.String) + doc.Date = &t + } + if processedStr.Valid { + t, _ := time.Parse(time.RFC3339, processedStr.String) + doc.ProcessedAt = t + } + if createdStr.Valid { + t, _ := time.Parse(time.RFC3339, createdStr.String) + doc.CreatedAt = t + } + if updatedStr.Valid { + t, _ := time.Parse(time.RFC3339, updatedStr.String) + doc.UpdatedAt = t + } + + docs = append(docs, &doc) + } + return docs, nil +} diff --git a/internal/handlers/handlers.go b/internal/handlers/handlers.go new file mode 100644 index 0000000..bf1588b --- /dev/null +++ b/internal/handlers/handlers.go @@ -0,0 +1,1173 @@ +package handlers + +import ( + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "strconv" + "strings" + "time" + + "docman/internal/db" + "docman/internal/processor" + + "github.com/labstack/echo/v4" +) + +type Handlers struct { + db *db.DB + processor *processor.Processor + storeDir string + recordsDir string +} + +func New(database *db.DB, proc *processor.Processor, storeDir, recordsDir string) *Handlers { + return &Handlers{ + db: database, + processor: proc, + storeDir: storeDir, + recordsDir: recordsDir, + } +} + +// Page handlers + +func (h *Handlers) Dashboard(c echo.Context) error { + stats, _ := h.db.GetStats() + recent, _ := h.db.RecentDocuments(12) + + return c.Render(http.StatusOK, "dashboard", map[string]interface{}{ + "Stats": stats, + "Recent": recent, + "Active": "dashboard", + }) +} + +func (h *Handlers) Browse(c echo.Context) error { + category := c.QueryParam("category") + page, _ := strconv.Atoi(c.QueryParam("page")) + if page < 1 { + page = 1 + } + limit := 24 + offset := (page - 1) * limit + + docs, _ := h.db.ListDocuments(category, limit, offset) + stats, _ := h.db.GetStats() + + categories := []string{"taxes", "expenses", "bills", "medical", "contacts", "legal", "insurance", "banking", "receipts", "correspondence", "uncategorized"} + + return c.Render(http.StatusOK, "browse", map[string]interface{}{ + "Documents": docs, + "Category": category, + "Categories": categories, + "Page": page, + "Stats": stats, + "Active": "browse", + }) +} + +func (h *Handlers) Document(c echo.Context) error { + id := c.Param("id") + doc, err := h.db.GetDocument(id) + if err != nil { + return c.Render(http.StatusNotFound, "error", map[string]interface{}{ + "Error": "Document not found", + }) + } + + // Read markdown content if available + var markdownContent string + if doc.MarkdownPath != "" { + if data, err := os.ReadFile(doc.MarkdownPath); err == nil { + markdownContent = string(data) + } + } + + return c.Render(http.StatusOK, "document", map[string]interface{}{ + "Document": doc, + "Markdown": markdownContent, + "Active": "browse", + }) +} + +func (h *Handlers) Search(c echo.Context) error { + query := c.QueryParam("q") + var results []*db.SearchResult + + if query != "" { + // Try FTS first + if ftsResults, err := h.db.SearchFTS(query, 50); err == nil && len(ftsResults) > 0 { + results = ftsResults + } else { + // Fallback to markdown search + mdResults, _ := processor.SearchMarkdown(h.recordsDir, query, 50) + results = mdResults + } + } + + return c.Render(http.StatusOK, "search", map[string]interface{}{ + "Query": query, + "Results": results, + "Active": "search", + }) +} + +func (h *Handlers) Expenses(c echo.Context) error { + year, _ := strconv.Atoi(c.QueryParam("year")) + month, _ := strconv.Atoi(c.QueryParam("month")) + + if year == 0 { + year = time.Now().Year() + } + + docs, _ := h.db.GetExpenses(year, month) + + // Calculate totals + var total float64 + var deductible float64 + for _, doc := range docs { + if doc.Amount != nil { + total += *doc.Amount + if doc.TaxDeductible { + deductible += *doc.Amount + } + } + } + + years := []int{} + for y := time.Now().Year(); y >= time.Now().Year()-5; y-- { + years = append(years, y) + } + + return c.Render(http.StatusOK, "expenses", map[string]interface{}{ + "Documents": docs, + "Year": year, + "Month": month, + "Total": total, + "Deductible": deductible, + "Years": years, + "Active": "expenses", + }) +} + +func (h *Handlers) Upload(c echo.Context) error { + return c.Render(http.StatusOK, "upload", map[string]interface{}{ + "Active": "upload", + }) +} + +// API handlers + +func (h *Handlers) APIUpload(c echo.Context) error { + file, err := c.FormFile("file") + if err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "No file uploaded"}) + } + + src, err := file.Open() + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": "Cannot open file"}) + } + defer src.Close() + + data, err := io.ReadAll(src) + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": "Cannot read file"}) + } + + doc, err := h.processor.ProcessSingle(c.Request().Context(), data, file.Filename) + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + + // For htmx, return the document card + if c.Request().Header.Get("HX-Request") == "true" { + return c.Render(http.StatusOK, "document-card", doc) + } + + return c.JSON(http.StatusOK, doc) +} + +func (h *Handlers) APIDocument(c echo.Context) error { + id := c.Param("id") + doc, err := h.db.GetDocument(id) + if err != nil { + return c.JSON(http.StatusNotFound, map[string]string{"error": "Not found"}) + } + return c.JSON(http.StatusOK, doc) +} + +func (h *Handlers) APIUpdateDocument(c echo.Context) error { + id := c.Param("id") + doc, err := h.db.GetDocument(id) + if err != nil { + return c.JSON(http.StatusNotFound, map[string]string{"error": "Not found"}) + } + + // Parse update fields + var update struct { + Category string `json:"category"` + Subcategory string `json:"subcategory"` + Title string `json:"title"` + Date string `json:"date"` + Vendor string `json:"vendor"` + Amount *float64 `json:"amount"` + TaxDeductible bool `json:"tax_deductible"` + } + + if err := c.Bind(&update); err != nil { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "Invalid request"}) + } + + if update.Category != "" { + doc.Category = update.Category + } + doc.Subcategory = update.Subcategory + if update.Title != "" { + doc.Title = update.Title + } + if update.Date != "" { + if t, err := time.Parse("2006-01-02", update.Date); err == nil { + doc.Date = &t + } + } + doc.Vendor = update.Vendor + doc.Amount = update.Amount + doc.TaxDeductible = update.TaxDeductible + + if err := h.db.UpdateDocument(doc); err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + + return c.JSON(http.StatusOK, doc) +} + +func (h *Handlers) APIDeleteDocument(c echo.Context) error { + id := c.Param("id") + doc, err := h.db.GetDocument(id) + if err != nil { + return c.JSON(http.StatusNotFound, map[string]string{"error": "Not found"}) + } + + // Remove files + if doc.StoragePath != "" { + os.Remove(doc.StoragePath) + } + if doc.MarkdownPath != "" { + os.Remove(doc.MarkdownPath) + } + + if err := h.db.DeleteDocument(id); err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + + return c.NoContent(http.StatusOK) +} + +func (h *Handlers) APISearch(c echo.Context) error { + query := c.QueryParam("q") + if query == "" { + return c.JSON(http.StatusBadRequest, map[string]string{"error": "Query required"}) + } + + results, err := h.db.SearchFTS(query, 50) + if err != nil { + // Fallback to markdown search + results, _ = processor.SearchMarkdown(h.recordsDir, query, 50) + } + + return c.JSON(http.StatusOK, results) +} + +func (h *Handlers) APIExportExpenses(c echo.Context) error { + year, _ := strconv.Atoi(c.QueryParam("year")) + if year == 0 { + year = time.Now().Year() + } + + docs, _ := h.db.GetExpenses(year, 0) + + c.Response().Header().Set("Content-Type", "text/csv") + c.Response().Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=expenses-%d.csv", year)) + + w := c.Response().Writer + w.Write([]byte("Date,Vendor,Title,Amount,Currency,Tax Deductible,Category\n")) + + for _, doc := range docs { + date := "" + if doc.Date != nil { + date = doc.Date.Format("2006-01-02") + } + amount := "" + if doc.Amount != nil { + amount = fmt.Sprintf("%.2f", *doc.Amount) + } + taxDed := "No" + if doc.TaxDeductible { + taxDed = "Yes" + } + line := fmt.Sprintf("%s,%s,%s,%s,%s,%s,%s\n", + csvEscape(date), + csvEscape(doc.Vendor), + csvEscape(doc.Title), + amount, + doc.Currency, + taxDed, + doc.Category, + ) + w.Write([]byte(line)) + } + + return nil +} + +func (h *Handlers) APIStats(c echo.Context) error { + stats, err := h.db.GetStats() + if err != nil { + return c.JSON(http.StatusInternalServerError, map[string]string{"error": err.Error()}) + } + return c.JSON(http.StatusOK, stats) +} + +// ServePDF serves PDF files for viewing +func (h *Handlers) ServePDF(c echo.Context) error { + filename := c.Param("filename") + // Sanitize filename to prevent path traversal + filename = filepath.Base(filename) + path := filepath.Join(h.storeDir, filename) + + if _, err := os.Stat(path); os.IsNotExist(err) { + return c.String(http.StatusNotFound, "File not found") + } + + return c.File(path) +} + +// ServeMarkdown serves markdown record files +func (h *Handlers) ServeMarkdown(c echo.Context) error { + // Get path from query + path := c.QueryParam("path") + if path == "" { + return c.String(http.StatusBadRequest, "Path required") + } + + // Ensure path is within records directory + absPath, _ := filepath.Abs(path) + absRecords, _ := filepath.Abs(h.recordsDir) + if !strings.HasPrefix(absPath, absRecords) { + return c.String(http.StatusForbidden, "Access denied") + } + + data, err := os.ReadFile(path) + if err != nil { + return c.String(http.StatusNotFound, "File not found") + } + + return c.Blob(http.StatusOK, "text/markdown", data) +} + +func csvEscape(s string) string { + if strings.ContainsAny(s, ",\"\n") { + return "\"" + strings.ReplaceAll(s, "\"", "\"\"") + "\"" + } + return s +} + +// Template renderer +type TemplateRenderer struct { + templates map[string]string +} + +func NewTemplateRenderer() *TemplateRenderer { + return &TemplateRenderer{ + templates: make(map[string]string), + } +} + +func (t *TemplateRenderer) Render(w io.Writer, name string, data interface{}, c echo.Context) error { + // We'll use html/template inline for now, or integrate with templ later + // For simplicity, serve static HTML with htmx + return renderTemplate(w, name, data) +} + +func renderTemplate(w io.Writer, name string, data interface{}) error { + dataMap, ok := data.(map[string]interface{}) + if !ok { + dataMap = map[string]interface{}{"Data": data} + } + + html := generateHTML(name, dataMap) + _, err := w.Write([]byte(html)) + return err +} + +func generateHTML(name string, data map[string]interface{}) string { + switch name { + case "dashboard": + return dashboardHTML(data) + case "browse": + return browseHTML(data) + case "document": + return documentHTML(data) + case "search": + return searchHTML(data) + case "expenses": + return expensesHTML(data) + case "upload": + return uploadHTML(data) + case "document-card": + return documentCardHTML(data) + case "error": + return errorHTML(data) + default: + return "Unknown template: " + name + "" + } +} + +func baseHTML(title, content string, active string) string { + return fmt.Sprintf(` + + + + + %s - DocMan + + + + + +
+ +
+ %s +
+
+ +`, + title, content, + activeClass(active, "dashboard"), + activeClass(active, "browse"), + activeClass(active, "search"), + activeClass(active, "expenses"), + activeClass(active, "upload"), + ) +} + +func activeClass(active, name string) string { + if active == name { + return "active" + } + return "" +} + +func dashboardHTML(data map[string]interface{}) string { + stats := data["Stats"].(map[string]interface{}) + recent := data["Recent"].([]*db.Document) + + totalSize := int64(0) + if v, ok := stats["total_size"].(int64); ok { + totalSize = v + } + + content := fmt.Sprintf(` +
+

Dashboard

+
+
+
+
%d
+
Total Documents
+
+
+
%d
+
This Month
+
+
+
%s
+
Storage Used
+
+
+
+

Recent Documents

+
+ %s +
+
+ `, stats["total"], stats["this_month"], formatBytes(totalSize), renderDocCards(recent)) + + return baseHTML("Dashboard", content, data["Active"].(string)) +} + +func browseHTML(data map[string]interface{}) string { + docs := data["Documents"].([]*db.Document) + categories := data["Categories"].([]string) + currentCat := data["Category"].(string) + + catOptions := `` + for _, cat := range categories { + selected := "" + if cat == currentCat { + selected = "selected" + } + catOptions += fmt.Sprintf(``, cat, selected, strings.Title(cat)) + } + + content := fmt.Sprintf(` +
+

Browse Documents

+ + Upload +
+
+ +
+
+ %s +
+ `, catOptions, renderDocCards(docs)) + + return baseHTML("Browse", content, data["Active"].(string)) +} + +func documentHTML(data map[string]interface{}) string { + doc := data["Document"].(*db.Document) + + dateStr := "—" + if doc.Date != nil { + dateStr = doc.Date.Format("January 2, 2006") + } + amountStr := "—" + if doc.Amount != nil { + amountStr = fmt.Sprintf("%s %.2f", doc.Currency, *doc.Amount) + } + taxDed := "No" + if doc.TaxDeductible { + taxDed = "Yes" + } + + content := fmt.Sprintf(` +
+

%s

+
+ ← Back + +
+
+
+
+
+

Preview

+ +
+
+
+
+

Details

+ + + + + + + + + +
Category%s
Date%s
Vendor%s
Amount%s
Tax Deductible%s
Pages%d
File Size%s
Processed%s
+
+
+

OCR Text

+
%s
+
+
+
+ + `, doc.Title, doc.ID, doc.Filename, doc.Category, doc.Category, + dateStr, doc.Vendor, amountStr, taxDed, doc.PageCount, + formatBytes(doc.FileSize), doc.ProcessedAt.Format("Jan 2, 2006 3:04 PM"), + escapeHTML(truncateText(doc.OCRText, 5000))) + + return baseHTML(doc.Title, content, data["Active"].(string)) +} + +func searchHTML(data map[string]interface{}) string { + query := data["Query"].(string) + results, _ := data["Results"].([]*db.SearchResult) + + resultsHTML := "" + if query == "" { + resultsHTML = `

Enter a search query to find documents

` + } else if len(results) == 0 { + resultsHTML = `

No results found

` + } else { + for _, r := range results { + resultsHTML += fmt.Sprintf(` +
+

%s

+ %s +

%s

+
+ `, r.ID, r.Title, r.Category, r.Category, r.Snippet) + } + } + + content := fmt.Sprintf(` +
+

Search Documents

+
+
+
+ +
+
+ %s +
+
+ `, escapeHTML(query), resultsHTML) + + return baseHTML("Search", content, data["Active"].(string)) +} + +func expensesHTML(data map[string]interface{}) string { + docs := data["Documents"].([]*db.Document) + year := data["Year"].(int) + month := data["Month"].(int) + total := data["Total"].(float64) + deductible := data["Deductible"].(float64) + years := data["Years"].([]int) + + yearOptions := "" + for _, y := range years { + selected := "" + if y == year { + selected = "selected" + } + yearOptions += fmt.Sprintf(``, y, selected, y) + } + + monthOptions := `` + months := []string{"", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"} + for i := 1; i <= 12; i++ { + selected := "" + if i == month { + selected = "selected" + } + monthOptions += fmt.Sprintf(``, i, selected, months[i]) + } + + rows := "" + for _, doc := range docs { + dateStr := "—" + if doc.Date != nil { + dateStr = doc.Date.Format("2006-01-02") + } + amountStr := "—" + if doc.Amount != nil { + amountStr = fmt.Sprintf("%.2f", *doc.Amount) + } + taxDed := "" + if doc.TaxDeductible { + taxDed = "✓" + } + rows += fmt.Sprintf(` + + %s + %s + %s + %s %s + %s + + `, dateStr, doc.Vendor, doc.ID, doc.Title, doc.Currency, amountStr, taxDed) + } + + content := fmt.Sprintf(` +
+

Expenses

+ Export CSV +
+
+
+
$%.2f
+
Total Expenses
+
+
+
$%.2f
+
Tax Deductible
+
+
+
+
+ + +
+ + + + + + + + + + + %s +
DateVendorDescriptionAmountTax Ded.
+
+ + `, year, total, deductible, yearOptions, monthOptions, rows) + + return baseHTML("Expenses", content, data["Active"].(string)) +} + +func uploadHTML(data map[string]interface{}) string { + content := ` +
+

Upload Documents

+
+
+
+ +
+
+
+ + ` + + return baseHTML("Upload", content, data["Active"].(string)) +} + +func documentCardHTML(data map[string]interface{}) string { + doc := data["Data"].(*db.Document) + return renderDocCard(doc) +} + +func renderDocCards(docs []*db.Document) string { + html := "" + for _, doc := range docs { + html += renderDocCard(doc) + } + return html +} + +func renderDocCard(doc *db.Document) string { + dateStr := "" + if doc.Date != nil { + dateStr = doc.Date.Format("Jan 2, 2006") + } + + icon := "📄" + switch doc.Category { + case "taxes": + icon = "🧾" + case "expenses": + icon = "🛒" + case "bills": + icon = "📬" + case "medical": + icon = "🏥" + case "legal": + icon = "⚖️" + case "insurance": + icon = "🛡️" + case "banking": + icon = "🏦" + case "receipts": + icon = "🧾" + } + + return fmt.Sprintf(` +
+ +
%s
+
+
%s
+
+ %s + %s +
+
+
+
+ `, doc.ID, icon, escapeHTML(doc.Title), doc.Category, doc.Category, dateStr) +} + +func errorHTML(data map[string]interface{}) string { + errMsg := data["Error"].(string) + content := fmt.Sprintf(` +
+

Error

+

%s

+ Go Home +
+ `, escapeHTML(errMsg)) + return baseHTML("Error", content, "") +} + +func formatBytes(b int64) string { + const unit = 1024 + if b < unit { + return fmt.Sprintf("%d B", b) + } + div, exp := int64(unit), 0 + for n := b / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %cB", float64(b)/float64(div), "KMGTPE"[exp]) +} + +func escapeHTML(s string) string { + s = strings.ReplaceAll(s, "&", "&") + s = strings.ReplaceAll(s, "<", "<") + s = strings.ReplaceAll(s, ">", ">") + s = strings.ReplaceAll(s, "\"", """) + return s +} + +func truncateText(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] + "..." +} diff --git a/internal/processor/processor.go b/internal/processor/processor.go new file mode 100644 index 0000000..0f01fb3 --- /dev/null +++ b/internal/processor/processor.go @@ -0,0 +1,700 @@ +package processor + +import ( + "bytes" + "context" + "crypto/sha256" + "encoding/hex" + "encoding/json" + "fmt" + "log" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + "time" + + "docman/internal/db" + + "github.com/fsnotify/fsnotify" + "github.com/google/uuid" + openai "github.com/sashabaranov/go-openai" +) + +type Processor struct { + db *db.DB + inboxDir string + storeDir string + recordsDir string + aiClient *openai.Client + aiModel string + embedModel string +} + +type Config struct { + InboxDir string + StoreDir string + RecordsDir string + AIEndpoint string // Fireworks API endpoint + AIKey string + AIModel string // e.g., "accounts/fireworks/models/qwen2-vl-72b-instruct" + EmbedModel string +} + +type Classification struct { + Category string `json:"category"` + Subcategory string `json:"subcategory,omitempty"` + Title string `json:"title"` + Date string `json:"date,omitempty"` + Vendor string `json:"vendor,omitempty"` + Amount *float64 `json:"amount,omitempty"` + Currency string `json:"currency,omitempty"` + TaxDeductible bool `json:"tax_deductible"` + Summary string `json:"summary"` + KeyFields map[string]string `json:"key_fields,omitempty"` +} + +func New(cfg Config, database *db.DB) *Processor { + config := openai.DefaultConfig(cfg.AIKey) + config.BaseURL = cfg.AIEndpoint + + return &Processor{ + db: database, + inboxDir: cfg.InboxDir, + storeDir: cfg.StoreDir, + recordsDir: cfg.RecordsDir, + aiClient: openai.NewClientWithConfig(config), + aiModel: cfg.AIModel, + embedModel: cfg.EmbedModel, + } +} + +func (p *Processor) Watch(ctx context.Context) error { + // Ensure directories exist + for _, dir := range []string{p.inboxDir, p.storeDir, p.recordsDir} { + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("create directory %s: %w", dir, err) + } + } + + // Process existing files first + entries, _ := os.ReadDir(p.inboxDir) + for _, entry := range entries { + if entry.IsDir() || strings.HasPrefix(entry.Name(), ".") { + continue + } + path := filepath.Join(p.inboxDir, entry.Name()) + if err := p.ProcessFile(ctx, path); err != nil { + log.Printf("Error processing %s: %v", path, err) + } + } + + // Watch for new files + watcher, err := fsnotify.NewWatcher() + if err != nil { + return err + } + defer watcher.Close() + + if err := watcher.Add(p.inboxDir); err != nil { + return err + } + + log.Printf("Watching inbox: %s", p.inboxDir) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case event, ok := <-watcher.Events: + if !ok { + return nil + } + if event.Op&fsnotify.Create == fsnotify.Create { + // Wait a moment for file to be fully written + time.Sleep(500 * time.Millisecond) + if err := p.ProcessFile(ctx, event.Name); err != nil { + log.Printf("Error processing %s: %v", event.Name, err) + } + } + case err, ok := <-watcher.Errors: + if !ok { + return nil + } + log.Printf("Watcher error: %v", err) + } + } +} + +func (p *Processor) ProcessFile(ctx context.Context, path string) error { + // Skip hidden files and non-PDFs/images + base := filepath.Base(path) + if strings.HasPrefix(base, ".") { + return nil + } + + ext := strings.ToLower(filepath.Ext(path)) + if ext != ".pdf" && ext != ".jpg" && ext != ".jpeg" && ext != ".png" { + log.Printf("Skipping non-document file: %s", path) + return nil + } + + log.Printf("Processing: %s", path) + + // Read file + data, err := os.ReadFile(path) + if err != nil { + return err + } + + // Compute checksum + hash := sha256.Sum256(data) + checksum := hex.EncodeToString(hash[:]) + + // Generate ID + id := uuid.New().String() + + // Extract text via OCR + ocrText, pageCount, err := p.extractText(path, ext) + if err != nil { + log.Printf("OCR failed for %s: %v", path, err) + ocrText = "" + } + + // Classify with AI + classification, err := p.classify(ctx, ocrText, base) + if err != nil { + log.Printf("Classification failed for %s: %v", path, err) + classification = &Classification{ + Category: "uncategorized", + Title: base, + } + } + + // Store PDF + storageName := fmt.Sprintf("%s%s", checksum[:16], ext) + storagePath := filepath.Join(p.storeDir, storageName) + if err := os.WriteFile(storagePath, data, 0644); err != nil { + return err + } + + // Parse date + var docDate *time.Time + if classification.Date != "" { + if t, err := parseDate(classification.Date); err == nil { + docDate = &t + } + } + + // Create document record + doc := &db.Document{ + ID: id, + Filename: storageName, + OriginalName: base, + Category: classification.Category, + Subcategory: classification.Subcategory, + Title: classification.Title, + Date: docDate, + Vendor: classification.Vendor, + Amount: classification.Amount, + Currency: classification.Currency, + TaxDeductible: classification.TaxDeductible, + OCRText: ocrText, + StoragePath: storagePath, + PageCount: pageCount, + FileSize: int64(len(data)), + MimeType: getMimeType(ext), + Checksum: checksum, + ProcessedAt: time.Now(), + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + + // Generate markdown record + mdPath, err := p.writeMarkdown(doc, classification) + if err != nil { + log.Printf("Failed to write markdown: %v", err) + } else { + doc.MarkdownPath = mdPath + } + + // Generate embedding + if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil { + doc.Embedding = embedding + } + + // Store metadata as JSON + if meta, err := json.Marshal(classification.KeyFields); err == nil { + doc.Metadata = meta + } + + // Insert into database + if err := p.db.InsertDocument(doc); err != nil { + return err + } + + // Remove from inbox + if err := os.Remove(path); err != nil { + log.Printf("Failed to remove inbox file: %v", err) + } + + log.Printf("Processed: %s -> %s (%s)", base, classification.Title, classification.Category) + return nil +} + +func (p *Processor) extractText(path, ext string) (string, int, error) { + if ext == ".pdf" { + return p.extractPDFText(path) + } + return p.extractImageText(path) +} + +func (p *Processor) extractPDFText(path string) (string, int, error) { + // Try pdftotext first (poppler-utils) + cmd := exec.Command("pdftotext", "-layout", path, "-") + output, err := cmd.Output() + if err == nil && len(output) > 100 { + // Count pages + pageCmd := exec.Command("pdfinfo", path) + pageOut, _ := pageCmd.Output() + pages := 1 + if match := regexp.MustCompile(`Pages:\s+(\d+)`).FindSubmatch(pageOut); len(match) > 1 { + fmt.Sscanf(string(match[1]), "%d", &pages) + } + return string(output), pages, nil + } + + // Fallback to OCR via tesseract + // Convert PDF to images first + tmpDir, err := os.MkdirTemp("", "docman-ocr-") + if err != nil { + return "", 0, err + } + defer os.RemoveAll(tmpDir) + + // Use pdftoppm to convert to images + cmd = exec.Command("pdftoppm", "-png", "-r", "300", path, filepath.Join(tmpDir, "page")) + if err := cmd.Run(); err != nil { + return "", 0, fmt.Errorf("pdftoppm failed: %w", err) + } + + // OCR each page + var textBuf bytes.Buffer + pages, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png")) + for _, pagePath := range pages { + text, _, _ := p.extractImageText(pagePath) + textBuf.WriteString(text) + textBuf.WriteString("\n\n--- Page Break ---\n\n") + } + + return textBuf.String(), len(pages), nil +} + +func (p *Processor) extractImageText(path string) (string, int, error) { + cmd := exec.Command("tesseract", path, "stdout", "-l", "eng+nld") + output, err := cmd.Output() + if err != nil { + return "", 1, err + } + return string(output), 1, nil +} + +func (p *Processor) classify(ctx context.Context, text, filename string) (*Classification, error) { + prompt := fmt.Sprintf(`Analyze this scanned document and extract structured information. + +Document filename: %s + +OCR Text: +%s + +Classify and extract the following JSON structure: +{ + "category": "taxes|expenses|bills|medical|contacts|legal|insurance|banking|receipts|correspondence|uncategorized", + "subcategory": "more specific category if applicable", + "title": "descriptive title for this document", + "date": "YYYY-MM-DD if found", + "vendor": "company/person name if applicable", + "amount": numeric amount if this is a financial document, + "currency": "USD" or other currency code, + "tax_deductible": true/false if this is a deductible expense, + "summary": "one paragraph summary of the document", + "key_fields": {"field_name": "value"} for any other important extracted data +} + +Categories: +- taxes: W-2, 1099, tax returns, deductions +- expenses: receipts, invoices for purchases +- bills: utility bills, service bills +- medical: medical records, prescriptions, EOBs +- contacts: business cards, contact info +- legal: contracts, agreements, legal documents +- insurance: policies, claims +- banking: statements, checks +- receipts: general purchase receipts +- correspondence: letters, emails + +Return ONLY valid JSON.`, filename, truncate(text, 4000)) + + resp, err := p.aiClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{ + Model: p.aiModel, + Messages: []openai.ChatCompletionMessage{ + {Role: openai.ChatMessageRoleUser, Content: prompt}, + }, + Temperature: 0.1, + }) + if err != nil { + return nil, err + } + + if len(resp.Choices) == 0 { + return nil, fmt.Errorf("no response from AI") + } + + content := resp.Choices[0].Message.Content + // Extract JSON from response + content = extractJSON(content) + + var classification Classification + if err := json.Unmarshal([]byte(content), &classification); err != nil { + return nil, fmt.Errorf("parse classification: %w", err) + } + + return &classification, nil +} + +func (p *Processor) generateEmbedding(ctx context.Context, text string) ([]byte, error) { + if p.embedModel == "" { + return nil, nil + } + + resp, err := p.aiClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{ + Model: openai.EmbeddingModel(p.embedModel), + Input: []string{truncate(text, 8000)}, + }) + if err != nil { + return nil, err + } + + if len(resp.Data) == 0 { + return nil, fmt.Errorf("no embedding returned") + } + + // Serialize embedding to bytes + embData, err := json.Marshal(resp.Data[0].Embedding) + return embData, err +} + +func (p *Processor) writeMarkdown(doc *db.Document, class *Classification) (string, error) { + // Create category subdirectory + catDir := filepath.Join(p.recordsDir, doc.Category) + if err := os.MkdirAll(catDir, 0755); err != nil { + return "", err + } + + // Generate filename + dateStr := "undated" + if doc.Date != nil { + dateStr = doc.Date.Format("2006-01-02") + } + safeName := sanitizeFilename(doc.Title) + mdName := fmt.Sprintf("%s_%s.md", dateStr, safeName) + mdPath := filepath.Join(catDir, mdName) + + // Build markdown content + var buf bytes.Buffer + buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title)) + buf.WriteString("## Metadata\n\n") + buf.WriteString(fmt.Sprintf("- **ID:** %s\n", doc.ID)) + buf.WriteString(fmt.Sprintf("- **Category:** %s", doc.Category)) + if doc.Subcategory != "" { + buf.WriteString(fmt.Sprintf(" / %s", doc.Subcategory)) + } + buf.WriteString("\n") + if doc.Date != nil { + buf.WriteString(fmt.Sprintf("- **Date:** %s\n", doc.Date.Format("2006-01-02"))) + } + if doc.Vendor != "" { + buf.WriteString(fmt.Sprintf("- **Vendor:** %s\n", doc.Vendor)) + } + if doc.Amount != nil { + buf.WriteString(fmt.Sprintf("- **Amount:** %s %.2f\n", doc.Currency, *doc.Amount)) + } + if doc.TaxDeductible { + buf.WriteString("- **Tax Deductible:** Yes\n") + } + buf.WriteString(fmt.Sprintf("- **Original File:** %s\n", doc.OriginalName)) + buf.WriteString(fmt.Sprintf("- **PDF:** [View](%s)\n", doc.StoragePath)) + buf.WriteString(fmt.Sprintf("- **Processed:** %s\n", doc.ProcessedAt.Format(time.RFC3339))) + + if class.Summary != "" { + buf.WriteString("\n## Summary\n\n") + buf.WriteString(class.Summary) + buf.WriteString("\n") + } + + if len(class.KeyFields) > 0 { + buf.WriteString("\n## Key Fields\n\n") + for k, v := range class.KeyFields { + buf.WriteString(fmt.Sprintf("- **%s:** %s\n", k, v)) + } + } + + buf.WriteString("\n## Full Text (OCR)\n\n```\n") + buf.WriteString(doc.OCRText) + buf.WriteString("\n```\n") + + if err := os.WriteFile(mdPath, buf.Bytes(), 0644); err != nil { + return "", err + } + + return mdPath, nil +} + +// Helper functions + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] +} + +func extractJSON(s string) string { + // Try to find JSON block + start := strings.Index(s, "{") + end := strings.LastIndex(s, "}") + if start >= 0 && end > start { + return s[start : end+1] + } + return s +} + +func parseDate(s string) (time.Time, error) { + formats := []string{ + "2006-01-02", + "01/02/2006", + "1/2/2006", + "January 2, 2006", + "Jan 2, 2006", + "2006/01/02", + } + for _, f := range formats { + if t, err := time.Parse(f, s); err == nil { + return t, nil + } + } + return time.Time{}, fmt.Errorf("cannot parse date: %s", s) +} + +func getMimeType(ext string) string { + switch ext { + case ".pdf": + return "application/pdf" + case ".jpg", ".jpeg": + return "image/jpeg" + case ".png": + return "image/png" + default: + return "application/octet-stream" + } +} + +func sanitizeFilename(s string) string { + s = strings.ToLower(s) + s = regexp.MustCompile(`[^a-z0-9]+`).ReplaceAllString(s, "-") + s = strings.Trim(s, "-") + if len(s) > 50 { + s = s[:50] + } + return s +} + +// ProcessSingle processes a single file and returns the document (for API uploads) +func (p *Processor) ProcessSingle(ctx context.Context, data []byte, filename string) (*db.Document, error) { + // Write to temp file for processing + ext := strings.ToLower(filepath.Ext(filename)) + tmpFile, err := os.CreateTemp("", "docman-upload-*"+ext) + if err != nil { + return nil, err + } + defer os.Remove(tmpFile.Name()) + + if _, err := tmpFile.Write(data); err != nil { + return nil, err + } + tmpFile.Close() + + // Use existing process logic but don't delete the temp file + // Compute checksum + hash := sha256.Sum256(data) + checksum := hex.EncodeToString(hash[:]) + id := uuid.New().String() + + ocrText, pageCount, err := p.extractText(tmpFile.Name(), ext) + if err != nil { + ocrText = "" + } + + classification, err := p.classify(ctx, ocrText, filename) + if err != nil { + classification = &Classification{ + Category: "uncategorized", + Title: filename, + } + } + + storageName := fmt.Sprintf("%s%s", checksum[:16], ext) + storagePath := filepath.Join(p.storeDir, storageName) + if err := os.WriteFile(storagePath, data, 0644); err != nil { + return nil, err + } + + var docDate *time.Time + if classification.Date != "" { + if t, err := parseDate(classification.Date); err == nil { + docDate = &t + } + } + + doc := &db.Document{ + ID: id, + Filename: storageName, + OriginalName: filename, + Category: classification.Category, + Subcategory: classification.Subcategory, + Title: classification.Title, + Date: docDate, + Vendor: classification.Vendor, + Amount: classification.Amount, + Currency: classification.Currency, + TaxDeductible: classification.TaxDeductible, + OCRText: ocrText, + StoragePath: storagePath, + PageCount: pageCount, + FileSize: int64(len(data)), + MimeType: getMimeType(ext), + Checksum: checksum, + ProcessedAt: time.Now(), + CreatedAt: time.Now(), + UpdatedAt: time.Now(), + } + + mdPath, _ := p.writeMarkdown(doc, classification) + doc.MarkdownPath = mdPath + + if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil { + doc.Embedding = embedding + } + + if meta, err := json.Marshal(classification.KeyFields); err == nil { + doc.Metadata = meta + } + + if err := p.db.InsertDocument(doc); err != nil { + return nil, err + } + + return doc, nil +} + +// SearchMarkdown searches markdown files directly (fallback when embeddings unavailable) +func SearchMarkdown(recordsDir, query string, limit int) ([]*db.SearchResult, error) { + var results []*db.SearchResult + query = strings.ToLower(query) + terms := strings.Fields(query) + + err := filepath.Walk(recordsDir, func(path string, info os.FileInfo, err error) error { + if err != nil || info.IsDir() || !strings.HasSuffix(path, ".md") { + return nil + } + + data, err := os.ReadFile(path) + if err != nil { + return nil + } + + content := strings.ToLower(string(data)) + score := 0.0 + for _, term := range terms { + if strings.Contains(content, term) { + score += 1.0 + } + } + + if score > 0 { + // Extract title from first line + lines := strings.Split(string(data), "\n") + title := strings.TrimPrefix(lines[0], "# ") + + // Find snippet around first match + snippet := findSnippet(string(data), terms[0], 100) + + results = append(results, &db.SearchResult{ + Document: db.Document{ + Title: title, + MarkdownPath: path, + }, + Score: score / float64(len(terms)), + Snippet: snippet, + }) + } + return nil + }) + + if err != nil { + return nil, err + } + + // Sort by score descending + for i := 0; i < len(results)-1; i++ { + for j := i + 1; j < len(results); j++ { + if results[j].Score > results[i].Score { + results[i], results[j] = results[j], results[i] + } + } + } + + if len(results) > limit { + results = results[:limit] + } + + return results, nil +} + +func findSnippet(text, term string, radius int) string { + lower := strings.ToLower(text) + idx := strings.Index(lower, strings.ToLower(term)) + if idx < 0 { + if len(text) > radius*2 { + return text[:radius*2] + "..." + } + return text + } + + start := idx - radius + if start < 0 { + start = 0 + } + end := idx + len(term) + radius + if end > len(text) { + end = len(text) + } + + snippet := text[start:end] + if start > 0 { + snippet = "..." + snippet + } + if end < len(text) { + snippet = snippet + "..." + } + return snippet +} + +func (p *Processor) GetRecordsDir() string { + return p.recordsDir +} + +func (p *Processor) GetStoreDir() string { + return p.storeDir +}