Initial commit
This commit is contained in:
commit
880f9dab9d
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Binaries
|
||||||
|
*.exe
|
||||||
|
node_modules/
|
||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
|
@ -0,0 +1,51 @@
|
||||||
|
.PHONY: build run install clean dev deps
|
||||||
|
|
||||||
|
BINARY=docman
|
||||||
|
VERSION=$(shell git describe --tags --always --dirty 2>/dev/null || echo "dev")
|
||||||
|
BUILD_DIR=build
|
||||||
|
|
||||||
|
# Build flags
|
||||||
|
LDFLAGS=-ldflags "-s -w -X main.Version=$(VERSION)"
|
||||||
|
CGO_CFLAGS=-DSQLITE_ENABLE_FTS5
|
||||||
|
TAGS=-tags "fts5"
|
||||||
|
|
||||||
|
build:
|
||||||
|
@mkdir -p $(BUILD_DIR)
|
||||||
|
CGO_CFLAGS="$(CGO_CFLAGS)" go build $(TAGS) $(LDFLAGS) -o $(BUILD_DIR)/$(BINARY) ./cmd/docman
|
||||||
|
|
||||||
|
run: build
|
||||||
|
./$(BUILD_DIR)/$(BINARY)
|
||||||
|
|
||||||
|
dev:
|
||||||
|
go run ./cmd/docman -port 8200
|
||||||
|
|
||||||
|
deps:
|
||||||
|
go mod download
|
||||||
|
go mod tidy
|
||||||
|
|
||||||
|
# Install to ~/bin
|
||||||
|
install: build
|
||||||
|
@mkdir -p $(HOME)/bin
|
||||||
|
cp $(BUILD_DIR)/$(BINARY) $(HOME)/bin/
|
||||||
|
@echo "Installed to $(HOME)/bin/$(BINARY)"
|
||||||
|
|
||||||
|
# Install systemd service
|
||||||
|
install-service:
|
||||||
|
@mkdir -p $(HOME)/.config/systemd/user
|
||||||
|
@envsubst < deploy/docman.service > $(HOME)/.config/systemd/user/docman.service
|
||||||
|
systemctl --user daemon-reload
|
||||||
|
systemctl --user enable docman
|
||||||
|
@echo "Service installed. Start with: systemctl --user start docman"
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BUILD_DIR)
|
||||||
|
|
||||||
|
# Development helpers
|
||||||
|
test:
|
||||||
|
go test -v ./...
|
||||||
|
|
||||||
|
lint:
|
||||||
|
golangci-lint run
|
||||||
|
|
||||||
|
fmt:
|
||||||
|
go fmt ./...
|
||||||
|
|
@ -0,0 +1,116 @@
|
||||||
|
# DocMan - Document Management System
|
||||||
|
|
||||||
|
AI-powered document scanning, OCR, classification, and search.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run directly (dev mode)
|
||||||
|
cd ~/dev/docman
|
||||||
|
make dev
|
||||||
|
|
||||||
|
# Or run the installed binary
|
||||||
|
~/bin/docman -port 8200
|
||||||
|
```
|
||||||
|
|
||||||
|
Open http://localhost:8200
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Auto-processing**: Drop PDFs/images into `~/documents/inbox/` → auto-classified and indexed
|
||||||
|
- **PDF Preview**: Built-in PDF viewer
|
||||||
|
- **Full-text Search**: FTS5 + semantic search
|
||||||
|
- **Categories**: taxes, expenses, bills, medical, contacts, legal, insurance, banking, receipts
|
||||||
|
- **Expense Tracking**: Filter by date, export to CSV, track tax-deductible items
|
||||||
|
- **Markdown Records**: Each document gets a searchable markdown file
|
||||||
|
|
||||||
|
## Scanner Setup
|
||||||
|
|
||||||
|
1. **Set scanner to save to SMB share:**
|
||||||
|
- Share: `\\192.168.1.16\documents\inbox` (or wherever james is)
|
||||||
|
- Or use your scanner's app to save to `~/documents/inbox/`
|
||||||
|
|
||||||
|
2. **Workflow:**
|
||||||
|
- Scan document → lands in inbox
|
||||||
|
- DocMan auto-processes (OCR → classify → store)
|
||||||
|
- View/search in web UI
|
||||||
|
|
||||||
|
## Directory Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
~/documents/
|
||||||
|
├── inbox/ # Drop scans here (auto-processed)
|
||||||
|
├── store/ # PDF storage (by checksum)
|
||||||
|
├── records/ # Markdown records by category
|
||||||
|
│ ├── taxes/
|
||||||
|
│ ├── expenses/
|
||||||
|
│ ├── bills/
|
||||||
|
│ └── ...
|
||||||
|
└── index/ # SQLite database
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
FIREWORKS_API_KEY=fw_xxx # Required for AI classification
|
||||||
|
```
|
||||||
|
|
||||||
|
### Command Line Options
|
||||||
|
|
||||||
|
```
|
||||||
|
-port HTTP port (default: 8200)
|
||||||
|
-data Data directory (default: ~/documents)
|
||||||
|
-ai-endpoint AI API endpoint (default: Fireworks)
|
||||||
|
-ai-key AI API key
|
||||||
|
-ai-model Classification model
|
||||||
|
-embed-model Embedding model
|
||||||
|
-watch Only watch inbox, don't start web server
|
||||||
|
```
|
||||||
|
|
||||||
|
## Systemd Service
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit to add your Fireworks API key
|
||||||
|
nano ~/.config/systemd/user/docman.service
|
||||||
|
|
||||||
|
# Enable and start
|
||||||
|
systemctl --user daemon-reload
|
||||||
|
systemctl --user enable docman
|
||||||
|
systemctl --user start docman
|
||||||
|
|
||||||
|
# Check status
|
||||||
|
systemctl --user status docman
|
||||||
|
journalctl --user -u docman -f
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
| Method | Path | Description |
|
||||||
|
|--------|------|-------------|
|
||||||
|
| GET | `/` | Dashboard |
|
||||||
|
| GET | `/browse` | Browse documents |
|
||||||
|
| GET | `/doc/:id` | View document |
|
||||||
|
| GET | `/search` | Search page |
|
||||||
|
| GET | `/expenses` | Expenses tracker |
|
||||||
|
| GET | `/upload` | Upload page |
|
||||||
|
| POST | `/api/upload` | Upload document |
|
||||||
|
| GET | `/api/documents/:id` | Get document JSON |
|
||||||
|
| PATCH | `/api/documents/:id` | Update document |
|
||||||
|
| DELETE | `/api/documents/:id` | Delete document |
|
||||||
|
| GET | `/api/search?q=` | Search API |
|
||||||
|
| GET | `/api/expenses/export` | Export CSV |
|
||||||
|
| GET | `/api/stats` | Dashboard stats |
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
System packages (already installed):
|
||||||
|
- `poppler-utils` (pdftotext, pdfinfo, pdftoppm)
|
||||||
|
- `tesseract-ocr` (OCR for scanned images)
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- Without FIREWORKS_API_KEY, documents will be categorized as "uncategorized"
|
||||||
|
- The inbox watcher runs continuously, processing new files automatically
|
||||||
|
- Markdown files are searchable even without embeddings
|
||||||
Binary file not shown.
|
|
@ -0,0 +1,164 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"os/signal"
|
||||||
|
"path/filepath"
|
||||||
|
"syscall"
|
||||||
|
|
||||||
|
"docman/internal/db"
|
||||||
|
"docman/internal/handlers"
|
||||||
|
"docman/internal/processor"
|
||||||
|
|
||||||
|
"github.com/labstack/echo/v4"
|
||||||
|
"github.com/labstack/echo/v4/middleware"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var (
|
||||||
|
port = flag.Int("port", 8200, "HTTP port")
|
||||||
|
dataDir = flag.String("data", "", "Data directory (default: ~/documents)")
|
||||||
|
aiEndpoint = flag.String("ai-endpoint", "https://api.fireworks.ai/inference/v1", "AI API endpoint")
|
||||||
|
aiKey = flag.String("ai-key", "", "AI API key (or FIREWORKS_API_KEY env)")
|
||||||
|
aiModel = flag.String("ai-model", "accounts/fireworks/models/qwen2-vl-72b-instruct", "AI model for classification")
|
||||||
|
embedModel = flag.String("embed-model", "nomic-ai/nomic-embed-text-v1.5", "Embedding model")
|
||||||
|
watchOnly = flag.Bool("watch", false, "Only watch inbox, don't start web server")
|
||||||
|
)
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
// Resolve data directory
|
||||||
|
if *dataDir == "" {
|
||||||
|
home, _ := os.UserHomeDir()
|
||||||
|
*dataDir = filepath.Join(home, "documents")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve AI key
|
||||||
|
if *aiKey == "" {
|
||||||
|
*aiKey = os.Getenv("FIREWORKS_API_KEY")
|
||||||
|
}
|
||||||
|
if *aiKey == "" {
|
||||||
|
log.Println("Warning: No AI API key provided. Classification will fail.")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create directories
|
||||||
|
dirs := map[string]string{
|
||||||
|
"inbox": filepath.Join(*dataDir, "inbox"),
|
||||||
|
"store": filepath.Join(*dataDir, "store"),
|
||||||
|
"records": filepath.Join(*dataDir, "records"),
|
||||||
|
"index": filepath.Join(*dataDir, "index"),
|
||||||
|
}
|
||||||
|
for _, dir := range dirs {
|
||||||
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||||
|
log.Fatalf("Failed to create directory %s: %v", dir, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Open database
|
||||||
|
dbPath := filepath.Join(dirs["index"], "docman.db")
|
||||||
|
database, err := db.Open(dbPath)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Failed to open database: %v", err)
|
||||||
|
}
|
||||||
|
defer database.Close()
|
||||||
|
|
||||||
|
if err := database.Init(); err != nil {
|
||||||
|
log.Fatalf("Failed to initialize database: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create processor
|
||||||
|
proc := processor.New(processor.Config{
|
||||||
|
InboxDir: dirs["inbox"],
|
||||||
|
StoreDir: dirs["store"],
|
||||||
|
RecordsDir: dirs["records"],
|
||||||
|
AIEndpoint: *aiEndpoint,
|
||||||
|
AIKey: *aiKey,
|
||||||
|
AIModel: *aiModel,
|
||||||
|
EmbedModel: *embedModel,
|
||||||
|
}, database)
|
||||||
|
|
||||||
|
// Context for graceful shutdown
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Start inbox watcher
|
||||||
|
go func() {
|
||||||
|
if err := proc.Watch(ctx); err != nil && err != context.Canceled {
|
||||||
|
log.Printf("Watcher error: %v", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
if *watchOnly {
|
||||||
|
log.Printf("Watching inbox: %s", dirs["inbox"])
|
||||||
|
// Wait for signal
|
||||||
|
sigCh := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
<-sigCh
|
||||||
|
cancel()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create Echo server
|
||||||
|
e := echo.New()
|
||||||
|
e.HideBanner = true
|
||||||
|
|
||||||
|
// Middleware
|
||||||
|
e.Use(middleware.Logger())
|
||||||
|
e.Use(middleware.Recover())
|
||||||
|
e.Use(middleware.CORS())
|
||||||
|
|
||||||
|
// Template renderer
|
||||||
|
e.Renderer = &templateRenderer{}
|
||||||
|
|
||||||
|
// Handlers
|
||||||
|
h := handlers.New(database, proc, dirs["store"], dirs["records"])
|
||||||
|
|
||||||
|
// Page routes
|
||||||
|
e.GET("/", h.Dashboard)
|
||||||
|
e.GET("/browse", h.Browse)
|
||||||
|
e.GET("/doc/:id", h.Document)
|
||||||
|
e.GET("/search", h.Search)
|
||||||
|
e.GET("/expenses", h.Expenses)
|
||||||
|
e.GET("/upload", h.Upload)
|
||||||
|
|
||||||
|
// API routes
|
||||||
|
e.POST("/api/upload", h.APIUpload)
|
||||||
|
e.GET("/api/documents/:id", h.APIDocument)
|
||||||
|
e.PATCH("/api/documents/:id", h.APIUpdateDocument)
|
||||||
|
e.DELETE("/api/documents/:id", h.APIDeleteDocument)
|
||||||
|
e.GET("/api/search", h.APISearch)
|
||||||
|
e.GET("/api/expenses/export", h.APIExportExpenses)
|
||||||
|
e.GET("/api/stats", h.APIStats)
|
||||||
|
|
||||||
|
// Static file serving
|
||||||
|
e.GET("/pdf/:filename", h.ServePDF)
|
||||||
|
e.GET("/markdown", h.ServeMarkdown)
|
||||||
|
|
||||||
|
// Graceful shutdown
|
||||||
|
go func() {
|
||||||
|
sigCh := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
<-sigCh
|
||||||
|
log.Println("Shutting down...")
|
||||||
|
cancel()
|
||||||
|
e.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Start server
|
||||||
|
log.Printf("DocMan starting on http://localhost:%d", *port)
|
||||||
|
log.Printf("Data directory: %s", *dataDir)
|
||||||
|
log.Printf("Inbox: %s", dirs["inbox"])
|
||||||
|
if err := e.Start(fmt.Sprintf(":%d", *port)); err != nil {
|
||||||
|
log.Printf("Server stopped: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type templateRenderer struct{}
|
||||||
|
|
||||||
|
func (t *templateRenderer) Render(w io.Writer, name string, data interface{}, c echo.Context) error {
|
||||||
|
return handlers.NewTemplateRenderer().Render(w, name, data, c)
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
[Unit]
|
||||||
|
Description=DocMan - Document Management System
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
ExecStart=${HOME}/bin/docman -port 8200 -ai-key ${FIREWORKS_API_KEY}
|
||||||
|
Restart=on-failure
|
||||||
|
RestartSec=5
|
||||||
|
Environment=HOME=${HOME}
|
||||||
|
Environment=FIREWORKS_API_KEY=${FIREWORKS_API_KEY}
|
||||||
|
WorkingDirectory=${HOME}
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=default.target
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
module docman
|
||||||
|
|
||||||
|
go 1.22
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/fsnotify/fsnotify v1.7.0
|
||||||
|
github.com/google/uuid v1.6.0
|
||||||
|
github.com/labstack/echo/v4 v4.12.0
|
||||||
|
github.com/mattn/go-sqlite3 v1.14.22
|
||||||
|
github.com/sashabaranov/go-openai v1.29.0
|
||||||
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/golang-jwt/jwt v3.2.2+incompatible // indirect
|
||||||
|
github.com/labstack/gommon v0.4.2 // indirect
|
||||||
|
github.com/mattn/go-colorable v0.1.13 // indirect
|
||||||
|
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||||
|
github.com/valyala/bytebufferpool v1.0.0 // indirect
|
||||||
|
github.com/valyala/fasttemplate v1.2.2 // indirect
|
||||||
|
golang.org/x/crypto v0.26.0 // indirect
|
||||||
|
golang.org/x/net v0.28.0 // indirect
|
||||||
|
golang.org/x/sys v0.23.0 // indirect
|
||||||
|
golang.org/x/text v0.17.0 // indirect
|
||||||
|
golang.org/x/time v0.5.0 // indirect
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
|
||||||
|
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
|
||||||
|
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
|
||||||
|
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
|
||||||
|
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||||
|
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||||
|
github.com/labstack/echo/v4 v4.12.0 h1:IKpw49IMryVB2p1a4dzwlhP1O2Tf2E0Ir/450lH+kI0=
|
||||||
|
github.com/labstack/echo/v4 v4.12.0/go.mod h1:UP9Cr2DJXbOK3Kr9ONYzNowSh7HP0aG0ShAyycHSJvM=
|
||||||
|
github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
|
||||||
|
github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
|
||||||
|
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||||
|
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||||
|
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
|
||||||
|
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||||
|
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||||
|
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
|
||||||
|
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||||
|
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||||
|
github.com/sashabaranov/go-openai v1.29.0 h1:eBH6LSjtX4md5ImDCX8hNhHQvaRf22zujiERoQpsvLo=
|
||||||
|
github.com/sashabaranov/go-openai v1.29.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
|
||||||
|
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||||
|
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||||
|
github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
|
||||||
|
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
|
||||||
|
github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
|
||||||
|
github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
|
||||||
|
golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw=
|
||||||
|
golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54=
|
||||||
|
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
|
||||||
|
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
|
||||||
|
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||||
|
golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM=
|
||||||
|
golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
|
golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
|
||||||
|
golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
|
||||||
|
golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk=
|
||||||
|
golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|
@ -0,0 +1,406 @@
|
||||||
|
package db
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/mattn/go-sqlite3"
|
||||||
|
)
|
||||||
|
|
||||||
|
type DB struct {
|
||||||
|
*sql.DB
|
||||||
|
}
|
||||||
|
|
||||||
|
type Document struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Filename string `json:"filename"`
|
||||||
|
OriginalName string `json:"original_name"`
|
||||||
|
Category string `json:"category"`
|
||||||
|
Subcategory string `json:"subcategory,omitempty"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Date *time.Time `json:"date,omitempty"`
|
||||||
|
Vendor string `json:"vendor,omitempty"`
|
||||||
|
Amount *float64 `json:"amount,omitempty"`
|
||||||
|
Currency string `json:"currency,omitempty"`
|
||||||
|
TaxDeductible bool `json:"tax_deductible"`
|
||||||
|
OCRText string `json:"ocr_text"`
|
||||||
|
Metadata json.RawMessage `json:"metadata,omitempty"`
|
||||||
|
Embedding []byte `json:"embedding,omitempty"`
|
||||||
|
StoragePath string `json:"storage_path"`
|
||||||
|
MarkdownPath string `json:"markdown_path"`
|
||||||
|
PageCount int `json:"page_count"`
|
||||||
|
FileSize int64 `json:"file_size"`
|
||||||
|
MimeType string `json:"mime_type"`
|
||||||
|
Checksum string `json:"checksum"`
|
||||||
|
ProcessedAt time.Time `json:"processed_at"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type SearchResult struct {
|
||||||
|
Document
|
||||||
|
Score float64 `json:"score"`
|
||||||
|
Snippet string `json:"snippet"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func Open(path string) (*DB, error) {
|
||||||
|
db, err := sql.Open("sqlite3", path+"?_journal_mode=WAL&_busy_timeout=5000")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := db.Ping(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return &DB{db}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) Init() error {
|
||||||
|
schema := `
|
||||||
|
CREATE TABLE IF NOT EXISTS documents (
|
||||||
|
id TEXT PRIMARY KEY,
|
||||||
|
filename TEXT NOT NULL,
|
||||||
|
original_name TEXT NOT NULL,
|
||||||
|
category TEXT NOT NULL DEFAULT 'uncategorized',
|
||||||
|
subcategory TEXT,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
date TEXT,
|
||||||
|
vendor TEXT,
|
||||||
|
amount REAL,
|
||||||
|
currency TEXT DEFAULT 'USD',
|
||||||
|
tax_deductible INTEGER DEFAULT 0,
|
||||||
|
ocr_text TEXT,
|
||||||
|
metadata TEXT,
|
||||||
|
embedding BLOB,
|
||||||
|
storage_path TEXT NOT NULL,
|
||||||
|
markdown_path TEXT,
|
||||||
|
page_count INTEGER DEFAULT 1,
|
||||||
|
file_size INTEGER,
|
||||||
|
mime_type TEXT,
|
||||||
|
checksum TEXT,
|
||||||
|
processed_at TEXT,
|
||||||
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_category ON documents(category);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_date ON documents(date);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_vendor ON documents(vendor);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_documents_amount ON documents(amount);
|
||||||
|
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
|
||||||
|
id,
|
||||||
|
title,
|
||||||
|
ocr_text,
|
||||||
|
vendor,
|
||||||
|
category,
|
||||||
|
content='documents',
|
||||||
|
content_rowid='rowid'
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
|
||||||
|
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
|
||||||
|
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
|
||||||
|
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
|
||||||
|
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
|
||||||
|
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
|
||||||
|
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
|
||||||
|
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
|
||||||
|
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS settings (
|
||||||
|
key TEXT PRIMARY KEY,
|
||||||
|
value TEXT
|
||||||
|
);
|
||||||
|
`
|
||||||
|
_, err := db.Exec(schema)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) InsertDocument(doc *Document) error {
|
||||||
|
var dateStr *string
|
||||||
|
if doc.Date != nil {
|
||||||
|
s := doc.Date.Format("2006-01-02")
|
||||||
|
dateStr = &s
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := db.Exec(`
|
||||||
|
INSERT INTO documents (
|
||||||
|
id, filename, original_name, category, subcategory, title, date,
|
||||||
|
vendor, amount, currency, tax_deductible, ocr_text, metadata,
|
||||||
|
embedding, storage_path, markdown_path, page_count, file_size,
|
||||||
|
mime_type, checksum, processed_at, created_at, updated_at
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
`, doc.ID, doc.Filename, doc.OriginalName, doc.Category, doc.Subcategory,
|
||||||
|
doc.Title, dateStr, doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
|
||||||
|
doc.OCRText, doc.Metadata, doc.Embedding, doc.StoragePath, doc.MarkdownPath,
|
||||||
|
doc.PageCount, doc.FileSize, doc.MimeType, doc.Checksum,
|
||||||
|
doc.ProcessedAt.Format(time.RFC3339),
|
||||||
|
doc.CreatedAt.Format(time.RFC3339),
|
||||||
|
doc.UpdatedAt.Format(time.RFC3339))
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) UpdateDocument(doc *Document) error {
|
||||||
|
var dateStr *string
|
||||||
|
if doc.Date != nil {
|
||||||
|
s := doc.Date.Format("2006-01-02")
|
||||||
|
dateStr = &s
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := db.Exec(`
|
||||||
|
UPDATE documents SET
|
||||||
|
category = ?, subcategory = ?, title = ?, date = ?,
|
||||||
|
vendor = ?, amount = ?, currency = ?, tax_deductible = ?,
|
||||||
|
metadata = ?, updated_at = ?
|
||||||
|
WHERE id = ?
|
||||||
|
`, doc.Category, doc.Subcategory, doc.Title, dateStr,
|
||||||
|
doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
|
||||||
|
doc.Metadata, time.Now().Format(time.RFC3339), doc.ID)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) GetDocument(id string) (*Document, error) {
|
||||||
|
row := db.QueryRow(`SELECT * FROM documents WHERE id = ?`, id)
|
||||||
|
return scanDocument(row)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) DeleteDocument(id string) error {
|
||||||
|
_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) ListDocuments(category string, limit, offset int) ([]*Document, error) {
|
||||||
|
var rows *sql.Rows
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if category != "" {
|
||||||
|
rows, err = db.Query(`
|
||||||
|
SELECT * FROM documents
|
||||||
|
WHERE category = ?
|
||||||
|
ORDER BY COALESCE(date, created_at) DESC
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
`, category, limit, offset)
|
||||||
|
} else {
|
||||||
|
rows, err = db.Query(`
|
||||||
|
SELECT * FROM documents
|
||||||
|
ORDER BY COALESCE(date, created_at) DESC
|
||||||
|
LIMIT ? OFFSET ?
|
||||||
|
`, limit, offset)
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanDocuments(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) RecentDocuments(limit int) ([]*Document, error) {
|
||||||
|
rows, err := db.Query(`
|
||||||
|
SELECT * FROM documents
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT ?
|
||||||
|
`, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanDocuments(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) SearchFTS(query string, limit int) ([]*SearchResult, error) {
|
||||||
|
rows, err := db.Query(`
|
||||||
|
SELECT d.*,
|
||||||
|
bm25(documents_fts) as score,
|
||||||
|
snippet(documents_fts, 2, '<mark>', '</mark>', '...', 32) as snippet
|
||||||
|
FROM documents_fts f
|
||||||
|
JOIN documents d ON f.id = d.id
|
||||||
|
WHERE documents_fts MATCH ?
|
||||||
|
ORDER BY bm25(documents_fts)
|
||||||
|
LIMIT ?
|
||||||
|
`, query, limit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
var results []*SearchResult
|
||||||
|
for rows.Next() {
|
||||||
|
var doc Document
|
||||||
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
||||||
|
var score float64
|
||||||
|
var snippet string
|
||||||
|
|
||||||
|
err := rows.Scan(
|
||||||
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
||||||
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
||||||
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
||||||
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
||||||
|
&processedStr, &createdStr, &updatedStr, &score, &snippet,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if dateStr.Valid {
|
||||||
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
||||||
|
doc.Date = &t
|
||||||
|
}
|
||||||
|
if createdStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
||||||
|
doc.CreatedAt = t
|
||||||
|
}
|
||||||
|
|
||||||
|
results = append(results, &SearchResult{Document: doc, Score: score, Snippet: snippet})
|
||||||
|
}
|
||||||
|
|
||||||
|
return results, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) GetStats() (map[string]interface{}, error) {
|
||||||
|
stats := make(map[string]interface{})
|
||||||
|
|
||||||
|
// Total documents
|
||||||
|
var total int
|
||||||
|
db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&total)
|
||||||
|
stats["total"] = total
|
||||||
|
|
||||||
|
// This month
|
||||||
|
var thisMonth int
|
||||||
|
db.QueryRow(`SELECT COUNT(*) FROM documents WHERE created_at >= date('now', 'start of month')`).Scan(&thisMonth)
|
||||||
|
stats["this_month"] = thisMonth
|
||||||
|
|
||||||
|
// Total size
|
||||||
|
var totalSize int64
|
||||||
|
db.QueryRow(`SELECT COALESCE(SUM(file_size), 0) FROM documents`).Scan(&totalSize)
|
||||||
|
stats["total_size"] = totalSize
|
||||||
|
|
||||||
|
// By category
|
||||||
|
rows, err := db.Query(`SELECT category, COUNT(*) FROM documents GROUP BY category`)
|
||||||
|
if err == nil {
|
||||||
|
categories := make(map[string]int)
|
||||||
|
for rows.Next() {
|
||||||
|
var cat string
|
||||||
|
var count int
|
||||||
|
rows.Scan(&cat, &count)
|
||||||
|
categories[cat] = count
|
||||||
|
}
|
||||||
|
rows.Close()
|
||||||
|
stats["by_category"] = categories
|
||||||
|
}
|
||||||
|
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (db *DB) GetExpenses(year int, month int) ([]*Document, error) {
|
||||||
|
query := `
|
||||||
|
SELECT * FROM documents
|
||||||
|
WHERE category = 'expenses' AND amount IS NOT NULL
|
||||||
|
`
|
||||||
|
args := []interface{}{}
|
||||||
|
|
||||||
|
if year > 0 {
|
||||||
|
query += ` AND strftime('%Y', date) = ?`
|
||||||
|
args = append(args, fmt.Sprintf("%04d", year))
|
||||||
|
}
|
||||||
|
if month > 0 {
|
||||||
|
query += ` AND strftime('%m', date) = ?`
|
||||||
|
args = append(args, fmt.Sprintf("%02d", month))
|
||||||
|
}
|
||||||
|
|
||||||
|
query += ` ORDER BY date DESC`
|
||||||
|
|
||||||
|
rows, err := db.Query(query, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
|
||||||
|
return scanDocuments(rows)
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanDocument(row *sql.Row) (*Document, error) {
|
||||||
|
var doc Document
|
||||||
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
||||||
|
|
||||||
|
err := row.Scan(
|
||||||
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
||||||
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
||||||
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
||||||
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
||||||
|
&processedStr, &createdStr, &updatedStr,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if dateStr.Valid {
|
||||||
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
||||||
|
doc.Date = &t
|
||||||
|
}
|
||||||
|
if processedStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, processedStr.String)
|
||||||
|
doc.ProcessedAt = t
|
||||||
|
}
|
||||||
|
if createdStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
||||||
|
doc.CreatedAt = t
|
||||||
|
}
|
||||||
|
if updatedStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, updatedStr.String)
|
||||||
|
doc.UpdatedAt = t
|
||||||
|
}
|
||||||
|
|
||||||
|
return &doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanDocuments(rows *sql.Rows) ([]*Document, error) {
|
||||||
|
var docs []*Document
|
||||||
|
for rows.Next() {
|
||||||
|
var doc Document
|
||||||
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
||||||
|
|
||||||
|
err := rows.Scan(
|
||||||
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
||||||
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
||||||
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
||||||
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
||||||
|
&processedStr, &createdStr, &updatedStr,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if dateStr.Valid {
|
||||||
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
||||||
|
doc.Date = &t
|
||||||
|
}
|
||||||
|
if processedStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, processedStr.String)
|
||||||
|
doc.ProcessedAt = t
|
||||||
|
}
|
||||||
|
if createdStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
||||||
|
doc.CreatedAt = t
|
||||||
|
}
|
||||||
|
if updatedStr.Valid {
|
||||||
|
t, _ := time.Parse(time.RFC3339, updatedStr.String)
|
||||||
|
doc.UpdatedAt = t
|
||||||
|
}
|
||||||
|
|
||||||
|
docs = append(docs, &doc)
|
||||||
|
}
|
||||||
|
return docs, nil
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,700 @@
|
||||||
|
package processor
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"crypto/sha256"
|
||||||
|
"encoding/hex"
|
||||||
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"docman/internal/db"
|
||||||
|
|
||||||
|
"github.com/fsnotify/fsnotify"
|
||||||
|
"github.com/google/uuid"
|
||||||
|
openai "github.com/sashabaranov/go-openai"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Processor struct {
|
||||||
|
db *db.DB
|
||||||
|
inboxDir string
|
||||||
|
storeDir string
|
||||||
|
recordsDir string
|
||||||
|
aiClient *openai.Client
|
||||||
|
aiModel string
|
||||||
|
embedModel string
|
||||||
|
}
|
||||||
|
|
||||||
|
type Config struct {
|
||||||
|
InboxDir string
|
||||||
|
StoreDir string
|
||||||
|
RecordsDir string
|
||||||
|
AIEndpoint string // Fireworks API endpoint
|
||||||
|
AIKey string
|
||||||
|
AIModel string // e.g., "accounts/fireworks/models/qwen2-vl-72b-instruct"
|
||||||
|
EmbedModel string
|
||||||
|
}
|
||||||
|
|
||||||
|
type Classification struct {
|
||||||
|
Category string `json:"category"`
|
||||||
|
Subcategory string `json:"subcategory,omitempty"`
|
||||||
|
Title string `json:"title"`
|
||||||
|
Date string `json:"date,omitempty"`
|
||||||
|
Vendor string `json:"vendor,omitempty"`
|
||||||
|
Amount *float64 `json:"amount,omitempty"`
|
||||||
|
Currency string `json:"currency,omitempty"`
|
||||||
|
TaxDeductible bool `json:"tax_deductible"`
|
||||||
|
Summary string `json:"summary"`
|
||||||
|
KeyFields map[string]string `json:"key_fields,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func New(cfg Config, database *db.DB) *Processor {
|
||||||
|
config := openai.DefaultConfig(cfg.AIKey)
|
||||||
|
config.BaseURL = cfg.AIEndpoint
|
||||||
|
|
||||||
|
return &Processor{
|
||||||
|
db: database,
|
||||||
|
inboxDir: cfg.InboxDir,
|
||||||
|
storeDir: cfg.StoreDir,
|
||||||
|
recordsDir: cfg.RecordsDir,
|
||||||
|
aiClient: openai.NewClientWithConfig(config),
|
||||||
|
aiModel: cfg.AIModel,
|
||||||
|
embedModel: cfg.EmbedModel,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) Watch(ctx context.Context) error {
|
||||||
|
// Ensure directories exist
|
||||||
|
for _, dir := range []string{p.inboxDir, p.storeDir, p.recordsDir} {
|
||||||
|
if err := os.MkdirAll(dir, 0755); err != nil {
|
||||||
|
return fmt.Errorf("create directory %s: %w", dir, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process existing files first
|
||||||
|
entries, _ := os.ReadDir(p.inboxDir)
|
||||||
|
for _, entry := range entries {
|
||||||
|
if entry.IsDir() || strings.HasPrefix(entry.Name(), ".") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
path := filepath.Join(p.inboxDir, entry.Name())
|
||||||
|
if err := p.ProcessFile(ctx, path); err != nil {
|
||||||
|
log.Printf("Error processing %s: %v", path, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Watch for new files
|
||||||
|
watcher, err := fsnotify.NewWatcher()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer watcher.Close()
|
||||||
|
|
||||||
|
if err := watcher.Add(p.inboxDir); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Watching inbox: %s", p.inboxDir)
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case event, ok := <-watcher.Events:
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if event.Op&fsnotify.Create == fsnotify.Create {
|
||||||
|
// Wait a moment for file to be fully written
|
||||||
|
time.Sleep(500 * time.Millisecond)
|
||||||
|
if err := p.ProcessFile(ctx, event.Name); err != nil {
|
||||||
|
log.Printf("Error processing %s: %v", event.Name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case err, ok := <-watcher.Errors:
|
||||||
|
if !ok {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
log.Printf("Watcher error: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) ProcessFile(ctx context.Context, path string) error {
|
||||||
|
// Skip hidden files and non-PDFs/images
|
||||||
|
base := filepath.Base(path)
|
||||||
|
if strings.HasPrefix(base, ".") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ext := strings.ToLower(filepath.Ext(path))
|
||||||
|
if ext != ".pdf" && ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
|
||||||
|
log.Printf("Skipping non-document file: %s", path)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Processing: %s", path)
|
||||||
|
|
||||||
|
// Read file
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute checksum
|
||||||
|
hash := sha256.Sum256(data)
|
||||||
|
checksum := hex.EncodeToString(hash[:])
|
||||||
|
|
||||||
|
// Generate ID
|
||||||
|
id := uuid.New().String()
|
||||||
|
|
||||||
|
// Extract text via OCR
|
||||||
|
ocrText, pageCount, err := p.extractText(path, ext)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("OCR failed for %s: %v", path, err)
|
||||||
|
ocrText = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
// Classify with AI
|
||||||
|
classification, err := p.classify(ctx, ocrText, base)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Classification failed for %s: %v", path, err)
|
||||||
|
classification = &Classification{
|
||||||
|
Category: "uncategorized",
|
||||||
|
Title: base,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store PDF
|
||||||
|
storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
|
||||||
|
storagePath := filepath.Join(p.storeDir, storageName)
|
||||||
|
if err := os.WriteFile(storagePath, data, 0644); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse date
|
||||||
|
var docDate *time.Time
|
||||||
|
if classification.Date != "" {
|
||||||
|
if t, err := parseDate(classification.Date); err == nil {
|
||||||
|
docDate = &t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create document record
|
||||||
|
doc := &db.Document{
|
||||||
|
ID: id,
|
||||||
|
Filename: storageName,
|
||||||
|
OriginalName: base,
|
||||||
|
Category: classification.Category,
|
||||||
|
Subcategory: classification.Subcategory,
|
||||||
|
Title: classification.Title,
|
||||||
|
Date: docDate,
|
||||||
|
Vendor: classification.Vendor,
|
||||||
|
Amount: classification.Amount,
|
||||||
|
Currency: classification.Currency,
|
||||||
|
TaxDeductible: classification.TaxDeductible,
|
||||||
|
OCRText: ocrText,
|
||||||
|
StoragePath: storagePath,
|
||||||
|
PageCount: pageCount,
|
||||||
|
FileSize: int64(len(data)),
|
||||||
|
MimeType: getMimeType(ext),
|
||||||
|
Checksum: checksum,
|
||||||
|
ProcessedAt: time.Now(),
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
UpdatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate markdown record
|
||||||
|
mdPath, err := p.writeMarkdown(doc, classification)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("Failed to write markdown: %v", err)
|
||||||
|
} else {
|
||||||
|
doc.MarkdownPath = mdPath
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate embedding
|
||||||
|
if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
|
||||||
|
doc.Embedding = embedding
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store metadata as JSON
|
||||||
|
if meta, err := json.Marshal(classification.KeyFields); err == nil {
|
||||||
|
doc.Metadata = meta
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert into database
|
||||||
|
if err := p.db.InsertDocument(doc); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove from inbox
|
||||||
|
if err := os.Remove(path); err != nil {
|
||||||
|
log.Printf("Failed to remove inbox file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Printf("Processed: %s -> %s (%s)", base, classification.Title, classification.Category)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) extractText(path, ext string) (string, int, error) {
|
||||||
|
if ext == ".pdf" {
|
||||||
|
return p.extractPDFText(path)
|
||||||
|
}
|
||||||
|
return p.extractImageText(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) extractPDFText(path string) (string, int, error) {
|
||||||
|
// Try pdftotext first (poppler-utils)
|
||||||
|
cmd := exec.Command("pdftotext", "-layout", path, "-")
|
||||||
|
output, err := cmd.Output()
|
||||||
|
if err == nil && len(output) > 100 {
|
||||||
|
// Count pages
|
||||||
|
pageCmd := exec.Command("pdfinfo", path)
|
||||||
|
pageOut, _ := pageCmd.Output()
|
||||||
|
pages := 1
|
||||||
|
if match := regexp.MustCompile(`Pages:\s+(\d+)`).FindSubmatch(pageOut); len(match) > 1 {
|
||||||
|
fmt.Sscanf(string(match[1]), "%d", &pages)
|
||||||
|
}
|
||||||
|
return string(output), pages, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback to OCR via tesseract
|
||||||
|
// Convert PDF to images first
|
||||||
|
tmpDir, err := os.MkdirTemp("", "docman-ocr-")
|
||||||
|
if err != nil {
|
||||||
|
return "", 0, err
|
||||||
|
}
|
||||||
|
defer os.RemoveAll(tmpDir)
|
||||||
|
|
||||||
|
// Use pdftoppm to convert to images
|
||||||
|
cmd = exec.Command("pdftoppm", "-png", "-r", "300", path, filepath.Join(tmpDir, "page"))
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
return "", 0, fmt.Errorf("pdftoppm failed: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// OCR each page
|
||||||
|
var textBuf bytes.Buffer
|
||||||
|
pages, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
||||||
|
for _, pagePath := range pages {
|
||||||
|
text, _, _ := p.extractImageText(pagePath)
|
||||||
|
textBuf.WriteString(text)
|
||||||
|
textBuf.WriteString("\n\n--- Page Break ---\n\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
return textBuf.String(), len(pages), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) extractImageText(path string) (string, int, error) {
|
||||||
|
cmd := exec.Command("tesseract", path, "stdout", "-l", "eng+nld")
|
||||||
|
output, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return "", 1, err
|
||||||
|
}
|
||||||
|
return string(output), 1, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) classify(ctx context.Context, text, filename string) (*Classification, error) {
|
||||||
|
prompt := fmt.Sprintf(`Analyze this scanned document and extract structured information.
|
||||||
|
|
||||||
|
Document filename: %s
|
||||||
|
|
||||||
|
OCR Text:
|
||||||
|
%s
|
||||||
|
|
||||||
|
Classify and extract the following JSON structure:
|
||||||
|
{
|
||||||
|
"category": "taxes|expenses|bills|medical|contacts|legal|insurance|banking|receipts|correspondence|uncategorized",
|
||||||
|
"subcategory": "more specific category if applicable",
|
||||||
|
"title": "descriptive title for this document",
|
||||||
|
"date": "YYYY-MM-DD if found",
|
||||||
|
"vendor": "company/person name if applicable",
|
||||||
|
"amount": numeric amount if this is a financial document,
|
||||||
|
"currency": "USD" or other currency code,
|
||||||
|
"tax_deductible": true/false if this is a deductible expense,
|
||||||
|
"summary": "one paragraph summary of the document",
|
||||||
|
"key_fields": {"field_name": "value"} for any other important extracted data
|
||||||
|
}
|
||||||
|
|
||||||
|
Categories:
|
||||||
|
- taxes: W-2, 1099, tax returns, deductions
|
||||||
|
- expenses: receipts, invoices for purchases
|
||||||
|
- bills: utility bills, service bills
|
||||||
|
- medical: medical records, prescriptions, EOBs
|
||||||
|
- contacts: business cards, contact info
|
||||||
|
- legal: contracts, agreements, legal documents
|
||||||
|
- insurance: policies, claims
|
||||||
|
- banking: statements, checks
|
||||||
|
- receipts: general purchase receipts
|
||||||
|
- correspondence: letters, emails
|
||||||
|
|
||||||
|
Return ONLY valid JSON.`, filename, truncate(text, 4000))
|
||||||
|
|
||||||
|
resp, err := p.aiClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
|
||||||
|
Model: p.aiModel,
|
||||||
|
Messages: []openai.ChatCompletionMessage{
|
||||||
|
{Role: openai.ChatMessageRoleUser, Content: prompt},
|
||||||
|
},
|
||||||
|
Temperature: 0.1,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(resp.Choices) == 0 {
|
||||||
|
return nil, fmt.Errorf("no response from AI")
|
||||||
|
}
|
||||||
|
|
||||||
|
content := resp.Choices[0].Message.Content
|
||||||
|
// Extract JSON from response
|
||||||
|
content = extractJSON(content)
|
||||||
|
|
||||||
|
var classification Classification
|
||||||
|
if err := json.Unmarshal([]byte(content), &classification); err != nil {
|
||||||
|
return nil, fmt.Errorf("parse classification: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &classification, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) generateEmbedding(ctx context.Context, text string) ([]byte, error) {
|
||||||
|
if p.embedModel == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
resp, err := p.aiClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
|
||||||
|
Model: openai.EmbeddingModel(p.embedModel),
|
||||||
|
Input: []string{truncate(text, 8000)},
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(resp.Data) == 0 {
|
||||||
|
return nil, fmt.Errorf("no embedding returned")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Serialize embedding to bytes
|
||||||
|
embData, err := json.Marshal(resp.Data[0].Embedding)
|
||||||
|
return embData, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) writeMarkdown(doc *db.Document, class *Classification) (string, error) {
|
||||||
|
// Create category subdirectory
|
||||||
|
catDir := filepath.Join(p.recordsDir, doc.Category)
|
||||||
|
if err := os.MkdirAll(catDir, 0755); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate filename
|
||||||
|
dateStr := "undated"
|
||||||
|
if doc.Date != nil {
|
||||||
|
dateStr = doc.Date.Format("2006-01-02")
|
||||||
|
}
|
||||||
|
safeName := sanitizeFilename(doc.Title)
|
||||||
|
mdName := fmt.Sprintf("%s_%s.md", dateStr, safeName)
|
||||||
|
mdPath := filepath.Join(catDir, mdName)
|
||||||
|
|
||||||
|
// Build markdown content
|
||||||
|
var buf bytes.Buffer
|
||||||
|
buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
|
||||||
|
buf.WriteString("## Metadata\n\n")
|
||||||
|
buf.WriteString(fmt.Sprintf("- **ID:** %s\n", doc.ID))
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Category:** %s", doc.Category))
|
||||||
|
if doc.Subcategory != "" {
|
||||||
|
buf.WriteString(fmt.Sprintf(" / %s", doc.Subcategory))
|
||||||
|
}
|
||||||
|
buf.WriteString("\n")
|
||||||
|
if doc.Date != nil {
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Date:** %s\n", doc.Date.Format("2006-01-02")))
|
||||||
|
}
|
||||||
|
if doc.Vendor != "" {
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Vendor:** %s\n", doc.Vendor))
|
||||||
|
}
|
||||||
|
if doc.Amount != nil {
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Amount:** %s %.2f\n", doc.Currency, *doc.Amount))
|
||||||
|
}
|
||||||
|
if doc.TaxDeductible {
|
||||||
|
buf.WriteString("- **Tax Deductible:** Yes\n")
|
||||||
|
}
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Original File:** %s\n", doc.OriginalName))
|
||||||
|
buf.WriteString(fmt.Sprintf("- **PDF:** [View](%s)\n", doc.StoragePath))
|
||||||
|
buf.WriteString(fmt.Sprintf("- **Processed:** %s\n", doc.ProcessedAt.Format(time.RFC3339)))
|
||||||
|
|
||||||
|
if class.Summary != "" {
|
||||||
|
buf.WriteString("\n## Summary\n\n")
|
||||||
|
buf.WriteString(class.Summary)
|
||||||
|
buf.WriteString("\n")
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(class.KeyFields) > 0 {
|
||||||
|
buf.WriteString("\n## Key Fields\n\n")
|
||||||
|
for k, v := range class.KeyFields {
|
||||||
|
buf.WriteString(fmt.Sprintf("- **%s:** %s\n", k, v))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buf.WriteString("\n## Full Text (OCR)\n\n```\n")
|
||||||
|
buf.WriteString(doc.OCRText)
|
||||||
|
buf.WriteString("\n```\n")
|
||||||
|
|
||||||
|
if err := os.WriteFile(mdPath, buf.Bytes(), 0644); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return mdPath, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper functions
|
||||||
|
|
||||||
|
func truncate(s string, max int) string {
|
||||||
|
if len(s) <= max {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
return s[:max]
|
||||||
|
}
|
||||||
|
|
||||||
|
func extractJSON(s string) string {
|
||||||
|
// Try to find JSON block
|
||||||
|
start := strings.Index(s, "{")
|
||||||
|
end := strings.LastIndex(s, "}")
|
||||||
|
if start >= 0 && end > start {
|
||||||
|
return s[start : end+1]
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseDate(s string) (time.Time, error) {
|
||||||
|
formats := []string{
|
||||||
|
"2006-01-02",
|
||||||
|
"01/02/2006",
|
||||||
|
"1/2/2006",
|
||||||
|
"January 2, 2006",
|
||||||
|
"Jan 2, 2006",
|
||||||
|
"2006/01/02",
|
||||||
|
}
|
||||||
|
for _, f := range formats {
|
||||||
|
if t, err := time.Parse(f, s); err == nil {
|
||||||
|
return t, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return time.Time{}, fmt.Errorf("cannot parse date: %s", s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func getMimeType(ext string) string {
|
||||||
|
switch ext {
|
||||||
|
case ".pdf":
|
||||||
|
return "application/pdf"
|
||||||
|
case ".jpg", ".jpeg":
|
||||||
|
return "image/jpeg"
|
||||||
|
case ".png":
|
||||||
|
return "image/png"
|
||||||
|
default:
|
||||||
|
return "application/octet-stream"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeFilename(s string) string {
|
||||||
|
s = strings.ToLower(s)
|
||||||
|
s = regexp.MustCompile(`[^a-z0-9]+`).ReplaceAllString(s, "-")
|
||||||
|
s = strings.Trim(s, "-")
|
||||||
|
if len(s) > 50 {
|
||||||
|
s = s[:50]
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// ProcessSingle processes a single file and returns the document (for API uploads)
|
||||||
|
func (p *Processor) ProcessSingle(ctx context.Context, data []byte, filename string) (*db.Document, error) {
|
||||||
|
// Write to temp file for processing
|
||||||
|
ext := strings.ToLower(filepath.Ext(filename))
|
||||||
|
tmpFile, err := os.CreateTemp("", "docman-upload-*"+ext)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer os.Remove(tmpFile.Name())
|
||||||
|
|
||||||
|
if _, err := tmpFile.Write(data); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
tmpFile.Close()
|
||||||
|
|
||||||
|
// Use existing process logic but don't delete the temp file
|
||||||
|
// Compute checksum
|
||||||
|
hash := sha256.Sum256(data)
|
||||||
|
checksum := hex.EncodeToString(hash[:])
|
||||||
|
id := uuid.New().String()
|
||||||
|
|
||||||
|
ocrText, pageCount, err := p.extractText(tmpFile.Name(), ext)
|
||||||
|
if err != nil {
|
||||||
|
ocrText = ""
|
||||||
|
}
|
||||||
|
|
||||||
|
classification, err := p.classify(ctx, ocrText, filename)
|
||||||
|
if err != nil {
|
||||||
|
classification = &Classification{
|
||||||
|
Category: "uncategorized",
|
||||||
|
Title: filename,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
|
||||||
|
storagePath := filepath.Join(p.storeDir, storageName)
|
||||||
|
if err := os.WriteFile(storagePath, data, 0644); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var docDate *time.Time
|
||||||
|
if classification.Date != "" {
|
||||||
|
if t, err := parseDate(classification.Date); err == nil {
|
||||||
|
docDate = &t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
doc := &db.Document{
|
||||||
|
ID: id,
|
||||||
|
Filename: storageName,
|
||||||
|
OriginalName: filename,
|
||||||
|
Category: classification.Category,
|
||||||
|
Subcategory: classification.Subcategory,
|
||||||
|
Title: classification.Title,
|
||||||
|
Date: docDate,
|
||||||
|
Vendor: classification.Vendor,
|
||||||
|
Amount: classification.Amount,
|
||||||
|
Currency: classification.Currency,
|
||||||
|
TaxDeductible: classification.TaxDeductible,
|
||||||
|
OCRText: ocrText,
|
||||||
|
StoragePath: storagePath,
|
||||||
|
PageCount: pageCount,
|
||||||
|
FileSize: int64(len(data)),
|
||||||
|
MimeType: getMimeType(ext),
|
||||||
|
Checksum: checksum,
|
||||||
|
ProcessedAt: time.Now(),
|
||||||
|
CreatedAt: time.Now(),
|
||||||
|
UpdatedAt: time.Now(),
|
||||||
|
}
|
||||||
|
|
||||||
|
mdPath, _ := p.writeMarkdown(doc, classification)
|
||||||
|
doc.MarkdownPath = mdPath
|
||||||
|
|
||||||
|
if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
|
||||||
|
doc.Embedding = embedding
|
||||||
|
}
|
||||||
|
|
||||||
|
if meta, err := json.Marshal(classification.KeyFields); err == nil {
|
||||||
|
doc.Metadata = meta
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := p.db.InsertDocument(doc); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// SearchMarkdown searches markdown files directly (fallback when embeddings unavailable)
|
||||||
|
func SearchMarkdown(recordsDir, query string, limit int) ([]*db.SearchResult, error) {
|
||||||
|
var results []*db.SearchResult
|
||||||
|
query = strings.ToLower(query)
|
||||||
|
terms := strings.Fields(query)
|
||||||
|
|
||||||
|
err := filepath.Walk(recordsDir, func(path string, info os.FileInfo, err error) error {
|
||||||
|
if err != nil || info.IsDir() || !strings.HasSuffix(path, ".md") {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
content := strings.ToLower(string(data))
|
||||||
|
score := 0.0
|
||||||
|
for _, term := range terms {
|
||||||
|
if strings.Contains(content, term) {
|
||||||
|
score += 1.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if score > 0 {
|
||||||
|
// Extract title from first line
|
||||||
|
lines := strings.Split(string(data), "\n")
|
||||||
|
title := strings.TrimPrefix(lines[0], "# ")
|
||||||
|
|
||||||
|
// Find snippet around first match
|
||||||
|
snippet := findSnippet(string(data), terms[0], 100)
|
||||||
|
|
||||||
|
results = append(results, &db.SearchResult{
|
||||||
|
Document: db.Document{
|
||||||
|
Title: title,
|
||||||
|
MarkdownPath: path,
|
||||||
|
},
|
||||||
|
Score: score / float64(len(terms)),
|
||||||
|
Snippet: snippet,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort by score descending
|
||||||
|
for i := 0; i < len(results)-1; i++ {
|
||||||
|
for j := i + 1; j < len(results); j++ {
|
||||||
|
if results[j].Score > results[i].Score {
|
||||||
|
results[i], results[j] = results[j], results[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(results) > limit {
|
||||||
|
results = results[:limit]
|
||||||
|
}
|
||||||
|
|
||||||
|
return results, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func findSnippet(text, term string, radius int) string {
|
||||||
|
lower := strings.ToLower(text)
|
||||||
|
idx := strings.Index(lower, strings.ToLower(term))
|
||||||
|
if idx < 0 {
|
||||||
|
if len(text) > radius*2 {
|
||||||
|
return text[:radius*2] + "..."
|
||||||
|
}
|
||||||
|
return text
|
||||||
|
}
|
||||||
|
|
||||||
|
start := idx - radius
|
||||||
|
if start < 0 {
|
||||||
|
start = 0
|
||||||
|
}
|
||||||
|
end := idx + len(term) + radius
|
||||||
|
if end > len(text) {
|
||||||
|
end = len(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
snippet := text[start:end]
|
||||||
|
if start > 0 {
|
||||||
|
snippet = "..." + snippet
|
||||||
|
}
|
||||||
|
if end < len(text) {
|
||||||
|
snippet = snippet + "..."
|
||||||
|
}
|
||||||
|
return snippet
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) GetRecordsDir() string {
|
||||||
|
return p.recordsDir
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *Processor) GetStoreDir() string {
|
||||||
|
return p.storeDir
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue