docman/internal/db/db.go

407 lines
11 KiB
Go

package db
import (
"database/sql"
"encoding/json"
"fmt"
"time"
_ "github.com/mattn/go-sqlite3"
)
type DB struct {
*sql.DB
}
type Document struct {
ID string `json:"id"`
Filename string `json:"filename"`
OriginalName string `json:"original_name"`
Category string `json:"category"`
Subcategory string `json:"subcategory,omitempty"`
Title string `json:"title"`
Date *time.Time `json:"date,omitempty"`
Vendor string `json:"vendor,omitempty"`
Amount *float64 `json:"amount,omitempty"`
Currency string `json:"currency,omitempty"`
TaxDeductible bool `json:"tax_deductible"`
OCRText string `json:"ocr_text"`
Metadata json.RawMessage `json:"metadata,omitempty"`
Embedding []byte `json:"embedding,omitempty"`
StoragePath string `json:"storage_path"`
MarkdownPath string `json:"markdown_path"`
PageCount int `json:"page_count"`
FileSize int64 `json:"file_size"`
MimeType string `json:"mime_type"`
Checksum string `json:"checksum"`
ProcessedAt time.Time `json:"processed_at"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type SearchResult struct {
Document
Score float64 `json:"score"`
Snippet string `json:"snippet"`
}
func Open(path string) (*DB, error) {
db, err := sql.Open("sqlite3", path+"?_journal_mode=WAL&_busy_timeout=5000")
if err != nil {
return nil, err
}
if err := db.Ping(); err != nil {
return nil, err
}
return &DB{db}, nil
}
func (db *DB) Init() error {
schema := `
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
filename TEXT NOT NULL,
original_name TEXT NOT NULL,
category TEXT NOT NULL DEFAULT 'uncategorized',
subcategory TEXT,
title TEXT NOT NULL,
date TEXT,
vendor TEXT,
amount REAL,
currency TEXT DEFAULT 'USD',
tax_deductible INTEGER DEFAULT 0,
ocr_text TEXT,
metadata TEXT,
embedding BLOB,
storage_path TEXT NOT NULL,
markdown_path TEXT,
page_count INTEGER DEFAULT 1,
file_size INTEGER,
mime_type TEXT,
checksum TEXT,
processed_at TEXT,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_documents_category ON documents(category);
CREATE INDEX IF NOT EXISTS idx_documents_date ON documents(date);
CREATE INDEX IF NOT EXISTS idx_documents_vendor ON documents(vendor);
CREATE INDEX IF NOT EXISTS idx_documents_amount ON documents(amount);
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
id,
title,
ocr_text,
vendor,
category,
content='documents',
content_rowid='rowid'
);
CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
END;
CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
END;
CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
END;
CREATE TABLE IF NOT EXISTS settings (
key TEXT PRIMARY KEY,
value TEXT
);
`
_, err := db.Exec(schema)
return err
}
func (db *DB) InsertDocument(doc *Document) error {
var dateStr *string
if doc.Date != nil {
s := doc.Date.Format("2006-01-02")
dateStr = &s
}
_, err := db.Exec(`
INSERT INTO documents (
id, filename, original_name, category, subcategory, title, date,
vendor, amount, currency, tax_deductible, ocr_text, metadata,
embedding, storage_path, markdown_path, page_count, file_size,
mime_type, checksum, processed_at, created_at, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, doc.ID, doc.Filename, doc.OriginalName, doc.Category, doc.Subcategory,
doc.Title, dateStr, doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
doc.OCRText, doc.Metadata, doc.Embedding, doc.StoragePath, doc.MarkdownPath,
doc.PageCount, doc.FileSize, doc.MimeType, doc.Checksum,
doc.ProcessedAt.Format(time.RFC3339),
doc.CreatedAt.Format(time.RFC3339),
doc.UpdatedAt.Format(time.RFC3339))
return err
}
func (db *DB) UpdateDocument(doc *Document) error {
var dateStr *string
if doc.Date != nil {
s := doc.Date.Format("2006-01-02")
dateStr = &s
}
_, err := db.Exec(`
UPDATE documents SET
category = ?, subcategory = ?, title = ?, date = ?,
vendor = ?, amount = ?, currency = ?, tax_deductible = ?,
metadata = ?, updated_at = ?
WHERE id = ?
`, doc.Category, doc.Subcategory, doc.Title, dateStr,
doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
doc.Metadata, time.Now().Format(time.RFC3339), doc.ID)
return err
}
func (db *DB) GetDocument(id string) (*Document, error) {
row := db.QueryRow(`SELECT * FROM documents WHERE id = ?`, id)
return scanDocument(row)
}
func (db *DB) DeleteDocument(id string) error {
_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
return err
}
func (db *DB) ListDocuments(category string, limit, offset int) ([]*Document, error) {
var rows *sql.Rows
var err error
if category != "" {
rows, err = db.Query(`
SELECT * FROM documents
WHERE category = ?
ORDER BY COALESCE(date, created_at) DESC
LIMIT ? OFFSET ?
`, category, limit, offset)
} else {
rows, err = db.Query(`
SELECT * FROM documents
ORDER BY COALESCE(date, created_at) DESC
LIMIT ? OFFSET ?
`, limit, offset)
}
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocuments(rows)
}
func (db *DB) RecentDocuments(limit int) ([]*Document, error) {
rows, err := db.Query(`
SELECT * FROM documents
ORDER BY created_at DESC
LIMIT ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocuments(rows)
}
func (db *DB) SearchFTS(query string, limit int) ([]*SearchResult, error) {
rows, err := db.Query(`
SELECT d.*,
bm25(documents_fts) as score,
snippet(documents_fts, 2, '<mark>', '</mark>', '...', 32) as snippet
FROM documents_fts f
JOIN documents d ON f.id = d.id
WHERE documents_fts MATCH ?
ORDER BY bm25(documents_fts)
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var results []*SearchResult
for rows.Next() {
var doc Document
var dateStr, processedStr, createdStr, updatedStr sql.NullString
var score float64
var snippet string
err := rows.Scan(
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
&processedStr, &createdStr, &updatedStr, &score, &snippet,
)
if err != nil {
return nil, err
}
if dateStr.Valid {
t, _ := time.Parse("2006-01-02", dateStr.String)
doc.Date = &t
}
if createdStr.Valid {
t, _ := time.Parse(time.RFC3339, createdStr.String)
doc.CreatedAt = t
}
results = append(results, &SearchResult{Document: doc, Score: score, Snippet: snippet})
}
return results, nil
}
func (db *DB) GetStats() (map[string]interface{}, error) {
stats := make(map[string]interface{})
// Total documents
var total int
db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&total)
stats["total"] = total
// This month
var thisMonth int
db.QueryRow(`SELECT COUNT(*) FROM documents WHERE created_at >= date('now', 'start of month')`).Scan(&thisMonth)
stats["this_month"] = thisMonth
// Total size
var totalSize int64
db.QueryRow(`SELECT COALESCE(SUM(file_size), 0) FROM documents`).Scan(&totalSize)
stats["total_size"] = totalSize
// By category
rows, err := db.Query(`SELECT category, COUNT(*) FROM documents GROUP BY category`)
if err == nil {
categories := make(map[string]int)
for rows.Next() {
var cat string
var count int
rows.Scan(&cat, &count)
categories[cat] = count
}
rows.Close()
stats["by_category"] = categories
}
return stats, nil
}
func (db *DB) GetExpenses(year int, month int) ([]*Document, error) {
query := `
SELECT * FROM documents
WHERE category = 'expenses' AND amount IS NOT NULL
`
args := []interface{}{}
if year > 0 {
query += ` AND strftime('%Y', date) = ?`
args = append(args, fmt.Sprintf("%04d", year))
}
if month > 0 {
query += ` AND strftime('%m', date) = ?`
args = append(args, fmt.Sprintf("%02d", month))
}
query += ` ORDER BY date DESC`
rows, err := db.Query(query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocuments(rows)
}
func scanDocument(row *sql.Row) (*Document, error) {
var doc Document
var dateStr, processedStr, createdStr, updatedStr sql.NullString
err := row.Scan(
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
&processedStr, &createdStr, &updatedStr,
)
if err != nil {
return nil, err
}
if dateStr.Valid {
t, _ := time.Parse("2006-01-02", dateStr.String)
doc.Date = &t
}
if processedStr.Valid {
t, _ := time.Parse(time.RFC3339, processedStr.String)
doc.ProcessedAt = t
}
if createdStr.Valid {
t, _ := time.Parse(time.RFC3339, createdStr.String)
doc.CreatedAt = t
}
if updatedStr.Valid {
t, _ := time.Parse(time.RFC3339, updatedStr.String)
doc.UpdatedAt = t
}
return &doc, nil
}
func scanDocuments(rows *sql.Rows) ([]*Document, error) {
var docs []*Document
for rows.Next() {
var doc Document
var dateStr, processedStr, createdStr, updatedStr sql.NullString
err := rows.Scan(
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
&processedStr, &createdStr, &updatedStr,
)
if err != nil {
return nil, err
}
if dateStr.Valid {
t, _ := time.Parse("2006-01-02", dateStr.String)
doc.Date = &t
}
if processedStr.Valid {
t, _ := time.Parse(time.RFC3339, processedStr.String)
doc.ProcessedAt = t
}
if createdStr.Valid {
t, _ := time.Parse(time.RFC3339, createdStr.String)
doc.CreatedAt = t
}
if updatedStr.Valid {
t, _ := time.Parse(time.RFC3339, updatedStr.String)
doc.UpdatedAt = t
}
docs = append(docs, &doc)
}
return docs, nil
}