407 lines
11 KiB
Go
407 lines
11 KiB
Go
package db
|
|
|
|
import (
|
|
"database/sql"
|
|
"encoding/json"
|
|
"fmt"
|
|
"time"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
type DB struct {
|
|
*sql.DB
|
|
}
|
|
|
|
type Document struct {
|
|
ID string `json:"id"`
|
|
Filename string `json:"filename"`
|
|
OriginalName string `json:"original_name"`
|
|
Category string `json:"category"`
|
|
Subcategory string `json:"subcategory,omitempty"`
|
|
Title string `json:"title"`
|
|
Date *time.Time `json:"date,omitempty"`
|
|
Vendor string `json:"vendor,omitempty"`
|
|
Amount *float64 `json:"amount,omitempty"`
|
|
Currency string `json:"currency,omitempty"`
|
|
TaxDeductible bool `json:"tax_deductible"`
|
|
OCRText string `json:"ocr_text"`
|
|
Metadata json.RawMessage `json:"metadata,omitempty"`
|
|
Embedding []byte `json:"embedding,omitempty"`
|
|
StoragePath string `json:"storage_path"`
|
|
MarkdownPath string `json:"markdown_path"`
|
|
PageCount int `json:"page_count"`
|
|
FileSize int64 `json:"file_size"`
|
|
MimeType string `json:"mime_type"`
|
|
Checksum string `json:"checksum"`
|
|
ProcessedAt time.Time `json:"processed_at"`
|
|
CreatedAt time.Time `json:"created_at"`
|
|
UpdatedAt time.Time `json:"updated_at"`
|
|
}
|
|
|
|
type SearchResult struct {
|
|
Document
|
|
Score float64 `json:"score"`
|
|
Snippet string `json:"snippet"`
|
|
}
|
|
|
|
func Open(path string) (*DB, error) {
|
|
db, err := sql.Open("sqlite3", path+"?_journal_mode=WAL&_busy_timeout=5000")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := db.Ping(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &DB{db}, nil
|
|
}
|
|
|
|
func (db *DB) Init() error {
|
|
schema := `
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
id TEXT PRIMARY KEY,
|
|
filename TEXT NOT NULL,
|
|
original_name TEXT NOT NULL,
|
|
category TEXT NOT NULL DEFAULT 'uncategorized',
|
|
subcategory TEXT,
|
|
title TEXT NOT NULL,
|
|
date TEXT,
|
|
vendor TEXT,
|
|
amount REAL,
|
|
currency TEXT DEFAULT 'USD',
|
|
tax_deductible INTEGER DEFAULT 0,
|
|
ocr_text TEXT,
|
|
metadata TEXT,
|
|
embedding BLOB,
|
|
storage_path TEXT NOT NULL,
|
|
markdown_path TEXT,
|
|
page_count INTEGER DEFAULT 1,
|
|
file_size INTEGER,
|
|
mime_type TEXT,
|
|
checksum TEXT,
|
|
processed_at TEXT,
|
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_documents_category ON documents(category);
|
|
CREATE INDEX IF NOT EXISTS idx_documents_date ON documents(date);
|
|
CREATE INDEX IF NOT EXISTS idx_documents_vendor ON documents(vendor);
|
|
CREATE INDEX IF NOT EXISTS idx_documents_amount ON documents(amount);
|
|
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
|
|
id,
|
|
title,
|
|
ocr_text,
|
|
vendor,
|
|
category,
|
|
content='documents',
|
|
content_rowid='rowid'
|
|
);
|
|
|
|
CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents BEGIN
|
|
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
|
|
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
|
|
END;
|
|
|
|
CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
|
|
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
|
|
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
|
|
END;
|
|
|
|
CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents BEGIN
|
|
INSERT INTO documents_fts(documents_fts, id, title, ocr_text, vendor, category)
|
|
VALUES ('delete', old.id, old.title, old.ocr_text, old.vendor, old.category);
|
|
INSERT INTO documents_fts(id, title, ocr_text, vendor, category)
|
|
VALUES (new.id, new.title, new.ocr_text, new.vendor, new.category);
|
|
END;
|
|
|
|
CREATE TABLE IF NOT EXISTS settings (
|
|
key TEXT PRIMARY KEY,
|
|
value TEXT
|
|
);
|
|
`
|
|
_, err := db.Exec(schema)
|
|
return err
|
|
}
|
|
|
|
func (db *DB) InsertDocument(doc *Document) error {
|
|
var dateStr *string
|
|
if doc.Date != nil {
|
|
s := doc.Date.Format("2006-01-02")
|
|
dateStr = &s
|
|
}
|
|
|
|
_, err := db.Exec(`
|
|
INSERT INTO documents (
|
|
id, filename, original_name, category, subcategory, title, date,
|
|
vendor, amount, currency, tax_deductible, ocr_text, metadata,
|
|
embedding, storage_path, markdown_path, page_count, file_size,
|
|
mime_type, checksum, processed_at, created_at, updated_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`, doc.ID, doc.Filename, doc.OriginalName, doc.Category, doc.Subcategory,
|
|
doc.Title, dateStr, doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
|
|
doc.OCRText, doc.Metadata, doc.Embedding, doc.StoragePath, doc.MarkdownPath,
|
|
doc.PageCount, doc.FileSize, doc.MimeType, doc.Checksum,
|
|
doc.ProcessedAt.Format(time.RFC3339),
|
|
doc.CreatedAt.Format(time.RFC3339),
|
|
doc.UpdatedAt.Format(time.RFC3339))
|
|
return err
|
|
}
|
|
|
|
func (db *DB) UpdateDocument(doc *Document) error {
|
|
var dateStr *string
|
|
if doc.Date != nil {
|
|
s := doc.Date.Format("2006-01-02")
|
|
dateStr = &s
|
|
}
|
|
|
|
_, err := db.Exec(`
|
|
UPDATE documents SET
|
|
category = ?, subcategory = ?, title = ?, date = ?,
|
|
vendor = ?, amount = ?, currency = ?, tax_deductible = ?,
|
|
metadata = ?, updated_at = ?
|
|
WHERE id = ?
|
|
`, doc.Category, doc.Subcategory, doc.Title, dateStr,
|
|
doc.Vendor, doc.Amount, doc.Currency, doc.TaxDeductible,
|
|
doc.Metadata, time.Now().Format(time.RFC3339), doc.ID)
|
|
return err
|
|
}
|
|
|
|
func (db *DB) GetDocument(id string) (*Document, error) {
|
|
row := db.QueryRow(`SELECT * FROM documents WHERE id = ?`, id)
|
|
return scanDocument(row)
|
|
}
|
|
|
|
func (db *DB) DeleteDocument(id string) error {
|
|
_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
|
|
return err
|
|
}
|
|
|
|
func (db *DB) ListDocuments(category string, limit, offset int) ([]*Document, error) {
|
|
var rows *sql.Rows
|
|
var err error
|
|
|
|
if category != "" {
|
|
rows, err = db.Query(`
|
|
SELECT * FROM documents
|
|
WHERE category = ?
|
|
ORDER BY COALESCE(date, created_at) DESC
|
|
LIMIT ? OFFSET ?
|
|
`, category, limit, offset)
|
|
} else {
|
|
rows, err = db.Query(`
|
|
SELECT * FROM documents
|
|
ORDER BY COALESCE(date, created_at) DESC
|
|
LIMIT ? OFFSET ?
|
|
`, limit, offset)
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocuments(rows)
|
|
}
|
|
|
|
func (db *DB) RecentDocuments(limit int) ([]*Document, error) {
|
|
rows, err := db.Query(`
|
|
SELECT * FROM documents
|
|
ORDER BY created_at DESC
|
|
LIMIT ?
|
|
`, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocuments(rows)
|
|
}
|
|
|
|
func (db *DB) SearchFTS(query string, limit int) ([]*SearchResult, error) {
|
|
rows, err := db.Query(`
|
|
SELECT d.*,
|
|
bm25(documents_fts) as score,
|
|
snippet(documents_fts, 2, '<mark>', '</mark>', '...', 32) as snippet
|
|
FROM documents_fts f
|
|
JOIN documents d ON f.id = d.id
|
|
WHERE documents_fts MATCH ?
|
|
ORDER BY bm25(documents_fts)
|
|
LIMIT ?
|
|
`, query, limit)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
var results []*SearchResult
|
|
for rows.Next() {
|
|
var doc Document
|
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
|
var score float64
|
|
var snippet string
|
|
|
|
err := rows.Scan(
|
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
|
&processedStr, &createdStr, &updatedStr, &score, &snippet,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if dateStr.Valid {
|
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
|
doc.Date = &t
|
|
}
|
|
if createdStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
|
doc.CreatedAt = t
|
|
}
|
|
|
|
results = append(results, &SearchResult{Document: doc, Score: score, Snippet: snippet})
|
|
}
|
|
|
|
return results, nil
|
|
}
|
|
|
|
func (db *DB) GetStats() (map[string]interface{}, error) {
|
|
stats := make(map[string]interface{})
|
|
|
|
// Total documents
|
|
var total int
|
|
db.QueryRow(`SELECT COUNT(*) FROM documents`).Scan(&total)
|
|
stats["total"] = total
|
|
|
|
// This month
|
|
var thisMonth int
|
|
db.QueryRow(`SELECT COUNT(*) FROM documents WHERE created_at >= date('now', 'start of month')`).Scan(&thisMonth)
|
|
stats["this_month"] = thisMonth
|
|
|
|
// Total size
|
|
var totalSize int64
|
|
db.QueryRow(`SELECT COALESCE(SUM(file_size), 0) FROM documents`).Scan(&totalSize)
|
|
stats["total_size"] = totalSize
|
|
|
|
// By category
|
|
rows, err := db.Query(`SELECT category, COUNT(*) FROM documents GROUP BY category`)
|
|
if err == nil {
|
|
categories := make(map[string]int)
|
|
for rows.Next() {
|
|
var cat string
|
|
var count int
|
|
rows.Scan(&cat, &count)
|
|
categories[cat] = count
|
|
}
|
|
rows.Close()
|
|
stats["by_category"] = categories
|
|
}
|
|
|
|
return stats, nil
|
|
}
|
|
|
|
func (db *DB) GetExpenses(year int, month int) ([]*Document, error) {
|
|
query := `
|
|
SELECT * FROM documents
|
|
WHERE category = 'expenses' AND amount IS NOT NULL
|
|
`
|
|
args := []interface{}{}
|
|
|
|
if year > 0 {
|
|
query += ` AND strftime('%Y', date) = ?`
|
|
args = append(args, fmt.Sprintf("%04d", year))
|
|
}
|
|
if month > 0 {
|
|
query += ` AND strftime('%m', date) = ?`
|
|
args = append(args, fmt.Sprintf("%02d", month))
|
|
}
|
|
|
|
query += ` ORDER BY date DESC`
|
|
|
|
rows, err := db.Query(query, args...)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocuments(rows)
|
|
}
|
|
|
|
func scanDocument(row *sql.Row) (*Document, error) {
|
|
var doc Document
|
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
|
|
|
err := row.Scan(
|
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
|
&processedStr, &createdStr, &updatedStr,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if dateStr.Valid {
|
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
|
doc.Date = &t
|
|
}
|
|
if processedStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, processedStr.String)
|
|
doc.ProcessedAt = t
|
|
}
|
|
if createdStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
|
doc.CreatedAt = t
|
|
}
|
|
if updatedStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, updatedStr.String)
|
|
doc.UpdatedAt = t
|
|
}
|
|
|
|
return &doc, nil
|
|
}
|
|
|
|
func scanDocuments(rows *sql.Rows) ([]*Document, error) {
|
|
var docs []*Document
|
|
for rows.Next() {
|
|
var doc Document
|
|
var dateStr, processedStr, createdStr, updatedStr sql.NullString
|
|
|
|
err := rows.Scan(
|
|
&doc.ID, &doc.Filename, &doc.OriginalName, &doc.Category, &doc.Subcategory,
|
|
&doc.Title, &dateStr, &doc.Vendor, &doc.Amount, &doc.Currency, &doc.TaxDeductible,
|
|
&doc.OCRText, &doc.Metadata, &doc.Embedding, &doc.StoragePath, &doc.MarkdownPath,
|
|
&doc.PageCount, &doc.FileSize, &doc.MimeType, &doc.Checksum,
|
|
&processedStr, &createdStr, &updatedStr,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if dateStr.Valid {
|
|
t, _ := time.Parse("2006-01-02", dateStr.String)
|
|
doc.Date = &t
|
|
}
|
|
if processedStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, processedStr.String)
|
|
doc.ProcessedAt = t
|
|
}
|
|
if createdStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, createdStr.String)
|
|
doc.CreatedAt = t
|
|
}
|
|
if updatedStr.Valid {
|
|
t, _ := time.Parse(time.RFC3339, updatedStr.String)
|
|
doc.UpdatedAt = t
|
|
}
|
|
|
|
docs = append(docs, &doc)
|
|
}
|
|
return docs, nil
|
|
}
|