docsys/db.go

718 lines
20 KiB
Go

package main
import (
"crypto/rand"
"database/sql"
"encoding/hex"
"encoding/json"
"fmt"
"math"
"os"
"path/filepath"
"regexp"
"sort"
"strings"
_ "github.com/mattn/go-sqlite3"
)
var db *sql.DB
// Document represents a document record
type Document struct {
ID string
Title string
Category string
Type string
Date string
Amount string
Vendor string
Summary string
FullText string
PDFPath string
RecordPath string
ProcessedAt string
OriginalFile string
Notes string
Metadata map[string]string
Status string // "processing", "ready", "error"
Score float64 `json:",omitempty"` // semantic search relevance 0-1
}
// Share represents a document share link
type Share struct {
Token string
DocID string
CreatedAt string
ExpiresAt string
}
// DocumentUpdate contains fields that can be updated
type DocumentUpdate struct {
Title string
Category string
Notes string
}
// Stats contains dashboard statistics
type Stats struct {
TotalDocs int
RecentDocs int
ByCategory map[string]int
RecentUploads []Document
}
// InitDB initializes the database connection and schema
func InitDB(dbPath string) error {
var err error
db, err = sql.Open("sqlite3", dbPath+"?_fk=1")
if err != nil {
return fmt.Errorf("failed to open database: %w", err)
}
return initSchema()
}
// CloseDB closes the database connection
func CloseDB() error {
if db != nil {
return db.Close()
}
return nil
}
func initSchema() error {
schema := `
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY,
title TEXT,
category TEXT,
type TEXT,
date TEXT,
amount TEXT,
vendor TEXT,
summary TEXT,
full_text TEXT,
pdf_path TEXT,
record_path TEXT,
processed_at TEXT,
original_file TEXT,
notes TEXT,
metadata TEXT,
status TEXT DEFAULT 'ready',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_category ON documents(category);
CREATE INDEX IF NOT EXISTS idx_date ON documents(date);
CREATE INDEX IF NOT EXISTS idx_type ON documents(type);
CREATE INDEX IF NOT EXISTS idx_processed_at ON documents(processed_at);
CREATE TABLE IF NOT EXISTS embeddings (
doc_id TEXT PRIMARY KEY,
embedding BLOB,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS shares (
id TEXT PRIMARY KEY,
doc_id TEXT NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
expires_at DATETIME,
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_shares_doc_id ON shares(doc_id);
DROP TABLE IF EXISTS documents_fts;
CREATE VIRTUAL TABLE documents_fts USING fts5(
id UNINDEXED, title, summary, full_text, vendor
);
`
if _, err := db.Exec(schema); err != nil {
return err
}
// Rebuild FTS index from existing documents
return rebuildFTS()
}
func rebuildFTS() error {
db.Exec(`DELETE FROM documents_fts`)
_, err := db.Exec(`
INSERT INTO documents_fts(id, title, summary, full_text, vendor)
SELECT id, COALESCE(title,''), COALESCE(summary,''), COALESCE(full_text,''), COALESCE(vendor,'')
FROM documents WHERE status = 'ready'
`)
return err
}
func syncFTS(doc *Document) {
db.Exec(`DELETE FROM documents_fts WHERE id = ?`, doc.ID)
db.Exec(`INSERT INTO documents_fts(id, title, summary, full_text, vendor) VALUES (?, ?, ?, ?, ?)`,
doc.ID, doc.Title, doc.Summary, doc.FullText, doc.Vendor)
}
func deleteFTS(id string) {
db.Exec(`DELETE FROM documents_fts WHERE id = ?`, id)
}
// InsertDocument adds a new document to the database
func InsertDocument(doc *Document) error {
metaJSON, _ := json.Marshal(doc.Metadata)
status := doc.Status
if status == "" {
status = "ready"
}
_, err := db.Exec(`
INSERT OR REPLACE INTO documents
(id, title, category, type, date, amount, vendor, summary, full_text,
pdf_path, record_path, processed_at, original_file, notes, metadata, status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount,
doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath,
doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON), status)
if err == nil {
syncFTS(doc)
}
return err
}
// InsertPendingDocument creates a placeholder document while processing
func InsertPendingDocument(id, originalFile string) error {
// Use INSERT OR IGNORE to avoid conflicts with existing docs
// If doc already exists (duplicate upload), this silently succeeds
_, err := db.Exec(`
INSERT OR IGNORE INTO documents (id, title, original_file, status, processed_at)
VALUES (?, ?, ?, 'processing', datetime('now'))
`, id, "Processing: "+originalFile, originalFile)
return err
}
// UpdateDocumentStatus updates the status of a document
func UpdateDocumentStatus(id, status string) error {
_, err := db.Exec(`UPDATE documents SET status = ? WHERE id = ?`, status, id)
return err
}
// StoreEmbedding saves an embedding vector for a document
func StoreEmbedding(docID string, embedding []float32) error {
// Convert to bytes (4 bytes per float32)
buf := make([]byte, len(embedding)*4)
for i, v := range embedding {
bits := math.Float32bits(v)
buf[i*4] = byte(bits)
buf[i*4+1] = byte(bits >> 8)
buf[i*4+2] = byte(bits >> 16)
buf[i*4+3] = byte(bits >> 24)
}
_, err := db.Exec(`INSERT OR REPLACE INTO embeddings (doc_id, embedding) VALUES (?, ?)`, docID, buf)
return err
}
// SemanticSearch finds documents by cosine similarity to a query embedding
func SemanticSearch(queryEmb []float32, limit int) ([]Document, error) {
rows, err := db.Query(`SELECT doc_id, embedding FROM embeddings`)
if err != nil {
return nil, err
}
defer rows.Close()
type scored struct {
id string
score float64
}
var results []scored
for rows.Next() {
var docID string
var blob []byte
if err := rows.Scan(&docID, &blob); err != nil {
continue
}
// Decode embedding
if len(blob) != len(queryEmb)*4 {
continue
}
docEmb := make([]float32, len(queryEmb))
for i := range docEmb {
bits := uint32(blob[i*4]) | uint32(blob[i*4+1])<<8 | uint32(blob[i*4+2])<<16 | uint32(blob[i*4+3])<<24
docEmb[i] = math.Float32frombits(bits)
}
results = append(results, scored{id: docID, score: cosineSim(queryEmb, docEmb)})
}
// Sort by score descending
sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score })
if len(results) > limit {
results = results[:limit]
}
var docs []Document
for _, r := range results {
if r.score < 0.3 { // minimum relevance threshold
continue
}
if doc, err := GetDocument(r.id); err == nil {
doc.Score = r.score
docs = append(docs, *doc)
}
}
return docs, nil
}
func cosineSim(a, b []float32) float64 {
var dot, normA, normB float64
for i := range a {
dot += float64(a[i]) * float64(b[i])
normA += float64(a[i]) * float64(a[i])
normB += float64(b[i]) * float64(b[i])
}
if normA == 0 || normB == 0 {
return 0
}
return dot / (math.Sqrt(normA) * math.Sqrt(normB))
}
// GetDocument retrieves a single document by ID
func GetDocument(id string) (*Document, error) {
doc := &Document{Metadata: make(map[string]string)}
var metaJSON sql.NullString
var status sql.NullString
err := db.QueryRow(`
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
COALESCE(summary,''), COALESCE(full_text,''),
COALESCE(pdf_path,''), COALESCE(record_path,''), COALESCE(processed_at,''),
COALESCE(original_file,''),
COALESCE(notes, ''), COALESCE(metadata, '{}'), COALESCE(status, 'ready')
FROM documents WHERE id = ?
`, id).Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText,
&doc.PDFPath, &doc.RecordPath, &doc.ProcessedAt, &doc.OriginalFile,
&doc.Notes, &metaJSON, &status,
)
if err != nil {
return nil, err
}
if metaJSON.Valid {
json.Unmarshal([]byte(metaJSON.String), &doc.Metadata)
}
doc.Status = status.String
return doc, nil
}
// GetDocumentsByCategory retrieves all documents in a category
func GetDocumentsByCategory(category string) ([]Document, error) {
return queryDocuments("WHERE category = ? ORDER BY processed_at DESC", category)
}
// GetRecentDocuments retrieves the most recent documents
func GetRecentDocuments(limit int) ([]Document, error) {
return queryDocuments(fmt.Sprintf("ORDER BY processed_at DESC LIMIT %d", limit))
}
// GetAllDocuments retrieves all documents
func GetAllDocuments() ([]Document, error) {
return queryDocuments("ORDER BY processed_at DESC")
}
// SearchDocuments performs full-text search
func SearchDocuments(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 50
}
rows, err := db.Query(`
SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''),
COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''),
COALESCE(d.summary,''), COALESCE(d.pdf_path,''), COALESCE(d.processed_at,''),
COALESCE(d.original_file,''), COALESCE(d.status,'ready')
FROM documents d
JOIN documents_fts fts ON d.id = fts.id
WHERE documents_fts MATCH ?
ORDER BY rank
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocumentRows(rows)
}
// SearchDocumentsFallback performs simple LIKE-based search (fallback)
func SearchDocumentsFallback(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 50
}
pattern := "%" + query + "%"
rows, err := db.Query(`
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''),
COALESCE(original_file,''), COALESCE(status,'ready')
FROM documents
WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ?
ORDER BY processed_at DESC
LIMIT ?
`, pattern, pattern, pattern, pattern, limit)
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocumentRows(rows)
}
// UpdateDocument updates document metadata
func UpdateDocument(id string, update DocumentUpdate) error {
_, err := db.Exec(`
UPDATE documents
SET title = ?, category = ?, notes = ?, updated_at = CURRENT_TIMESTAMP
WHERE id = ?
`, update.Title, update.Category, update.Notes, id)
return err
}
// UpdateDocumentRecordPath updates the record path after moving
func UpdateDocumentRecordPath(id, newPath string) error {
_, err := db.Exec(`UPDATE documents SET record_path = ? WHERE id = ?`, newPath, id)
return err
}
// UpdateDocumentMetadata updates the metadata JSON for a document
func UpdateDocumentMetadata(id string, metadata map[string]string) error {
metaJSON, _ := json.Marshal(metadata)
_, err := db.Exec(`UPDATE documents SET metadata = ? WHERE id = ?`, string(metaJSON), id)
return err
}
// DeleteDocument removes a document from the database
func DeleteDocument(id string) error {
deleteFTS(id)
db.Exec(`DELETE FROM embeddings WHERE doc_id = ?`, id)
_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
return err
}
// CreateShare creates a share link for a document, returns the token
func CreateShare(docID string, expiryDays int) (string, error) {
b := make([]byte, 6)
if _, err := rand.Read(b); err != nil {
return "", err
}
token := hex.EncodeToString(b)
var expiresAt sql.NullString
if expiryDays > 0 {
expiresAt = sql.NullString{
String: fmt.Sprintf("datetime('now', '+%d days')", expiryDays),
Valid: true,
}
_, err := db.Exec(`INSERT INTO shares (id, doc_id, expires_at) VALUES (?, ?, datetime('now', ?))`,
token, docID, fmt.Sprintf("+%d days", expiryDays))
return token, err
}
_ = expiresAt
_, err := db.Exec(`INSERT INTO shares (id, doc_id) VALUES (?, ?)`, token, docID)
return token, err
}
// GetShare retrieves the document for a share token, returns nil if expired/invalid
func GetShare(token string) (*Document, error) {
var docID string
var expiresAt sql.NullString
err := db.QueryRow(`SELECT doc_id, expires_at FROM shares WHERE id = ?`, token).Scan(&docID, &expiresAt)
if err != nil {
return nil, err
}
if expiresAt.Valid {
var expired bool
db.QueryRow(`SELECT datetime(?) < datetime('now')`, expiresAt.String).Scan(&expired)
if expired {
return nil, nil
}
}
return GetDocument(docID)
}
// DeleteShare removes a share link
func DeleteShare(token string) error {
_, err := db.Exec(`DELETE FROM shares WHERE id = ?`, token)
return err
}
// GetSharesByDocument returns active shares for a document
func GetSharesByDocument(docID string) ([]Share, error) {
rows, err := db.Query(`SELECT id, doc_id, created_at, COALESCE(expires_at, '') FROM shares WHERE doc_id = ? AND (expires_at IS NULL OR datetime(expires_at) > datetime('now'))`, docID)
if err != nil {
return nil, err
}
defer rows.Close()
var shares []Share
for rows.Next() {
var s Share
if err := rows.Scan(&s.Token, &s.DocID, &s.CreatedAt, &s.ExpiresAt); err != nil {
continue
}
shares = append(shares, s)
}
return shares, nil
}
// UpsertDocument inserts or updates a document
func UpsertDocument(doc *Document) error {
metaJSON, _ := json.Marshal(doc.Metadata)
_, err := db.Exec(`
INSERT INTO documents (
id, title, category, type, date, amount, vendor, summary, full_text,
pdf_path, record_path, processed_at, original_file, notes, metadata, updated_at
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
ON CONFLICT(id) DO UPDATE SET
title = excluded.title,
category = excluded.category,
type = excluded.type,
date = excluded.date,
amount = excluded.amount,
vendor = excluded.vendor,
summary = excluded.summary,
full_text = excluded.full_text,
pdf_path = excluded.pdf_path,
record_path = excluded.record_path,
processed_at = excluded.processed_at,
original_file = excluded.original_file,
metadata = excluded.metadata,
updated_at = CURRENT_TIMESTAMP
`, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount,
doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath,
doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON))
return err
}
// GetStats returns dashboard statistics
func GetStats() (*Stats, error) {
stats := &Stats{
ByCategory: make(map[string]int),
}
// Total count
db.QueryRow("SELECT COUNT(*) FROM documents").Scan(&stats.TotalDocs)
// Recent (last 7 days)
db.QueryRow(`
SELECT COUNT(*) FROM documents
WHERE datetime(processed_at) > datetime('now', '-7 days')
`).Scan(&stats.RecentDocs)
// By category
rows, err := db.Query("SELECT category, COUNT(*) FROM documents GROUP BY category")
if err == nil {
defer rows.Close()
for rows.Next() {
var cat string
var count int
if rows.Scan(&cat, &count) == nil {
stats.ByCategory[cat] = count
}
}
}
// Recent uploads
stats.RecentUploads, _ = GetRecentDocuments(5)
return stats, nil
}
// GetCategoryStats returns document count per category
func GetCategoryStats(categories []string) map[string]int {
stats := make(map[string]int)
for _, cat := range categories {
var count int
db.QueryRow("SELECT COUNT(*) FROM documents WHERE category = ?", cat).Scan(&count)
stats[cat] = count
}
return stats
}
// ClearAllDocuments removes all documents (for reindexing)
func ClearAllDocuments() error {
_, err := db.Exec("DELETE FROM documents")
return err
}
// IndexDocumentsFromDirectory scans markdown files and indexes them
func IndexDocumentsFromDirectory(recordsDir, storeDir string, categories []string) error {
for _, cat := range categories {
catDir := filepath.Join(recordsDir, cat)
files, err := filepath.Glob(filepath.Join(catDir, "*.md"))
if err != nil {
continue
}
for _, f := range files {
doc, err := parseMarkdownRecord(f, cat, storeDir)
if err != nil {
continue
}
UpsertDocument(doc)
}
}
return nil
}
// parseMarkdownRecord parses a markdown document record file
func parseMarkdownRecord(path, category, storeDir string) (*Document, error) {
content, err := os.ReadFile(path)
if err != nil {
return nil, err
}
doc := &Document{
Category: category,
RecordPath: path,
Metadata: make(map[string]string),
}
text := string(content)
lines := strings.Split(text, "\n")
// Extract ID from filename
base := filepath.Base(path)
base = strings.TrimSuffix(base, ".md")
parts := strings.Split(base, "_")
if len(parts) >= 2 {
doc.ID = parts[len(parts)-1]
} else {
doc.ID = base
}
// Regex patterns for metadata extraction
idRe := regexp.MustCompile(`\*\*ID:\*\*\s*(.+)`)
titleRe := regexp.MustCompile(`^#\s+(.+)`)
fileRe := regexp.MustCompile(`\*\*Original File:\*\*\s*(.+)`)
procRe := regexp.MustCompile(`\*\*Processed:\*\*\s*(.+)`)
typeRe := regexp.MustCompile(`\*\*Type:\*\*\s*(.+)`)
dateRe := regexp.MustCompile(`\|\s*Date\s*\|\s*(.+?)\s*\|`)
vendorRe := regexp.MustCompile(`\|\s*Vendor\s*\|\s*(.+?)\s*\|`)
amountRe := regexp.MustCompile(`\|\s*Amount\s*\|\s*(.+?)\s*\|`)
pdfRe := regexp.MustCompile(`\*\*PDF:\*\*\s*\[.+?\]\((.+?)\)`)
var inFullText, inSummary bool
var fullTextLines, summaryLines []string
for i, line := range lines {
if m := titleRe.FindStringSubmatch(line); m != nil && i == 0 {
doc.Title = strings.TrimSpace(m[1])
}
if m := idRe.FindStringSubmatch(line); m != nil {
doc.ID = strings.TrimSpace(m[1])
}
if m := fileRe.FindStringSubmatch(line); m != nil {
doc.OriginalFile = strings.TrimSpace(m[1])
}
if m := procRe.FindStringSubmatch(line); m != nil {
doc.ProcessedAt = strings.TrimSpace(m[1])
}
if m := typeRe.FindStringSubmatch(line); m != nil {
doc.Type = strings.TrimSpace(m[1])
}
if m := dateRe.FindStringSubmatch(line); m != nil {
doc.Date = strings.TrimSpace(m[1])
}
if m := vendorRe.FindStringSubmatch(line); m != nil {
doc.Vendor = strings.TrimSpace(m[1])
}
if m := amountRe.FindStringSubmatch(line); m != nil {
doc.Amount = strings.TrimSpace(m[1])
}
if m := pdfRe.FindStringSubmatch(line); m != nil {
pdfPath := strings.TrimSpace(m[1])
if strings.Contains(pdfPath, "store/") {
doc.PDFPath = filepath.Join(storeDir, filepath.Base(pdfPath))
} else {
doc.PDFPath = pdfPath
}
}
// Section detection
if strings.HasPrefix(line, "## Full Text") {
inFullText, inSummary = true, false
continue
}
if strings.HasPrefix(line, "## Summary") {
inSummary, inFullText = true, false
continue
}
if strings.HasPrefix(line, "## ") {
inFullText, inSummary = false, false
}
if inFullText && !strings.HasPrefix(line, "```") {
fullTextLines = append(fullTextLines, line)
}
if inSummary {
summaryLines = append(summaryLines, line)
}
}
doc.FullText = strings.TrimSpace(strings.Join(fullTextLines, "\n"))
doc.Summary = strings.TrimSpace(strings.Join(summaryLines, "\n"))
if doc.Title == "" {
doc.Title = doc.OriginalFile
}
if doc.Title == "" {
doc.Title = doc.ID
}
return doc, nil
}
// Helper function to query documents with a WHERE/ORDER clause
func queryDocuments(whereClause string, args ...interface{}) ([]Document, error) {
query := `
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''),
COALESCE(original_file,''), COALESCE(status, 'ready')
FROM documents ` + whereClause
rows, err := db.Query(query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
return scanDocumentRows(rows)
}
// Helper function to scan document rows
func scanDocumentRows(rows *sql.Rows) ([]Document, error) {
var docs []Document
for rows.Next() {
var doc Document
err := rows.Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.PDFPath,
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
)
if err != nil {
continue
}
docs = append(docs, doc)
}
return docs, rows.Err()
}