package main import ( "crypto/rand" "database/sql" "encoding/hex" "encoding/json" "fmt" "math" "os" "path/filepath" "regexp" "sort" "strings" _ "github.com/mattn/go-sqlite3" ) var db *sql.DB // Document represents a document record type Document struct { ID string Title string Category string Type string Date string Amount string Vendor string Summary string FullText string PDFPath string RecordPath string ProcessedAt string OriginalFile string Notes string Metadata map[string]string Status string // "processing", "ready", "error" Score float64 `json:",omitempty"` // semantic search relevance 0-1 } // Share represents a document share link type Share struct { Token string DocID string CreatedAt string ExpiresAt string } // DocumentUpdate contains fields that can be updated type DocumentUpdate struct { Title string Category string Notes string } // Stats contains dashboard statistics type Stats struct { TotalDocs int RecentDocs int ByCategory map[string]int RecentUploads []Document } // InitDB initializes the database connection and schema func InitDB(dbPath string) error { var err error db, err = sql.Open("sqlite3", dbPath+"?_fk=1") if err != nil { return fmt.Errorf("failed to open database: %w", err) } return initSchema() } // CloseDB closes the database connection func CloseDB() error { if db != nil { return db.Close() } return nil } func initSchema() error { schema := ` CREATE TABLE IF NOT EXISTS documents ( id TEXT PRIMARY KEY, title TEXT, category TEXT, type TEXT, date TEXT, amount TEXT, vendor TEXT, summary TEXT, full_text TEXT, pdf_path TEXT, record_path TEXT, processed_at TEXT, original_file TEXT, notes TEXT, metadata TEXT, status TEXT DEFAULT 'ready', created_at DATETIME DEFAULT CURRENT_TIMESTAMP, updated_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE INDEX IF NOT EXISTS idx_category ON documents(category); CREATE INDEX IF NOT EXISTS idx_date ON documents(date); CREATE INDEX IF NOT EXISTS idx_type ON documents(type); CREATE INDEX IF NOT EXISTS idx_processed_at ON documents(processed_at); CREATE TABLE IF NOT EXISTS embeddings ( doc_id TEXT PRIMARY KEY, embedding BLOB, created_at DATETIME DEFAULT CURRENT_TIMESTAMP ); CREATE TABLE IF NOT EXISTS shares ( id TEXT PRIMARY KEY, doc_id TEXT NOT NULL, created_at DATETIME DEFAULT CURRENT_TIMESTAMP, expires_at DATETIME, FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS idx_shares_doc_id ON shares(doc_id); DROP TABLE IF EXISTS documents_fts; CREATE VIRTUAL TABLE documents_fts USING fts5( id UNINDEXED, title, summary, full_text, vendor ); ` if _, err := db.Exec(schema); err != nil { return err } // Rebuild FTS index from existing documents return rebuildFTS() } func rebuildFTS() error { db.Exec(`DELETE FROM documents_fts`) _, err := db.Exec(` INSERT INTO documents_fts(id, title, summary, full_text, vendor) SELECT id, COALESCE(title,''), COALESCE(summary,''), COALESCE(full_text,''), COALESCE(vendor,'') FROM documents WHERE status = 'ready' `) return err } func syncFTS(doc *Document) { db.Exec(`DELETE FROM documents_fts WHERE id = ?`, doc.ID) db.Exec(`INSERT INTO documents_fts(id, title, summary, full_text, vendor) VALUES (?, ?, ?, ?, ?)`, doc.ID, doc.Title, doc.Summary, doc.FullText, doc.Vendor) } func deleteFTS(id string) { db.Exec(`DELETE FROM documents_fts WHERE id = ?`, id) } // InsertDocument adds a new document to the database func InsertDocument(doc *Document) error { metaJSON, _ := json.Marshal(doc.Metadata) status := doc.Status if status == "" { status = "ready" } _, err := db.Exec(` INSERT OR REPLACE INTO documents (id, title, category, type, date, amount, vendor, summary, full_text, pdf_path, record_path, processed_at, original_file, notes, metadata, status) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) `, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount, doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath, doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON), status) if err == nil { syncFTS(doc) } return err } // InsertPendingDocument creates a placeholder document while processing func InsertPendingDocument(id, originalFile string) error { // Use INSERT OR IGNORE to avoid conflicts with existing docs // If doc already exists (duplicate upload), this silently succeeds _, err := db.Exec(` INSERT OR IGNORE INTO documents (id, title, original_file, status, processed_at) VALUES (?, ?, ?, 'processing', datetime('now')) `, id, "Processing: "+originalFile, originalFile) return err } // UpdateDocumentStatus updates the status of a document func UpdateDocumentStatus(id, status string) error { _, err := db.Exec(`UPDATE documents SET status = ? WHERE id = ?`, status, id) return err } // StoreEmbedding saves an embedding vector for a document func StoreEmbedding(docID string, embedding []float32) error { // Convert to bytes (4 bytes per float32) buf := make([]byte, len(embedding)*4) for i, v := range embedding { bits := math.Float32bits(v) buf[i*4] = byte(bits) buf[i*4+1] = byte(bits >> 8) buf[i*4+2] = byte(bits >> 16) buf[i*4+3] = byte(bits >> 24) } _, err := db.Exec(`INSERT OR REPLACE INTO embeddings (doc_id, embedding) VALUES (?, ?)`, docID, buf) return err } // SemanticSearch finds documents by cosine similarity to a query embedding func SemanticSearch(queryEmb []float32, limit int) ([]Document, error) { rows, err := db.Query(`SELECT doc_id, embedding FROM embeddings`) if err != nil { return nil, err } defer rows.Close() type scored struct { id string score float64 } var results []scored for rows.Next() { var docID string var blob []byte if err := rows.Scan(&docID, &blob); err != nil { continue } // Decode embedding if len(blob) != len(queryEmb)*4 { continue } docEmb := make([]float32, len(queryEmb)) for i := range docEmb { bits := uint32(blob[i*4]) | uint32(blob[i*4+1])<<8 | uint32(blob[i*4+2])<<16 | uint32(blob[i*4+3])<<24 docEmb[i] = math.Float32frombits(bits) } results = append(results, scored{id: docID, score: cosineSim(queryEmb, docEmb)}) } // Sort by score descending sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score }) if len(results) > limit { results = results[:limit] } var docs []Document for _, r := range results { if r.score < 0.3 { // minimum relevance threshold continue } if doc, err := GetDocument(r.id); err == nil { doc.Score = r.score docs = append(docs, *doc) } } return docs, nil } func cosineSim(a, b []float32) float64 { var dot, normA, normB float64 for i := range a { dot += float64(a[i]) * float64(b[i]) normA += float64(a[i]) * float64(a[i]) normB += float64(b[i]) * float64(b[i]) } if normA == 0 || normB == 0 { return 0 } return dot / (math.Sqrt(normA) * math.Sqrt(normB)) } // GetDocument retrieves a single document by ID func GetDocument(id string) (*Document, error) { doc := &Document{Metadata: make(map[string]string)} var metaJSON sql.NullString var status sql.NullString err := db.QueryRow(` SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''), COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''), COALESCE(summary,''), COALESCE(full_text,''), COALESCE(pdf_path,''), COALESCE(record_path,''), COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(notes, ''), COALESCE(metadata, '{}'), COALESCE(status, 'ready') FROM documents WHERE id = ? `, id).Scan( &doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date, &doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath, &doc.RecordPath, &doc.ProcessedAt, &doc.OriginalFile, &doc.Notes, &metaJSON, &status, ) if err != nil { return nil, err } if metaJSON.Valid { json.Unmarshal([]byte(metaJSON.String), &doc.Metadata) } doc.Status = status.String return doc, nil } // GetDocumentsByCategory retrieves all documents in a category func GetDocumentsByCategory(category string) ([]Document, error) { return queryDocuments("WHERE category = ? ORDER BY processed_at DESC", category) } // GetRecentDocuments retrieves the most recent documents func GetRecentDocuments(limit int) ([]Document, error) { return queryDocuments(fmt.Sprintf("ORDER BY processed_at DESC LIMIT %d", limit)) } // GetAllDocuments retrieves all documents func GetAllDocuments() ([]Document, error) { return queryDocuments("ORDER BY processed_at DESC") } // SearchDocuments performs full-text search func SearchDocuments(query string, limit int) ([]Document, error) { if limit <= 0 { limit = 50 } rows, err := db.Query(` SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''), COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''), COALESCE(d.summary,''), COALESCE(d.pdf_path,''), COALESCE(d.processed_at,''), COALESCE(d.original_file,''), COALESCE(d.status,'ready') FROM documents d JOIN documents_fts fts ON d.id = fts.id WHERE documents_fts MATCH ? ORDER BY rank LIMIT ? `, query, limit) if err != nil { return nil, err } defer rows.Close() return scanDocumentRows(rows) } // SearchDocumentsFallback performs simple LIKE-based search (fallback) func SearchDocumentsFallback(query string, limit int) ([]Document, error) { if limit <= 0 { limit = 50 } pattern := "%" + query + "%" rows, err := db.Query(` SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''), COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''), COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(status,'ready') FROM documents WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ? ORDER BY processed_at DESC LIMIT ? `, pattern, pattern, pattern, pattern, limit) if err != nil { return nil, err } defer rows.Close() return scanDocumentRows(rows) } // UpdateDocument updates document metadata func UpdateDocument(id string, update DocumentUpdate) error { _, err := db.Exec(` UPDATE documents SET title = ?, category = ?, notes = ?, updated_at = CURRENT_TIMESTAMP WHERE id = ? `, update.Title, update.Category, update.Notes, id) return err } // UpdateDocumentRecordPath updates the record path after moving func UpdateDocumentRecordPath(id, newPath string) error { _, err := db.Exec(`UPDATE documents SET record_path = ? WHERE id = ?`, newPath, id) return err } // UpdateDocumentMetadata updates the metadata JSON for a document func UpdateDocumentMetadata(id string, metadata map[string]string) error { metaJSON, _ := json.Marshal(metadata) _, err := db.Exec(`UPDATE documents SET metadata = ? WHERE id = ?`, string(metaJSON), id) return err } // DeleteDocument removes a document from the database func DeleteDocument(id string) error { deleteFTS(id) db.Exec(`DELETE FROM embeddings WHERE doc_id = ?`, id) _, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id) return err } // CreateShare creates a share link for a document, returns the token func CreateShare(docID string, expiryDays int) (string, error) { b := make([]byte, 6) if _, err := rand.Read(b); err != nil { return "", err } token := hex.EncodeToString(b) var expiresAt sql.NullString if expiryDays > 0 { expiresAt = sql.NullString{ String: fmt.Sprintf("datetime('now', '+%d days')", expiryDays), Valid: true, } _, err := db.Exec(`INSERT INTO shares (id, doc_id, expires_at) VALUES (?, ?, datetime('now', ?))`, token, docID, fmt.Sprintf("+%d days", expiryDays)) return token, err } _ = expiresAt _, err := db.Exec(`INSERT INTO shares (id, doc_id) VALUES (?, ?)`, token, docID) return token, err } // GetShare retrieves the document for a share token, returns nil if expired/invalid func GetShare(token string) (*Document, error) { var docID string var expiresAt sql.NullString err := db.QueryRow(`SELECT doc_id, expires_at FROM shares WHERE id = ?`, token).Scan(&docID, &expiresAt) if err != nil { return nil, err } if expiresAt.Valid { var expired bool db.QueryRow(`SELECT datetime(?) < datetime('now')`, expiresAt.String).Scan(&expired) if expired { return nil, nil } } return GetDocument(docID) } // DeleteShare removes a share link func DeleteShare(token string) error { _, err := db.Exec(`DELETE FROM shares WHERE id = ?`, token) return err } // GetSharesByDocument returns active shares for a document func GetSharesByDocument(docID string) ([]Share, error) { rows, err := db.Query(`SELECT id, doc_id, created_at, COALESCE(expires_at, '') FROM shares WHERE doc_id = ? AND (expires_at IS NULL OR datetime(expires_at) > datetime('now'))`, docID) if err != nil { return nil, err } defer rows.Close() var shares []Share for rows.Next() { var s Share if err := rows.Scan(&s.Token, &s.DocID, &s.CreatedAt, &s.ExpiresAt); err != nil { continue } shares = append(shares, s) } return shares, nil } // UpsertDocument inserts or updates a document func UpsertDocument(doc *Document) error { metaJSON, _ := json.Marshal(doc.Metadata) _, err := db.Exec(` INSERT INTO documents ( id, title, category, type, date, amount, vendor, summary, full_text, pdf_path, record_path, processed_at, original_file, notes, metadata, updated_at ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) ON CONFLICT(id) DO UPDATE SET title = excluded.title, category = excluded.category, type = excluded.type, date = excluded.date, amount = excluded.amount, vendor = excluded.vendor, summary = excluded.summary, full_text = excluded.full_text, pdf_path = excluded.pdf_path, record_path = excluded.record_path, processed_at = excluded.processed_at, original_file = excluded.original_file, metadata = excluded.metadata, updated_at = CURRENT_TIMESTAMP `, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount, doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath, doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON)) return err } // GetStats returns dashboard statistics func GetStats() (*Stats, error) { stats := &Stats{ ByCategory: make(map[string]int), } // Total count db.QueryRow("SELECT COUNT(*) FROM documents").Scan(&stats.TotalDocs) // Recent (last 7 days) db.QueryRow(` SELECT COUNT(*) FROM documents WHERE datetime(processed_at) > datetime('now', '-7 days') `).Scan(&stats.RecentDocs) // By category rows, err := db.Query("SELECT category, COUNT(*) FROM documents GROUP BY category") if err == nil { defer rows.Close() for rows.Next() { var cat string var count int if rows.Scan(&cat, &count) == nil { stats.ByCategory[cat] = count } } } // Recent uploads stats.RecentUploads, _ = GetRecentDocuments(5) return stats, nil } // GetCategoryStats returns document count per category func GetCategoryStats(categories []string) map[string]int { stats := make(map[string]int) for _, cat := range categories { var count int db.QueryRow("SELECT COUNT(*) FROM documents WHERE category = ?", cat).Scan(&count) stats[cat] = count } return stats } // ClearAllDocuments removes all documents (for reindexing) func ClearAllDocuments() error { _, err := db.Exec("DELETE FROM documents") return err } // IndexDocumentsFromDirectory scans markdown files and indexes them func IndexDocumentsFromDirectory(recordsDir, storeDir string, categories []string) error { for _, cat := range categories { catDir := filepath.Join(recordsDir, cat) files, err := filepath.Glob(filepath.Join(catDir, "*.md")) if err != nil { continue } for _, f := range files { doc, err := parseMarkdownRecord(f, cat, storeDir) if err != nil { continue } UpsertDocument(doc) } } return nil } // parseMarkdownRecord parses a markdown document record file func parseMarkdownRecord(path, category, storeDir string) (*Document, error) { content, err := os.ReadFile(path) if err != nil { return nil, err } doc := &Document{ Category: category, RecordPath: path, Metadata: make(map[string]string), } text := string(content) lines := strings.Split(text, "\n") // Extract ID from filename base := filepath.Base(path) base = strings.TrimSuffix(base, ".md") parts := strings.Split(base, "_") if len(parts) >= 2 { doc.ID = parts[len(parts)-1] } else { doc.ID = base } // Regex patterns for metadata extraction idRe := regexp.MustCompile(`\*\*ID:\*\*\s*(.+)`) titleRe := regexp.MustCompile(`^#\s+(.+)`) fileRe := regexp.MustCompile(`\*\*Original File:\*\*\s*(.+)`) procRe := regexp.MustCompile(`\*\*Processed:\*\*\s*(.+)`) typeRe := regexp.MustCompile(`\*\*Type:\*\*\s*(.+)`) dateRe := regexp.MustCompile(`\|\s*Date\s*\|\s*(.+?)\s*\|`) vendorRe := regexp.MustCompile(`\|\s*Vendor\s*\|\s*(.+?)\s*\|`) amountRe := regexp.MustCompile(`\|\s*Amount\s*\|\s*(.+?)\s*\|`) pdfRe := regexp.MustCompile(`\*\*PDF:\*\*\s*\[.+?\]\((.+?)\)`) var inFullText, inSummary bool var fullTextLines, summaryLines []string for i, line := range lines { if m := titleRe.FindStringSubmatch(line); m != nil && i == 0 { doc.Title = strings.TrimSpace(m[1]) } if m := idRe.FindStringSubmatch(line); m != nil { doc.ID = strings.TrimSpace(m[1]) } if m := fileRe.FindStringSubmatch(line); m != nil { doc.OriginalFile = strings.TrimSpace(m[1]) } if m := procRe.FindStringSubmatch(line); m != nil { doc.ProcessedAt = strings.TrimSpace(m[1]) } if m := typeRe.FindStringSubmatch(line); m != nil { doc.Type = strings.TrimSpace(m[1]) } if m := dateRe.FindStringSubmatch(line); m != nil { doc.Date = strings.TrimSpace(m[1]) } if m := vendorRe.FindStringSubmatch(line); m != nil { doc.Vendor = strings.TrimSpace(m[1]) } if m := amountRe.FindStringSubmatch(line); m != nil { doc.Amount = strings.TrimSpace(m[1]) } if m := pdfRe.FindStringSubmatch(line); m != nil { pdfPath := strings.TrimSpace(m[1]) if strings.Contains(pdfPath, "store/") { doc.PDFPath = filepath.Join(storeDir, filepath.Base(pdfPath)) } else { doc.PDFPath = pdfPath } } // Section detection if strings.HasPrefix(line, "## Full Text") { inFullText, inSummary = true, false continue } if strings.HasPrefix(line, "## Summary") { inSummary, inFullText = true, false continue } if strings.HasPrefix(line, "## ") { inFullText, inSummary = false, false } if inFullText && !strings.HasPrefix(line, "```") { fullTextLines = append(fullTextLines, line) } if inSummary { summaryLines = append(summaryLines, line) } } doc.FullText = strings.TrimSpace(strings.Join(fullTextLines, "\n")) doc.Summary = strings.TrimSpace(strings.Join(summaryLines, "\n")) if doc.Title == "" { doc.Title = doc.OriginalFile } if doc.Title == "" { doc.Title = doc.ID } return doc, nil } // Helper function to query documents with a WHERE/ORDER clause func queryDocuments(whereClause string, args ...interface{}) ([]Document, error) { query := ` SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''), COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''), COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(status, 'ready') FROM documents ` + whereClause rows, err := db.Query(query, args...) if err != nil { return nil, err } defer rows.Close() return scanDocumentRows(rows) } // Helper function to scan document rows func scanDocumentRows(rows *sql.Rows) ([]Document, error) { var docs []Document for rows.Next() { var doc Document err := rows.Scan( &doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date, &doc.Amount, &doc.Vendor, &doc.Summary, &doc.PDFPath, &doc.ProcessedAt, &doc.OriginalFile, &doc.Status, ) if err != nil { continue } docs = append(docs, doc) } return docs, rows.Err() }