718 lines
20 KiB
Go
718 lines
20 KiB
Go
package main
|
|
|
|
import (
|
|
"crypto/rand"
|
|
"database/sql"
|
|
"encoding/hex"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
var db *sql.DB
|
|
|
|
// Document represents a document record
|
|
type Document struct {
|
|
ID string
|
|
Title string
|
|
Category string
|
|
Type string
|
|
Date string
|
|
Amount string
|
|
Vendor string
|
|
Summary string
|
|
FullText string
|
|
PDFPath string
|
|
RecordPath string
|
|
ProcessedAt string
|
|
OriginalFile string
|
|
Notes string
|
|
Metadata map[string]string
|
|
Status string // "processing", "ready", "error"
|
|
Score float64 `json:",omitempty"` // semantic search relevance 0-1
|
|
}
|
|
|
|
// Share represents a document share link
|
|
type Share struct {
|
|
Token string
|
|
DocID string
|
|
CreatedAt string
|
|
ExpiresAt string
|
|
}
|
|
|
|
// DocumentUpdate contains fields that can be updated
|
|
type DocumentUpdate struct {
|
|
Title string
|
|
Category string
|
|
Notes string
|
|
}
|
|
|
|
// Stats contains dashboard statistics
|
|
type Stats struct {
|
|
TotalDocs int
|
|
RecentDocs int
|
|
ByCategory map[string]int
|
|
RecentUploads []Document
|
|
}
|
|
|
|
// InitDB initializes the database connection and schema
|
|
func InitDB(dbPath string) error {
|
|
var err error
|
|
db, err = sql.Open("sqlite3", dbPath+"?_fk=1")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to open database: %w", err)
|
|
}
|
|
|
|
return initSchema()
|
|
}
|
|
|
|
// CloseDB closes the database connection
|
|
func CloseDB() error {
|
|
if db != nil {
|
|
return db.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func initSchema() error {
|
|
schema := `
|
|
CREATE TABLE IF NOT EXISTS documents (
|
|
id TEXT PRIMARY KEY,
|
|
title TEXT,
|
|
category TEXT,
|
|
type TEXT,
|
|
date TEXT,
|
|
amount TEXT,
|
|
vendor TEXT,
|
|
summary TEXT,
|
|
full_text TEXT,
|
|
pdf_path TEXT,
|
|
record_path TEXT,
|
|
processed_at TEXT,
|
|
original_file TEXT,
|
|
notes TEXT,
|
|
metadata TEXT,
|
|
status TEXT DEFAULT 'ready',
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
updated_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_category ON documents(category);
|
|
CREATE INDEX IF NOT EXISTS idx_date ON documents(date);
|
|
CREATE INDEX IF NOT EXISTS idx_type ON documents(type);
|
|
CREATE INDEX IF NOT EXISTS idx_processed_at ON documents(processed_at);
|
|
|
|
CREATE TABLE IF NOT EXISTS embeddings (
|
|
doc_id TEXT PRIMARY KEY,
|
|
embedding BLOB,
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS shares (
|
|
id TEXT PRIMARY KEY,
|
|
doc_id TEXT NOT NULL,
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
expires_at DATETIME,
|
|
FOREIGN KEY (doc_id) REFERENCES documents(id) ON DELETE CASCADE
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_shares_doc_id ON shares(doc_id);
|
|
|
|
DROP TABLE IF EXISTS documents_fts;
|
|
CREATE VIRTUAL TABLE documents_fts USING fts5(
|
|
id UNINDEXED, title, summary, full_text, vendor
|
|
);
|
|
`
|
|
if _, err := db.Exec(schema); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Rebuild FTS index from existing documents
|
|
return rebuildFTS()
|
|
}
|
|
|
|
func rebuildFTS() error {
|
|
db.Exec(`DELETE FROM documents_fts`)
|
|
_, err := db.Exec(`
|
|
INSERT INTO documents_fts(id, title, summary, full_text, vendor)
|
|
SELECT id, COALESCE(title,''), COALESCE(summary,''), COALESCE(full_text,''), COALESCE(vendor,'')
|
|
FROM documents WHERE status = 'ready'
|
|
`)
|
|
return err
|
|
}
|
|
|
|
func syncFTS(doc *Document) {
|
|
db.Exec(`DELETE FROM documents_fts WHERE id = ?`, doc.ID)
|
|
db.Exec(`INSERT INTO documents_fts(id, title, summary, full_text, vendor) VALUES (?, ?, ?, ?, ?)`,
|
|
doc.ID, doc.Title, doc.Summary, doc.FullText, doc.Vendor)
|
|
}
|
|
|
|
func deleteFTS(id string) {
|
|
db.Exec(`DELETE FROM documents_fts WHERE id = ?`, id)
|
|
}
|
|
|
|
// InsertDocument adds a new document to the database
|
|
func InsertDocument(doc *Document) error {
|
|
metaJSON, _ := json.Marshal(doc.Metadata)
|
|
status := doc.Status
|
|
if status == "" {
|
|
status = "ready"
|
|
}
|
|
_, err := db.Exec(`
|
|
INSERT OR REPLACE INTO documents
|
|
(id, title, category, type, date, amount, vendor, summary, full_text,
|
|
pdf_path, record_path, processed_at, original_file, notes, metadata, status)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
`, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount,
|
|
doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath,
|
|
doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON), status)
|
|
if err == nil {
|
|
syncFTS(doc)
|
|
}
|
|
return err
|
|
}
|
|
|
|
// InsertPendingDocument creates a placeholder document while processing
|
|
func InsertPendingDocument(id, originalFile string) error {
|
|
// Use INSERT OR IGNORE to avoid conflicts with existing docs
|
|
// If doc already exists (duplicate upload), this silently succeeds
|
|
_, err := db.Exec(`
|
|
INSERT OR IGNORE INTO documents (id, title, original_file, status, processed_at)
|
|
VALUES (?, ?, ?, 'processing', datetime('now'))
|
|
`, id, "Processing: "+originalFile, originalFile)
|
|
return err
|
|
}
|
|
|
|
// UpdateDocumentStatus updates the status of a document
|
|
func UpdateDocumentStatus(id, status string) error {
|
|
_, err := db.Exec(`UPDATE documents SET status = ? WHERE id = ?`, status, id)
|
|
return err
|
|
}
|
|
|
|
// StoreEmbedding saves an embedding vector for a document
|
|
func StoreEmbedding(docID string, embedding []float32) error {
|
|
// Convert to bytes (4 bytes per float32)
|
|
buf := make([]byte, len(embedding)*4)
|
|
for i, v := range embedding {
|
|
bits := math.Float32bits(v)
|
|
buf[i*4] = byte(bits)
|
|
buf[i*4+1] = byte(bits >> 8)
|
|
buf[i*4+2] = byte(bits >> 16)
|
|
buf[i*4+3] = byte(bits >> 24)
|
|
}
|
|
_, err := db.Exec(`INSERT OR REPLACE INTO embeddings (doc_id, embedding) VALUES (?, ?)`, docID, buf)
|
|
return err
|
|
}
|
|
|
|
// SemanticSearch finds documents by cosine similarity to a query embedding
|
|
func SemanticSearch(queryEmb []float32, limit int) ([]Document, error) {
|
|
rows, err := db.Query(`SELECT doc_id, embedding FROM embeddings`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
type scored struct {
|
|
id string
|
|
score float64
|
|
}
|
|
var results []scored
|
|
|
|
for rows.Next() {
|
|
var docID string
|
|
var blob []byte
|
|
if err := rows.Scan(&docID, &blob); err != nil {
|
|
continue
|
|
}
|
|
// Decode embedding
|
|
if len(blob) != len(queryEmb)*4 {
|
|
continue
|
|
}
|
|
docEmb := make([]float32, len(queryEmb))
|
|
for i := range docEmb {
|
|
bits := uint32(blob[i*4]) | uint32(blob[i*4+1])<<8 | uint32(blob[i*4+2])<<16 | uint32(blob[i*4+3])<<24
|
|
docEmb[i] = math.Float32frombits(bits)
|
|
}
|
|
results = append(results, scored{id: docID, score: cosineSim(queryEmb, docEmb)})
|
|
}
|
|
|
|
// Sort by score descending
|
|
sort.Slice(results, func(i, j int) bool { return results[i].score > results[j].score })
|
|
|
|
if len(results) > limit {
|
|
results = results[:limit]
|
|
}
|
|
|
|
var docs []Document
|
|
for _, r := range results {
|
|
if r.score < 0.3 { // minimum relevance threshold
|
|
continue
|
|
}
|
|
if doc, err := GetDocument(r.id); err == nil {
|
|
doc.Score = r.score
|
|
docs = append(docs, *doc)
|
|
}
|
|
}
|
|
return docs, nil
|
|
}
|
|
|
|
func cosineSim(a, b []float32) float64 {
|
|
var dot, normA, normB float64
|
|
for i := range a {
|
|
dot += float64(a[i]) * float64(b[i])
|
|
normA += float64(a[i]) * float64(a[i])
|
|
normB += float64(b[i]) * float64(b[i])
|
|
}
|
|
if normA == 0 || normB == 0 {
|
|
return 0
|
|
}
|
|
return dot / (math.Sqrt(normA) * math.Sqrt(normB))
|
|
}
|
|
|
|
// GetDocument retrieves a single document by ID
|
|
func GetDocument(id string) (*Document, error) {
|
|
doc := &Document{Metadata: make(map[string]string)}
|
|
var metaJSON sql.NullString
|
|
var status sql.NullString
|
|
|
|
err := db.QueryRow(`
|
|
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
|
|
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
|
|
COALESCE(summary,''), COALESCE(full_text,''),
|
|
COALESCE(pdf_path,''), COALESCE(record_path,''), COALESCE(processed_at,''),
|
|
COALESCE(original_file,''),
|
|
COALESCE(notes, ''), COALESCE(metadata, '{}'), COALESCE(status, 'ready')
|
|
FROM documents WHERE id = ?
|
|
`, id).Scan(
|
|
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
|
|
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText,
|
|
&doc.PDFPath, &doc.RecordPath, &doc.ProcessedAt, &doc.OriginalFile,
|
|
&doc.Notes, &metaJSON, &status,
|
|
)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if metaJSON.Valid {
|
|
json.Unmarshal([]byte(metaJSON.String), &doc.Metadata)
|
|
}
|
|
doc.Status = status.String
|
|
return doc, nil
|
|
}
|
|
|
|
// GetDocumentsByCategory retrieves all documents in a category
|
|
func GetDocumentsByCategory(category string) ([]Document, error) {
|
|
return queryDocuments("WHERE category = ? ORDER BY processed_at DESC", category)
|
|
}
|
|
|
|
// GetRecentDocuments retrieves the most recent documents
|
|
func GetRecentDocuments(limit int) ([]Document, error) {
|
|
return queryDocuments(fmt.Sprintf("ORDER BY processed_at DESC LIMIT %d", limit))
|
|
}
|
|
|
|
// GetAllDocuments retrieves all documents
|
|
func GetAllDocuments() ([]Document, error) {
|
|
return queryDocuments("ORDER BY processed_at DESC")
|
|
}
|
|
|
|
// SearchDocuments performs full-text search
|
|
func SearchDocuments(query string, limit int) ([]Document, error) {
|
|
if limit <= 0 {
|
|
limit = 50
|
|
}
|
|
|
|
rows, err := db.Query(`
|
|
SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''),
|
|
COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''),
|
|
COALESCE(d.summary,''), COALESCE(d.pdf_path,''), COALESCE(d.processed_at,''),
|
|
COALESCE(d.original_file,''), COALESCE(d.status,'ready')
|
|
FROM documents d
|
|
JOIN documents_fts fts ON d.id = fts.id
|
|
WHERE documents_fts MATCH ?
|
|
ORDER BY rank
|
|
LIMIT ?
|
|
`, query, limit)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocumentRows(rows)
|
|
}
|
|
|
|
// SearchDocumentsFallback performs simple LIKE-based search (fallback)
|
|
func SearchDocumentsFallback(query string, limit int) ([]Document, error) {
|
|
if limit <= 0 {
|
|
limit = 50
|
|
}
|
|
pattern := "%" + query + "%"
|
|
|
|
rows, err := db.Query(`
|
|
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
|
|
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
|
|
COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''),
|
|
COALESCE(original_file,''), COALESCE(status,'ready')
|
|
FROM documents
|
|
WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ?
|
|
ORDER BY processed_at DESC
|
|
LIMIT ?
|
|
`, pattern, pattern, pattern, pattern, limit)
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocumentRows(rows)
|
|
}
|
|
|
|
// UpdateDocument updates document metadata
|
|
func UpdateDocument(id string, update DocumentUpdate) error {
|
|
_, err := db.Exec(`
|
|
UPDATE documents
|
|
SET title = ?, category = ?, notes = ?, updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ?
|
|
`, update.Title, update.Category, update.Notes, id)
|
|
return err
|
|
}
|
|
|
|
// UpdateDocumentRecordPath updates the record path after moving
|
|
func UpdateDocumentRecordPath(id, newPath string) error {
|
|
_, err := db.Exec(`UPDATE documents SET record_path = ? WHERE id = ?`, newPath, id)
|
|
return err
|
|
}
|
|
|
|
// UpdateDocumentMetadata updates the metadata JSON for a document
|
|
func UpdateDocumentMetadata(id string, metadata map[string]string) error {
|
|
metaJSON, _ := json.Marshal(metadata)
|
|
_, err := db.Exec(`UPDATE documents SET metadata = ? WHERE id = ?`, string(metaJSON), id)
|
|
return err
|
|
}
|
|
|
|
// DeleteDocument removes a document from the database
|
|
func DeleteDocument(id string) error {
|
|
deleteFTS(id)
|
|
db.Exec(`DELETE FROM embeddings WHERE doc_id = ?`, id)
|
|
_, err := db.Exec(`DELETE FROM documents WHERE id = ?`, id)
|
|
return err
|
|
}
|
|
|
|
// CreateShare creates a share link for a document, returns the token
|
|
func CreateShare(docID string, expiryDays int) (string, error) {
|
|
b := make([]byte, 6)
|
|
if _, err := rand.Read(b); err != nil {
|
|
return "", err
|
|
}
|
|
token := hex.EncodeToString(b)
|
|
|
|
var expiresAt sql.NullString
|
|
if expiryDays > 0 {
|
|
expiresAt = sql.NullString{
|
|
String: fmt.Sprintf("datetime('now', '+%d days')", expiryDays),
|
|
Valid: true,
|
|
}
|
|
_, err := db.Exec(`INSERT INTO shares (id, doc_id, expires_at) VALUES (?, ?, datetime('now', ?))`,
|
|
token, docID, fmt.Sprintf("+%d days", expiryDays))
|
|
return token, err
|
|
}
|
|
_ = expiresAt
|
|
_, err := db.Exec(`INSERT INTO shares (id, doc_id) VALUES (?, ?)`, token, docID)
|
|
return token, err
|
|
}
|
|
|
|
// GetShare retrieves the document for a share token, returns nil if expired/invalid
|
|
func GetShare(token string) (*Document, error) {
|
|
var docID string
|
|
var expiresAt sql.NullString
|
|
err := db.QueryRow(`SELECT doc_id, expires_at FROM shares WHERE id = ?`, token).Scan(&docID, &expiresAt)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
if expiresAt.Valid {
|
|
var expired bool
|
|
db.QueryRow(`SELECT datetime(?) < datetime('now')`, expiresAt.String).Scan(&expired)
|
|
if expired {
|
|
return nil, nil
|
|
}
|
|
}
|
|
return GetDocument(docID)
|
|
}
|
|
|
|
// DeleteShare removes a share link
|
|
func DeleteShare(token string) error {
|
|
_, err := db.Exec(`DELETE FROM shares WHERE id = ?`, token)
|
|
return err
|
|
}
|
|
|
|
// GetSharesByDocument returns active shares for a document
|
|
func GetSharesByDocument(docID string) ([]Share, error) {
|
|
rows, err := db.Query(`SELECT id, doc_id, created_at, COALESCE(expires_at, '') FROM shares WHERE doc_id = ? AND (expires_at IS NULL OR datetime(expires_at) > datetime('now'))`, docID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
var shares []Share
|
|
for rows.Next() {
|
|
var s Share
|
|
if err := rows.Scan(&s.Token, &s.DocID, &s.CreatedAt, &s.ExpiresAt); err != nil {
|
|
continue
|
|
}
|
|
shares = append(shares, s)
|
|
}
|
|
return shares, nil
|
|
}
|
|
|
|
// UpsertDocument inserts or updates a document
|
|
func UpsertDocument(doc *Document) error {
|
|
metaJSON, _ := json.Marshal(doc.Metadata)
|
|
|
|
_, err := db.Exec(`
|
|
INSERT INTO documents (
|
|
id, title, category, type, date, amount, vendor, summary, full_text,
|
|
pdf_path, record_path, processed_at, original_file, notes, metadata, updated_at
|
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
ON CONFLICT(id) DO UPDATE SET
|
|
title = excluded.title,
|
|
category = excluded.category,
|
|
type = excluded.type,
|
|
date = excluded.date,
|
|
amount = excluded.amount,
|
|
vendor = excluded.vendor,
|
|
summary = excluded.summary,
|
|
full_text = excluded.full_text,
|
|
pdf_path = excluded.pdf_path,
|
|
record_path = excluded.record_path,
|
|
processed_at = excluded.processed_at,
|
|
original_file = excluded.original_file,
|
|
metadata = excluded.metadata,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
`, doc.ID, doc.Title, doc.Category, doc.Type, doc.Date, doc.Amount,
|
|
doc.Vendor, doc.Summary, doc.FullText, doc.PDFPath, doc.RecordPath,
|
|
doc.ProcessedAt, doc.OriginalFile, doc.Notes, string(metaJSON))
|
|
|
|
return err
|
|
}
|
|
|
|
// GetStats returns dashboard statistics
|
|
func GetStats() (*Stats, error) {
|
|
stats := &Stats{
|
|
ByCategory: make(map[string]int),
|
|
}
|
|
|
|
// Total count
|
|
db.QueryRow("SELECT COUNT(*) FROM documents").Scan(&stats.TotalDocs)
|
|
|
|
// Recent (last 7 days)
|
|
db.QueryRow(`
|
|
SELECT COUNT(*) FROM documents
|
|
WHERE datetime(processed_at) > datetime('now', '-7 days')
|
|
`).Scan(&stats.RecentDocs)
|
|
|
|
// By category
|
|
rows, err := db.Query("SELECT category, COUNT(*) FROM documents GROUP BY category")
|
|
if err == nil {
|
|
defer rows.Close()
|
|
for rows.Next() {
|
|
var cat string
|
|
var count int
|
|
if rows.Scan(&cat, &count) == nil {
|
|
stats.ByCategory[cat] = count
|
|
}
|
|
}
|
|
}
|
|
|
|
// Recent uploads
|
|
stats.RecentUploads, _ = GetRecentDocuments(5)
|
|
|
|
return stats, nil
|
|
}
|
|
|
|
// GetCategoryStats returns document count per category
|
|
func GetCategoryStats(categories []string) map[string]int {
|
|
stats := make(map[string]int)
|
|
for _, cat := range categories {
|
|
var count int
|
|
db.QueryRow("SELECT COUNT(*) FROM documents WHERE category = ?", cat).Scan(&count)
|
|
stats[cat] = count
|
|
}
|
|
return stats
|
|
}
|
|
|
|
// ClearAllDocuments removes all documents (for reindexing)
|
|
func ClearAllDocuments() error {
|
|
_, err := db.Exec("DELETE FROM documents")
|
|
return err
|
|
}
|
|
|
|
// IndexDocumentsFromDirectory scans markdown files and indexes them
|
|
func IndexDocumentsFromDirectory(recordsDir, storeDir string, categories []string) error {
|
|
for _, cat := range categories {
|
|
catDir := filepath.Join(recordsDir, cat)
|
|
files, err := filepath.Glob(filepath.Join(catDir, "*.md"))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, f := range files {
|
|
doc, err := parseMarkdownRecord(f, cat, storeDir)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
UpsertDocument(doc)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseMarkdownRecord parses a markdown document record file
|
|
func parseMarkdownRecord(path, category, storeDir string) (*Document, error) {
|
|
content, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
doc := &Document{
|
|
Category: category,
|
|
RecordPath: path,
|
|
Metadata: make(map[string]string),
|
|
}
|
|
|
|
text := string(content)
|
|
lines := strings.Split(text, "\n")
|
|
|
|
// Extract ID from filename
|
|
base := filepath.Base(path)
|
|
base = strings.TrimSuffix(base, ".md")
|
|
parts := strings.Split(base, "_")
|
|
if len(parts) >= 2 {
|
|
doc.ID = parts[len(parts)-1]
|
|
} else {
|
|
doc.ID = base
|
|
}
|
|
|
|
// Regex patterns for metadata extraction
|
|
idRe := regexp.MustCompile(`\*\*ID:\*\*\s*(.+)`)
|
|
titleRe := regexp.MustCompile(`^#\s+(.+)`)
|
|
fileRe := regexp.MustCompile(`\*\*Original File:\*\*\s*(.+)`)
|
|
procRe := regexp.MustCompile(`\*\*Processed:\*\*\s*(.+)`)
|
|
typeRe := regexp.MustCompile(`\*\*Type:\*\*\s*(.+)`)
|
|
dateRe := regexp.MustCompile(`\|\s*Date\s*\|\s*(.+?)\s*\|`)
|
|
vendorRe := regexp.MustCompile(`\|\s*Vendor\s*\|\s*(.+?)\s*\|`)
|
|
amountRe := regexp.MustCompile(`\|\s*Amount\s*\|\s*(.+?)\s*\|`)
|
|
pdfRe := regexp.MustCompile(`\*\*PDF:\*\*\s*\[.+?\]\((.+?)\)`)
|
|
|
|
var inFullText, inSummary bool
|
|
var fullTextLines, summaryLines []string
|
|
|
|
for i, line := range lines {
|
|
if m := titleRe.FindStringSubmatch(line); m != nil && i == 0 {
|
|
doc.Title = strings.TrimSpace(m[1])
|
|
}
|
|
if m := idRe.FindStringSubmatch(line); m != nil {
|
|
doc.ID = strings.TrimSpace(m[1])
|
|
}
|
|
if m := fileRe.FindStringSubmatch(line); m != nil {
|
|
doc.OriginalFile = strings.TrimSpace(m[1])
|
|
}
|
|
if m := procRe.FindStringSubmatch(line); m != nil {
|
|
doc.ProcessedAt = strings.TrimSpace(m[1])
|
|
}
|
|
if m := typeRe.FindStringSubmatch(line); m != nil {
|
|
doc.Type = strings.TrimSpace(m[1])
|
|
}
|
|
if m := dateRe.FindStringSubmatch(line); m != nil {
|
|
doc.Date = strings.TrimSpace(m[1])
|
|
}
|
|
if m := vendorRe.FindStringSubmatch(line); m != nil {
|
|
doc.Vendor = strings.TrimSpace(m[1])
|
|
}
|
|
if m := amountRe.FindStringSubmatch(line); m != nil {
|
|
doc.Amount = strings.TrimSpace(m[1])
|
|
}
|
|
if m := pdfRe.FindStringSubmatch(line); m != nil {
|
|
pdfPath := strings.TrimSpace(m[1])
|
|
if strings.Contains(pdfPath, "store/") {
|
|
doc.PDFPath = filepath.Join(storeDir, filepath.Base(pdfPath))
|
|
} else {
|
|
doc.PDFPath = pdfPath
|
|
}
|
|
}
|
|
|
|
// Section detection
|
|
if strings.HasPrefix(line, "## Full Text") {
|
|
inFullText, inSummary = true, false
|
|
continue
|
|
}
|
|
if strings.HasPrefix(line, "## Summary") {
|
|
inSummary, inFullText = true, false
|
|
continue
|
|
}
|
|
if strings.HasPrefix(line, "## ") {
|
|
inFullText, inSummary = false, false
|
|
}
|
|
|
|
if inFullText && !strings.HasPrefix(line, "```") {
|
|
fullTextLines = append(fullTextLines, line)
|
|
}
|
|
if inSummary {
|
|
summaryLines = append(summaryLines, line)
|
|
}
|
|
}
|
|
|
|
doc.FullText = strings.TrimSpace(strings.Join(fullTextLines, "\n"))
|
|
doc.Summary = strings.TrimSpace(strings.Join(summaryLines, "\n"))
|
|
|
|
if doc.Title == "" {
|
|
doc.Title = doc.OriginalFile
|
|
}
|
|
if doc.Title == "" {
|
|
doc.Title = doc.ID
|
|
}
|
|
|
|
return doc, nil
|
|
}
|
|
|
|
// Helper function to query documents with a WHERE/ORDER clause
|
|
func queryDocuments(whereClause string, args ...interface{}) ([]Document, error) {
|
|
query := `
|
|
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
|
|
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
|
|
COALESCE(summary,''), COALESCE(pdf_path,''), COALESCE(processed_at,''),
|
|
COALESCE(original_file,''), COALESCE(status, 'ready')
|
|
FROM documents ` + whereClause
|
|
|
|
rows, err := db.Query(query, args...)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer rows.Close()
|
|
|
|
return scanDocumentRows(rows)
|
|
}
|
|
|
|
// Helper function to scan document rows
|
|
func scanDocumentRows(rows *sql.Rows) ([]Document, error) {
|
|
var docs []Document
|
|
for rows.Next() {
|
|
var doc Document
|
|
err := rows.Scan(
|
|
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
|
|
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.PDFPath,
|
|
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
|
|
)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
docs = append(docs, doc)
|
|
}
|
|
return docs, rows.Err()
|
|
}
|