fix: format=md endpoint now returns full OCR text (full_text field)

SearchDocuments excludes full_text for performance. The MD endpoint
needs the actual OCR content, not just the summary.

Added SearchDocumentsWithFullText() and SearchDocumentsWithFullTextFallback()
that select full_text explicitly. apiSearchMDHandler now uses these,
so format=md returns the complete OCR/markdown text for each document.
This commit is contained in:
James 2026-03-23 14:07:20 -04:00
parent 405a6f697f
commit 9622ab9390
2 changed files with 77 additions and 2 deletions

75
db.go
View File

@ -715,3 +715,78 @@ func scanDocumentRows(rows *sql.Rows) ([]Document, error) {
} }
return docs, rows.Err() return docs, rows.Err()
} }
// SearchDocumentsWithFullText is like SearchDocuments but includes the full_text
// column. Used by the format=md endpoint where OCR content is needed.
func SearchDocumentsWithFullText(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 200
}
rows, err := db.Query(`
SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''),
COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''),
COALESCE(d.summary,''), COALESCE(d.full_text,''), COALESCE(d.pdf_path,''),
COALESCE(d.processed_at,''), COALESCE(d.original_file,''), COALESCE(d.status,'ready')
FROM documents d
JOIN documents_fts fts ON d.id = fts.id
WHERE documents_fts MATCH ?
ORDER BY rank
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var docs []Document
for rows.Next() {
var doc Document
if err := rows.Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
); err != nil {
continue
}
docs = append(docs, doc)
}
return docs, rows.Err()
}
// SearchDocumentsWithFullTextFallback is the LIKE-based fallback that also includes full_text.
func SearchDocumentsWithFullTextFallback(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 200
}
pattern := "%" + query + "%"
rows, err := db.Query(`
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
COALESCE(summary,''), COALESCE(full_text,''), COALESCE(pdf_path,''),
COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(status,'ready')
FROM documents
WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ?
ORDER BY processed_at DESC
LIMIT ?
`, pattern, pattern, pattern, pattern, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var docs []Document
for rows.Next() {
var doc Document
if err := rows.Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
); err != nil {
continue
}
docs = append(docs, doc)
}
return docs, rows.Err()
}

View File

@ -390,9 +390,9 @@ func apiSearchMDHandler(w http.ResponseWriter, r *http.Request) {
return return
} }
docs, err := SearchDocuments(query, 200) docs, err := SearchDocumentsWithFullText(query, 200)
if err != nil { if err != nil {
docs, _ = SearchDocumentsFallback(query, 200) docs, _ = SearchDocumentsWithFullTextFallback(query, 200)
} }
w.Header().Set("Content-Type", "text/markdown; charset=utf-8") w.Header().Set("Content-Type", "text/markdown; charset=utf-8")