fix: format=md endpoint now returns full OCR text (full_text field)
SearchDocuments excludes full_text for performance. The MD endpoint needs the actual OCR content, not just the summary. Added SearchDocumentsWithFullText() and SearchDocumentsWithFullTextFallback() that select full_text explicitly. apiSearchMDHandler now uses these, so format=md returns the complete OCR/markdown text for each document.
This commit is contained in:
parent
405a6f697f
commit
9622ab9390
75
db.go
75
db.go
|
|
@ -715,3 +715,78 @@ func scanDocumentRows(rows *sql.Rows) ([]Document, error) {
|
|||
}
|
||||
return docs, rows.Err()
|
||||
}
|
||||
|
||||
// SearchDocumentsWithFullText is like SearchDocuments but includes the full_text
|
||||
// column. Used by the format=md endpoint where OCR content is needed.
|
||||
func SearchDocumentsWithFullText(query string, limit int) ([]Document, error) {
|
||||
if limit <= 0 {
|
||||
limit = 200
|
||||
}
|
||||
|
||||
rows, err := db.Query(`
|
||||
SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''),
|
||||
COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''),
|
||||
COALESCE(d.summary,''), COALESCE(d.full_text,''), COALESCE(d.pdf_path,''),
|
||||
COALESCE(d.processed_at,''), COALESCE(d.original_file,''), COALESCE(d.status,'ready')
|
||||
FROM documents d
|
||||
JOIN documents_fts fts ON d.id = fts.id
|
||||
WHERE documents_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
`, query, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var docs []Document
|
||||
for rows.Next() {
|
||||
var doc Document
|
||||
if err := rows.Scan(
|
||||
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
|
||||
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
|
||||
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
}
|
||||
return docs, rows.Err()
|
||||
}
|
||||
|
||||
// SearchDocumentsWithFullTextFallback is the LIKE-based fallback that also includes full_text.
|
||||
func SearchDocumentsWithFullTextFallback(query string, limit int) ([]Document, error) {
|
||||
if limit <= 0 {
|
||||
limit = 200
|
||||
}
|
||||
pattern := "%" + query + "%"
|
||||
|
||||
rows, err := db.Query(`
|
||||
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
|
||||
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
|
||||
COALESCE(summary,''), COALESCE(full_text,''), COALESCE(pdf_path,''),
|
||||
COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(status,'ready')
|
||||
FROM documents
|
||||
WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ?
|
||||
ORDER BY processed_at DESC
|
||||
LIMIT ?
|
||||
`, pattern, pattern, pattern, pattern, limit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var docs []Document
|
||||
for rows.Next() {
|
||||
var doc Document
|
||||
if err := rows.Scan(
|
||||
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
|
||||
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
|
||||
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
|
||||
); err != nil {
|
||||
continue
|
||||
}
|
||||
docs = append(docs, doc)
|
||||
}
|
||||
return docs, rows.Err()
|
||||
}
|
||||
|
|
|
|||
4
main.go
4
main.go
|
|
@ -390,9 +390,9 @@ func apiSearchMDHandler(w http.ResponseWriter, r *http.Request) {
|
|||
return
|
||||
}
|
||||
|
||||
docs, err := SearchDocuments(query, 200)
|
||||
docs, err := SearchDocumentsWithFullText(query, 200)
|
||||
if err != nil {
|
||||
docs, _ = SearchDocumentsFallback(query, 200)
|
||||
docs, _ = SearchDocumentsWithFullTextFallback(query, 200)
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
|
||||
|
|
|
|||
Loading…
Reference in New Issue