Compare commits

...

10 Commits

Author SHA1 Message Date
James a1d2546117 Search: sort by timestamp desc, display dates in results
- FTS search results now ORDER BY processed_at DESC (was rank)
- Full search page shows processed timestamp with formatDateTime
- Quick search dropdown shows formatted date next to category
- Fixed formatDate to handle timezone-aware timestamps
2026-03-31 13:34:54 -04:00
James 31d6cb6f86 fix: skip store copy if file already exists (reprocess safety)
When reprocessing a document whose PDF is already in the store,
copyFile() would fail with 'open /srv/docsys/inbox/...: no such file
or directory' because the upload wrote to a temp inbox path that was
already cleaned up by the time async OCR completed.

The store is keyed by content hash so if the file is already there,
the copy is a no-op — skip it rather than error out.
2026-03-23 14:27:38 -04:00
James 883f118d66 fix: pdftoppm output filename glob instead of hardcoded page-1.png
pdftoppm zero-pads the page number based on total page count:
- <10 pages: page-1.png
- <100 pages: page-01.png
- <1000 pages: page-001.png

The code hardcoded 'page-1.png' and 'page-N.png', which fails for any
multi-page document. Use filepath.Glob('page-*.png') to find the actual
output regardless of padding width.

Fixed in both ConvertToImage() (first-page preview) and the multi-page
OCR loop in ProcessDocument().
2026-03-23 14:14:28 -04:00
James 9622ab9390 fix: format=md endpoint now returns full OCR text (full_text field)
SearchDocuments excludes full_text for performance. The MD endpoint
needs the actual OCR content, not just the summary.

Added SearchDocumentsWithFullText() and SearchDocumentsWithFullTextFallback()
that select full_text explicitly. apiSearchMDHandler now uses these,
so format=md returns the complete OCR/markdown text for each document.
2026-03-23 14:07:20 -04:00
James 405a6f697f feat: add GET /api/search?q=...&format=md for AI/LLM consumption
New endpoint returns all matching documents as concatenated plain-text
markdown, one section per document separated by ---.

Format:
  # Document: {title}
  ID: {id} | Category: {category} | Date: {date} | Vendor: {vendor}

  {full_text or summary}

  ---

Parameters:
  q      - search query (required)
  format - must be 'md' (required; distinguishes from HTML search)

Uses same FTS5 search as existing endpoints, limit raised to 200.
Falls back to LIKE search if FTS5 fails. Returns text/markdown content type.
POST /api/search (HTML partial) unchanged.
2026-03-23 13:58:47 -04:00
James 63d4e5e5ca chore: auto-commit uncommitted changes 2026-02-28 06:01:28 -05:00
James 2c91d5649e chore: auto-commit uncommitted changes 2026-02-28 00:01:21 -05:00
James bbc029196a chore: auto-commit uncommitted changes 2026-02-25 18:01:27 -05:00
James 83373885d4 Add vocabulary hints for handwriting: Jongsma, Johan, Tatyana, St. Petersburg FL 2026-02-25 14:24:04 -05:00
James 1b4c82ab83 Improve title prompt: require specific, identifying titles with sender+topic+date 2026-02-25 14:21:43 -05:00
9 changed files with 1504 additions and 47 deletions

290
ai.go
View File

@ -1,6 +1,7 @@
package main
import (
"archive/zip"
"bytes"
"crypto/sha256"
"encoding/base64"
@ -89,7 +90,7 @@ func ConvertToImage(filePath string) ([]byte, error) {
}
defer os.RemoveAll(tmpDir)
cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
cmd := exec.Command("soffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("libreoffice conversion failed: %w", err)
}
@ -115,14 +116,21 @@ func ConvertToImage(filePath string) ([]byte, error) {
return nil, fmt.Errorf("pdftoppm failed: %w", err)
}
pngPath := filepath.Join(tmpDir, "page-1.png")
return os.ReadFile(pngPath)
// pdftoppm uses variable-width zero-padding depending on page count
// (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
// Glob for the first match instead of hardcoding "page-1.png".
matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if err != nil || len(matches) == 0 {
return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
}
return os.ReadFile(matches[0])
}
// Image files — read directly
return os.ReadFile(filePath)
}
// IsTextFile returns true for plain text files
func IsTextFile(ext string) bool {
textExts := map[string]bool{
@ -159,12 +167,18 @@ func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
- vendor: Company/organization name
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
5. **Title**: SHORT English title (max 6-8 words), e.g. "Apple Store Mac Mini Receipt" or "Electric Bill March 2025"
5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Ask yourself: "If I had to find this document again, what would I search for?" Prioritize the most unique/recoverable detail: an account number, confirmation code, ID, or specific value that can't be inferred from context. If the document's main value is a number or ID, put it in the title. Bad: "FedEx Receipt Jan 2026" (generic). Good: "FedEx Account 203634010" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024". Never use generic words like "Document", "Letter", "Report" alone always qualify them.
6. **Summary**: 1-2 sentence English description with key details.
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
**Known proper nouns** (use these exact spellings when you see similar handwriting):
- Jongsma (surname may look like "Jongoma", "Jongsoma", "Jongma")
- Johan (first name)
- Tatyana / Tanya (first name)
- St. Petersburg, Florida (city not Russia)
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
@ -227,12 +241,20 @@ func AnalyzeText(text, filename string) (*DocumentAnalysis, error) {
Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
For the title: specific and concise (max 8 words). Ask yourself: "What would I search for to find this again?" Prioritize unique/recoverable details account numbers, IDs, confirmation codes over generic document type names. Examples:
- "FedEx Account 203634010" (if the key value is an account number)
- "Allegiant Confirmation IB2EJA AVL-PIE Feb 2026"
- "IRS Form 1099-INT Chase Bank 2024"
Bad: "FedEx Receipt Jan 2026" or "Flight Confirmation" too generic.
Also produce a "full_text" field: reformat the raw content as clean Markdown. Use ## headers for sections, | tables | for tabular data, **bold** for field labels, and bullet lists where appropriate. Preserve all values exactly. Do not summarize include everything.
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text)
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`, filename, text)
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"max_tokens": 1024,
"max_tokens": 2048,
"messages": []map[string]interface{}{
{"role": "user", "content": prompt},
},
@ -242,7 +264,10 @@ Respond in JSON ONLY:
if err != nil {
return nil, err
}
analysis.FullText = text
// If model didn't produce full_text, fall back to raw extracted text
if analysis.FullText == "" {
analysis.FullText = text
}
return analysis, nil
}
@ -464,8 +489,13 @@ func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
continue
}
pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page))
imageData, err := os.ReadFile(pngPath)
// Glob for the output — pdftoppm zero-pads based on total page count
pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if len(pageMatches) == 0 {
os.RemoveAll(tmpDir)
continue
}
imageData, err := os.ReadFile(pageMatches[0])
os.RemoveAll(tmpDir)
if err != nil {
continue
@ -611,6 +641,22 @@ func ProcessDocument(filePath string) (*Document, error) {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("text analysis failed: %w", err)
}
} else if IsOfficeFile(ext) {
// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
UpdateJob(hash, "converting", "Extracting text...")
log.Printf(" Extracting text from %s...", ext)
text, err := ExtractOfficeText(filePath)
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("office text extraction failed: %w", err)
}
log.Printf(" Extracted %d chars, classifying...", len(text))
UpdateJob(hash, "classifying", "Classifying...")
analysis, err = AnalyzeText(text, filepath.Base(filePath))
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("text analysis failed: %w", err)
}
} else {
// Vision — convert to image and analyze
UpdateJob(hash, "converting", "Converting to image...")
@ -676,10 +722,18 @@ func ProcessDocument(filePath string) (*Document, error) {
log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType)
// Copy to store
// Copy to store (skip if already there — reprocessing from store-backed upload)
storePath := filepath.Join(storeDir, hash+ext)
if err := copyFile(filePath, storePath); err != nil {
return nil, fmt.Errorf("store copy failed: %w", err)
if _, statErr := os.Stat(storePath); os.IsNotExist(statErr) {
if err := copyFile(filePath, storePath); err != nil {
return nil, fmt.Errorf("store copy failed: %w", err)
}
}
// Only set PDFPath for actual PDFs — office/text files have no previewable PDF
pdfStorePath := ""
if ext == ".pdf" {
pdfStorePath = storePath
}
// Create document record
@ -688,7 +742,7 @@ func ProcessDocument(filePath string) (*Document, error) {
if title == "" {
title = analysis.Summary
}
doc := &Document{
ID: hash,
Title: title,
@ -699,7 +753,7 @@ func ProcessDocument(filePath string) (*Document, error) {
Vendor: analysis.Vendor,
Summary: analysis.Summary,
FullText: analysis.FullText,
PDFPath: storePath,
PDFPath: pdfStorePath,
OriginalFile: filepath.Base(filePath),
ProcessedAt: time.Now().Format(time.RFC3339),
Status: "ready",
@ -745,3 +799,211 @@ func copyFile(src, dst string) error {
_, err = io.Copy(out, in)
return err
}
// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
// These are ZIP archives containing XML — no LibreOffice needed.
func ExtractOfficeText(filePath string) (string, error) {
ext := strings.ToLower(filepath.Ext(filePath))
r, err := zip.OpenReader(filePath)
if err != nil {
return "", fmt.Errorf("not a valid Office file: %w", err)
}
defer r.Close()
// Which XML paths to extract per format
var targets []string
switch ext {
case ".docx", ".doc", ".odt", ".rtf":
targets = []string{"word/document.xml", "word/body.xml"}
case ".xlsx", ".xls":
// All sheets
for _, f := range r.File {
if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
targets = append(targets, f.Name)
}
}
// Shared strings (cell values are stored here for xlsx)
targets = append(targets, "xl/sharedStrings.xml")
case ".pptx", ".ppt":
for _, f := range r.File {
if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
targets = append(targets, f.Name)
}
}
default:
return "", fmt.Errorf("unsupported office format: %s", ext)
}
var sb strings.Builder
for _, f := range r.File {
found := false
for _, t := range targets {
if f.Name == t {
found = true
break
}
}
if !found {
continue
}
rc, err := f.Open()
if err != nil {
continue
}
data, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
sb.WriteString(xmlToText(data))
sb.WriteString("\n")
}
text := strings.TrimSpace(sb.String())
if text == "" {
return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
}
return text, nil
}
// xmlToText parses DOCX/Office XML into structured plain text.
// Understands paragraph breaks, table rows/cells, and text runs.
func xmlToText(data []byte) string {
s := string(data)
// Normalise tag names: strip namespace prefix (w:p → p, a:p → p, etc.)
// We work on the raw string using simple tag scanning.
var sb strings.Builder
inTag := false
var tagBuf strings.Builder
lastWasNewline := false
flushLine := func() {
line := strings.TrimSpace(sb.String())
if line != "" {
sb.Reset()
// return the line — caller collects via out builder
// (we reuse sb for the whole doc; collect separately)
}
}
_ = flushLine
// Two-pass: build a token stream of (tag, text) pairs
type token struct {
tag string // lowercased local name, or "" for text
text string
}
var tokens []token
for i := 0; i < len(s); i++ {
c := s[i]
if c == '<' {
// flush text buffer
if sb.Len() > 0 {
tokens = append(tokens, token{text: sb.String()})
sb.Reset()
}
inTag = true
tagBuf.Reset()
continue
}
if c == '>' {
inTag = false
raw := strings.TrimSpace(tagBuf.String())
// Extract local name (strip namespace and attributes)
name := raw
if idx := strings.IndexAny(raw, " \t\n\r/"); idx >= 0 {
name = raw[:idx]
}
if idx := strings.Index(name, ":"); idx >= 0 {
name = name[idx+1:]
}
name = strings.ToLower(name)
closing := strings.HasPrefix(raw, "/")
if closing {
name = strings.TrimPrefix(name, "/")
}
tokens = append(tokens, token{tag: name})
tagBuf.Reset()
continue
}
if inTag {
tagBuf.WriteByte(c)
} else {
sb.WriteByte(c)
}
}
if sb.Len() > 0 {
tokens = append(tokens, token{text: sb.String()})
}
// Now render tokens into structured text
var out strings.Builder
pendingNewlines := 0
emitNewlines := func(n int) {
if n > pendingNewlines {
pendingNewlines = n
}
}
flushNewlines := func() {
for i := 0; i < pendingNewlines; i++ {
out.WriteByte('\n')
}
pendingNewlines = 0
lastWasNewline = pendingNewlines == 0
}
_ = lastWasNewline
inCell := false
cellCount := 0
for _, tok := range tokens {
if tok.tag == "" {
// text node
txt := tok.text
// decode entities
txt = strings.ReplaceAll(txt, "&amp;", "&")
txt = strings.ReplaceAll(txt, "&lt;", "<")
txt = strings.ReplaceAll(txt, "&gt;", ">")
txt = strings.ReplaceAll(txt, "&quot;", "\"")
txt = strings.ReplaceAll(txt, "&apos;", "'")
txt = strings.ReplaceAll(txt, "&#xA;", "\n")
txt = strings.ReplaceAll(txt, "&#x9;", "\t")
if strings.TrimSpace(txt) != "" {
flushNewlines()
out.WriteString(txt)
}
continue
}
switch tok.tag {
case "p": // paragraph
emitNewlines(1)
case "tr": // table row start
inCell = false
cellCount = 0
emitNewlines(1)
case "tc": // table cell
if inCell {
out.WriteString("\t")
}
inCell = true
cellCount++
case "br": // line break
emitNewlines(1)
}
}
return strings.TrimSpace(out.String())
}
// IsOfficeFile returns true for formats with extractable XML text.
func IsOfficeFile(ext string) bool {
return map[string]bool{
".docx": true, ".xlsx": true, ".pptx": true,
".odt": true, ".ods": true, ".odp": true,
}[ext]
}

891
ai.go.bak-20260228-005328 Normal file
View File

@ -0,0 +1,891 @@
package main
import (
"archive/zip"
"bytes"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
var (
fireworksAPIKey string
fireworksBaseURL = "https://api.fireworks.ai/inference/v1"
)
func init() {
fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY")
if fireworksAPIKey == "" {
// Try .env file in docsys directory
envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env")
if data, err := os.ReadFile(envPath); err == nil {
for _, line := range strings.Split(string(data), "\n") {
if strings.HasPrefix(line, "FIREWORKS_API_KEY=") {
fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY="))
fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`)
break
}
}
}
}
}
// DocumentAnalysis contains the AI-extracted information
type DocumentAnalysis struct {
Category string `json:"category"`
DocType string `json:"doc_type"`
Date string `json:"date"`
Vendor string `json:"vendor"`
Amount interface{} `json:"amount"` // Can be string or number
Title string `json:"title"`
Summary string `json:"summary"`
FullText string `json:"full_text"`
}
func (d *DocumentAnalysis) AmountString() string {
switch v := d.Amount.(type) {
case string:
return v
case float64:
return fmt.Sprintf("$%.2f", v)
default:
return ""
}
}
// FileHash returns first 16 chars of SHA256 hash
func FileHash(filepath string) (string, error) {
f, err := os.Open(filepath)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return fmt.Sprintf("%x", h.Sum(nil))[:16], nil
}
// ConvertToImage converts PDF/Office docs to PNG for vision API
func ConvertToImage(filePath string) ([]byte, error) {
ext := strings.ToLower(filepath.Ext(filePath))
// Office documents → PDF first
officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true}
if officeExts[ext] {
tmpDir, err := os.MkdirTemp("", "docsys")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
cmd := exec.Command("libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("libreoffice conversion failed: %w", err)
}
base := strings.TrimSuffix(filepath.Base(filePath), ext)
pdfPath := filepath.Join(tmpDir, base+".pdf")
filePath = pdfPath
ext = ".pdf"
}
// PDF → PNG (first page only for preview, full processing done separately)
if ext == ".pdf" {
tmpDir, err := os.MkdirTemp("", "docsys")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
// Convert first page for initial analysis
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix)
if err := cmd.Run(); err != nil {
return nil, fmt.Errorf("pdftoppm failed: %w", err)
}
pngPath := filepath.Join(tmpDir, "page-1.png")
return os.ReadFile(pngPath)
}
// Image files — read directly
return os.ReadFile(filePath)
}
// IsTextFile returns true for plain text files
func IsTextFile(ext string) bool {
textExts := map[string]bool{
".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true,
".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true,
}
return textExts[ext]
}
// AnalyzeWithVision uses K2.5 vision model
func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
b64 := base64.StdEncoding.EncodeToString(imageData)
prompt := `Analyze this document image and extract:
1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
- Use headers (##) for sections
- Use **bold** for labels/field names
- Use tables for tabular data (items, prices, etc.)
- Use bullet lists where appropriate
- Preserve ALL numbers, dates, amounts, and codes exactly as shown
2. **Classification**: Categorize into exactly ONE of:
taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")
4. **Key Fields** (these may be in English for searchability):
- date: Document date (YYYY-MM-DD if possible)
- vendor: Company/organization name
- amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")
5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Include the key distinguishing details: who sent it, what it's about, and when. Bad: "Financial Report" or "Invoice". Good: "N-able Technology Exchange Rate Loss Explanation Feb 2025" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024" or "BayCare HomeCare Invoice $340 Nov 2025". Never use generic words like "Document", "Letter", "Report" alone — always qualify them.
6. **Summary**: 1-2 sentence English description with key details.
IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.
**Known proper nouns** (use these exact spellings when you see similar handwriting):
- Jongsma (surname — may look like "Jongoma", "Jongsoma", "Jongma")
- Johan (first name)
- Tatyana / Tanya (first name)
- St. Petersburg, Florida (city — not Russia)
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": prompt},
},
},
},
}
analysis, err := callFireworks(reqBody)
if err != nil {
// Retry once with minimal prompt to avoid triggering extended reasoning
log.Printf(" [AI] First attempt failed, retrying with simplified prompt...")
retryBody := map[string]interface{}{
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{"role": "system", "content": "Output valid JSON only. No other text."},
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
},
},
},
}
return callFireworks(retryBody)
}
return analysis, nil
}
// AnalyzeText uses K2 text model for plain text files
func AnalyzeText(text, filename string) (*DocumentAnalysis, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
// Truncate long text
if len(text) > 50000 {
text = text[:50000]
}
prompt := fmt.Sprintf(`Analyze this document:
**Filename:** %s
**Content:**
%s
Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized
Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "summary": "..."}`, filename, text)
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/kimi-k2-instruct-0905",
"max_tokens": 1024,
"messages": []map[string]interface{}{
{"role": "user", "content": prompt},
},
}
analysis, err := callFireworks(reqBody)
if err != nil {
return nil, err
}
analysis.FullText = text
return analysis, nil
}
func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) {
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
}
respBody, _ := io.ReadAll(resp.Body)
var result struct {
Choices []struct {
Message struct {
Content string `json:"content"`
ReasoningContent string `json:"reasoning_content"`
} `json:"message"`
} `json:"choices"`
}
if err := json.Unmarshal(respBody, &result); err != nil {
return nil, err
}
if len(result.Choices) == 0 {
return nil, fmt.Errorf("no response from API")
}
content := result.Choices[0].Message.Content
reasoning := result.Choices[0].Message.ReasoningContent
// K2.5 reasoning mode: actual JSON may be in content or reasoning_content
// Try content first, if it doesn't look like JSON, try reasoning_content
if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") {
log.Printf(" [AI] Using reasoning_content (content had no JSON)")
content = reasoning
}
// Strip markdown code fences (```json ... ``` or ``` ... ```)
content = strings.TrimSpace(content)
if strings.HasPrefix(content, "```") {
// Remove opening fence (```json or ```)
if idx := strings.Index(content, "\n"); idx >= 0 {
content = content[idx+1:]
}
// Remove closing fence
if idx := strings.LastIndex(content, "```"); idx >= 0 {
content = content[:idx]
}
content = strings.TrimSpace(content)
}
// Extract JSON from response
if idx := strings.Index(content, "{"); idx >= 0 {
if end := strings.LastIndex(content, "}"); end > idx {
content = content[idx : end+1]
}
}
var analysis DocumentAnalysis
if err := json.Unmarshal([]byte(content), &analysis); err != nil {
// Last resort: try to find a JSON object with braces matching
cleaned := extractJSONObject(content)
if cleaned != "" {
if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil {
log.Printf(" [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content)
return nil, fmt.Errorf("failed to parse response: %w", err)
}
} else {
log.Printf(" [AI debug] No JSON object found in response. Content starts: %.200s", content)
return nil, fmt.Errorf("failed to parse response: %w", err)
}
}
// Validate category
validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true}
if !validCats[analysis.Category] {
analysis.Category = "uncategorized"
}
return &analysis, nil
}
// extractJSONObject tries to find a balanced JSON object in a string
func extractJSONObject(s string) string {
start := strings.Index(s, "{")
if start < 0 {
return ""
}
depth := 0
inString := false
escaped := false
for i := start; i < len(s); i++ {
c := s[i]
if escaped {
escaped = false
continue
}
if c == '\\' && inString {
escaped = true
continue
}
if c == '"' {
inString = !inString
continue
}
if inString {
continue
}
if c == '{' {
depth++
} else if c == '}' {
depth--
if depth == 0 {
return s[start : i+1]
}
}
}
return ""
}
// GenerateEmbedding creates a vector embedding using Fireworks
func GenerateEmbedding(text string) ([]float32, error) {
if fireworksAPIKey == "" {
return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
}
// Truncate
if len(text) > 32000 {
text = text[:32000]
}
reqBody := map[string]interface{}{
"model": "fireworks/qwen3-embedding-8b",
"input": text,
}
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body))
}
var result struct {
Data []struct {
Embedding []float32 `json:"embedding"`
} `json:"data"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return nil, err
}
if len(result.Data) == 0 {
return nil, fmt.Errorf("no embedding returned")
}
return result.Data[0].Embedding, nil
}
// GetPDFPageCount returns the number of pages in a PDF
func GetPDFPageCount(filePath string) int {
cmd := exec.Command("pdfinfo", filePath)
out, err := cmd.Output()
if err != nil {
return 1
}
for _, line := range strings.Split(string(out), "\n") {
if strings.HasPrefix(line, "Pages:") {
var count int
fmt.Sscanf(line, "Pages: %d", &count)
return count
}
}
return 1
}
// ProcessPDFPageByPage extracts text from each page separately
func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
pageCount := GetPDFPageCount(filePath)
log.Printf(" Processing %d pages separately...", pageCount)
var allText strings.Builder
for page := 1; page <= pageCount; page++ {
UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount))
tmpDir, err := os.MkdirTemp("", "docsys-page")
if err != nil {
continue
}
// Convert single page to PNG
outPrefix := filepath.Join(tmpDir, "page")
cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix)
if err := cmd.Run(); err != nil {
os.RemoveAll(tmpDir)
continue
}
pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page))
imageData, err := os.ReadFile(pngPath)
os.RemoveAll(tmpDir)
if err != nil {
continue
}
// OCR this page
log.Printf(" Page %d/%d...", page, pageCount)
pageAnalysis, err := AnalyzePageOnly(imageData, page)
if err != nil {
log.Printf(" Page %d failed: %v", page, err)
continue
}
if pageAnalysis != "" {
allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page))
allText.WriteString(pageAnalysis)
}
}
return allText.String(), nil
}
// AnalyzePageOnly extracts just the text from a single page image
func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
if fireworksAPIKey == "" {
return "", fmt.Errorf("FIREWORKS_API_KEY not set")
}
b64 := base64.StdEncoding.EncodeToString(imageData)
prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.
FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`
reqBody := map[string]interface{}{
"model": "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
"max_tokens": 4096,
"messages": []map[string]interface{}{
{
"role": "user",
"content": []map[string]interface{}{
{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
{"type": "text", "text": prompt},
},
},
},
}
jsonBody, _ := json.Marshal(reqBody)
req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
req.Header.Set("Content-Type", "application/json")
client := &http.Client{Timeout: 120 * time.Second}
resp, err := client.Do(req)
if err != nil {
return "", err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
body, _ := io.ReadAll(resp.Body)
return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
}
// Read raw response to debug content vs reasoning_content
rawBody, err := io.ReadAll(resp.Body)
if err != nil {
return "", err
}
var result struct {
Choices []struct {
Message struct {
Content string `json:"content"`
ReasoningContent string `json:"reasoning_content"`
} `json:"message"`
} `json:"choices"`
}
if err := json.Unmarshal(rawBody, &result); err != nil {
return "", err
}
if len(result.Choices) == 0 {
return "", fmt.Errorf("no response")
}
content := result.Choices[0].Message.Content
reasoning := result.Choices[0].Message.ReasoningContent
if reasoning != "" {
log.Printf(" [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content))
if len(content) > 100 {
log.Printf(" [OCR debug] content starts: %.100s", content)
}
}
// If content is empty but reasoning has text, model put everything in wrong field
if strings.TrimSpace(content) == "" && reasoning != "" {
log.Printf(" [OCR debug] WARNING: content empty, using reasoning_content")
content = reasoning
}
return strings.TrimSpace(content), nil
}
// ProcessDocument handles the full document processing pipeline
func ProcessDocument(filePath string) (*Document, error) {
log.Printf("Processing: %s", filepath.Base(filePath))
ext := strings.ToLower(filepath.Ext(filePath))
// Get file hash
hash, err := FileHash(filePath)
if err != nil {
return nil, fmt.Errorf("hash failed: %w", err)
}
log.Printf(" Hash: %s", hash)
// Start progress tracking
StartJob(hash, filepath.Base(filePath))
defer FinishJob(hash)
// Check if already fully processed (not pending)
if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" {
log.Printf(" Already exists, skipping")
os.Remove(filePath)
return existing, nil
}
var analysis *DocumentAnalysis
if IsTextFile(ext) {
// Plain text — read and analyze
data, err := os.ReadFile(filePath)
if err != nil {
return nil, err
}
UpdateJob(hash, "classifying", "Analyzing text...")
log.Printf(" Analyzing text with K2...")
analysis, err = AnalyzeText(string(data), filepath.Base(filePath))
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("text analysis failed: %w", err)
}
} else if IsOfficeFile(ext) {
// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
UpdateJob(hash, "converting", "Extracting text...")
log.Printf(" Extracting text from %s...", ext)
text, err := ExtractOfficeText(filePath)
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("office text extraction failed: %w", err)
}
log.Printf(" Extracted %d chars, classifying...", len(text))
UpdateJob(hash, "classifying", "Classifying...")
analysis, err = AnalyzeText(text, filepath.Base(filePath))
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("text analysis failed: %w", err)
}
} else {
// Vision — convert to image and analyze
UpdateJob(hash, "converting", "Converting to image...")
log.Printf(" Converting to image...")
imageData, err := ConvertToImage(filePath)
if err != nil {
UpdateJob(hash, "error", err.Error())
return nil, fmt.Errorf("image conversion failed: %w", err)
}
UpdateJob(hash, "ocr", "Analyzing first page...")
log.Printf(" Analyzing with K2.5 vision...")
analysis, err = AnalyzeWithVision(imageData)
if err != nil {
// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
log.Printf(" AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
pageText, ocrErr := AnalyzePageOnly(imageData, 1)
if ocrErr != nil {
UpdateJob(hash, "error", ocrErr.Error())
return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
}
// Classify the extracted text
log.Printf(" OCR succeeded (%d chars), classifying...", len(pageText))
UpdateJob(hash, "classifying", "Classifying extracted text...")
analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
if err != nil {
// Use minimal stub so at least the doc is stored with its text
log.Printf(" Classification failed too: %v — storing with minimal metadata", err)
analysis = &DocumentAnalysis{
Category: "uncategorized",
DocType: "unknown",
Title: strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
Summary: "Extraction failed — stored raw text only",
FullText: pageText,
}
} else {
analysis.FullText = pageText
}
}
// For PDFs, process pages for accurate OCR
if ext == ".pdf" {
pageCount := GetPDFPageCount(filePath)
if pageCount > 1 {
log.Printf(" Multi-page PDF detected (%d pages)", pageCount)
UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount))
fullText, err := ProcessPDFPageByPage(filePath, hash)
if err == nil && fullText != "" {
analysis.FullText = fullText
}
} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
log.Printf(" Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
UpdateJob(hash, "ocr", "Retrying text extraction...")
if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
analysis.FullText = pageText
} else if err != nil {
log.Printf(" AnalyzePageOnly fallback failed: %v", err)
}
}
}
}
log.Printf(" Category: %s, Type: %s", analysis.Category, analysis.DocType)
// Copy to store
storePath := filepath.Join(storeDir, hash+ext)
if err := copyFile(filePath, storePath); err != nil {
return nil, fmt.Errorf("store copy failed: %w", err)
}
// Create document record
// Use title if provided, fall back to summary
title := analysis.Title
if title == "" {
title = analysis.Summary
}
doc := &Document{
ID: hash,
Title: title,
Category: analysis.Category,
Type: analysis.DocType,
Date: analysis.Date,
Amount: analysis.AmountString(),
Vendor: analysis.Vendor,
Summary: analysis.Summary,
FullText: analysis.FullText,
PDFPath: storePath,
OriginalFile: filepath.Base(filePath),
ProcessedAt: time.Now().Format(time.RFC3339),
Status: "ready",
}
// Save to database
if err := InsertDocument(doc); err != nil {
return nil, fmt.Errorf("db insert failed: %w", err)
}
// Generate embedding
if analysis.FullText != "" {
UpdateJob(hash, "embedding", "Generating search index...")
log.Printf(" Generating embedding...")
if emb, err := GenerateEmbedding(analysis.FullText); err == nil {
log.Printf(" Embedding: %d dimensions", len(emb))
StoreEmbedding(hash, emb)
} else {
log.Printf(" Embedding failed: %v", err)
}
}
// Remove from inbox
os.Remove(filePath)
log.Printf(" ✓ Done: %s/%s", analysis.Category, hash)
return doc, nil
}
func copyFile(src, dst string) error {
in, err := os.Open(src)
if err != nil {
return err
}
defer in.Close()
out, err := os.Create(dst)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, in)
return err
}
// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
// These are ZIP archives containing XML — no LibreOffice needed.
func ExtractOfficeText(filePath string) (string, error) {
ext := strings.ToLower(filepath.Ext(filePath))
r, err := zip.OpenReader(filePath)
if err != nil {
return "", fmt.Errorf("not a valid Office file: %w", err)
}
defer r.Close()
// Which XML paths to extract per format
var targets []string
switch ext {
case ".docx", ".doc", ".odt", ".rtf":
targets = []string{"word/document.xml", "word/body.xml"}
case ".xlsx", ".xls":
// All sheets
for _, f := range r.File {
if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
targets = append(targets, f.Name)
}
}
// Shared strings (cell values are stored here for xlsx)
targets = append(targets, "xl/sharedStrings.xml")
case ".pptx", ".ppt":
for _, f := range r.File {
if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
targets = append(targets, f.Name)
}
}
default:
return "", fmt.Errorf("unsupported office format: %s", ext)
}
var sb strings.Builder
for _, f := range r.File {
found := false
for _, t := range targets {
if f.Name == t {
found = true
break
}
}
if !found {
continue
}
rc, err := f.Open()
if err != nil {
continue
}
data, err := io.ReadAll(rc)
rc.Close()
if err != nil {
continue
}
sb.WriteString(xmlToText(data))
sb.WriteString("\n")
}
text := strings.TrimSpace(sb.String())
if text == "" {
return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
}
return text, nil
}
// xmlToText strips XML tags and decodes entities, returning plain text.
func xmlToText(data []byte) string {
var sb strings.Builder
inTag := false
lastWasSpace := false
for i := 0; i < len(data); i++ {
c := data[i]
switch {
case c == '<':
inTag = true
// Emit space between elements so words don't run together
if !lastWasSpace {
sb.WriteByte(' ')
lastWasSpace = true
}
case c == '>':
inTag = false
case !inTag:
if c == ' ' || c == '\t' || c == '\n' || c == '\r' {
if !lastWasSpace {
sb.WriteByte(' ')
lastWasSpace = true
}
} else {
sb.WriteByte(c)
lastWasSpace = false
}
}
}
// Decode common XML entities
result := sb.String()
result = strings.ReplaceAll(result, "&amp;", "&")
result = strings.ReplaceAll(result, "&lt;", "<")
result = strings.ReplaceAll(result, "&gt;", ">")
result = strings.ReplaceAll(result, "&quot;", "\"")
result = strings.ReplaceAll(result, "&apos;", "'")
result = strings.ReplaceAll(result, "&#xA;", "\n")
result = strings.ReplaceAll(result, "&#x9;", "\t")
return strings.TrimSpace(result)
}
// IsOfficeFile returns true for formats with extractable XML text.
func IsOfficeFile(ext string) bool {
return map[string]bool{
".docx": true, ".xlsx": true, ".pptx": true,
".odt": true, ".ods": true, ".odp": true,
}[ext]
}

77
db.go
View File

@ -337,7 +337,7 @@ func SearchDocuments(query string, limit int) ([]Document, error) {
FROM documents d
JOIN documents_fts fts ON d.id = fts.id
WHERE documents_fts MATCH ?
ORDER BY rank
ORDER BY d.processed_at DESC
LIMIT ?
`, query, limit)
@ -715,3 +715,78 @@ func scanDocumentRows(rows *sql.Rows) ([]Document, error) {
}
return docs, rows.Err()
}
// SearchDocumentsWithFullText is like SearchDocuments but includes the full_text
// column. Used by the format=md endpoint where OCR content is needed.
func SearchDocumentsWithFullText(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 200
}
rows, err := db.Query(`
SELECT d.id, COALESCE(d.title,''), COALESCE(d.category,''), COALESCE(d.type,''),
COALESCE(d.date,''), COALESCE(d.amount,''), COALESCE(d.vendor,''),
COALESCE(d.summary,''), COALESCE(d.full_text,''), COALESCE(d.pdf_path,''),
COALESCE(d.processed_at,''), COALESCE(d.original_file,''), COALESCE(d.status,'ready')
FROM documents d
JOIN documents_fts fts ON d.id = fts.id
WHERE documents_fts MATCH ?
ORDER BY d.processed_at DESC
LIMIT ?
`, query, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var docs []Document
for rows.Next() {
var doc Document
if err := rows.Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
); err != nil {
continue
}
docs = append(docs, doc)
}
return docs, rows.Err()
}
// SearchDocumentsWithFullTextFallback is the LIKE-based fallback that also includes full_text.
func SearchDocumentsWithFullTextFallback(query string, limit int) ([]Document, error) {
if limit <= 0 {
limit = 200
}
pattern := "%" + query + "%"
rows, err := db.Query(`
SELECT id, COALESCE(title,''), COALESCE(category,''), COALESCE(type,''),
COALESCE(date,''), COALESCE(amount,''), COALESCE(vendor,''),
COALESCE(summary,''), COALESCE(full_text,''), COALESCE(pdf_path,''),
COALESCE(processed_at,''), COALESCE(original_file,''), COALESCE(status,'ready')
FROM documents
WHERE title LIKE ? OR summary LIKE ? OR vendor LIKE ? OR full_text LIKE ?
ORDER BY processed_at DESC
LIMIT ?
`, pattern, pattern, pattern, pattern, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var docs []Document
for rows.Next() {
var doc Document
if err := rows.Scan(
&doc.ID, &doc.Title, &doc.Category, &doc.Type, &doc.Date,
&doc.Amount, &doc.Vendor, &doc.Summary, &doc.FullText, &doc.PDFPath,
&doc.ProcessedAt, &doc.OriginalFile, &doc.Status,
); err != nil {
continue
}
docs = append(docs, doc)
}
return docs, rows.Err()
}

BIN
docsys.bak-20260225-134659 Executable file

Binary file not shown.

175
main.go
View File

@ -72,6 +72,10 @@ func main() {
"title": strings.Title,
"safe": func(s string) template.HTML { return template.HTML(s) },
"multiply": func(a float64, b float64) float64 { return a * b },
"isImage": func(filename string) bool {
ext := strings.ToLower(filepath.Ext(filename))
return ext == ".jpg" || ext == ".jpeg" || ext == ".png" || ext == ".gif" || ext == ".webp" || ext == ".tiff"
},
}
r := chi.NewRouter()
@ -84,6 +88,7 @@ func main() {
// PDF serving
r.Get("/pdf/{hash}", servePDF)
r.Get("/img/{hash}", serveImage)
// Pages
r.Get("/", dashboardHandler)
@ -93,6 +98,7 @@ func main() {
r.Get("/search", searchHandler)
// API endpoints
r.Get("/api/search", apiSearchMDHandler)
r.Post("/api/search", apiSearchHandler)
r.Get("/api/documents", apiDocumentsHandler)
r.Get("/api/processing", apiProcessingHandler)
@ -151,6 +157,8 @@ func categoryIcon(cat string) string {
func formatDate(s string) string {
formats := []string{
"2006-01-02T15:04:05-07:00",
"2006-01-02T15:04:05.999999-07:00",
"2006-01-02T15:04:05.999999",
"2006-01-02T15:04:05",
"2006-01-02",
@ -323,6 +331,26 @@ func servePDF(w http.ResponseWriter, r *http.Request) {
http.Error(w, "File not found", http.StatusNotFound)
}
func serveImage(w http.ResponseWriter, r *http.Request) {
hash := chi.URLParam(r, "hash")
mimeTypes := map[string]string{
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
".png": "image/png", ".gif": "image/gif",
".webp": "image/webp", ".tiff": "image/tiff",
}
for ext, mime := range mimeTypes {
path := filepath.Join(storeDir, hash+ext)
if _, err := os.Stat(path); err == nil {
w.Header().Set("Content-Type", mime)
w.Header().Set("Cache-Control", "private, max-age=3600")
http.ServeFile(w, r, path)
return
}
}
http.NotFound(w, r)
}
// sanitizeFilename removes characters unsafe for use in Content-Disposition filenames.
func sanitizeFilename(name string) string {
replacer := strings.NewReplacer(`"`, "'", "/", "-", "\\", "-", "\n", " ", "\r", "")
@ -347,6 +375,77 @@ func apiSearchHandler(w http.ResponseWriter, r *http.Request) {
renderPartial(w, "document-list", docs)
}
// apiSearchMDHandler handles GET /api/search?q={query}&format=md
// Returns all matching documents as concatenated plain-text markdown,
// one document per section separated by ---. Intended for AI/LLM consumption.
func apiSearchMDHandler(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("q")
format := r.URL.Query().Get("format")
// Only serve markdown format; anything else falls back to a 400
if format != "md" {
http.Error(w, "format=md required", http.StatusBadRequest)
return
}
if query == "" {
http.Error(w, "q parameter required", http.StatusBadRequest)
return
}
docs, err := SearchDocumentsWithFullText(query, 200)
if err != nil {
docs, _ = SearchDocumentsWithFullTextFallback(query, 200)
}
w.Header().Set("Content-Type", "text/markdown; charset=utf-8")
if len(docs) == 0 {
w.Write([]byte("No documents found matching: " + query + "\n"))
return
}
var sb strings.Builder
for i, doc := range docs {
sb.WriteString("# Document: ")
if doc.Title != "" {
sb.WriteString(doc.Title)
} else {
sb.WriteString("(untitled)")
}
sb.WriteString("\n")
sb.WriteString("ID: ")
sb.WriteString(doc.ID)
if doc.Category != "" {
sb.WriteString(" | Category: ")
sb.WriteString(doc.Category)
}
if doc.Date != "" {
sb.WriteString(" | Date: ")
sb.WriteString(doc.Date)
}
if doc.Vendor != "" {
sb.WriteString(" | Vendor: ")
sb.WriteString(doc.Vendor)
}
sb.WriteString("\n\n")
text := doc.FullText
if text == "" {
text = doc.Summary
}
if text != "" {
sb.WriteString(text)
sb.WriteString("\n")
}
if i < len(docs)-1 {
sb.WriteString("\n---\n\n")
}
}
w.Write([]byte(sb.String()))
}
func apiProcessingHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(GetActiveJobs())
@ -444,6 +543,7 @@ func ingestHandler(w http.ResponseWriter, r *http.Request) {
var req struct {
Filename string `json:"filename"`
Content string `json:"content"`
URL string `json:"url"`
Source string `json:"source"`
Subject string `json:"subject"`
From string `json:"from"`
@ -454,16 +554,73 @@ func ingestHandler(w http.ResponseWriter, r *http.Request) {
return
}
if req.Filename == "" || req.Content == "" {
http.Error(w, "filename and content are required", http.StatusBadRequest)
return
}
var data []byte
// Decode base64 content
data, err := base64.StdEncoding.DecodeString(req.Content)
if err != nil {
http.Error(w, "Invalid base64 content", http.StatusBadRequest)
return
if req.URL != "" {
// Fetch from URL
resp, err := http.Get(req.URL)
if err != nil {
http.Error(w, "Failed to fetch URL: "+err.Error(), http.StatusBadGateway)
return
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
http.Error(w, fmt.Sprintf("URL returned %d", resp.StatusCode), http.StatusBadGateway)
return
}
data, err = io.ReadAll(resp.Body)
if err != nil {
http.Error(w, "Failed to read URL content", http.StatusInternalServerError)
return
}
// Derive filename from URL or content-type if not provided
if req.Filename == "" {
ext := ""
ct := resp.Header.Get("Content-Type")
switch {
case strings.Contains(ct, "jpeg") || strings.Contains(ct, "jpg"):
ext = ".jpg"
case strings.Contains(ct, "png"):
ext = ".png"
case strings.Contains(ct, "gif"):
ext = ".gif"
case strings.Contains(ct, "webp"):
ext = ".webp"
case strings.Contains(ct, "pdf"):
ext = ".pdf"
case strings.Contains(ct, "tiff"):
ext = ".tiff"
default:
// Try to get extension from URL path
urlPath := resp.Request.URL.Path
if e := filepath.Ext(urlPath); e != "" {
ext = e
} else {
ext = ".bin"
}
}
// Use last path segment of URL as base name
base := filepath.Base(resp.Request.URL.Path)
if base == "." || base == "/" {
base = "url-import"
}
if filepath.Ext(base) == "" {
base += ext
}
req.Filename = base
}
} else {
if req.Filename == "" || req.Content == "" {
http.Error(w, "filename and content required, or provide url", http.StatusBadRequest)
return
}
// Decode base64 content
var err error
data, err = base64.StdEncoding.DecodeString(req.Content)
if err != nil {
http.Error(w, "Invalid base64 content", http.StatusBadRequest)
return
}
}
// Sanitize filename

View File

@ -120,7 +120,7 @@
{{if .Stats.RecentUploads}}
<div class="divide-y divide-gray-100 dark:divide-gray-700">
{{range .Stats.RecentUploads}}
<a href="/document/{{.ID}}" class="flex items-center px-5 py-4 hover:bg-gray-50 dark:hover:bg-gray-700/50 transition-colors group">
<div class="flex items-center px-5 py-4 hover:bg-gray-50 dark:hover:bg-gray-700/50 transition-colors group cursor-pointer" onclick="location.href='/document/{{.ID}}'">
<div class="flex-shrink-0 p-2 bg-gray-100 dark:bg-gray-700 rounded-lg mr-4">
<svg class="w-6 h-6 text-gray-500 dark:text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"></path>
@ -130,27 +130,34 @@
<p class="font-medium text-gray-900 dark:text-white truncate group-hover:text-brand-600 dark:group-hover:text-brand-400 transition-colors">{{.Title}}</p>
<p class="text-sm text-gray-500 dark:text-gray-400 truncate">{{truncate .Summary 100}}</p>
</div>
<div class="flex-shrink-0 ml-4 text-right">
{{if eq .Status "processing"}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-300" data-processing="{{.ID}}">
<svg class="animate-spin -ml-0.5 mr-1.5 h-3 w-3" fill="none" viewBox="0 0 24 24">
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
<div class="flex-shrink-0 ml-4 flex items-center gap-3">
<div class="text-right">
{{if eq .Status "processing"}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-amber-100 dark:bg-amber-900/30 text-amber-700 dark:text-amber-300" data-processing="{{.ID}}">
<svg class="animate-spin -ml-0.5 mr-1.5 h-3 w-3" fill="none" viewBox="0 0 24 24">
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
</svg>
Processing
</span>
{{else if eq .Status "error"}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
Error
</span>
{{else}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-brand-100 dark:bg-brand-900/30 text-brand-700 dark:text-brand-300">
{{title .Category}}
</span>
{{end}}
<p class="text-xs text-gray-400 dark:text-gray-500 mt-1">{{formatDateTime .ProcessedAt}}</p>
</div>
<button onclick="event.stopPropagation(); deleteDoc('{{.ID}}', '{{.Title}}')" class="opacity-0 group-hover:opacity-100 p-1.5 text-gray-400 hover:text-red-600 hover:bg-red-50 dark:hover:bg-red-900/20 rounded-lg transition-all" title="Delete">
<svg class="w-4 h-4" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 7l-.867 12.142A2 2 0 0116.138 21H7.862a2 2 0 01-1.995-1.858L5 7m5 4v6m4-6v6m1-10V4a1 1 0 00-1-1h-4a1 1 0 00-1 1v3M4 7h16"></path>
</svg>
Processing
</span>
{{else if eq .Status "error"}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-red-100 dark:bg-red-900/30 text-red-700 dark:text-red-300">
Error
</span>
{{else}}
<span class="inline-flex items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-brand-100 dark:bg-brand-900/30 text-brand-700 dark:text-brand-300">
{{title .Category}}
</span>
{{end}}
<p class="text-xs text-gray-400 dark:text-gray-500 mt-1">{{formatDateTime .ProcessedAt}}</p>
</button>
</div>
</a>
</div>
{{end}}
</div>
{{else}}
@ -292,6 +299,51 @@
}
});
// Clipboard paste — handles Ctrl+V of images/screenshots
document.addEventListener('paste', async (e) => {
const items = Array.from(e.clipboardData.items).filter(i => i.type.startsWith('image/'));
if (!items.length) return;
e.preventDefault();
progress.classList.remove('hidden');
const pendingIds = [];
for (const item of items) {
const file = item.getAsFile();
if (!file) continue;
const ext = item.type.split('/')[1] || 'png';
const name = `paste-${Date.now()}.${ext}`;
try {
const formData = new FormData();
formData.append('file', new File([file], name, { type: item.type }));
const res = await fetch('/api/upload', { method: 'POST', body: formData });
const data = await res.json();
if (data.status === 'success') {
showToast('✓ Pasted image', 'success');
pendingIds.push(data.id);
} else {
showToast('Paste failed', 'error');
}
} catch (err) {
showToast('Paste failed: ' + err.message, 'error');
}
}
if (pendingIds.length > 0) {
sessionStorage.setItem('pendingDocs', JSON.stringify(pendingIds));
window.location.reload();
} else {
progress.classList.add('hidden');
}
});
async function deleteDoc(id, title) {
if (!confirm('Delete "' + title + '"?')) return;
const res = await fetch('/api/document/' + id, { method: 'DELETE' });
if (res.ok) {
window.location.reload();
} else {
alert('Failed to delete document');
}
}
// Check for pending docs on page load (from web upload)
const pendingDocsJson = sessionStorage.getItem('pendingDocs');
if (pendingDocsJson) {

View File

@ -26,6 +26,12 @@
</div>
</div>
<div class="flex gap-2">
<button onclick="deleteDocument()" class="inline-flex items-center px-3 py-2 bg-white dark:bg-gray-800 text-red-600 dark:text-red-400 text-sm font-medium rounded-lg border border-red-200 dark:border-red-800 hover:bg-red-50 dark:hover:bg-red-900/20 transition-all">
<svg class="w-4 h-4 mr-1.5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 7l-.867 12.142A2 2 0 0116.138 21H7.862a2 2 0 01-1.995-1.858L5 7m5 4v6m4-6v6m1-10V4a1 1 0 00-1-1h-4a1 1 0 00-1 1v3M4 7h16"></path>
</svg>
Delete
</button>
<button onclick="toggleEdit()" class="inline-flex items-center px-3 py-2 bg-white dark:bg-gray-800 text-gray-700 dark:text-gray-200 text-sm font-medium rounded-lg border border-gray-300 dark:border-gray-600 hover:bg-gray-50 dark:hover:bg-gray-700 transition-all">
<svg class="w-4 h-4 mr-1.5" fill="none" stroke="currentColor" viewBox="0 0 24 24">
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M11 5H6a2 2 0 00-2 2v11a2 2 0 002 2h11a2 2 0 002-2v-5m-1.414-9.414a2 2 0 112.828 2.828L11.828 15H9v-2.828l8.586-8.586z"></path>
@ -163,7 +169,16 @@
{{end}}
<div>
{{if .Document.PDFPath}}
{{if isImage .Document.OriginalFile}}
<div class="bg-white dark:bg-gray-800 rounded-2xl shadow-sm border border-gray-100 dark:border-gray-700 overflow-hidden">
<div class="px-6 py-4 border-b border-gray-100 dark:border-gray-700">
<h2 class="font-semibold text-gray-900 dark:text-white">Preview</h2>
</div>
<div class="p-4 flex justify-center">
<img src="/img/{{.Document.ID}}" alt="{{.Document.Title}}" class="max-w-full rounded-lg shadow-sm" style="max-height:600px;object-fit:contain;">
</div>
</div>
{{else if .Document.PDFPath}}
<div class="bg-white dark:bg-gray-800 rounded-2xl shadow-sm border border-gray-100 dark:border-gray-700 overflow-hidden">
<div class="px-6 py-4 border-b border-gray-100 dark:border-gray-700 flex items-center justify-between">
<h2 class="font-semibold text-gray-900 dark:text-white">Document Preview</h2>

View File

@ -8,7 +8,10 @@
<p class="font-medium text-gray-900 dark:text-white truncate text-sm">{{.Title}}</p>
<p class="text-xs text-gray-500 dark:text-gray-400 truncate">{{truncate .Summary 50}}</p>
</div>
<span class="ml-2 text-xs text-gray-400">{{title .Category}}</span>
<div class="ml-2 text-right flex-shrink-0">
<span class="text-xs text-gray-400 block">{{title .Category}}</span>
{{if .ProcessedAt}}<span class="text-xs text-gray-400 block">{{formatDate .ProcessedAt}}</span>{{end}}
</div>
</a>
{{end}}
</div>

View File

@ -56,7 +56,9 @@
{{title .Type}}
</span>
{{end}}
{{if .Date}}
{{if .ProcessedAt}}
<span class="text-xs text-gray-500 dark:text-gray-400">📅 {{formatDateTime .ProcessedAt}}</span>
{{else if .Date}}
<span class="text-xs text-gray-500 dark:text-gray-400">{{formatDate .Date}}</span>
{{end}}
{{if .Score}}