docsys/ai.go

package main

import (
	"archive/zip"
	"bytes"
	"crypto/sha256"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"os/exec"
	"path/filepath"
	"strings"
	"time"
)

var (
	fireworksAPIKey string
	fireworksBaseURL = "https://api.fireworks.ai/inference/v1"
)

func init() {
	fireworksAPIKey = os.Getenv("FIREWORKS_API_KEY")
	if fireworksAPIKey == "" {
		// Try .env file in docsys directory
		envPath := filepath.Join(os.Getenv("HOME"), "dev/docsys/.env")
		if data, err := os.ReadFile(envPath); err == nil {
			for _, line := range strings.Split(string(data), "\n") {
				if strings.HasPrefix(line, "FIREWORKS_API_KEY=") {
					fireworksAPIKey = strings.TrimSpace(strings.TrimPrefix(line, "FIREWORKS_API_KEY="))
					fireworksAPIKey = strings.Trim(fireworksAPIKey, `"'`)
					break
				}
			}
		}
	}
}

// DocumentAnalysis contains the AI-extracted information
type DocumentAnalysis struct {
	Category string      `json:"category"`
	DocType  string      `json:"doc_type"`
	Date     string      `json:"date"`
	Vendor   string      `json:"vendor"`
	Amount   interface{} `json:"amount"` // Can be string or number
	Title    string      `json:"title"`
	Summary  string      `json:"summary"`
	FullText string      `json:"full_text"`
}

func (d *DocumentAnalysis) AmountString() string {
	switch v := d.Amount.(type) {
	case string:
		return v
	case float64:
		return fmt.Sprintf("$%.2f", v)
	default:
		return ""
	}
}

// FileHash returns first 16 chars of SHA256 hash
func FileHash(filepath string) (string, error) {
	f, err := os.Open(filepath)
	if err != nil {
		return "", err
	}
	defer f.Close()

	h := sha256.New()
	if _, err := io.Copy(h, f); err != nil {
		return "", err
	}
	return fmt.Sprintf("%x", h.Sum(nil))[:16], nil
}

// ConvertToImage converts PDF/Office docs to PNG for vision API
func ConvertToImage(filePath string) ([]byte, error) {
	ext := strings.ToLower(filepath.Ext(filePath))

	// Office documents → PDF first
	officeExts := map[string]bool{".doc": true, ".docx": true, ".odt": true, ".rtf": true, ".xls": true, ".xlsx": true, ".ppt": true, ".pptx": true}
	if officeExts[ext] {
		tmpDir, err := os.MkdirTemp("", "docsys")
		if err != nil {
			return nil, err
		}
		defer os.RemoveAll(tmpDir)

		cmd := exec.Command("soffice", "--headless", "--convert-to", "pdf", "--outdir", tmpDir, filePath)
		if err := cmd.Run(); err != nil {
			return nil, fmt.Errorf("libreoffice conversion failed: %w", err)
		}

		base := strings.TrimSuffix(filepath.Base(filePath), ext)
		pdfPath := filepath.Join(tmpDir, base+".pdf")
		filePath = pdfPath
		ext = ".pdf"
	}

	// PDF → PNG (first page only for preview, full processing done separately)
	if ext == ".pdf" {
		tmpDir, err := os.MkdirTemp("", "docsys")
		if err != nil {
			return nil, err
		}
		defer os.RemoveAll(tmpDir)

		// Convert first page for initial analysis
		outPrefix := filepath.Join(tmpDir, "page")
		cmd := exec.Command("pdftoppm", "-png", "-f", "1", "-l", "1", "-r", "150", filePath, outPrefix)
		if err := cmd.Run(); err != nil {
			return nil, fmt.Errorf("pdftoppm failed: %w", err)
		}

		// pdftoppm uses variable-width zero-padding depending on page count
		// (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
		// Glob for the first match instead of hardcoding "page-1.png".
		matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
		if err != nil || len(matches) == 0 {
			return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
		}
		return os.ReadFile(matches[0])
	}

	// Image files — read directly
	return os.ReadFile(filePath)
}


// IsTextFile returns true for plain text files
func IsTextFile(ext string) bool {
	textExts := map[string]bool{
		".txt": true, ".md": true, ".markdown": true, ".text": true, ".log": true,
		".json": true, ".xml": true, ".csv": true, ".yaml": true, ".yml": true,
	}
	return textExts[ext]
}

// AnalyzeWithVision uses K2.5 vision model
func AnalyzeWithVision(imageData []byte) (*DocumentAnalysis, error) {
	if fireworksAPIKey == "" {
		return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
	}

	b64 := base64.StdEncoding.EncodeToString(imageData)

	prompt := `Analyze this document image and extract:

1. **Full Text**: Transcribe ALL visible text EXACTLY as it appears — in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Transcribe word-for-word in the source language (Russian, Dutch, German, French, etc.) formatted as clean Markdown:
   - Use headers (##) for sections
   - Use **bold** for labels/field names
   - Use tables for tabular data (items, prices, etc.)
   - Use bullet lists where appropriate
   - Preserve ALL numbers, dates, amounts, and codes exactly as shown

2. **Classification**: Categorize into exactly ONE of:
   taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized

3. **Document Type**: Specific type (e.g., "utility_bill", "receipt", "tax_form_w2")

4. **Key Fields** (these may be in English for searchability):
   - date: Document date (YYYY-MM-DD if possible)
   - vendor: Company/organization name
   - amount: Dollar/currency amount if present (e.g., "$123.45" or "809,400 BYN")

5. **Title**: Specific English title (max 8 words) that identifies THIS document uniquely. Ask yourself: "If I had to find this document again, what would I search for?" Prioritize the most unique/recoverable detail: an account number, confirmation code, ID, or specific value that can't be inferred from context. If the document's main value is a number or ID, put it in the title. Bad: "FedEx Receipt Jan 2026" (generic). Good: "FedEx Account 203634010" or "Duke Energy Electric Bill Oct 2024" or "IRS Form 1099-INT Chase Bank 2024". Never use generic words like "Document", "Letter", "Report" alone — always qualify them.

6. **Summary**: 1-2 sentence English description with key details.

IMPORTANT: The full_text field MUST contain the verbatim transcription in the document's original language. This is non-negotiable.

**Known proper nouns** (use these exact spellings when you see similar handwriting):
- Jongsma (surname — may look like "Jongoma", "Jongsoma", "Jongma")
- Johan (first name)
- Tatyana / Tanya (first name)
- St. Petersburg, Florida (city — not Russia)

Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`

	reqBody := map[string]interface{}{
		"model":      "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
		"max_tokens": 4096,
		"messages": []map[string]interface{}{
			{"role": "system", "content": "You are a document analysis API. Output ONLY raw JSON. No thinking, no commentary, no code fences. First character must be {, last character must be }."},
			{
				"role": "user",
				"content": []map[string]interface{}{
					{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
					{"type": "text", "text": prompt},
				},
			},
		},
	}

	analysis, err := callFireworks(reqBody)
	if err != nil {
		// Retry once with minimal prompt to avoid triggering extended reasoning
		log.Printf("  [AI] First attempt failed, retrying with simplified prompt...")
		retryBody := map[string]interface{}{
			"model":      "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
			"max_tokens": 4096,
			"messages": []map[string]interface{}{
				{"role": "system", "content": "Output valid JSON only. No other text."},
				{
					"role": "user",
					"content": []map[string]interface{}{
						{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
						{"type": "text", "text": `Look at this document image. Transcribe ALL text verbatim (original language, do NOT translate). Return ONLY this JSON with no placeholders:
{"category":"","doc_type":"","date":"","vendor":"","amount":"","title":"","summary":"","full_text":""}`},
					},
				},
			},
		}
		return callFireworks(retryBody)
	}
	return analysis, nil
}

// AnalyzeText uses K2 text model for plain text files
func AnalyzeText(text, filename string) (*DocumentAnalysis, error) {
	if fireworksAPIKey == "" {
		return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
	}

	// Truncate long text
	if len(text) > 50000 {
		text = text[:50000]
	}

	prompt := fmt.Sprintf(`Analyze this document:

**Filename:** %s

**Content:**
%s

Categorize into ONE of: taxes, bills, medical, insurance, legal, financial, expenses, vehicles, home, personal, contacts, uncategorized

For the title: specific and concise (max 8 words). Ask yourself: "What would I search for to find this again?" Prioritize unique/recoverable details — account numbers, IDs, confirmation codes — over generic document type names. Examples:
- "FedEx Account 203634010" (if the key value is an account number)
- "Allegiant Confirmation IB2EJA AVL-PIE Feb 2026"
- "IRS Form 1099-INT Chase Bank 2024"
Bad: "FedEx Receipt Jan 2026" or "Flight Confirmation" — too generic.

Also produce a "full_text" field: reformat the raw content as clean Markdown. Use ## headers for sections, | tables | for tabular data, **bold** for field labels, and bullet lists where appropriate. Preserve all values exactly. Do not summarize — include everything.

Respond in JSON ONLY:
{"category": "...", "doc_type": "...", "date": "...", "vendor": "...", "amount": "...", "title": "...", "summary": "...", "full_text": "..."}`, filename, text)

	reqBody := map[string]interface{}{
		"model":      "accounts/fireworks/models/kimi-k2-instruct-0905",
		"max_tokens": 2048,
		"messages": []map[string]interface{}{
			{"role": "user", "content": prompt},
		},
	}

	analysis, err := callFireworks(reqBody)
	if err != nil {
		return nil, err
	}
	// If model didn't produce full_text, fall back to raw extracted text
	if analysis.FullText == "" {
		analysis.FullText = text
	}
	return analysis, nil
}

func callFireworks(reqBody map[string]interface{}) (*DocumentAnalysis, error) {
	jsonBody, _ := json.Marshal(reqBody)

	req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
	req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{Timeout: 120 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
		body, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
	}

	respBody, _ := io.ReadAll(resp.Body)

	var result struct {
		Choices []struct {
			Message struct {
				Content          string `json:"content"`
				ReasoningContent string `json:"reasoning_content"`
			} `json:"message"`
		} `json:"choices"`
	}
	if err := json.Unmarshal(respBody, &result); err != nil {
		return nil, err
	}

	if len(result.Choices) == 0 {
		return nil, fmt.Errorf("no response from API")
	}

	content := result.Choices[0].Message.Content
	reasoning := result.Choices[0].Message.ReasoningContent

	// K2.5 reasoning mode: actual JSON may be in content or reasoning_content
	// Try content first, if it doesn't look like JSON, try reasoning_content
	if !strings.Contains(content, "{") && reasoning != "" && strings.Contains(reasoning, "{") {
		log.Printf("  [AI] Using reasoning_content (content had no JSON)")
		content = reasoning
	}

	// Strip markdown code fences (```json ... ``` or ``` ... ```)
	content = strings.TrimSpace(content)
	if strings.HasPrefix(content, "```") {
		// Remove opening fence (```json or ```)
		if idx := strings.Index(content, "\n"); idx >= 0 {
			content = content[idx+1:]
		}
		// Remove closing fence
		if idx := strings.LastIndex(content, "```"); idx >= 0 {
			content = content[:idx]
		}
		content = strings.TrimSpace(content)
	}

	// Extract JSON from response
	if idx := strings.Index(content, "{"); idx >= 0 {
		if end := strings.LastIndex(content, "}"); end > idx {
			content = content[idx : end+1]
		}
	}

	var analysis DocumentAnalysis
	if err := json.Unmarshal([]byte(content), &analysis); err != nil {
		// Last resort: try to find a JSON object with braces matching
		cleaned := extractJSONObject(content)
		if cleaned != "" {
			if err2 := json.Unmarshal([]byte(cleaned), &analysis); err2 != nil {
				log.Printf("  [AI debug] Failed to parse even after cleanup. Content starts: %.200s", content)
				return nil, fmt.Errorf("failed to parse response: %w", err)
			}
		} else {
			log.Printf("  [AI debug] No JSON object found in response. Content starts: %.200s", content)
			return nil, fmt.Errorf("failed to parse response: %w", err)
		}
	}

	// Validate category
	validCats := map[string]bool{"taxes": true, "bills": true, "medical": true, "insurance": true, "legal": true, "financial": true, "expenses": true, "vehicles": true, "home": true, "personal": true, "contacts": true, "uncategorized": true}
	if !validCats[analysis.Category] {
		analysis.Category = "uncategorized"
	}

	return &analysis, nil
}

// extractJSONObject tries to find a balanced JSON object in a string
func extractJSONObject(s string) string {
	start := strings.Index(s, "{")
	if start < 0 {
		return ""
	}
	depth := 0
	inString := false
	escaped := false
	for i := start; i < len(s); i++ {
		c := s[i]
		if escaped {
			escaped = false
			continue
		}
		if c == '\\' && inString {
			escaped = true
			continue
		}
		if c == '"' {
			inString = !inString
			continue
		}
		if inString {
			continue
		}
		if c == '{' {
			depth++
		} else if c == '}' {
			depth--
			if depth == 0 {
				return s[start : i+1]
			}
		}
	}
	return ""
}

// GenerateEmbedding creates a vector embedding using Fireworks
func GenerateEmbedding(text string) ([]float32, error) {
	if fireworksAPIKey == "" {
		return nil, fmt.Errorf("FIREWORKS_API_KEY not set")
	}

	// Truncate
	if len(text) > 32000 {
		text = text[:32000]
	}

	reqBody := map[string]interface{}{
		"model": "fireworks/qwen3-embedding-8b",
		"input": text,
	}
	jsonBody, _ := json.Marshal(reqBody)

	req, _ := http.NewRequest("POST", fireworksBaseURL+"/embeddings", bytes.NewReader(jsonBody))
	req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{Timeout: 30 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
		body, _ := io.ReadAll(resp.Body)
		return nil, fmt.Errorf("embedding API error %d: %s", resp.StatusCode, string(body))
	}

	var result struct {
		Data []struct {
			Embedding []float32 `json:"embedding"`
		} `json:"data"`
	}
	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
		return nil, err
	}

	if len(result.Data) == 0 {
		return nil, fmt.Errorf("no embedding returned")
	}

	return result.Data[0].Embedding, nil
}

// GetPDFPageCount returns the number of pages in a PDF
func GetPDFPageCount(filePath string) int {
	cmd := exec.Command("pdfinfo", filePath)
	out, err := cmd.Output()
	if err != nil {
		return 1
	}
	for _, line := range strings.Split(string(out), "\n") {
		if strings.HasPrefix(line, "Pages:") {
			var count int
			fmt.Sscanf(line, "Pages: %d", &count)
			return count
		}
	}
	return 1
}

// ProcessPDFPageByPage extracts text from each page separately
func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
	pageCount := GetPDFPageCount(filePath)
	log.Printf("  Processing %d pages separately...", pageCount)

	var allText strings.Builder

	for page := 1; page <= pageCount; page++ {
		UpdateJob(jobID, "ocr", fmt.Sprintf("Page %d/%d", page, pageCount))
		tmpDir, err := os.MkdirTemp("", "docsys-page")
		if err != nil {
			continue
		}

		// Convert single page to PNG
		outPrefix := filepath.Join(tmpDir, "page")
		cmd := exec.Command("pdftoppm", "-png", "-f", fmt.Sprintf("%d", page), "-l", fmt.Sprintf("%d", page), "-r", "150", filePath, outPrefix)
		if err := cmd.Run(); err != nil {
			os.RemoveAll(tmpDir)
			continue
		}

		// Glob for the output — pdftoppm zero-pads based on total page count
		pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
		if len(pageMatches) == 0 {
			os.RemoveAll(tmpDir)
			continue
		}
		imageData, err := os.ReadFile(pageMatches[0])
		os.RemoveAll(tmpDir)
		if err != nil {
			continue
		}

		// OCR this page
		log.Printf("  Page %d/%d...", page, pageCount)
		pageAnalysis, err := AnalyzePageOnly(imageData, page)
		if err != nil {
			log.Printf("  Page %d failed: %v", page, err)
			continue
		}

		if pageAnalysis != "" {
			allText.WriteString(fmt.Sprintf("\n\n---\n## Page %d\n\n", page))
			allText.WriteString(pageAnalysis)
		}
	}

	return allText.String(), nil
}

// AnalyzePageOnly extracts just the text from a single page image
func AnalyzePageOnly(imageData []byte, pageNum int) (string, error) {
	if fireworksAPIKey == "" {
		return "", fmt.Errorf("FIREWORKS_API_KEY not set")
	}

	b64 := base64.StdEncoding.EncodeToString(imageData)

	prompt := `Transcribe ALL visible text on this page EXACTLY as it appears, in the ORIGINAL language of the document. DO NOT translate. DO NOT summarize. Output ONLY the transcribed text — no commentary, no analysis, no preamble, no "The document is..." sentences. Start directly with the content.

FORMAT: Use ### for sections, **bold** for labels, markdown tables for tabular data, - bullets for lists. Preserve ALL numbers, dates, amounts, and values exactly as shown. If the document is in Russian, Dutch, German, French, or any other language — keep it in that language.`

	reqBody := map[string]interface{}{
		"model":      "accounts/fireworks/models/qwen3-vl-30b-a3b-instruct",
		"max_tokens": 4096,
		"messages": []map[string]interface{}{
			{
				"role": "user",
				"content": []map[string]interface{}{
					{"type": "image_url", "image_url": map[string]string{"url": "data:image/png;base64," + b64}},
					{"type": "text", "text": prompt},
				},
			},
		},
	}

	jsonBody, _ := json.Marshal(reqBody)
	req, _ := http.NewRequest("POST", fireworksBaseURL+"/chat/completions", bytes.NewReader(jsonBody))
	req.Header.Set("Authorization", "Bearer "+fireworksAPIKey)
	req.Header.Set("Content-Type", "application/json")

	client := &http.Client{Timeout: 120 * time.Second}
	resp, err := client.Do(req)
	if err != nil {
		return "", err
	}
	defer resp.Body.Close()

	if resp.StatusCode != 200 {
		body, _ := io.ReadAll(resp.Body)
		return "", fmt.Errorf("API error %d: %s", resp.StatusCode, string(body))
	}

	// Read raw response to debug content vs reasoning_content
	rawBody, err := io.ReadAll(resp.Body)
	if err != nil {
		return "", err
	}

	var result struct {
		Choices []struct {
			Message struct {
				Content          string `json:"content"`
				ReasoningContent string `json:"reasoning_content"`
			} `json:"message"`
		} `json:"choices"`
	}
	if err := json.Unmarshal(rawBody, &result); err != nil {
		return "", err
	}

	if len(result.Choices) == 0 {
		return "", fmt.Errorf("no response")
	}

	content := result.Choices[0].Message.Content
	reasoning := result.Choices[0].Message.ReasoningContent

	if reasoning != "" {
		log.Printf("  [OCR debug] reasoning_content length: %d, content length: %d", len(reasoning), len(content))
		if len(content) > 100 {
			log.Printf("  [OCR debug] content starts: %.100s", content)
		}
	}

	// If content is empty but reasoning has text, model put everything in wrong field
	if strings.TrimSpace(content) == "" && reasoning != "" {
		log.Printf("  [OCR debug] WARNING: content empty, using reasoning_content")
		content = reasoning
	}

	return strings.TrimSpace(content), nil
}

// ProcessDocument handles the full document processing pipeline
func ProcessDocument(filePath string) (*Document, error) {
	log.Printf("Processing: %s", filepath.Base(filePath))

	ext := strings.ToLower(filepath.Ext(filePath))

	// Get file hash
	hash, err := FileHash(filePath)
	if err != nil {
		return nil, fmt.Errorf("hash failed: %w", err)
	}
	log.Printf("  Hash: %s", hash)

	// Start progress tracking
	StartJob(hash, filepath.Base(filePath))
	defer FinishJob(hash)

	// Check if already fully processed (not pending)
	if existing, _ := GetDocument(hash); existing != nil && existing.Status == "ready" {
		log.Printf("  Already exists, skipping")
		os.Remove(filePath)
		return existing, nil
	}

	var analysis *DocumentAnalysis

	if IsTextFile(ext) {
		// Plain text — read and analyze
		data, err := os.ReadFile(filePath)
		if err != nil {
			return nil, err
		}
		UpdateJob(hash, "classifying", "Analyzing text...")
		log.Printf("  Analyzing text with K2...")
		analysis, err = AnalyzeText(string(data), filepath.Base(filePath))
		if err != nil {
			UpdateJob(hash, "error", err.Error())
			return nil, fmt.Errorf("text analysis failed: %w", err)
		}
	} else if IsOfficeFile(ext) {
		// Office formats — extract text natively from ZIP/XML, no LibreOffice needed
		UpdateJob(hash, "converting", "Extracting text...")
		log.Printf("  Extracting text from %s...", ext)
		text, err := ExtractOfficeText(filePath)
		if err != nil {
			UpdateJob(hash, "error", err.Error())
			return nil, fmt.Errorf("office text extraction failed: %w", err)
		}
		log.Printf("  Extracted %d chars, classifying...", len(text))
		UpdateJob(hash, "classifying", "Classifying...")
		analysis, err = AnalyzeText(text, filepath.Base(filePath))
		if err != nil {
			UpdateJob(hash, "error", err.Error())
			return nil, fmt.Errorf("text analysis failed: %w", err)
		}
	} else {
		// Vision — convert to image and analyze
		UpdateJob(hash, "converting", "Converting to image...")
		log.Printf("  Converting to image...")
		imageData, err := ConvertToImage(filePath)
		if err != nil {
			UpdateJob(hash, "error", err.Error())
			return nil, fmt.Errorf("image conversion failed: %w", err)
		}
		UpdateJob(hash, "ocr", "Analyzing first page...")
		log.Printf("  Analyzing with K2.5 vision...")
		analysis, err = AnalyzeWithVision(imageData)
		if err != nil {
			// Vision JSON extraction failed — fall back to two-step: plain-text OCR + text classifier
			log.Printf("  AnalyzeWithVision failed (%v), falling back to OCR+classify...", err)
			UpdateJob(hash, "ocr", "Falling back to OCR + classify...")
			pageText, ocrErr := AnalyzePageOnly(imageData, 1)
			if ocrErr != nil {
				UpdateJob(hash, "error", ocrErr.Error())
				return nil, fmt.Errorf("vision analysis failed (primary: %v, fallback: %w)", err, ocrErr)
			}
			// Classify the extracted text
			log.Printf("  OCR succeeded (%d chars), classifying...", len(pageText))
			UpdateJob(hash, "classifying", "Classifying extracted text...")
			analysis, err = AnalyzeText(pageText, filepath.Base(filePath))
			if err != nil {
				// Use minimal stub so at least the doc is stored with its text
				log.Printf("  Classification failed too: %v — storing with minimal metadata", err)
				analysis = &DocumentAnalysis{
					Category: "uncategorized",
					DocType:  "unknown",
					Title:    strings.TrimSuffix(filepath.Base(filePath), filepath.Ext(filePath)),
					Summary:  "Extraction failed — stored raw text only",
					FullText: pageText,
				}
			} else {
				analysis.FullText = pageText
			}
		}

		// For PDFs, process pages for accurate OCR
		if ext == ".pdf" {
			pageCount := GetPDFPageCount(filePath)
			if pageCount > 1 {
				log.Printf("  Multi-page PDF detected (%d pages)", pageCount)
				UpdateJob(hash, "ocr", fmt.Sprintf("Multi-page PDF: %d pages", pageCount))
				fullText, err := ProcessPDFPageByPage(filePath, hash)
				if err == nil && fullText != "" {
					analysis.FullText = fullText
				}
			} else if analysis.FullText == "" || len(analysis.FullText) < 50 || strings.HasPrefix(analysis.FullText, "[") {
				// Single-page but full_text is empty/placeholder — retry with AnalyzePageOnly
				log.Printf("  Single-page PDF with bad full_text (%q) — retrying with AnalyzePageOnly...", analysis.FullText)
				UpdateJob(hash, "ocr", "Retrying text extraction...")
				if pageText, err := AnalyzePageOnly(imageData, 1); err == nil && pageText != "" {
					analysis.FullText = pageText
				} else if err != nil {
					log.Printf("  AnalyzePageOnly fallback failed: %v", err)
				}
			}
		}
	}

	log.Printf("  Category: %s, Type: %s", analysis.Category, analysis.DocType)

	// Copy to store (skip if already there — reprocessing from store-backed upload)
	storePath := filepath.Join(storeDir, hash+ext)
	if _, statErr := os.Stat(storePath); os.IsNotExist(statErr) {
		if err := copyFile(filePath, storePath); err != nil {
			return nil, fmt.Errorf("store copy failed: %w", err)
		}
	}

	// Only set PDFPath for actual PDFs — office/text files have no previewable PDF
	pdfStorePath := ""
	if ext == ".pdf" {
		pdfStorePath = storePath
	}

	// Create document record
	// Use title if provided, fall back to summary
	title := analysis.Title
	if title == "" {
		title = analysis.Summary
	}

	doc := &Document{
		ID:           hash,
		Title:        title,
		Category:     analysis.Category,
		Type:         analysis.DocType,
		Date:         analysis.Date,
		Amount:       analysis.AmountString(),
		Vendor:       analysis.Vendor,
		Summary:      analysis.Summary,
		FullText:     analysis.FullText,
		PDFPath:      pdfStorePath,
		OriginalFile: filepath.Base(filePath),
		ProcessedAt:  time.Now().Format(time.RFC3339),
		Status:       "ready",
	}

	// Save to database
	if err := InsertDocument(doc); err != nil {
		return nil, fmt.Errorf("db insert failed: %w", err)
	}

	// Generate embedding
	if analysis.FullText != "" {
		UpdateJob(hash, "embedding", "Generating search index...")
		log.Printf("  Generating embedding...")
		if emb, err := GenerateEmbedding(analysis.FullText); err == nil {
			log.Printf("  Embedding: %d dimensions", len(emb))
			StoreEmbedding(hash, emb)
		} else {
			log.Printf("  Embedding failed: %v", err)
		}
	}

	// Remove from inbox
	os.Remove(filePath)

	log.Printf("  ✓ Done: %s/%s", analysis.Category, hash)
	return doc, nil
}

func copyFile(src, dst string) error {
	in, err := os.Open(src)
	if err != nil {
		return err
	}
	defer in.Close()

	out, err := os.Create(dst)
	if err != nil {
		return err
	}
	defer out.Close()

	_, err = io.Copy(out, in)
	return err
}

// ExtractOfficeText extracts plain text from DOCX/XLSX/PPTX natively.
// These are ZIP archives containing XML — no LibreOffice needed.
func ExtractOfficeText(filePath string) (string, error) {
	ext := strings.ToLower(filepath.Ext(filePath))

	r, err := zip.OpenReader(filePath)
	if err != nil {
		return "", fmt.Errorf("not a valid Office file: %w", err)
	}
	defer r.Close()

	// Which XML paths to extract per format
	var targets []string
	switch ext {
	case ".docx", ".doc", ".odt", ".rtf":
		targets = []string{"word/document.xml", "word/body.xml"}
	case ".xlsx", ".xls":
		// All sheets
		for _, f := range r.File {
			if strings.HasPrefix(f.Name, "xl/worksheets/sheet") && strings.HasSuffix(f.Name, ".xml") {
				targets = append(targets, f.Name)
			}
		}
		// Shared strings (cell values are stored here for xlsx)
		targets = append(targets, "xl/sharedStrings.xml")
	case ".pptx", ".ppt":
		for _, f := range r.File {
			if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
				targets = append(targets, f.Name)
			}
		}
	default:
		return "", fmt.Errorf("unsupported office format: %s", ext)
	}

	var sb strings.Builder
	for _, f := range r.File {
		found := false
		for _, t := range targets {
			if f.Name == t {
				found = true
				break
			}
		}
		if !found {
			continue
		}

		rc, err := f.Open()
		if err != nil {
			continue
		}
		data, err := io.ReadAll(rc)
		rc.Close()
		if err != nil {
			continue
		}

		sb.WriteString(xmlToText(data))
		sb.WriteString("\n")
	}

	text := strings.TrimSpace(sb.String())
	if text == "" {
		return "", fmt.Errorf("no text extracted from %s", filepath.Base(filePath))
	}
	return text, nil
}

// xmlToText parses DOCX/Office XML into structured plain text.
// Understands paragraph breaks, table rows/cells, and text runs.
func xmlToText(data []byte) string {
	s := string(data)

	// Normalise tag names: strip namespace prefix (w:p → p, a:p → p, etc.)
	// We work on the raw string using simple tag scanning.
	var sb strings.Builder
	inTag := false
	var tagBuf strings.Builder
	lastWasNewline := false

	flushLine := func() {
		line := strings.TrimSpace(sb.String())
		if line != "" {
			sb.Reset()
			// return the line — caller collects via out builder
			// (we reuse sb for the whole doc; collect separately)
		}
	}
	_ = flushLine

	// Two-pass: build a token stream of (tag, text) pairs
	type token struct {
		tag  string // lowercased local name, or "" for text
		text string
	}
	var tokens []token

	for i := 0; i < len(s); i++ {
		c := s[i]
		if c == '<' {
			// flush text buffer
			if sb.Len() > 0 {
				tokens = append(tokens, token{text: sb.String()})
				sb.Reset()
			}
			inTag = true
			tagBuf.Reset()
			continue
		}
		if c == '>' {
			inTag = false
			raw := strings.TrimSpace(tagBuf.String())
			// Extract local name (strip namespace and attributes)
			name := raw
			if idx := strings.IndexAny(raw, " \t\n\r/"); idx >= 0 {
				name = raw[:idx]
			}
			if idx := strings.Index(name, ":"); idx >= 0 {
				name = name[idx+1:]
			}
			name = strings.ToLower(name)
			closing := strings.HasPrefix(raw, "/")
			if closing {
				name = strings.TrimPrefix(name, "/")
			}
			tokens = append(tokens, token{tag: name})
			tagBuf.Reset()
			continue
		}
		if inTag {
			tagBuf.WriteByte(c)
		} else {
			sb.WriteByte(c)
		}
	}
	if sb.Len() > 0 {
		tokens = append(tokens, token{text: sb.String()})
	}

	// Now render tokens into structured text
	var out strings.Builder
	pendingNewlines := 0

	emitNewlines := func(n int) {
		if n > pendingNewlines {
			pendingNewlines = n
		}
	}
	flushNewlines := func() {
		for i := 0; i < pendingNewlines; i++ {
			out.WriteByte('\n')
		}
		pendingNewlines = 0
		lastWasNewline = pendingNewlines == 0
	}
	_ = lastWasNewline

	inCell := false
	cellCount := 0

	for _, tok := range tokens {
		if tok.tag == "" {
			// text node
			txt := tok.text
			// decode entities
			txt = strings.ReplaceAll(txt, "&amp;", "&")
			txt = strings.ReplaceAll(txt, "&lt;", "<")
			txt = strings.ReplaceAll(txt, "&gt;", ">")
			txt = strings.ReplaceAll(txt, "&quot;", "\"")
			txt = strings.ReplaceAll(txt, "&apos;", "'")
			txt = strings.ReplaceAll(txt, "&#xA;", "\n")
			txt = strings.ReplaceAll(txt, "&#x9;", "\t")
			if strings.TrimSpace(txt) != "" {
				flushNewlines()
				out.WriteString(txt)
			}
			continue
		}
		switch tok.tag {
		case "p": // paragraph
			emitNewlines(1)
		case "tr": // table row start
			inCell = false
			cellCount = 0
			emitNewlines(1)
		case "tc": // table cell
			if inCell {
				out.WriteString("\t")
			}
			inCell = true
			cellCount++
		case "br": // line break
			emitNewlines(1)
		}
	}

	return strings.TrimSpace(out.String())
}

// IsOfficeFile returns true for formats with extractable XML text.
func IsOfficeFile(ext string) bool {
	return map[string]bool{
		".docx": true, ".xlsx": true, ".pptx": true,
		".odt": true, ".ods": true, ".odp": true,
	}[ext]
}