docman/internal/processor/processor.go

package processor

import (
	"bytes"
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"fmt"
	"log"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"strings"
	"time"

	"docman/internal/db"

	"github.com/fsnotify/fsnotify"
	"github.com/google/uuid"
	openai "github.com/sashabaranov/go-openai"
)

type Processor struct {
	db          *db.DB
	inboxDir    string
	storeDir    string
	recordsDir  string
	aiClient    *openai.Client
	aiModel     string
	embedModel  string
}

type Config struct {
	InboxDir    string
	StoreDir    string
	RecordsDir  string
	AIEndpoint  string // Fireworks API endpoint
	AIKey       string
	AIModel     string // e.g., "accounts/fireworks/models/qwen2-vl-72b-instruct"
	EmbedModel  string
}

type Classification struct {
	Category      string   `json:"category"`
	Subcategory   string   `json:"subcategory,omitempty"`
	Title         string   `json:"title"`
	Date          string   `json:"date,omitempty"`
	Vendor        string   `json:"vendor,omitempty"`
	Amount        *float64 `json:"amount,omitempty"`
	Currency      string   `json:"currency,omitempty"`
	TaxDeductible bool     `json:"tax_deductible"`
	Summary       string   `json:"summary"`
	KeyFields     map[string]string `json:"key_fields,omitempty"`
}

func New(cfg Config, database *db.DB) *Processor {
	config := openai.DefaultConfig(cfg.AIKey)
	config.BaseURL = cfg.AIEndpoint

	return &Processor{
		db:         database,
		inboxDir:   cfg.InboxDir,
		storeDir:   cfg.StoreDir,
		recordsDir: cfg.RecordsDir,
		aiClient:   openai.NewClientWithConfig(config),
		aiModel:    cfg.AIModel,
		embedModel: cfg.EmbedModel,
	}
}

func (p *Processor) Watch(ctx context.Context) error {
	// Ensure directories exist
	for _, dir := range []string{p.inboxDir, p.storeDir, p.recordsDir} {
		if err := os.MkdirAll(dir, 0755); err != nil {
			return fmt.Errorf("create directory %s: %w", dir, err)
		}
	}

	// Process existing files first
	entries, _ := os.ReadDir(p.inboxDir)
	for _, entry := range entries {
		if entry.IsDir() || strings.HasPrefix(entry.Name(), ".") {
			continue
		}
		path := filepath.Join(p.inboxDir, entry.Name())
		if err := p.ProcessFile(ctx, path); err != nil {
			log.Printf("Error processing %s: %v", path, err)
		}
	}

	// Watch for new files
	watcher, err := fsnotify.NewWatcher()
	if err != nil {
		return err
	}
	defer watcher.Close()

	if err := watcher.Add(p.inboxDir); err != nil {
		return err
	}

	log.Printf("Watching inbox: %s", p.inboxDir)

	for {
		select {
		case <-ctx.Done():
			return ctx.Err()
		case event, ok := <-watcher.Events:
			if !ok {
				return nil
			}
			if event.Op&fsnotify.Create == fsnotify.Create {
				// Wait a moment for file to be fully written
				time.Sleep(500 * time.Millisecond)
				if err := p.ProcessFile(ctx, event.Name); err != nil {
					log.Printf("Error processing %s: %v", event.Name, err)
				}
			}
		case err, ok := <-watcher.Errors:
			if !ok {
				return nil
			}
			log.Printf("Watcher error: %v", err)
		}
	}
}

func (p *Processor) ProcessFile(ctx context.Context, path string) error {
	// Skip hidden files and non-PDFs/images
	base := filepath.Base(path)
	if strings.HasPrefix(base, ".") {
		return nil
	}

	ext := strings.ToLower(filepath.Ext(path))
	if ext != ".pdf" && ext != ".jpg" && ext != ".jpeg" && ext != ".png" {
		log.Printf("Skipping non-document file: %s", path)
		return nil
	}

	log.Printf("Processing: %s", path)

	// Read file
	data, err := os.ReadFile(path)
	if err != nil {
		return err
	}

	// Compute checksum
	hash := sha256.Sum256(data)
	checksum := hex.EncodeToString(hash[:])

	// Generate ID
	id := uuid.New().String()

	// Extract text via OCR
	ocrText, pageCount, err := p.extractText(path, ext)
	if err != nil {
		log.Printf("OCR failed for %s: %v", path, err)
		ocrText = ""
	}

	// Classify with AI
	classification, err := p.classify(ctx, ocrText, base)
	if err != nil {
		log.Printf("Classification failed for %s: %v", path, err)
		classification = &Classification{
			Category: "uncategorized",
			Title:    base,
		}
	}

	// Store PDF
	storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
	storagePath := filepath.Join(p.storeDir, storageName)
	if err := os.WriteFile(storagePath, data, 0644); err != nil {
		return err
	}

	// Parse date
	var docDate *time.Time
	if classification.Date != "" {
		if t, err := parseDate(classification.Date); err == nil {
			docDate = &t
		}
	}

	// Create document record
	doc := &db.Document{
		ID:           id,
		Filename:     storageName,
		OriginalName: base,
		Category:     classification.Category,
		Subcategory:  classification.Subcategory,
		Title:        classification.Title,
		Date:         docDate,
		Vendor:       classification.Vendor,
		Amount:       classification.Amount,
		Currency:     classification.Currency,
		TaxDeductible: classification.TaxDeductible,
		OCRText:      ocrText,
		StoragePath:  storagePath,
		PageCount:    pageCount,
		FileSize:     int64(len(data)),
		MimeType:     getMimeType(ext),
		Checksum:     checksum,
		ProcessedAt:  time.Now(),
		CreatedAt:    time.Now(),
		UpdatedAt:    time.Now(),
	}

	// Generate markdown record
	mdPath, err := p.writeMarkdown(doc, classification)
	if err != nil {
		log.Printf("Failed to write markdown: %v", err)
	} else {
		doc.MarkdownPath = mdPath
	}

	// Generate embedding
	if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
		doc.Embedding = embedding
	}

	// Store metadata as JSON
	if meta, err := json.Marshal(classification.KeyFields); err == nil {
		doc.Metadata = meta
	}

	// Insert into database
	if err := p.db.InsertDocument(doc); err != nil {
		return err
	}

	// Remove from inbox
	if err := os.Remove(path); err != nil {
		log.Printf("Failed to remove inbox file: %v", err)
	}

	log.Printf("Processed: %s -> %s (%s)", base, classification.Title, classification.Category)
	return nil
}

func (p *Processor) extractText(path, ext string) (string, int, error) {
	if ext == ".pdf" {
		return p.extractPDFText(path)
	}
	return p.extractImageText(path)
}

func (p *Processor) extractPDFText(path string) (string, int, error) {
	// Try pdftotext first (poppler-utils)
	cmd := exec.Command("pdftotext", "-layout", path, "-")
	output, err := cmd.Output()
	if err == nil && len(output) > 100 {
		// Count pages
		pageCmd := exec.Command("pdfinfo", path)
		pageOut, _ := pageCmd.Output()
		pages := 1
		if match := regexp.MustCompile(`Pages:\s+(\d+)`).FindSubmatch(pageOut); len(match) > 1 {
			fmt.Sscanf(string(match[1]), "%d", &pages)
		}
		return string(output), pages, nil
	}

	// Fallback to OCR via tesseract
	// Convert PDF to images first
	tmpDir, err := os.MkdirTemp("", "docman-ocr-")
	if err != nil {
		return "", 0, err
	}
	defer os.RemoveAll(tmpDir)

	// Use pdftoppm to convert to images
	cmd = exec.Command("pdftoppm", "-png", "-r", "300", path, filepath.Join(tmpDir, "page"))
	if err := cmd.Run(); err != nil {
		return "", 0, fmt.Errorf("pdftoppm failed: %w", err)
	}

	// OCR each page
	var textBuf bytes.Buffer
	pages, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
	for _, pagePath := range pages {
		text, _, _ := p.extractImageText(pagePath)
		textBuf.WriteString(text)
		textBuf.WriteString("\n\n--- Page Break ---\n\n")
	}

	return textBuf.String(), len(pages), nil
}

func (p *Processor) extractImageText(path string) (string, int, error) {
	cmd := exec.Command("tesseract", path, "stdout", "-l", "eng+nld")
	output, err := cmd.Output()
	if err != nil {
		return "", 1, err
	}
	return string(output), 1, nil
}

func (p *Processor) classify(ctx context.Context, text, filename string) (*Classification, error) {
	prompt := fmt.Sprintf(`Analyze this scanned document and extract structured information.

Document filename: %s

OCR Text:
%s

Classify and extract the following JSON structure:
{
  "category": "taxes|expenses|bills|medical|contacts|legal|insurance|banking|receipts|correspondence|uncategorized",
  "subcategory": "more specific category if applicable",
  "title": "descriptive title for this document",
  "date": "YYYY-MM-DD if found",
  "vendor": "company/person name if applicable",
  "amount": numeric amount if this is a financial document,
  "currency": "USD" or other currency code,
  "tax_deductible": true/false if this is a deductible expense,
  "summary": "one paragraph summary of the document",
  "key_fields": {"field_name": "value"} for any other important extracted data
}

Categories:
- taxes: W-2, 1099, tax returns, deductions
- expenses: receipts, invoices for purchases
- bills: utility bills, service bills
- medical: medical records, prescriptions, EOBs
- contacts: business cards, contact info
- legal: contracts, agreements, legal documents
- insurance: policies, claims
- banking: statements, checks
- receipts: general purchase receipts
- correspondence: letters, emails

Return ONLY valid JSON.`, filename, truncate(text, 4000))

	resp, err := p.aiClient.CreateChatCompletion(ctx, openai.ChatCompletionRequest{
		Model: p.aiModel,
		Messages: []openai.ChatCompletionMessage{
			{Role: openai.ChatMessageRoleUser, Content: prompt},
		},
		Temperature: 0.1,
	})
	if err != nil {
		return nil, err
	}

	if len(resp.Choices) == 0 {
		return nil, fmt.Errorf("no response from AI")
	}

	content := resp.Choices[0].Message.Content
	// Extract JSON from response
	content = extractJSON(content)

	var classification Classification
	if err := json.Unmarshal([]byte(content), &classification); err != nil {
		return nil, fmt.Errorf("parse classification: %w", err)
	}

	return &classification, nil
}

func (p *Processor) generateEmbedding(ctx context.Context, text string) ([]byte, error) {
	if p.embedModel == "" {
		return nil, nil
	}

	resp, err := p.aiClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
		Model: openai.EmbeddingModel(p.embedModel),
		Input: []string{truncate(text, 8000)},
	})
	if err != nil {
		return nil, err
	}

	if len(resp.Data) == 0 {
		return nil, fmt.Errorf("no embedding returned")
	}

	// Serialize embedding to bytes
	embData, err := json.Marshal(resp.Data[0].Embedding)
	return embData, err
}

func (p *Processor) writeMarkdown(doc *db.Document, class *Classification) (string, error) {
	// Create category subdirectory
	catDir := filepath.Join(p.recordsDir, doc.Category)
	if err := os.MkdirAll(catDir, 0755); err != nil {
		return "", err
	}

	// Generate filename
	dateStr := "undated"
	if doc.Date != nil {
		dateStr = doc.Date.Format("2006-01-02")
	}
	safeName := sanitizeFilename(doc.Title)
	mdName := fmt.Sprintf("%s_%s.md", dateStr, safeName)
	mdPath := filepath.Join(catDir, mdName)

	// Build markdown content
	var buf bytes.Buffer
	buf.WriteString(fmt.Sprintf("# %s\n\n", doc.Title))
	buf.WriteString("## Metadata\n\n")
	buf.WriteString(fmt.Sprintf("- **ID:** %s\n", doc.ID))
	buf.WriteString(fmt.Sprintf("- **Category:** %s", doc.Category))
	if doc.Subcategory != "" {
		buf.WriteString(fmt.Sprintf(" / %s", doc.Subcategory))
	}
	buf.WriteString("\n")
	if doc.Date != nil {
		buf.WriteString(fmt.Sprintf("- **Date:** %s\n", doc.Date.Format("2006-01-02")))
	}
	if doc.Vendor != "" {
		buf.WriteString(fmt.Sprintf("- **Vendor:** %s\n", doc.Vendor))
	}
	if doc.Amount != nil {
		buf.WriteString(fmt.Sprintf("- **Amount:** %s %.2f\n", doc.Currency, *doc.Amount))
	}
	if doc.TaxDeductible {
		buf.WriteString("- **Tax Deductible:** Yes\n")
	}
	buf.WriteString(fmt.Sprintf("- **Original File:** %s\n", doc.OriginalName))
	buf.WriteString(fmt.Sprintf("- **PDF:** [View](%s)\n", doc.StoragePath))
	buf.WriteString(fmt.Sprintf("- **Processed:** %s\n", doc.ProcessedAt.Format(time.RFC3339)))

	if class.Summary != "" {
		buf.WriteString("\n## Summary\n\n")
		buf.WriteString(class.Summary)
		buf.WriteString("\n")
	}

	if len(class.KeyFields) > 0 {
		buf.WriteString("\n## Key Fields\n\n")
		for k, v := range class.KeyFields {
			buf.WriteString(fmt.Sprintf("- **%s:** %s\n", k, v))
		}
	}

	buf.WriteString("\n## Full Text (OCR)\n\n```\n")
	buf.WriteString(doc.OCRText)
	buf.WriteString("\n```\n")

	if err := os.WriteFile(mdPath, buf.Bytes(), 0644); err != nil {
		return "", err
	}

	return mdPath, nil
}

// Helper functions

func truncate(s string, max int) string {
	if len(s) <= max {
		return s
	}
	return s[:max]
}

func extractJSON(s string) string {
	// Try to find JSON block
	start := strings.Index(s, "{")
	end := strings.LastIndex(s, "}")
	if start >= 0 && end > start {
		return s[start : end+1]
	}
	return s
}

func parseDate(s string) (time.Time, error) {
	formats := []string{
		"2006-01-02",
		"01/02/2006",
		"1/2/2006",
		"January 2, 2006",
		"Jan 2, 2006",
		"2006/01/02",
	}
	for _, f := range formats {
		if t, err := time.Parse(f, s); err == nil {
			return t, nil
		}
	}
	return time.Time{}, fmt.Errorf("cannot parse date: %s", s)
}

func getMimeType(ext string) string {
	switch ext {
	case ".pdf":
		return "application/pdf"
	case ".jpg", ".jpeg":
		return "image/jpeg"
	case ".png":
		return "image/png"
	default:
		return "application/octet-stream"
	}
}

func sanitizeFilename(s string) string {
	s = strings.ToLower(s)
	s = regexp.MustCompile(`[^a-z0-9]+`).ReplaceAllString(s, "-")
	s = strings.Trim(s, "-")
	if len(s) > 50 {
		s = s[:50]
	}
	return s
}

// ProcessSingle processes a single file and returns the document (for API uploads)
func (p *Processor) ProcessSingle(ctx context.Context, data []byte, filename string) (*db.Document, error) {
	// Write to temp file for processing
	ext := strings.ToLower(filepath.Ext(filename))
	tmpFile, err := os.CreateTemp("", "docman-upload-*"+ext)
	if err != nil {
		return nil, err
	}
	defer os.Remove(tmpFile.Name())

	if _, err := tmpFile.Write(data); err != nil {
		return nil, err
	}
	tmpFile.Close()

	// Use existing process logic but don't delete the temp file
	// Compute checksum
	hash := sha256.Sum256(data)
	checksum := hex.EncodeToString(hash[:])
	id := uuid.New().String()

	ocrText, pageCount, err := p.extractText(tmpFile.Name(), ext)
	if err != nil {
		ocrText = ""
	}

	classification, err := p.classify(ctx, ocrText, filename)
	if err != nil {
		classification = &Classification{
			Category: "uncategorized",
			Title:    filename,
		}
	}

	storageName := fmt.Sprintf("%s%s", checksum[:16], ext)
	storagePath := filepath.Join(p.storeDir, storageName)
	if err := os.WriteFile(storagePath, data, 0644); err != nil {
		return nil, err
	}

	var docDate *time.Time
	if classification.Date != "" {
		if t, err := parseDate(classification.Date); err == nil {
			docDate = &t
		}
	}

	doc := &db.Document{
		ID:           id,
		Filename:     storageName,
		OriginalName: filename,
		Category:     classification.Category,
		Subcategory:  classification.Subcategory,
		Title:        classification.Title,
		Date:         docDate,
		Vendor:       classification.Vendor,
		Amount:       classification.Amount,
		Currency:     classification.Currency,
		TaxDeductible: classification.TaxDeductible,
		OCRText:      ocrText,
		StoragePath:  storagePath,
		PageCount:    pageCount,
		FileSize:     int64(len(data)),
		MimeType:     getMimeType(ext),
		Checksum:     checksum,
		ProcessedAt:  time.Now(),
		CreatedAt:    time.Now(),
		UpdatedAt:    time.Now(),
	}

	mdPath, _ := p.writeMarkdown(doc, classification)
	doc.MarkdownPath = mdPath

	if embedding, err := p.generateEmbedding(ctx, ocrText); err == nil {
		doc.Embedding = embedding
	}

	if meta, err := json.Marshal(classification.KeyFields); err == nil {
		doc.Metadata = meta
	}

	if err := p.db.InsertDocument(doc); err != nil {
		return nil, err
	}

	return doc, nil
}

// SearchMarkdown searches markdown files directly (fallback when embeddings unavailable)
func SearchMarkdown(recordsDir, query string, limit int) ([]*db.SearchResult, error) {
	var results []*db.SearchResult
	query = strings.ToLower(query)
	terms := strings.Fields(query)

	err := filepath.Walk(recordsDir, func(path string, info os.FileInfo, err error) error {
		if err != nil || info.IsDir() || !strings.HasSuffix(path, ".md") {
			return nil
		}

		data, err := os.ReadFile(path)
		if err != nil {
			return nil
		}

		content := strings.ToLower(string(data))
		score := 0.0
		for _, term := range terms {
			if strings.Contains(content, term) {
				score += 1.0
			}
		}

		if score > 0 {
			// Extract title from first line
			lines := strings.Split(string(data), "\n")
			title := strings.TrimPrefix(lines[0], "# ")

			// Find snippet around first match
			snippet := findSnippet(string(data), terms[0], 100)

			results = append(results, &db.SearchResult{
				Document: db.Document{
					Title:        title,
					MarkdownPath: path,
				},
				Score:   score / float64(len(terms)),
				Snippet: snippet,
			})
		}
		return nil
	})

	if err != nil {
		return nil, err
	}

	// Sort by score descending
	for i := 0; i < len(results)-1; i++ {
		for j := i + 1; j < len(results); j++ {
			if results[j].Score > results[i].Score {
				results[i], results[j] = results[j], results[i]
			}
		}
	}

	if len(results) > limit {
		results = results[:limit]
	}

	return results, nil
}

func findSnippet(text, term string, radius int) string {
	lower := strings.ToLower(text)
	idx := strings.Index(lower, strings.ToLower(term))
	if idx < 0 {
		if len(text) > radius*2 {
			return text[:radius*2] + "..."
		}
		return text
	}

	start := idx - radius
	if start < 0 {
		start = 0
	}
	end := idx + len(term) + radius
	if end > len(text) {
		end = len(text)
	}

	snippet := text[start:end]
	if start > 0 {
		snippet = "..." + snippet
	}
	if end < len(text) {
		snippet = snippet + "..."
	}
	return snippet
}

func (p *Processor) GetRecordsDir() string {
	return p.recordsDir
}

func (p *Processor) GetStoreDir() string {
	return p.storeDir
}