dealroom/internal/extract/pdf.go

135 lines
3.1 KiB
Go

package extract
import (
"bytes"
"encoding/base64"
"fmt"
"os"
"os/exec"
"path/filepath"
"sort"
"strings"
"github.com/xuri/excelize/v2"
)
// FileToImages converts a file to base64-encoded JPEG images for vision extraction.
// For images (jpg/png), returns the base64 directly.
// For XLSX, returns nil (caller should use XLSXToText instead).
// For PDF, uses pdftoppm to rasterise pages.
func FileToImages(path string) ([]string, error) {
ext := strings.ToLower(filepath.Ext(path))
switch ext {
case ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp":
return imageToBase64(path)
case ".xlsx", ".xls":
return nil, nil // caller uses XLSXToText
case ".pdf":
return pdfToImages(path)
default:
// Try pdftoppm anyway; if it fails, return empty
imgs, err := pdfToImages(path)
if err != nil || len(imgs) == 0 {
return nil, fmt.Errorf("unsupported file type %s", ext)
}
return imgs, nil
}
}
// XLSXToText extracts all sheets from an XLSX file as markdown tables.
func XLSXToText(path string) (string, error) {
f, err := excelize.OpenFile(path)
if err != nil {
return "", fmt.Errorf("open xlsx: %w", err)
}
defer f.Close()
var buf bytes.Buffer
for _, sheetName := range f.GetSheetList() {
rows, err := f.GetRows(sheetName)
if err != nil {
continue
}
if len(rows) == 0 {
continue
}
buf.WriteString(fmt.Sprintf("## %s\n\n", sheetName))
// Write as markdown table
if len(rows) > 0 {
// Header row
buf.WriteString("| " + strings.Join(rows[0], " | ") + " |\n")
buf.WriteString("|" + strings.Repeat(" --- |", len(rows[0])) + "\n")
// Data rows
for _, row := range rows[1:] {
// Pad row if shorter than header
for len(row) < len(rows[0]) {
row = append(row, "")
}
buf.WriteString("| " + strings.Join(row, " | ") + " |\n")
}
}
buf.WriteString("\n")
}
return buf.String(), nil
}
func imageToBase64(path string) ([]string, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
return []string{base64.StdEncoding.EncodeToString(data)}, nil
}
func pdfToImages(path string) ([]string, error) {
tmpDir, err := os.MkdirTemp("", "pdf2img-")
if err != nil {
return nil, err
}
defer os.RemoveAll(tmpDir)
prefix := filepath.Join(tmpDir, "page")
cmd := exec.Command("pdftoppm", "-jpeg", "-r", "150", path, prefix)
if out, err := cmd.CombinedOutput(); err != nil {
return nil, fmt.Errorf("pdftoppm failed: %w: %s", err, string(out))
}
// Read generated files in sorted order
entries, err := os.ReadDir(tmpDir)
if err != nil {
return nil, err
}
var names []string
for _, e := range entries {
if strings.HasSuffix(e.Name(), ".jpg") {
names = append(names, e.Name())
}
}
sort.Strings(names)
var images []string
for _, name := range names {
data, err := os.ReadFile(filepath.Join(tmpDir, name))
if err != nil {
continue
}
images = append(images, base64.StdEncoding.EncodeToString(data))
}
if len(images) == 0 {
return nil, fmt.Errorf("pdftoppm produced no images")
}
return images, nil
}
// IsXLSX returns true if the file is an Excel file.
func IsXLSX(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
return ext == ".xlsx" || ext == ".xls"
}