135 lines
3.1 KiB
Go
135 lines
3.1 KiB
Go
package extract
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/base64"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/xuri/excelize/v2"
|
|
)
|
|
|
|
// FileToImages converts a file to base64-encoded JPEG images for vision extraction.
|
|
// For images (jpg/png), returns the base64 directly.
|
|
// For XLSX, returns nil (caller should use XLSXToText instead).
|
|
// For PDF, uses pdftoppm to rasterise pages.
|
|
func FileToImages(path string) ([]string, error) {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
|
|
switch ext {
|
|
case ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp":
|
|
return imageToBase64(path)
|
|
case ".xlsx", ".xls":
|
|
return nil, nil // caller uses XLSXToText
|
|
case ".pdf":
|
|
return pdfToImages(path)
|
|
default:
|
|
// Try pdftoppm anyway; if it fails, return empty
|
|
imgs, err := pdfToImages(path)
|
|
if err != nil || len(imgs) == 0 {
|
|
return nil, fmt.Errorf("unsupported file type %s", ext)
|
|
}
|
|
return imgs, nil
|
|
}
|
|
}
|
|
|
|
// XLSXToText extracts all sheets from an XLSX file as markdown tables.
|
|
func XLSXToText(path string) (string, error) {
|
|
f, err := excelize.OpenFile(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("open xlsx: %w", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
var buf bytes.Buffer
|
|
for _, sheetName := range f.GetSheetList() {
|
|
rows, err := f.GetRows(sheetName)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
if len(rows) == 0 {
|
|
continue
|
|
}
|
|
|
|
buf.WriteString(fmt.Sprintf("## %s\n\n", sheetName))
|
|
|
|
// Write as markdown table
|
|
if len(rows) > 0 {
|
|
// Header row
|
|
buf.WriteString("| " + strings.Join(rows[0], " | ") + " |\n")
|
|
buf.WriteString("|" + strings.Repeat(" --- |", len(rows[0])) + "\n")
|
|
// Data rows
|
|
for _, row := range rows[1:] {
|
|
// Pad row if shorter than header
|
|
for len(row) < len(rows[0]) {
|
|
row = append(row, "")
|
|
}
|
|
buf.WriteString("| " + strings.Join(row, " | ") + " |\n")
|
|
}
|
|
}
|
|
buf.WriteString("\n")
|
|
}
|
|
|
|
return buf.String(), nil
|
|
}
|
|
|
|
func imageToBase64(path string) ([]string, error) {
|
|
data, err := os.ReadFile(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return []string{base64.StdEncoding.EncodeToString(data)}, nil
|
|
}
|
|
|
|
func pdfToImages(path string) ([]string, error) {
|
|
tmpDir, err := os.MkdirTemp("", "pdf2img-")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer os.RemoveAll(tmpDir)
|
|
|
|
prefix := filepath.Join(tmpDir, "page")
|
|
cmd := exec.Command("pdftoppm", "-jpeg", "-r", "150", path, prefix)
|
|
if out, err := cmd.CombinedOutput(); err != nil {
|
|
return nil, fmt.Errorf("pdftoppm failed: %w: %s", err, string(out))
|
|
}
|
|
|
|
// Read generated files in sorted order
|
|
entries, err := os.ReadDir(tmpDir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var names []string
|
|
for _, e := range entries {
|
|
if strings.HasSuffix(e.Name(), ".jpg") {
|
|
names = append(names, e.Name())
|
|
}
|
|
}
|
|
sort.Strings(names)
|
|
|
|
var images []string
|
|
for _, name := range names {
|
|
data, err := os.ReadFile(filepath.Join(tmpDir, name))
|
|
if err != nil {
|
|
continue
|
|
}
|
|
images = append(images, base64.StdEncoding.EncodeToString(data))
|
|
}
|
|
|
|
if len(images) == 0 {
|
|
return nil, fmt.Errorf("pdftoppm produced no images")
|
|
}
|
|
return images, nil
|
|
}
|
|
|
|
// IsXLSX returns true if the file is an Excel file.
|
|
func IsXLSX(path string) bool {
|
|
ext := strings.ToLower(filepath.Ext(path))
|
|
return ext == ".xlsx" || ext == ".xls"
|
|
}
|