package extract import ( "bytes" "encoding/base64" "fmt" "os" "os/exec" "path/filepath" "sort" "strings" "github.com/xuri/excelize/v2" ) // FileToImages converts a file to base64-encoded JPEG images for vision extraction. // For images (jpg/png), returns the base64 directly. // For XLSX, returns nil (caller should use XLSXToText instead). // For PDF, uses pdftoppm to rasterise pages. func FileToImages(path string) ([]string, error) { ext := strings.ToLower(filepath.Ext(path)) switch ext { case ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp": return imageToBase64(path) case ".xlsx", ".xls": return nil, nil // caller uses XLSXToText case ".pdf": return pdfToImages(path) default: // Try pdftoppm anyway; if it fails, return empty imgs, err := pdfToImages(path) if err != nil || len(imgs) == 0 { return nil, fmt.Errorf("unsupported file type %s", ext) } return imgs, nil } } // XLSXToText extracts all sheets from an XLSX file as markdown tables. func XLSXToText(path string) (string, error) { f, err := excelize.OpenFile(path) if err != nil { return "", fmt.Errorf("open xlsx: %w", err) } defer f.Close() var buf bytes.Buffer for _, sheetName := range f.GetSheetList() { rows, err := f.GetRows(sheetName) if err != nil { continue } if len(rows) == 0 { continue } buf.WriteString(fmt.Sprintf("## %s\n\n", sheetName)) // Write as markdown table if len(rows) > 0 { // Header row buf.WriteString("| " + strings.Join(rows[0], " | ") + " |\n") buf.WriteString("|" + strings.Repeat(" --- |", len(rows[0])) + "\n") // Data rows for _, row := range rows[1:] { // Pad row if shorter than header for len(row) < len(rows[0]) { row = append(row, "") } buf.WriteString("| " + strings.Join(row, " | ") + " |\n") } } buf.WriteString("\n") } return buf.String(), nil } func imageToBase64(path string) ([]string, error) { data, err := os.ReadFile(path) if err != nil { return nil, err } return []string{base64.StdEncoding.EncodeToString(data)}, nil } func pdfToImages(path string) ([]string, error) { tmpDir, err := os.MkdirTemp("", "pdf2img-") if err != nil { return nil, err } defer os.RemoveAll(tmpDir) prefix := filepath.Join(tmpDir, "page") cmd := exec.Command("pdftoppm", "-jpeg", "-r", "150", path, prefix) if out, err := cmd.CombinedOutput(); err != nil { return nil, fmt.Errorf("pdftoppm failed: %w: %s", err, string(out)) } // Read generated files in sorted order entries, err := os.ReadDir(tmpDir) if err != nil { return nil, err } var names []string for _, e := range entries { if strings.HasSuffix(e.Name(), ".jpg") { names = append(names, e.Name()) } } sort.Strings(names) var images []string for _, name := range names { data, err := os.ReadFile(filepath.Join(tmpDir, name)) if err != nil { continue } images = append(images, base64.StdEncoding.EncodeToString(data)) } if len(images) == 0 { return nil, fmt.Errorf("pdftoppm produced no images") } return images, nil } // IsXLSX returns true if the file is an Excel file. func IsXLSX(path string) bool { ext := strings.ToLower(filepath.Ext(path)) return ext == ".xlsx" || ext == ".xls" }