fix: pdftoppm output filename glob instead of hardcoded page-1.png

pdftoppm zero-pads the page number based on total page count:
- <10 pages: page-1.png
- <100 pages: page-01.png
- <1000 pages: page-001.png

The code hardcoded 'page-1.png' and 'page-N.png', which fails for any
multi-page document. Use filepath.Glob('page-*.png') to find the actual
output regardless of padding width.

Fixed in both ConvertToImage() (first-page preview) and the multi-page
OCR loop in ProcessDocument().
This commit is contained in:
James 2026-03-23 14:14:28 -04:00
parent 9622ab9390
commit 883f118d66
1 changed files with 15 additions and 4 deletions

19
ai.go
View File

@ -116,8 +116,14 @@ func ConvertToImage(filePath string) ([]byte, error) {
return nil, fmt.Errorf("pdftoppm failed: %w", err) return nil, fmt.Errorf("pdftoppm failed: %w", err)
} }
pngPath := filepath.Join(tmpDir, "page-1.png") // pdftoppm uses variable-width zero-padding depending on page count
return os.ReadFile(pngPath) // (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
// Glob for the first match instead of hardcoding "page-1.png".
matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if err != nil || len(matches) == 0 {
return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
}
return os.ReadFile(matches[0])
} }
// Image files — read directly // Image files — read directly
@ -483,8 +489,13 @@ func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
continue continue
} }
pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page)) // Glob for the output — pdftoppm zero-pads based on total page count
imageData, err := os.ReadFile(pngPath) pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
if len(pageMatches) == 0 {
os.RemoveAll(tmpDir)
continue
}
imageData, err := os.ReadFile(pageMatches[0])
os.RemoveAll(tmpDir) os.RemoveAll(tmpDir)
if err != nil { if err != nil {
continue continue