fix: pdftoppm output filename glob instead of hardcoded page-1.png

pdftoppm zero-pads the page number based on total page count: - <10 pages: page-1.png - <100 pages: page-01.png - <1000 pages: page-001.png The code hardcoded 'page-1.png' and 'page-N.png', which fails for any multi-page document. Use filepath.Glob('page-*.png') to find the actual output regardless of padding width. Fixed in both ConvertToImage() (first-page preview) and the multi-page OCR loop in ProcessDocument().
2026-03-23 14:14:28 -04:00 · 2026-03-23 14:14:28 -04:00 · 883f118d66
parent 9622ab9390
commit 883f118d66
1 changed files with 15 additions and 4 deletions
--- a/ai.go
+++ b/ai.go
@ -116,8 +116,14 @@ func ConvertToImage(filePath string) ([]byte, error) {
 			return nil, fmt.Errorf("pdftoppm failed: %w", err)
 		}
-		pngPath := filepath.Join(tmpDir, "page-1.png")
+		// pdftoppm uses variable-width zero-padding depending on page count
-		return os.ReadFile(pngPath)
+		// (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
 		// Glob for the first match instead of hardcoding "page-1.png".
 		matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
 		if err != nil || len(matches) == 0 {
 			return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
 		}
 		return os.ReadFile(matches[0])
 	}
 	// Image files — read directly
@ -483,8 +489,13 @@ func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
 			continue
 		}
-		pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page))
+		// Glob for the output — pdftoppm zero-pads based on total page count
-		imageData, err := os.ReadFile(pngPath)
+		pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
 		if len(pageMatches) == 0 {
 			os.RemoveAll(tmpDir)
 			continue
 		}
 		imageData, err := os.ReadFile(pageMatches[0])
 		os.RemoveAll(tmpDir)
 		if err != nil {
 			continue