fix: pdftoppm output filename glob instead of hardcoded page-1.png
pdftoppm zero-pads the page number based on total page count:
- <10 pages: page-1.png
- <100 pages: page-01.png
- <1000 pages: page-001.png
The code hardcoded 'page-1.png' and 'page-N.png', which fails for any
multi-page document. Use filepath.Glob('page-*.png') to find the actual
output regardless of padding width.
Fixed in both ConvertToImage() (first-page preview) and the multi-page
OCR loop in ProcessDocument().
This commit is contained in:
parent
9622ab9390
commit
883f118d66
19
ai.go
19
ai.go
|
|
@ -116,8 +116,14 @@ func ConvertToImage(filePath string) ([]byte, error) {
|
|||
return nil, fmt.Errorf("pdftoppm failed: %w", err)
|
||||
}
|
||||
|
||||
pngPath := filepath.Join(tmpDir, "page-1.png")
|
||||
return os.ReadFile(pngPath)
|
||||
// pdftoppm uses variable-width zero-padding depending on page count
|
||||
// (e.g. page-01.png for <100 pages, page-001.png for <1000 pages).
|
||||
// Glob for the first match instead of hardcoding "page-1.png".
|
||||
matches, err := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
||||
if err != nil || len(matches) == 0 {
|
||||
return nil, fmt.Errorf("pdftoppm output not found in %s", tmpDir)
|
||||
}
|
||||
return os.ReadFile(matches[0])
|
||||
}
|
||||
|
||||
// Image files — read directly
|
||||
|
|
@ -483,8 +489,13 @@ func ProcessPDFPageByPage(filePath string, jobID string) (string, error) {
|
|||
continue
|
||||
}
|
||||
|
||||
pngPath := filepath.Join(tmpDir, fmt.Sprintf("page-%d.png", page))
|
||||
imageData, err := os.ReadFile(pngPath)
|
||||
// Glob for the output — pdftoppm zero-pads based on total page count
|
||||
pageMatches, _ := filepath.Glob(filepath.Join(tmpDir, "page-*.png"))
|
||||
if len(pageMatches) == 0 {
|
||||
os.RemoveAll(tmpDir)
|
||||
continue
|
||||
}
|
||||
imageData, err := os.ReadFile(pageMatches[0])
|
||||
os.RemoveAll(tmpDir)
|
||||
if err != nil {
|
||||
continue
|
||||
|
|
|
|||
Loading…
Reference in New Issue