diff --git a/ocr/google_docai_provider.go b/ocr/google_docai_provider.go
index 1e7413e..1c50625 100644
--- a/ocr/google_docai_provider.go
+++ b/ocr/google_docai_provider.go
@@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
for pageNum, page := range doc.GetPages() {
pageWidth := page.GetDimension().GetWidth()
pageHeight := page.GetDimension().GetHeight()
+ // Validate dimensions
+ if pageWidth <= 0 || pageHeight <= 0 {
+ continue
+ }
hocr.WriteString(fmt.Sprintf(`
`,
@@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
}
// Convert normalized coordinates to absolute
- x1 := int(paraBox[0].GetX() * pageWidth)
- y1 := int(paraBox[0].GetY() * pageHeight)
- x2 := int(paraBox[2].GetX() * pageWidth)
- y2 := int(paraBox[2].GetY() * pageHeight)
+ // Use float64 for intermediate calculations to prevent overflow
+ x1 := int(float64(paraBox[0].GetX()) * float64(pageWidth))
+ y1 := int(float64(paraBox[0].GetY()) * float64(pageHeight))
+ x2 := int(float64(paraBox[2].GetX()) * float64(pageWidth))
+ y2 := int(float64(paraBox[2].GetY()) * float64(pageHeight))
+
+ // Validate coordinates
+ if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
+ x1 > int(pageWidth) || y1 > int(pageHeight) ||
+ x2 > int(pageWidth) || y2 > int(pageHeight) {
+ continue
+ }
hocr.WriteString(fmt.Sprintf(`
`,
@@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
continue
}
+ // Escape HTML special characters
+ text = html.EscapeString(text)
+
hocr.WriteString(fmt.Sprintf(`
%s`, text))
}