Skip to content

Commit

Permalink
Update ocr/google_docai_provider.go
Browse files Browse the repository at this point in the history
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
  • Loading branch information
icereed and coderabbitai[bot] authored Feb 12, 2025
1 parent f0a73ed commit eb37f27
Showing 1 changed file with 19 additions and 4 deletions.
23 changes: 19 additions & 4 deletions ocr/google_docai_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
for pageNum, page := range doc.GetPages() {
pageWidth := page.GetDimension().GetWidth()
pageHeight := page.GetDimension().GetHeight()
// Validate dimensions
if pageWidth <= 0 || pageHeight <= 0 {
continue
}

hocr.WriteString(fmt.Sprintf(`
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
Expand All @@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
}

// Convert normalized coordinates to absolute
x1 := int(paraBox[0].GetX() * pageWidth)
y1 := int(paraBox[0].GetY() * pageHeight)
x2 := int(paraBox[2].GetX() * pageWidth)
y2 := int(paraBox[2].GetY() * pageHeight)
// Use float64 for intermediate calculations to prevent overflow
x1 := int(float64(paraBox[0].GetX()) * float64(pageWidth))
y1 := int(float64(paraBox[0].GetY()) * float64(pageHeight))
x2 := int(float64(paraBox[2].GetX()) * float64(pageWidth))
y2 := int(float64(paraBox[2].GetY()) * float64(pageHeight))

// Validate coordinates
if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
x1 > int(pageWidth) || y1 > int(pageHeight) ||
x2 > int(pageWidth) || y2 > int(pageHeight) {
continue
}

hocr.WriteString(fmt.Sprintf(`
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
Expand All @@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
continue
}

// Escape HTML special characters
text = html.EscapeString(text)

Check failure on line 210 in ocr/google_docai_provider.go

View workflow job for this annotation

GitHub Actions / test

undefined: html

hocr.WriteString(fmt.Sprintf(`
<span class='ocrx_word'>%s</span>`, text))
}
Expand Down

0 comments on commit eb37f27

Please sign in to comment.