Skip to content

Commit eb37f27

Browse files
Update ocr/google_docai_provider.go
Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent f0a73ed commit eb37f27

File tree

1 file changed

+19
-4
lines changed

1 file changed

+19
-4
lines changed

ocr/google_docai_provider.go

+19-4
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
165165
for pageNum, page := range doc.GetPages() {
166166
pageWidth := page.GetDimension().GetWidth()
167167
pageHeight := page.GetDimension().GetHeight()
168+
// Validate dimensions
169+
if pageWidth <= 0 || pageHeight <= 0 {
170+
continue
171+
}
168172

169173
hocr.WriteString(fmt.Sprintf(`
170174
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>`,
@@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
178182
}
179183

180184
// Convert normalized coordinates to absolute
181-
x1 := int(paraBox[0].GetX() * pageWidth)
182-
y1 := int(paraBox[0].GetY() * pageHeight)
183-
x2 := int(paraBox[2].GetX() * pageWidth)
184-
y2 := int(paraBox[2].GetY() * pageHeight)
185+
// Use float64 for intermediate calculations to prevent overflow
186+
x1 := int(float64(paraBox[0].GetX()) * float64(pageWidth))
187+
y1 := int(float64(paraBox[0].GetY()) * float64(pageHeight))
188+
x2 := int(float64(paraBox[2].GetX()) * float64(pageWidth))
189+
y2 := int(float64(paraBox[2].GetY()) * float64(pageHeight))
190+
191+
// Validate coordinates
192+
if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
193+
x1 > int(pageWidth) || y1 > int(pageHeight) ||
194+
x2 > int(pageWidth) || y2 > int(pageHeight) {
195+
continue
196+
}
185197

186198
hocr.WriteString(fmt.Sprintf(`
187199
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>`,
@@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
194206
continue
195207
}
196208

209+
// Escape HTML special characters
210+
text = html.EscapeString(text)
211+
197212
hocr.WriteString(fmt.Sprintf(`
198213
<span class='ocrx_word'>%s</span>`, text))
199214
}

0 commit comments

Comments
 (0)