@@ -165,6 +165,10 @@ func generateHOCR(doc *documentaipb.Document) string {
165
165
for pageNum , page := range doc .GetPages () {
166
166
pageWidth := page .GetDimension ().GetWidth ()
167
167
pageHeight := page .GetDimension ().GetHeight ()
168
+ // Validate dimensions
169
+ if pageWidth <= 0 || pageHeight <= 0 {
170
+ continue
171
+ }
168
172
169
173
hocr .WriteString (fmt .Sprintf (`
170
174
<div class='ocr_page' id='page_%d' title='image;bbox 0 0 %d %d'>` ,
@@ -178,10 +182,18 @@ func generateHOCR(doc *documentaipb.Document) string {
178
182
}
179
183
180
184
// Convert normalized coordinates to absolute
181
- x1 := int (paraBox [0 ].GetX () * pageWidth )
182
- y1 := int (paraBox [0 ].GetY () * pageHeight )
183
- x2 := int (paraBox [2 ].GetX () * pageWidth )
184
- y2 := int (paraBox [2 ].GetY () * pageHeight )
185
+ // Use float64 for intermediate calculations to prevent overflow
186
+ x1 := int (float64 (paraBox [0 ].GetX ()) * float64 (pageWidth ))
187
+ y1 := int (float64 (paraBox [0 ].GetY ()) * float64 (pageHeight ))
188
+ x2 := int (float64 (paraBox [2 ].GetX ()) * float64 (pageWidth ))
189
+ y2 := int (float64 (paraBox [2 ].GetY ()) * float64 (pageHeight ))
190
+
191
+ // Validate coordinates
192
+ if x1 < 0 || y1 < 0 || x2 < 0 || y2 < 0 ||
193
+ x1 > int (pageWidth ) || y1 > int (pageHeight ) ||
194
+ x2 > int (pageWidth ) || y2 > int (pageHeight ) {
195
+ continue
196
+ }
185
197
186
198
hocr .WriteString (fmt .Sprintf (`
187
199
<p class='ocr_par' id='par_%d_%d' title='bbox %d %d %d %d'>` ,
@@ -194,6 +206,9 @@ func generateHOCR(doc *documentaipb.Document) string {
194
206
continue
195
207
}
196
208
209
+ // Escape HTML special characters
210
+ text = html .EscapeString (text )
211
+
197
212
hocr .WriteString (fmt .Sprintf (`
198
213
<span class='ocrx_word'>%s</span>` , text ))
199
214
}
0 commit comments