Skip to content

Commit

Permalink
[feature] Add word crop general orientation to output (mindee#1546)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixdittrich92 authored Apr 12, 2024
1 parent d932175 commit dcaae42
Show file tree
Hide file tree
Showing 21 changed files with 323 additions and 69 deletions.
6 changes: 4 additions & 2 deletions api/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,8 @@ should yield
0.8272978149561669,
0.20703125
],
"confidence": 1.0
"confidence": 1.0,
"crop_orientation": {"value": 0, "confidence": null}
},
{
"value": "world!",
Expand All @@ -174,7 +175,8 @@ should yield
0.9101580212741838,
0.2080078125
],
"confidence": 1.0
"confidence": 1.0,
"crop_orientation": {"value": 0, "confidence": null}
}
]
}
Expand Down
1 change: 1 addition & 0 deletions api/app/routes/kie.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [Fil
value=prediction.value,
geometry=resolve_geometry(prediction.geometry),
confidence=round(prediction.confidence, 2),
crop_orientation=prediction.crop_orientation,
)
for prediction in page.predictions[class_name]
],
Expand Down
1 change: 1 addition & 0 deletions api/app/routes/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [Fil
value=word.value,
geometry=resolve_geometry(word.geometry),
confidence=round(word.confidence, 2),
crop_orientation=word.crop_orientation,
)
for word in line.words
],
Expand Down
52 changes: 45 additions & 7 deletions api/app/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

from typing import Dict, List, Tuple, Union
from typing import Any, Dict, List, Tuple, Union

from pydantic import BaseModel, Field

Expand Down Expand Up @@ -59,12 +59,21 @@ class OCRWord(BaseModel):
value: str = Field(..., examples=["example"])
geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
confidence: float = Field(..., examples=[0.99])
crop_orientation: Dict[str, Any] = Field(..., examples=[{"value": 0, "confidence": None}])


class OCRLine(BaseModel):
geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
words: List[OCRWord] = Field(
..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
...,
examples=[
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
],
)


Expand All @@ -75,7 +84,14 @@ class OCRBlock(BaseModel):
examples=[
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
"words": [
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
],
}
],
)
Expand All @@ -90,7 +106,14 @@ class OCRPage(BaseModel):
"lines": [
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
"words": [
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
],
}
],
}
Expand All @@ -111,7 +134,14 @@ class OCROut(BaseModel):
"lines": [
{
"geometry": [0.0, 0.0, 0.0, 0.0],
"words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
"words": [
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
],
}
],
}
Expand All @@ -121,8 +151,16 @@ class OCROut(BaseModel):

class KIEElement(BaseModel):
class_name: str = Field(..., examples=["example"])
items: List[Dict[str, Union[str, List[float], float]]] = Field(
..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
items: List[Dict[str, Union[str, List[float], float, Dict[str, Any]]]] = Field(
...,
examples=[
{
"value": "example",
"geometry": [0.0, 0.0, 0.0, 0.0],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": None},
}
],
)


Expand Down
8 changes: 8 additions & 0 deletions api/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,13 @@ def mock_kie_response():
"value": "Hello",
"geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
{
"value": "world!",
"geometry": [0.8176307908857315, 0.1787109375, 0.9101580212741838, 0.2080078125],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
],
}
Expand Down Expand Up @@ -117,6 +119,7 @@ def mock_kie_response():
0.20540954172611237,
],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": 1},
},
{
"value": "world!",
Expand All @@ -131,6 +134,7 @@ def mock_kie_response():
0.20735852420330048,
],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": 1},
},
],
}
Expand Down Expand Up @@ -160,6 +164,7 @@ def mock_ocr_response():
"value": "Hello",
"geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
{
"value": "world!",
Expand All @@ -170,6 +175,7 @@ def mock_ocr_response():
0.2080078125,
],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": None},
},
],
}
Expand Down Expand Up @@ -224,6 +230,7 @@ def mock_ocr_response():
0.20540954172611237,
],
"confidence": 0.99,
"crop_orientation": {"value": 0, "confidence": 1},
},
{
"value": "world!",
Expand All @@ -238,6 +245,7 @@ def mock_ocr_response():
0.20735852420330048,
],
"confidence": 1,
"crop_orientation": {"value": 0, "confidence": 1},
},
],
}
Expand Down
4 changes: 4 additions & 0 deletions api/tests/routes/test_kie.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def common_test(json_response, expected_response):
assert isinstance(pred_item["value"], str) and pred_item["value"] == expected_pred_item["value"]
assert isinstance(pred_item["confidence"], (int, float))
np.testing.assert_allclose(pred_item["geometry"], expected_pred_item["geometry"], rtol=1e-2)
assert isinstance(pred_item["crop_orientation"], dict)
assert isinstance(pred_item["crop_orientation"]["value"], int) and isinstance(
pred_item["crop_orientation"]["confidence"], (float, int, type(None))
)


@pytest.mark.asyncio
Expand Down
4 changes: 4 additions & 0 deletions api/tests/routes/test_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ def common_test(json_response, expected_response):
np.testing.assert_allclose(word["geometry"], expected_word["geometry"], rtol=1e-2)
assert isinstance(word["value"], str) and word["value"] == expected_word["value"]
assert isinstance(word["confidence"], (int, float))
assert isinstance(word["crop_orientation"], dict)
assert isinstance(word["crop_orientation"]["value"], int) and isinstance(
word["crop_orientation"]["confidence"], (float, int, type(None))
)


@pytest.mark.asyncio
Expand Down
9 changes: 6 additions & 3 deletions docs/source/using_doctr/using_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -377,17 +377,20 @@ For reference, here is the export for the same `Document` as above::
{
'value': 'No.',
'confidence': 0.914085328578949,
'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)),
'crop_orientation': {'value': 0, 'confidence': None},
},
{
'value': 'RECEIPT',
'confidence': 0.9949972033500671,
'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)),
'crop_orientation': {'value': 0, 'confidence': None},
},
{
'value': 'DATE',
'confidence': 0.9578408598899841,
'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)),
'crop_orientation': {'value': 0, 'confidence': None},
}
]
}
Expand Down
12 changes: 10 additions & 2 deletions doctr/io/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,24 @@ class Word(Element):
confidence: the confidence associated with the text prediction
geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
the page's size
crop_orientation: the general orientation of the crop in degrees and its confidence
"""

_exported_keys: List[str] = ["value", "confidence", "geometry"]
_exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
_children_names: List[str] = []

def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, np.ndarray]) -> None:
def __init__(
self,
value: str,
confidence: float,
geometry: Union[BoundingBox, np.ndarray],
crop_orientation: Dict[str, Any],
) -> None:
super().__init__()
self.value = value
self.confidence = confidence
self.geometry = geometry
self.crop_orientation = crop_orientation

def render(self) -> str:
"""Renders the full text of the element"""
Expand Down
Loading

0 comments on commit dcaae42

Please sign in to comment.