[feature] Add word crop general orientation to output (mindee#1546)

odulcy-mindee · Apr 12, 2024 · dcaae42 · dcaae42
1 parent d932175
commit dcaae42
Show file tree

Hide file tree

Showing 21 changed files with 323 additions and 69 deletions.
diff --git a/api/README.md b/api/README.md
@@ -164,7 +164,8 @@ should yield
                       0.8272978149561669,
                       0.20703125
                     ],
-                    "confidence": 1.0
+                    "confidence": 1.0,
+                    "crop_orientation": {"value": 0, "confidence": null}
                   },
                   {
                     "value": "world!",
@@ -174,7 +175,8 @@ should yield
                       0.9101580212741838,
                       0.2080078125
                     ],
-                    "confidence": 1.0
+                    "confidence": 1.0,
+                    "crop_orientation": {"value": 0, "confidence": null}
                   }
                 ]
               }

diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
@@ -39,6 +39,7 @@ async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [Fil
                             value=prediction.value,
                             geometry=resolve_geometry(prediction.geometry),
                             confidence=round(prediction.confidence, 2),
+                            crop_orientation=prediction.crop_orientation,
                         )
                         for prediction in page.predictions[class_name]
                     ],

diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
@@ -45,6 +45,7 @@ async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [Fil
                                             value=word.value,
                                             geometry=resolve_geometry(word.geometry),
                                             confidence=round(word.confidence, 2),
+                                            crop_orientation=word.crop_orientation,
                                         )
                                         for word in line.words
                                     ],

diff --git a/api/app/schemas.py b/api/app/schemas.py
@@ -3,7 +3,7 @@
 # This program is licensed under the Apache License 2.0.
 # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
 
-from typing import Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 from pydantic import BaseModel, Field
 
@@ -59,12 +59,21 @@ class OCRWord(BaseModel):
     value: str = Field(..., examples=["example"])
     geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
     confidence: float = Field(..., examples=[0.99])
+    crop_orientation: Dict[str, Any] = Field(..., examples=[{"value": 0, "confidence": None}])
 
 
 class OCRLine(BaseModel):
     geometry: List[float] = Field(..., examples=[[0.0, 0.0, 0.0, 0.0]])
     words: List[OCRWord] = Field(
-        ..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
+        ...,
+        examples=[
+            {
+                "value": "example",
+                "geometry": [0.0, 0.0, 0.0, 0.0],
+                "confidence": 0.99,
+                "crop_orientation": {"value": 0, "confidence": None},
+            }
+        ],
     )
 
 
@@ -75,7 +84,14 @@ class OCRBlock(BaseModel):
         examples=[
             {
                 "geometry": [0.0, 0.0, 0.0, 0.0],
-                "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+                "words": [
+                    {
+                        "value": "example",
+                        "geometry": [0.0, 0.0, 0.0, 0.0],
+                        "confidence": 0.99,
+                        "crop_orientation": {"value": 0, "confidence": None},
+                    }
+                ],
             }
         ],
     )
@@ -90,7 +106,14 @@ class OCRPage(BaseModel):
                 "lines": [
                     {
                         "geometry": [0.0, 0.0, 0.0, 0.0],
-                        "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+                        "words": [
+                            {
+                                "value": "example",
+                                "geometry": [0.0, 0.0, 0.0, 0.0],
+                                "confidence": 0.99,
+                                "crop_orientation": {"value": 0, "confidence": None},
+                            }
+                        ],
                     }
                 ],
             }
@@ -111,7 +134,14 @@ class OCROut(BaseModel):
                 "lines": [
                     {
                         "geometry": [0.0, 0.0, 0.0, 0.0],
-                        "words": [{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}],
+                        "words": [
+                            {
+                                "value": "example",
+                                "geometry": [0.0, 0.0, 0.0, 0.0],
+                                "confidence": 0.99,
+                                "crop_orientation": {"value": 0, "confidence": None},
+                            }
+                        ],
                     }
                 ],
             }
@@ -121,8 +151,16 @@ class OCROut(BaseModel):
 
 class KIEElement(BaseModel):
     class_name: str = Field(..., examples=["example"])
-    items: List[Dict[str, Union[str, List[float], float]]] = Field(
-        ..., examples=[{"value": "example", "geometry": [0.0, 0.0, 0.0, 0.0], "confidence": 0.99}]
+    items: List[Dict[str, Union[str, List[float], float, Dict[str, Any]]]] = Field(
+        ...,
+        examples=[
+            {
+                "value": "example",
+                "geometry": [0.0, 0.0, 0.0, 0.0],
+                "confidence": 0.99,
+                "crop_orientation": {"value": 0, "confidence": None},
+            }
+        ],
     )
 
 

diff --git a/api/tests/conftest.py b/api/tests/conftest.py
@@ -85,11 +85,13 @@ def mock_kie_response():
                             "value": "Hello",
                             "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
                             "confidence": 1,
+                            "crop_orientation": {"value": 0, "confidence": None},
                         },
                         {
                             "value": "world!",
                             "geometry": [0.8176307908857315, 0.1787109375, 0.9101580212741838, 0.2080078125],
                             "confidence": 1,
+                            "crop_orientation": {"value": 0, "confidence": None},
                         },
                     ],
                 }
@@ -117,6 +119,7 @@ def mock_kie_response():
                                 0.20540954172611237,
                             ],
                             "confidence": 0.99,
+                            "crop_orientation": {"value": 0, "confidence": 1},
                         },
                         {
                             "value": "world!",
@@ -131,6 +134,7 @@ def mock_kie_response():
                                 0.20735852420330048,
                             ],
                             "confidence": 1,
+                            "crop_orientation": {"value": 0, "confidence": 1},
                         },
                     ],
                 }
@@ -160,6 +164,7 @@ def mock_ocr_response():
                                             "value": "Hello",
                                             "geometry": [0.7471996155154171, 0.1796875, 0.8272978149561669, 0.20703125],
                                             "confidence": 1,
+                                            "crop_orientation": {"value": 0, "confidence": None},
                                         },
                                         {
                                             "value": "world!",
@@ -170,6 +175,7 @@ def mock_ocr_response():
                                                 0.2080078125,
                                             ],
                                             "confidence": 1,
+                                            "crop_orientation": {"value": 0, "confidence": None},
                                         },
                                     ],
                                 }
@@ -224,6 +230,7 @@ def mock_ocr_response():
                                                 0.20540954172611237,
                                             ],
                                             "confidence": 0.99,
+                                            "crop_orientation": {"value": 0, "confidence": 1},
                                         },
                                         {
                                             "value": "world!",
@@ -238,6 +245,7 @@ def mock_ocr_response():
                                                 0.20735852420330048,
                                             ],
                                             "confidence": 1,
+                                            "crop_orientation": {"value": 0, "confidence": 1},
                                         },
                                     ],
                                 }

diff --git a/api/tests/routes/test_kie.py b/api/tests/routes/test_kie.py
@@ -22,6 +22,10 @@ def common_test(json_response, expected_response):
             assert isinstance(pred_item["value"], str) and pred_item["value"] == expected_pred_item["value"]
             assert isinstance(pred_item["confidence"], (int, float))
             np.testing.assert_allclose(pred_item["geometry"], expected_pred_item["geometry"], rtol=1e-2)
+            assert isinstance(pred_item["crop_orientation"], dict)
+            assert isinstance(pred_item["crop_orientation"]["value"], int) and isinstance(
+                pred_item["crop_orientation"]["confidence"], (float, int, type(None))
+            )
 
 
 @pytest.mark.asyncio

diff --git a/api/tests/routes/test_ocr.py b/api/tests/routes/test_ocr.py
@@ -20,6 +20,10 @@ def common_test(json_response, expected_response):
                     np.testing.assert_allclose(word["geometry"], expected_word["geometry"], rtol=1e-2)
                     assert isinstance(word["value"], str) and word["value"] == expected_word["value"]
                     assert isinstance(word["confidence"], (int, float))
+                    assert isinstance(word["crop_orientation"], dict)
+                    assert isinstance(word["crop_orientation"]["value"], int) and isinstance(
+                        word["crop_orientation"]["confidence"], (float, int, type(None))
+                    )
 
 
 @pytest.mark.asyncio

diff --git a/docs/source/using_doctr/using_models.rst b/docs/source/using_doctr/using_models.rst
@@ -377,17 +377,20 @@ For reference, here is the export for the same `Document` as above::
                                 {
                                     'value': 'No.',
                                     'confidence': 0.914085328578949,
-                                    'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875))
+                                    'geometry': ((0.5478515625, 0.06640625), (0.5810546875, 0.0966796875)),
+                                    'crop_orientation': {'value': 0, 'confidence': None},
                                 },
                                 {
                                     'value': 'RECEIPT',
                                     'confidence': 0.9949972033500671,
-                                    'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375))
+                                    'geometry': ((0.1357421875, 0.0361328125), (0.51171875, 0.1630859375)),
+                                    'crop_orientation': {'value': 0, 'confidence': None},
                                 },
                                 {
                                     'value': 'DATE',
                                     'confidence': 0.9578408598899841,
-                                    'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625))
+                                    'geometry': ((0.1396484375, 0.3232421875), (0.185546875, 0.3515625)),
+                                    'crop_orientation': {'value': 0, 'confidence': None},
                                 }
                             ]
                         }

diff --git a/doctr/io/elements.py b/doctr/io/elements.py
@@ -67,16 +67,24 @@ class Word(Element):
         confidence: the confidence associated with the text prediction
         geometry: bounding box of the word in format ((xmin, ymin), (xmax, ymax)) where coordinates are relative to
         the page's size
+        crop_orientation: the general orientation of the crop in degrees and its confidence
     """
 
-    _exported_keys: List[str] = ["value", "confidence", "geometry"]
+    _exported_keys: List[str] = ["value", "confidence", "geometry", "crop_orientation"]
     _children_names: List[str] = []
 
-    def __init__(self, value: str, confidence: float, geometry: Union[BoundingBox, np.ndarray]) -> None:
+    def __init__(
+        self,
+        value: str,
+        confidence: float,
+        geometry: Union[BoundingBox, np.ndarray],
+        crop_orientation: Dict[str, Any],
+    ) -> None:
         super().__init__()
         self.value = value
         self.confidence = confidence
         self.geometry = geometry
+        self.crop_orientation = crop_orientation
 
     def render(self) -> str:
         """Renders the full text of the element"""