ENH: add get_pages_from_field (#2494)

pubpub-zz · stefan6419846 · web-flow · commit cb146e81eabe · 2024-03-02T17:16:41.000+01:00
* DEV: add _get_page_number_from_indirect in writer

create similar function to have same API as in reader
used in future dev


---------

Co-authored-by: Stefan &lt;96178532+stefan6419846@users.noreply.github.com&gt;
diff --git a/docs/user/forms.md b/docs/user/forms.md
@@ -50,7 +50,7 @@ PDF forms have a dual-nature approach about the fields:
   Inside it you could find (optional):
 
   - some global elements (Fonts, Resources,...)
-  - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
+  - some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
   - `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content
   - `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots)
 
@@ -99,3 +99,9 @@ However, it's also important to note that the two lists do not *always* refer to
 __Caution: Remember that fields are not stored in pages: If you use  `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__
 
 In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_.
+
+## Identify pages where fields are used
+
+On order to ease locating page fields you can use `page.get_pages_using_field`. This methods accepts a field object, id est a *PdfObject* that represents a field (as are extracted from `_root_object["/AcroForm"]["/Fields"]`. The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g. radio buttons or text displayed on multiple pages).
+
+The page numbers can then be retrieved as usual by using `page.page_number`.
diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -667,6 +667,76 @@ def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
                     ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
         return ff
 
+    def get_pages_showing_field(
+        self, field: Union[Field, PdfObject, IndirectObject]
+    ) -> List[PageObject]:
+        """
+        Provides list of pages where the field is called.
+
+        Args:
+            field: Field Object, PdfObject or IndirectObject referencing a Field
+
+        Returns:
+            List of pages:
+                - Empty list:
+                    The field has no widgets attached
+                    (either hidden field or ancestor field).
+                - Single page list:
+                    Page where the widget is present
+                    (most common).
+                - Multi-page list:
+                    Field with multiple kids widgets
+                    (example: radio buttons, field repeated on multiple pages).
+        """
+
+        def _get_inherited(obj: DictionaryObject, key: str) -> Any:
+            if key in obj:
+                return obj[key]
+            elif "/Parent" in obj:
+                return _get_inherited(
+                    cast(DictionaryObject, obj["/Parent"].get_object()), key
+                )
+            else:
+                return None
+
+        try:
+            # to cope with all types
+            field = cast(DictionaryObject, field.indirect_reference.get_object())  # type: ignore
+        except Exception as exc:
+            raise ValueError("field type is invalid") from exc
+        if _get_inherited(field, "/FT") is None:
+            raise ValueError("field is not valid")
+        ret = []
+        if field.get("/Subtype", "") == "/Widget":
+            if "/P" in field:
+                ret = [field["/P"].get_object()]
+            else:
+                ret = [
+                    p
+                    for p in self.pages
+                    if field.indirect_reference in p.get("/Annots", "")
+                ]
+        else:
+            kids = field.get("/Kids", ())
+            for k in kids:
+                k = k.get_object()
+                if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
+                    # Kid that is just a widget, not a field:
+                    if "/P" in k:
+                        ret += [k["/P"].get_object()]
+                    else:
+                        ret += [
+                            p
+                            for p in self.pages
+                            if k.indirect_reference in p.get("/Annots", "")
+                        ]
+        return [
+            x
+            if isinstance(x, PageObject)
+            else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)])  # type: ignore
+            for x in ret
+        ]
+
     def _get_named_destinations(
         self,
         tree: Union[TreeObject, None] = None,
@@ -1813,7 +1883,9 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType:
     def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
         """Take the permissions as an integer, return the allowed access."""
         deprecate_with_replacement(
-            old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0"
+            old_name="decode_permissions",
+            new_name="user_access_permissions",
+            removed_in="5.0.0",
         )
 
         permissions_mapping = {
diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -96,6 +96,7 @@
     DecodedStreamObject,
     Destination,
     DictionaryObject,
+    Field,
     Fit,
     FloatObject,
     IndirectObject,
@@ -1003,6 +1004,76 @@ def reattach_fields(
                 lst.append(ano)
         return lst
 
+    def get_pages_showing_field(
+        self, field: Union[Field, PdfObject, IndirectObject]
+    ) -> List[PageObject]:
+        """
+        Provides list of pages where the field is called.
+
+        Args:
+            field: Field Object, PdfObject or IndirectObject referencing a Field
+
+        Returns:
+            List of pages:
+                - Empty list:
+                    The field has no widgets attached
+                    (either hidden field or ancestor field).
+                - Single page list:
+                    Page where the widget is present
+                    (most common).
+                - Multi-page list:
+                    Field with multiple kids widgets
+                    (example: radio buttons, field repeated on multiple pages).
+        """
+
+        def _get_inherited(obj: DictionaryObject, key: str) -> Any:
+            if key in obj:
+                return obj[key]
+            elif "/Parent" in obj:
+                return _get_inherited(
+                    cast(DictionaryObject, obj["/Parent"].get_object()), key
+                )
+            else:
+                return None
+
+        try:
+            # to cope with all types
+            field = cast(DictionaryObject, field.indirect_reference.get_object())  # type: ignore
+        except Exception as exc:
+            raise ValueError("field type is invalid") from exc
+        if _get_inherited(field, "/FT") is None:
+            raise ValueError("field is not valid")
+        ret = []
+        if field.get("/Subtype", "") == "/Widget":
+            if "/P" in field:
+                ret = [field["/P"].get_object()]
+            else:
+                ret = [
+                    p
+                    for p in self.pages
+                    if field.indirect_reference in p.get("/Annots", "")
+                ]
+        else:
+            kids = field.get("/Kids", ())
+            for k in kids:
+                k = k.get_object()
+                if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
+                    # Kid that is just a widget, not a field:
+                    if "/P" in k:
+                        ret += [k["/P"].get_object()]
+                    else:
+                        ret += [
+                            p
+                            for p in self.pages
+                            if k.indirect_reference in p.get("/Annots", "")
+                        ]
+        return [
+            x
+            if isinstance(x, PageObject)
+            else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)])  # type: ignore
+            for x in ret
+        ]
+
     def clone_reader_document_root(self, reader: PdfReader) -> None:
         """
         Copy the reader document root to the writer and all sub elements,
diff --git a/tests/test_workflows.py b/tests/test_workflows.py
@@ -17,7 +17,14 @@
 from pypdf import PdfMerger, PdfReader, PdfWriter
 from pypdf.constants import PageAttributes as PG
 from pypdf.errors import PdfReadError, PdfReadWarning
-from pypdf.generic import ContentStream, NameObject, read_object
+from pypdf.generic import (
+    ArrayObject,
+    ContentStream,
+    DictionaryObject,
+    NameObject,
+    TextStringObject,
+    read_object,
+)
 
 from . import get_data_from_url, normalize_warnings
 
@@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode():
     reader = PdfReader(pdf_path)
     with pytest.raises(ValueError, match="Invalid text extraction mode"):
         reader.pages[0].extract_text(extraction_mode="foo")  # type: ignore
+
+
+@pytest.mark.enable_socket()
+def test_get_page_showing_field():
+    """
+    Uses testfile from #2452 in order to get fields on multiple pages,
+        choices boxes,...
+    """
+    url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf"
+    name = "iss2452.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name)))
+    writer = PdfWriter(clone_from=reader)
+
+    # validate with Field:  only works on Reader (no get_fields on writer yet)
+    fld = reader.get_fields()
+    assert [
+        p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"])
+    ] == [0]
+
+    # validate with dictionary object
+    # NRCategory field is a radio box
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
+        )
+    ] == [0, 0, 0, 0, 0]
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][8].get_object()
+        )
+    ] == [0, 0, 0, 0, 0]
+
+    # validate with IndirectObject
+    # SiteID field is a textbox on multiple pages
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]
+        )
+    ] == [0, 1]
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][99]
+        )
+    ] == [0, 1]
+    # test directly on the widget:
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1]
+        )
+    ] == [1]
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1]
+        )
+    ] == [1]
+
+    # Exceptions:
+    # Invalid Object
+    with pytest.raises(ValueError) as exc:
+        reader.get_pages_showing_field(None)
+    with pytest.raises(ValueError) as exc:
+        writer.get_pages_showing_field(None)
+    assert "field type is invalid" in exc.value.args[0]
+
+    # Damage Field
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"]
+    del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"]
+    with pytest.raises(ValueError) as exc:
+        reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
+        )
+    with pytest.raises(ValueError) as exc:
+        writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
+    assert "field is not valid" in exc.value.args[0]
+
+    # missing Parent in field
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
+        "/Parent"
+    ]
+    del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
+        "/Parent"
+    ]
+    with pytest.raises(ValueError) as exc:
+        reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
+        )
+    with pytest.raises(ValueError) as exc:
+        writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
+
+    # remove "/P" (optional)
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[
+        "/P"
+    ]
+    del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"]
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1]
+        )
+    ] == [0]
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1]
+        )
+    ] == [0]
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
+        )
+    ] == [0, 0, 0, 0, 0]
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][8].get_object()
+        )
+    ] == [0, 0, 0, 0, 0]
+
+    # Grouping fields
+    reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[
+        NameObject("/Kids")
+    ] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]])
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"]
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"]
+    del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"]
+    writer._root_object["/AcroForm"]["/Fields"].append(
+        writer._add_object(
+            DictionaryObject(
+                {
+                    NameObject("/T"): TextStringObject("grouping"),
+                    NameObject("/FT"): NameObject("/Tx"),
+                    NameObject("/Kids"): ArrayObject(
+                        [reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]
+                    ),
+                }
+            )
+        )
+    )
+    assert [
+        p.page_number
+        for p in reader.get_pages_showing_field(
+            reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1]
+        )
+    ] == []
+    assert [
+        p.page_number
+        for p in writer.get_pages_showing_field(
+            writer._root_object["/AcroForm"]["/Fields"][-1]
+        )
+    ] == []