Skip to content

Commit cb146e8

Browse files
ENH: add get_pages_from_field (#2494)
* DEV: add _get_page_number_from_indirect in writer create similar function to have same API as in reader used in future dev --------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
1 parent f32a964 commit cb146e8

File tree

4 files changed

+316
-3
lines changed

4 files changed

+316
-3
lines changed

docs/user/forms.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ PDF forms have a dual-nature approach about the fields:
5050
Inside it you could find (optional):
5151

5252
- some global elements (Fonts, Resources,...)
53-
- some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
53+
- some global flags (like `/NeedAppearances` (set/cleared with `auto_regenerate` parameter in `update_page_form_field_values()`) that indicates if the reading program should re-render the visual fields upon document launch)
5454
- `/XFA` that houses a form in XDP format (very specific XML that describes the form rendered by some viewers); the `/XFA` form overrides the page content
5555
- `/Fields` that houses an array of indirect references that reference the upper _Field_ Objects (roots)
5656

@@ -99,3 +99,9 @@ However, it's also important to note that the two lists do not *always* refer to
9999
__Caution: Remember that fields are not stored in pages: If you use `add_page()` the field structure is not copied. It is recommended to use `.append()` with the proper parameters instead.__
100100

101101
In case of missing _field_ objects in `/Fields`, `writer.reattach_fields()` will parse page(s) annotations and will reattach them. This fix can not guess intermediate fields and will not report fields using the same _name_.
102+
103+
## Identify pages where fields are used
104+
105+
On order to ease locating page fields you can use `page.get_pages_using_field`. This methods accepts a field object, id est a *PdfObject* that represents a field (as are extracted from `_root_object["/AcroForm"]["/Fields"]`. The method returns a list of pages, because a field can have multiple widgets as mentioned previously (e.g. radio buttons or text displayed on multiple pages).
106+
107+
The page numbers can then be retrieved as usual by using `page.page_number`.

pypdf/_reader.py

+73-1
Original file line numberDiff line numberDiff line change
@@ -667,6 +667,76 @@ def indexed_key(k: str, fields: Dict[Any, Any]) -> str:
667667
ff[indexed_key(cast(str, value["/T"]), ff)] = value.get("/V")
668668
return ff
669669

670+
def get_pages_showing_field(
671+
self, field: Union[Field, PdfObject, IndirectObject]
672+
) -> List[PageObject]:
673+
"""
674+
Provides list of pages where the field is called.
675+
676+
Args:
677+
field: Field Object, PdfObject or IndirectObject referencing a Field
678+
679+
Returns:
680+
List of pages:
681+
- Empty list:
682+
The field has no widgets attached
683+
(either hidden field or ancestor field).
684+
- Single page list:
685+
Page where the widget is present
686+
(most common).
687+
- Multi-page list:
688+
Field with multiple kids widgets
689+
(example: radio buttons, field repeated on multiple pages).
690+
"""
691+
692+
def _get_inherited(obj: DictionaryObject, key: str) -> Any:
693+
if key in obj:
694+
return obj[key]
695+
elif "/Parent" in obj:
696+
return _get_inherited(
697+
cast(DictionaryObject, obj["/Parent"].get_object()), key
698+
)
699+
else:
700+
return None
701+
702+
try:
703+
# to cope with all types
704+
field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
705+
except Exception as exc:
706+
raise ValueError("field type is invalid") from exc
707+
if _get_inherited(field, "/FT") is None:
708+
raise ValueError("field is not valid")
709+
ret = []
710+
if field.get("/Subtype", "") == "/Widget":
711+
if "/P" in field:
712+
ret = [field["/P"].get_object()]
713+
else:
714+
ret = [
715+
p
716+
for p in self.pages
717+
if field.indirect_reference in p.get("/Annots", "")
718+
]
719+
else:
720+
kids = field.get("/Kids", ())
721+
for k in kids:
722+
k = k.get_object()
723+
if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
724+
# Kid that is just a widget, not a field:
725+
if "/P" in k:
726+
ret += [k["/P"].get_object()]
727+
else:
728+
ret += [
729+
p
730+
for p in self.pages
731+
if k.indirect_reference in p.get("/Annots", "")
732+
]
733+
return [
734+
x
735+
if isinstance(x, PageObject)
736+
else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
737+
for x in ret
738+
]
739+
670740
def _get_named_destinations(
671741
self,
672742
tree: Union[TreeObject, None] = None,
@@ -1813,7 +1883,9 @@ def decrypt(self, password: Union[str, bytes]) -> PasswordType:
18131883
def decode_permissions(self, permissions_code: int) -> Dict[str, bool]:
18141884
"""Take the permissions as an integer, return the allowed access."""
18151885
deprecate_with_replacement(
1816-
old_name="decode_permissions", new_name="user_access_permissions", removed_in="5.0.0"
1886+
old_name="decode_permissions",
1887+
new_name="user_access_permissions",
1888+
removed_in="5.0.0",
18171889
)
18181890

18191891
permissions_mapping = {

pypdf/_writer.py

+71
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
DecodedStreamObject,
9797
Destination,
9898
DictionaryObject,
99+
Field,
99100
Fit,
100101
FloatObject,
101102
IndirectObject,
@@ -1003,6 +1004,76 @@ def reattach_fields(
10031004
lst.append(ano)
10041005
return lst
10051006

1007+
def get_pages_showing_field(
1008+
self, field: Union[Field, PdfObject, IndirectObject]
1009+
) -> List[PageObject]:
1010+
"""
1011+
Provides list of pages where the field is called.
1012+
1013+
Args:
1014+
field: Field Object, PdfObject or IndirectObject referencing a Field
1015+
1016+
Returns:
1017+
List of pages:
1018+
- Empty list:
1019+
The field has no widgets attached
1020+
(either hidden field or ancestor field).
1021+
- Single page list:
1022+
Page where the widget is present
1023+
(most common).
1024+
- Multi-page list:
1025+
Field with multiple kids widgets
1026+
(example: radio buttons, field repeated on multiple pages).
1027+
"""
1028+
1029+
def _get_inherited(obj: DictionaryObject, key: str) -> Any:
1030+
if key in obj:
1031+
return obj[key]
1032+
elif "/Parent" in obj:
1033+
return _get_inherited(
1034+
cast(DictionaryObject, obj["/Parent"].get_object()), key
1035+
)
1036+
else:
1037+
return None
1038+
1039+
try:
1040+
# to cope with all types
1041+
field = cast(DictionaryObject, field.indirect_reference.get_object()) # type: ignore
1042+
except Exception as exc:
1043+
raise ValueError("field type is invalid") from exc
1044+
if _get_inherited(field, "/FT") is None:
1045+
raise ValueError("field is not valid")
1046+
ret = []
1047+
if field.get("/Subtype", "") == "/Widget":
1048+
if "/P" in field:
1049+
ret = [field["/P"].get_object()]
1050+
else:
1051+
ret = [
1052+
p
1053+
for p in self.pages
1054+
if field.indirect_reference in p.get("/Annots", "")
1055+
]
1056+
else:
1057+
kids = field.get("/Kids", ())
1058+
for k in kids:
1059+
k = k.get_object()
1060+
if (k.get("/Subtype", "") == "/Widget") and ("/T" not in k):
1061+
# Kid that is just a widget, not a field:
1062+
if "/P" in k:
1063+
ret += [k["/P"].get_object()]
1064+
else:
1065+
ret += [
1066+
p
1067+
for p in self.pages
1068+
if k.indirect_reference in p.get("/Annots", "")
1069+
]
1070+
return [
1071+
x
1072+
if isinstance(x, PageObject)
1073+
else (self.pages[self._get_page_number_by_indirect(x.indirect_reference)]) # type: ignore
1074+
for x in ret
1075+
]
1076+
10061077
def clone_reader_document_root(self, reader: PdfReader) -> None:
10071078
"""
10081079
Copy the reader document root to the writer and all sub elements,

tests/test_workflows.py

+165-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,14 @@
1717
from pypdf import PdfMerger, PdfReader, PdfWriter
1818
from pypdf.constants import PageAttributes as PG
1919
from pypdf.errors import PdfReadError, PdfReadWarning
20-
from pypdf.generic import ContentStream, NameObject, read_object
20+
from pypdf.generic import (
21+
ArrayObject,
22+
ContentStream,
23+
DictionaryObject,
24+
NameObject,
25+
TextStringObject,
26+
read_object,
27+
)
2128

2229
from . import get_data_from_url, normalize_warnings
2330

@@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode():
11081115
reader = PdfReader(pdf_path)
11091116
with pytest.raises(ValueError, match="Invalid text extraction mode"):
11101117
reader.pages[0].extract_text(extraction_mode="foo") # type: ignore
1118+
1119+
1120+
@pytest.mark.enable_socket()
1121+
def test_get_page_showing_field():
1122+
"""
1123+
Uses testfile from #2452 in order to get fields on multiple pages,
1124+
choices boxes,...
1125+
"""
1126+
url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf"
1127+
name = "iss2452.pdf"
1128+
reader = PdfReader(BytesIO(get_data_from_url(url, name)))
1129+
writer = PdfWriter(clone_from=reader)
1130+
1131+
# validate with Field: only works on Reader (no get_fields on writer yet)
1132+
fld = reader.get_fields()
1133+
assert [
1134+
p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"])
1135+
] == [0]
1136+
1137+
# validate with dictionary object
1138+
# NRCategory field is a radio box
1139+
assert [
1140+
p.page_number
1141+
for p in reader.get_pages_showing_field(
1142+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
1143+
)
1144+
] == [0, 0, 0, 0, 0]
1145+
assert [
1146+
p.page_number
1147+
for p in writer.get_pages_showing_field(
1148+
writer._root_object["/AcroForm"]["/Fields"][8].get_object()
1149+
)
1150+
] == [0, 0, 0, 0, 0]
1151+
1152+
# validate with IndirectObject
1153+
# SiteID field is a textbox on multiple pages
1154+
assert [
1155+
p.page_number
1156+
for p in reader.get_pages_showing_field(
1157+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]
1158+
)
1159+
] == [0, 1]
1160+
assert [
1161+
p.page_number
1162+
for p in writer.get_pages_showing_field(
1163+
writer._root_object["/AcroForm"]["/Fields"][99]
1164+
)
1165+
] == [0, 1]
1166+
# test directly on the widget:
1167+
assert [
1168+
p.page_number
1169+
for p in reader.get_pages_showing_field(
1170+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1]
1171+
)
1172+
] == [1]
1173+
assert [
1174+
p.page_number
1175+
for p in writer.get_pages_showing_field(
1176+
writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1]
1177+
)
1178+
] == [1]
1179+
1180+
# Exceptions:
1181+
# Invalid Object
1182+
with pytest.raises(ValueError) as exc:
1183+
reader.get_pages_showing_field(None)
1184+
with pytest.raises(ValueError) as exc:
1185+
writer.get_pages_showing_field(None)
1186+
assert "field type is invalid" in exc.value.args[0]
1187+
1188+
# Damage Field
1189+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"]
1190+
del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"]
1191+
with pytest.raises(ValueError) as exc:
1192+
reader.get_pages_showing_field(
1193+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
1194+
)
1195+
with pytest.raises(ValueError) as exc:
1196+
writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
1197+
assert "field is not valid" in exc.value.args[0]
1198+
1199+
# missing Parent in field
1200+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
1201+
"/Parent"
1202+
]
1203+
del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[
1204+
"/Parent"
1205+
]
1206+
with pytest.raises(ValueError) as exc:
1207+
reader.get_pages_showing_field(
1208+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][1]
1209+
)
1210+
with pytest.raises(ValueError) as exc:
1211+
writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1])
1212+
1213+
# remove "/P" (optional)
1214+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[
1215+
"/P"
1216+
]
1217+
del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"]
1218+
assert [
1219+
p.page_number
1220+
for p in reader.get_pages_showing_field(
1221+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1]
1222+
)
1223+
] == [0]
1224+
assert [
1225+
p.page_number
1226+
for p in writer.get_pages_showing_field(
1227+
writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1]
1228+
)
1229+
] == [0]
1230+
assert [
1231+
p.page_number
1232+
for p in reader.get_pages_showing_field(
1233+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object()
1234+
)
1235+
] == [0, 0, 0, 0, 0]
1236+
assert [
1237+
p.page_number
1238+
for p in writer.get_pages_showing_field(
1239+
writer._root_object["/AcroForm"]["/Fields"][8].get_object()
1240+
)
1241+
] == [0, 0, 0, 0, 0]
1242+
1243+
# Grouping fields
1244+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[
1245+
NameObject("/Kids")
1246+
] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]])
1247+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"]
1248+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"]
1249+
del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"]
1250+
writer._root_object["/AcroForm"]["/Fields"].append(
1251+
writer._add_object(
1252+
DictionaryObject(
1253+
{
1254+
NameObject("/T"): TextStringObject("grouping"),
1255+
NameObject("/FT"): NameObject("/Tx"),
1256+
NameObject("/Kids"): ArrayObject(
1257+
[reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]
1258+
),
1259+
}
1260+
)
1261+
)
1262+
)
1263+
assert [
1264+
p.page_number
1265+
for p in reader.get_pages_showing_field(
1266+
reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1]
1267+
)
1268+
] == []
1269+
assert [
1270+
p.page_number
1271+
for p in writer.get_pages_showing_field(
1272+
writer._root_object["/AcroForm"]["/Fields"][-1]
1273+
)
1274+
] == []

0 commit comments

Comments
 (0)