|
17 | 17 | from pypdf import PdfMerger, PdfReader, PdfWriter
|
18 | 18 | from pypdf.constants import PageAttributes as PG
|
19 | 19 | from pypdf.errors import PdfReadError, PdfReadWarning
|
20 |
| -from pypdf.generic import ContentStream, NameObject, read_object |
| 20 | +from pypdf.generic import ( |
| 21 | + ArrayObject, |
| 22 | + ContentStream, |
| 23 | + DictionaryObject, |
| 24 | + NameObject, |
| 25 | + TextStringObject, |
| 26 | + read_object, |
| 27 | +) |
21 | 28 |
|
22 | 29 | from . import get_data_from_url, normalize_warnings
|
23 | 30 |
|
@@ -1108,3 +1115,160 @@ def test_text_extraction_invalid_mode():
|
1108 | 1115 | reader = PdfReader(pdf_path)
|
1109 | 1116 | with pytest.raises(ValueError, match="Invalid text extraction mode"):
|
1110 | 1117 | reader.pages[0].extract_text(extraction_mode="foo") # type: ignore
|
| 1118 | + |
| 1119 | + |
| 1120 | +@pytest.mark.enable_socket() |
| 1121 | +def test_get_page_showing_field(): |
| 1122 | + """ |
| 1123 | + Uses testfile from #2452 in order to get fields on multiple pages, |
| 1124 | + choices boxes,... |
| 1125 | + """ |
| 1126 | + url = "https://github.com/py-pdf/pypdf/files/14031491/Form_Structure_v50.pdf" |
| 1127 | + name = "iss2452.pdf" |
| 1128 | + reader = PdfReader(BytesIO(get_data_from_url(url, name))) |
| 1129 | + writer = PdfWriter(clone_from=reader) |
| 1130 | + |
| 1131 | + # validate with Field: only works on Reader (no get_fields on writer yet) |
| 1132 | + fld = reader.get_fields() |
| 1133 | + assert [ |
| 1134 | + p.page_number for p in reader.get_pages_showing_field(fld["FormVersion"]) |
| 1135 | + ] == [0] |
| 1136 | + |
| 1137 | + # validate with dictionary object |
| 1138 | + # NRCategory field is a radio box |
| 1139 | + assert [ |
| 1140 | + p.page_number |
| 1141 | + for p in reader.get_pages_showing_field( |
| 1142 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() |
| 1143 | + ) |
| 1144 | + ] == [0, 0, 0, 0, 0] |
| 1145 | + assert [ |
| 1146 | + p.page_number |
| 1147 | + for p in writer.get_pages_showing_field( |
| 1148 | + writer._root_object["/AcroForm"]["/Fields"][8].get_object() |
| 1149 | + ) |
| 1150 | + ] == [0, 0, 0, 0, 0] |
| 1151 | + |
| 1152 | + # validate with IndirectObject |
| 1153 | + # SiteID field is a textbox on multiple pages |
| 1154 | + assert [ |
| 1155 | + p.page_number |
| 1156 | + for p in reader.get_pages_showing_field( |
| 1157 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][99] |
| 1158 | + ) |
| 1159 | + ] == [0, 1] |
| 1160 | + assert [ |
| 1161 | + p.page_number |
| 1162 | + for p in writer.get_pages_showing_field( |
| 1163 | + writer._root_object["/AcroForm"]["/Fields"][99] |
| 1164 | + ) |
| 1165 | + ] == [0, 1] |
| 1166 | + # test directly on the widget: |
| 1167 | + assert [ |
| 1168 | + p.page_number |
| 1169 | + for p in reader.get_pages_showing_field( |
| 1170 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1] |
| 1171 | + ) |
| 1172 | + ] == [1] |
| 1173 | + assert [ |
| 1174 | + p.page_number |
| 1175 | + for p in writer.get_pages_showing_field( |
| 1176 | + writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1] |
| 1177 | + ) |
| 1178 | + ] == [1] |
| 1179 | + |
| 1180 | + # Exceptions: |
| 1181 | + # Invalid Object |
| 1182 | + with pytest.raises(ValueError) as exc: |
| 1183 | + reader.get_pages_showing_field(None) |
| 1184 | + with pytest.raises(ValueError) as exc: |
| 1185 | + writer.get_pages_showing_field(None) |
| 1186 | + assert "field type is invalid" in exc.value.args[0] |
| 1187 | + |
| 1188 | + # Damage Field |
| 1189 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][1].get_object()["/FT"] |
| 1190 | + del writer._root_object["/AcroForm"]["/Fields"][1].get_object()["/FT"] |
| 1191 | + with pytest.raises(ValueError) as exc: |
| 1192 | + reader.get_pages_showing_field( |
| 1193 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] |
| 1194 | + ) |
| 1195 | + with pytest.raises(ValueError) as exc: |
| 1196 | + writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) |
| 1197 | + assert "field is not valid" in exc.value.args[0] |
| 1198 | + |
| 1199 | + # missing Parent in field |
| 1200 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ |
| 1201 | + "/Parent" |
| 1202 | + ] |
| 1203 | + del writer._root_object["/AcroForm"]["/Fields"][99]["/Kids"][1].get_object()[ |
| 1204 | + "/Parent" |
| 1205 | + ] |
| 1206 | + with pytest.raises(ValueError) as exc: |
| 1207 | + reader.get_pages_showing_field( |
| 1208 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][1] |
| 1209 | + ) |
| 1210 | + with pytest.raises(ValueError) as exc: |
| 1211 | + writer.get_pages_showing_field(writer._root_object["/AcroForm"]["/Fields"][1]) |
| 1212 | + |
| 1213 | + # remove "/P" (optional) |
| 1214 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()[ |
| 1215 | + "/P" |
| 1216 | + ] |
| 1217 | + del writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1].get_object()["/P"] |
| 1218 | + assert [ |
| 1219 | + p.page_number |
| 1220 | + for p in reader.get_pages_showing_field( |
| 1221 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8]["/Kids"][1] |
| 1222 | + ) |
| 1223 | + ] == [0] |
| 1224 | + assert [ |
| 1225 | + p.page_number |
| 1226 | + for p in writer.get_pages_showing_field( |
| 1227 | + writer._root_object["/AcroForm"]["/Fields"][8]["/Kids"][1] |
| 1228 | + ) |
| 1229 | + ] == [0] |
| 1230 | + assert [ |
| 1231 | + p.page_number |
| 1232 | + for p in reader.get_pages_showing_field( |
| 1233 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][8].get_object() |
| 1234 | + ) |
| 1235 | + ] == [0, 0, 0, 0, 0] |
| 1236 | + assert [ |
| 1237 | + p.page_number |
| 1238 | + for p in writer.get_pages_showing_field( |
| 1239 | + writer._root_object["/AcroForm"]["/Fields"][8].get_object() |
| 1240 | + ) |
| 1241 | + ] == [0, 0, 0, 0, 0] |
| 1242 | + |
| 1243 | + # Grouping fields |
| 1244 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()[ |
| 1245 | + NameObject("/Kids") |
| 1246 | + ] = ArrayObject([reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]]) |
| 1247 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/T"] |
| 1248 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/P"] |
| 1249 | + del reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1].get_object()["/Subtype"] |
| 1250 | + writer._root_object["/AcroForm"]["/Fields"].append( |
| 1251 | + writer._add_object( |
| 1252 | + DictionaryObject( |
| 1253 | + { |
| 1254 | + NameObject("/T"): TextStringObject("grouping"), |
| 1255 | + NameObject("/FT"): NameObject("/Tx"), |
| 1256 | + NameObject("/Kids"): ArrayObject( |
| 1257 | + [reader.trailer["/Root"]["/AcroForm"]["/Fields"][0]] |
| 1258 | + ), |
| 1259 | + } |
| 1260 | + ) |
| 1261 | + ) |
| 1262 | + ) |
| 1263 | + assert [ |
| 1264 | + p.page_number |
| 1265 | + for p in reader.get_pages_showing_field( |
| 1266 | + reader.trailer["/Root"]["/AcroForm"]["/Fields"][-1] |
| 1267 | + ) |
| 1268 | + ] == [] |
| 1269 | + assert [ |
| 1270 | + p.page_number |
| 1271 | + for p in writer.get_pages_showing_field( |
| 1272 | + writer._root_object["/AcroForm"]["/Fields"][-1] |
| 1273 | + ) |
| 1274 | + ] == [] |
0 commit comments