Skip to content

Commit e6531a2

Browse files
authored
ROB: Fix infinite loop due to Invalid object (#1331)
Fixes #1329 * Prevent loop within dictionaries caused by objects not respecting the PDF standard * Fix cmap warnings due to "numbered" characters ( #2d instead of -) * Apply unnumbering to NameObject * Add _get_indirect_object for debugging and development * Add some missing seeks (no issue reported yet)
1 parent 2f77698 commit e6531a2

File tree

5 files changed

+58
-17
lines changed

5 files changed

+58
-17
lines changed

PyPDF2/_cmap.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ._codecs import adobe_glyphs, charset_encoding
66
from ._utils import logger_warning
77
from .errors import PdfReadWarning
8-
from .generic import DecodedStreamObject, DictionaryObject
8+
from .generic import DecodedStreamObject, DictionaryObject, NameObject
99

1010

1111
# code freely inspired from @twiggy ; see #711
@@ -124,6 +124,7 @@ def parse_encoding(
124124
enc: Union(str, DictionaryObject) = ft["/Encoding"].get_object() # type: ignore
125125
if isinstance(enc, str):
126126
try:
127+
enc = NameObject.unnumber(enc) # for #xx decoding
127128
if enc in charset_encoding:
128129
encoding = charset_encoding[enc].copy()
129130
elif enc in _predefined_cmap:

PyPDF2/_reader.py

+9
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
11391139
buf = bytes(self.stream.getbuffer()) # type: ignore
11401140
else:
11411141
p = self.stream.tell()
1142+
self.stream.seek(0, 0)
11421143
buf = self.stream.read(-1)
11431144
self.stream.seek(p, 0)
11441145
m = re.search(
@@ -1192,6 +1193,7 @@ def get_object(self, indirect_reference: IndirectObject) -> Optional[PdfObject]:
11921193
buf = bytes(self.stream.getbuffer()) # type: ignore
11931194
else:
11941195
p = self.stream.tell()
1196+
self.stream.seek(0, 0)
11951197
buf = self.stream.read(-1)
11961198
self.stream.seek(p, 0)
11971199
m = re.search(
@@ -1883,6 +1885,13 @@ def xfa(self) -> Optional[Dict[str, Any]]:
18831885
retval[tag] = es
18841886
return retval
18851887

1888+
def _get_indirect_object(self, num: int, gen: int) -> Optional[PdfObject]:
1889+
"""
1890+
used to ease development
1891+
equivalent to generic.IndirectObject(num,gen,self).get_object()
1892+
"""
1893+
return IndirectObject(num, gen, self).get_object()
1894+
18861895

18871896
class PdfFileReader(PdfReader): # pragma: no cover
18881897
def __init__(self, *args: Any, **kwargs: Any) -> None:

PyPDF2/generic/_base.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,14 @@ def writeToStream(
420420
deprecate_with_replacement("writeToStream", "write_to_stream")
421421
self.write_to_stream(stream, encryption_key)
422422

423+
@staticmethod
424+
def unnumber(sin: str) -> str:
425+
i = sin.find("#")
426+
while i >= 0:
427+
sin = sin[:i] + chr(int(sin[i + 1 : i + 3], 16)) + sin[i + 3 :]
428+
i = sin.find("#")
429+
return sin
430+
423431
@staticmethod
424432
def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
425433
name = stream.read(1)
@@ -431,10 +439,11 @@ def read_from_stream(stream: StreamType, pdf: Any) -> "NameObject": # PdfReader
431439
ret = name.decode("utf-8")
432440
except (UnicodeEncodeError, UnicodeDecodeError):
433441
ret = name.decode("gbk")
434-
return NameObject(ret)
435-
except (UnicodeEncodeError, UnicodeDecodeError) as e:
436442
# Name objects should represent irregular characters
437443
# with a '#' followed by the symbol's hex number
444+
ret = NameObject.unnumber(ret)
445+
return NameObject(ret)
446+
except (UnicodeEncodeError, UnicodeDecodeError) as e:
438447
if not pdf.strict:
439448
logger_warning("Illegal character in Name Object", __name__)
440449
return NameObject(name)

PyPDF2/generic/_data_structures.py

+25-14
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@
6767
from ._utils import read_hex_string_from_stream, read_string_from_stream
6868

6969
logger = logging.getLogger(__name__)
70-
ObjectPrefix = b"/<[tf(n%"
7170
NumberSigns = b"+-"
7271
IndirectPattern = re.compile(rb"[+-]?(\d+)\s+(\d+)\s+R[^a-zA-Z]")
7372

@@ -263,10 +262,19 @@ def read_unsized_from_steam(stream: StreamType, pdf: Any) -> bytes: # PdfReader
263262
stream.read(1)
264263
break
265264
stream.seek(-1, 1)
266-
key = read_object(stream, pdf)
267-
tok = read_non_whitespace(stream)
268-
stream.seek(-1, 1)
269-
value = read_object(stream, pdf, forced_encoding)
265+
try:
266+
key = read_object(stream, pdf)
267+
tok = read_non_whitespace(stream)
268+
stream.seek(-1, 1)
269+
value = read_object(stream, pdf, forced_encoding)
270+
except Exception as exc:
271+
if pdf is not None and pdf.strict:
272+
raise PdfReadError(exc.__repr__())
273+
logger_warning(exc.__repr__(), __name__)
274+
retval = DictionaryObject()
275+
retval.update(data)
276+
return retval # return partial data
277+
270278
if not data.get(key):
271279
data[key] = value
272280
else:
@@ -812,10 +820,9 @@ def read_object(
812820
) -> Union[PdfObject, int, str, ContentStream]:
813821
tok = stream.read(1)
814822
stream.seek(-1, 1) # reset to start
815-
idx = ObjectPrefix.find(tok)
816-
if idx == 0:
823+
if tok == b"/":
817824
return NameObject.read_from_stream(stream, pdf)
818-
elif idx == 1:
825+
elif tok == b"<":
819826
# hexadecimal string OR dictionary
820827
peek = stream.read(2)
821828
stream.seek(-2, 1) # reset to start
@@ -824,15 +831,15 @@ def read_object(
824831
return DictionaryObject.read_from_stream(stream, pdf, forced_encoding)
825832
else:
826833
return read_hex_string_from_stream(stream, forced_encoding)
827-
elif idx == 2:
834+
elif tok == b"[":
828835
return ArrayObject.read_from_stream(stream, pdf, forced_encoding)
829-
elif idx == 3 or idx == 4:
836+
elif tok == b"t" or tok == b"f":
830837
return BooleanObject.read_from_stream(stream)
831-
elif idx == 5:
838+
elif tok == b"(":
832839
return read_string_from_stream(stream, forced_encoding)
833-
elif idx == 6:
840+
elif tok == b"n":
834841
return NullObject.read_from_stream(stream)
835-
elif idx == 7:
842+
elif tok == b"%":
836843
# comment
837844
while tok not in (b"\r", b"\n"):
838845
tok = stream.read(1)
@@ -843,14 +850,18 @@ def read_object(
843850
tok = read_non_whitespace(stream)
844851
stream.seek(-1, 1)
845852
return read_object(stream, pdf, forced_encoding)
846-
else:
853+
elif tok in b"0123456789+-.":
847854
# number object OR indirect reference
848855
peek = stream.read(20)
849856
stream.seek(-len(peek), 1) # reset to start
850857
if IndirectPattern.match(peek) is not None:
851858
return IndirectObject.read_from_stream(stream, pdf)
852859
else:
853860
return NumberObject.read_from_stream(stream)
861+
else:
862+
raise PdfReadError(
863+
f"Invalid Elementary Object starting with {tok} @{stream.tell()}" # type: ignore
864+
)
854865

855866

856867
class Field(TreeObject):

tests/test_generic.py

+11
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,17 @@ def test_NameObject():
175175
with pytest.raises(PdfReadError) as exc:
176176
NameObject.read_from_stream(stream, None)
177177
assert exc.value.args[0] == "name read error"
178+
assert (
179+
NameObject.read_from_stream(
180+
BytesIO(b"/A;Name_With-Various***Characters?"), None
181+
)
182+
== "/A;Name_With-Various***Characters?"
183+
)
184+
assert (
185+
NameObject.read_from_stream(BytesIO(b"/paired#28#29parentheses"), None)
186+
== "/paired()parentheses"
187+
)
188+
assert NameObject.read_from_stream(BytesIO(b"/A#42"), None) == "/AB"
178189

179190

180191
def test_destination_fit_r():

0 commit comments

Comments
 (0)