From ae8ca3f4665088015eb790aaf25c64970c5f5dcf Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Wed, 26 Feb 2025 11:39:51 +0100 Subject: [PATCH 1/2] BUG: Use the correct name StandardEncoding for the predefined cmap --- pypdf/_cmap.py | 4 ++-- pypdf/_codecs/__init__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index de21b3429..f4b5e4b4c 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -177,9 +177,9 @@ def _parse_encoding( f"Advanced encoding {encoding} not implemented yet", __name__, ) - encoding = charset_encoding["/StandardCoding"].copy() + encoding = charset_encoding["/StandardEncoding"].copy() else: - encoding = charset_encoding["/StandardCoding"].copy() + encoding = charset_encoding["/StandardEncoding"].copy() if "/Differences" in enc: x: int = 0 o: Union[int, str] diff --git a/pypdf/_codecs/__init__.py b/pypdf/_codecs/__init__.py index 734c9ece1..540bcc4c3 100644 --- a/pypdf/_codecs/__init__.py +++ b/pypdf/_codecs/__init__.py @@ -40,7 +40,7 @@ def rev_encoding(enc: List[str]) -> Dict[str, int]: charset_encoding: Dict[str, List[str]] = { - "/StandardCoding": _std_encoding, + "/StandardEncoding": _std_encoding, "/WinAnsiEncoding": _win_encoding, "/MacRomanEncoding": _mac_encoding, "/PDFDocEncoding": _pdfdoc_encoding, From 28844a347a891faa2579f75e44786444f9b03e9a Mon Sep 17 00:00:00 2001 From: stefan6419846 <96178532+stefan6419846@users.noreply.github.com> Date: Wed, 26 Feb 2025 13:08:45 +0100 Subject: [PATCH 2/2] add test --- tests/test_cmap.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 3a2f39d26..94f4c5a8b 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -293,3 +293,15 @@ def test_binascii_odd_length_string(caplog): page = reader.pages[0] assert "\n(Many other theorems may\n" in page.extract_text() assert "Skipping broken line b'143f 143f 10300': Odd-length string\n" in caplog.text + + +@pytest.mark.enable_socket +def test_standard_encoding(caplog): + """Tests for #3156""" + url = "https://github.com/user-attachments/files/18983503/standard-encoding.pdf" + name = "issue3156.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + page = reader.pages[0] + assert page.extract_text() == "Lorem ipsum" + assert "Advanced encoding" not in caplog.text