diff --git a/.github/workflows/github-ci.yaml b/.github/workflows/github-ci.yaml index 820ccdcaa..1eb3d9bd0 100644 --- a/.github/workflows/github-ci.yaml +++ b/.github/workflows/github-ci.yaml @@ -57,7 +57,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13-dev"] use-crypto-lib: ["cryptography"] include: - python-version: "3.7" @@ -90,7 +90,7 @@ jobs: cache-dependency-path: '**/requirements/ci.txt' - name: Setup Python (3.11+) uses: actions/setup-python@v5 - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' with: python-version: ${{ matrix.python-version }} allow-prereleases: true @@ -106,7 +106,7 @@ jobs: - name: Install requirements (Python 3.11+) run: | pip install -r requirements/ci-3.11.txt - if: matrix.python-version == '3.11' || matrix.python-version == '3.12' + if: matrix.python-version == '3.11' || matrix.python-version == '3.12' || matrix.python-version == '3.13-dev' - name: Remove pycryptodome and cryptography run: | pip uninstall pycryptodome cryptography -y @@ -215,8 +215,8 @@ jobs: - name: Check Number of Downloaded Files run: | downloaded_files_count=$(find \.coverage* -type f | wc -l) - if [ $downloaded_files_count -eq 8 ]; then - echo "The expected number of files (8) were downloaded." + if [ $downloaded_files_count -eq 9 ]; then + echo "The expected number of files (9) were downloaded." else echo "ERROR: Expected 8 files, but found $downloaded_files_count files." exit 1 diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 9f782ec08..b1a4fb27f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -12,6 +12,9 @@ on: permissions: contents: write +env: + HEAD_COMMIT_MESSAGE: ${{ github.event.head_commit.message }} + jobs: build_and_publish: name: Publish a new version @@ -24,7 +27,7 @@ jobs: - name: Extract version from commit message id: extract_version run: | - VERSION=$(echo "${{ github.event.head_commit.message }}" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') + VERSION=$(echo "$HEAD_COMMIT_MESSAGE" | grep -oP '(?<=REL: )\d+\.\d+\.\d+') echo "version=$VERSION" >> $GITHUB_OUTPUT - name: Extract tag message from commit message @@ -32,7 +35,7 @@ jobs: run: | VERSION="${{ steps.extract_version.outputs.version }}" delimiter="$(openssl rand -hex 8)" - MESSAGE=$(echo "${{ github.event.head_commit.message }}" | sed "0,/REL: $VERSION/s///" ) + MESSAGE=$(echo "$HEAD_COMMIT_MESSAGE" | sed "0,/REL: $VERSION/s///" ) echo "message<<${delimiter}" >> $GITHUB_OUTPUT echo "$MESSAGE" >> $GITHUB_OUTPUT echo "${delimiter}" >> $GITHUB_OUTPUT diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 84f0b6ee4..89fec3b14 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -19,6 +19,7 @@ history and [GitHub's 'Contributors' feature](https://github.com/py-pdf/pypdf/gr * [ediamondscience](https://github.com/ediamondscience) * [Ermeson, Felipe](https://github.com/FelipeErmeson) * [Freitag, François](https://github.com/francoisfreitag) +* [Gagnon, William G.](https://github.com/williamgagnon) * [Górny, Michał](https://github.com/mgorny) * [Grillo, Miguel](https://github.com/Ineffable22) * [Gutteridge, David H.](https://github.com/dhgutteridge) diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py index 9a2d10a61..a82e5cdb4 100644 --- a/pypdf/_cmap.py +++ b/pypdf/_cmap.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Tuple, Union, cast from ._codecs import adobe_glyphs, charset_encoding -from ._utils import b_, logger_error, logger_warning +from ._utils import bytes_, logger_error, logger_warning from .generic import ( DecodedStreamObject, DictionaryObject, @@ -258,7 +258,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes: tu = ft["/ToUnicode"] cm: bytes if isinstance(tu, StreamObject): - cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) + cm = bytes_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()) elif isinstance(tu, str) and tu.startswith("/Identity"): # the full range 0000-FFFF will be processed cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange" diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py index d4c5c43c3..b3c2a4a41 100644 --- a/pypdf/_doc_common.py +++ b/pypdf/_doc_common.py @@ -49,7 +49,7 @@ from ._page import PageObject, _VirtualList from ._page_labels import index2label as page_index2page_label from ._utils import ( - b_, + bytes_, deprecate_with_replacement, logger_warning, parse_iso8824_date, @@ -1258,7 +1258,7 @@ def xfa(self) -> Optional[Dict[str, Any]]: if isinstance(f, IndirectObject): field = cast(Optional[EncodedStreamObject], f.get_object()) if field: - es = zlib.decompress(b_(field._data)) + es = zlib.decompress(bytes_(field._data)) retval[tag] = es return retval diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py index 5ddd8d0ef..8ca658588 100644 --- a/pypdf/_encryption.py +++ b/pypdf/_encryption.py @@ -43,7 +43,7 @@ rc4_encrypt, ) -from ._utils import b_, logger_warning +from ._utils import bytes_, logger_warning from .generic import ( ArrayObject, ByteStringObject, @@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject: elif isinstance(obj, StreamObject): obj2 = StreamObject() obj2.update(obj) - obj2.set_data(self.stm_crypt.encrypt(b_(obj._data))) + obj2.set_data(self.stm_crypt.encrypt(bytes_(obj._data))) for key, value in obj.items(): # Dont forget the Stream dict. obj2[key] = self.encrypt_object(value) obj = obj2 @@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject: data = self.str_crypt.decrypt(obj.original_bytes) obj = create_string_object(data) elif isinstance(obj, StreamObject): - obj._data = self.stm_crypt.decrypt(b_(obj._data)) + obj._data = self.stm_crypt.decrypt(bytes_(obj._data)) for key, value in obj.items(): # Dont forget the Stream dict. obj[key] = self.decrypt_object(value) elif isinstance(obj, DictionaryObject): diff --git a/pypdf/_reader.py b/pypdf/_reader.py index aeababa7b..281bb0c30 100644 --- a/pypdf/_reader.py +++ b/pypdf/_reader.py @@ -51,7 +51,7 @@ from ._utils import ( StrByteType, StreamType, - b_, + bytes_, logger_warning, read_non_whitespace, read_previous_line, @@ -328,7 +328,7 @@ def _get_object_from_stream( assert cast(str, obj_stm["/Type"]) == "/ObjStm" # /N is the number of indirect objects in the stream assert idx < obj_stm["/N"] - stream_data = BytesIO(b_(obj_stm.get_data())) + stream_data = BytesIO(bytes_(obj_stm.get_data())) for i in range(obj_stm["/N"]): # type: ignore read_non_whitespace(stream_data) stream_data.seek(-1, 1) @@ -932,7 +932,7 @@ def _read_pdf15_xref_stream( xrefstream = cast(ContentStream, read_object(stream, self)) assert cast(str, xrefstream["/Type"]) == "/XRef" self.cache_indirect_object(generation, idnum, xrefstream) - stream_data = BytesIO(b_(xrefstream.get_data())) + stream_data = BytesIO(bytes_(xrefstream.get_data())) # Index pairs specify the subsections in the dictionary. If # none create one subsection that spans everything. idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")]) diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index a912fddb2..40655b1b2 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -1,8 +1,9 @@ """Font constants and classes for "layout" mode text operations""" from dataclasses import dataclass, field -from typing import Any, Dict, Sequence, Union +from typing import Any, Dict, Sequence, Union, cast +from ...errors import ParseError from ...generic import IndirectObject from ._font_widths import STANDARD_WIDTHS @@ -58,6 +59,7 @@ def __post_init__(self) -> None: skip_count = 0 _w = d_font.get("/W", []) for idx, w_entry in enumerate(_w): + w_entry = w_entry.get_object() if skip_count: skip_count -= 1 continue @@ -66,13 +68,14 @@ def __post_init__(self) -> None: # warning and or use reader's "strict" to force an ex??? continue # check for format (1): `int [int int int int ...]` - if isinstance(_w[idx + 1], Sequence): - start_idx, width_list = _w[idx : idx + 2] + w_next_entry = _w[idx + 1].get_object() + if isinstance(w_next_entry, Sequence): + start_idx, width_list = w_entry, w_next_entry self.width_map.update( { ord_map[_cidx]: _width for _cidx, _width in zip( - range(start_idx, start_idx + len(width_list), 1), + range(cast(int, start_idx), cast(int, start_idx) + len(width_list), 1), width_list, ) if _cidx in ord_map @@ -80,18 +83,23 @@ def __post_init__(self) -> None: ) skip_count = 1 # check for format (2): `int int int` - if not isinstance(_w[idx + 1], Sequence) and not isinstance( - _w[idx + 2], Sequence - ): - start_idx, stop_idx, const_width = _w[idx : idx + 3] + elif isinstance(w_next_entry, (int, float)) and isinstance(_w[idx + 2].get_object(), (int, float)): + start_idx, stop_idx, const_width = w_entry, w_next_entry, _w[idx + 2].get_object() self.width_map.update( { ord_map[_cidx]: const_width - for _cidx in range(start_idx, stop_idx + 1, 1) + for _cidx in range(cast(int, start_idx), cast(int, stop_idx + 1), 1) if _cidx in ord_map } ) skip_count = 2 + else: + # Note: this doesn't handle the case of out of bounds (reaching the end of the width definitions + # while expecting more elements). This raises an IndexError which is sufficient. + raise ParseError( + f"Invalid font width definition. Next elements: {w_entry}, {w_next_entry}, {_w[idx + 2]}" + ) # pragma: no cover + if not self.width_map and "/BaseFont" in self.font_dictionary: for key in STANDARD_WIDTHS: if self.font_dictionary["/BaseFont"].startswith(f"/{key}"): diff --git a/pypdf/_utils.py b/pypdf/_utils.py index 38c0d67d7..689004dcd 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -336,25 +336,22 @@ def mark_location(stream: StreamType) -> None: stream.seek(-radius, 1) -B_CACHE: Dict[str, bytes] = {} +BYTE_CACHE: Dict[str, bytes] = {} -def b_(s: Union[str, bytes]) -> bytes: +def bytes_(s: Union[str, bytes]) -> bytes: if isinstance(s, bytes): return s - bc = B_CACHE + bc = BYTE_CACHE if s in bc: return bc[s] try: r = s.encode("latin-1") - if len(s) < 2: - bc[s] = r - return r - except Exception: + except UnicodeEncodeError: r = s.encode("utf-8") - if len(s) < 2: - bc[s] = r - return r + if len(s) < 2: + bc[s] = r + return r def str_(b: Any) -> str: @@ -390,20 +387,6 @@ def ord_(b: Union[int, str, bytes]) -> Union[int, bytes]: WHITESPACES_AS_REGEXP = b"[" + WHITESPACES_AS_BYTES + b"]" -def paeth_predictor(left: int, up: int, up_left: int) -> int: - p = left + up - up_left - dist_left = abs(p - left) - dist_up = abs(p - up) - dist_up_left = abs(p - up_left) - - if dist_left <= dist_up and dist_left <= dist_up_left: - return left - elif dist_up <= dist_up_left: - return up - else: - return up_left - - def deprecate(msg: str, stacklevel: int = 3) -> None: warnings.warn(msg, DeprecationWarning, stacklevel=stacklevel) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 00b9d498c..7ea868602 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -62,7 +62,7 @@ StrByteType, StreamType, _get_max_pdf_version_header, - b_, + bytes_, deprecate_with_replacement, logger_warning, ) @@ -680,7 +680,7 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None: # endobj file_entry = DecodedStreamObject() - file_entry.set_data(b_(data)) + file_entry.set_data(bytes_(data)) file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")}) # The Filespec entry diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 45b0c145b..5ae8894fa 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -6,7 +6,7 @@ from ._utils import check_if_whitespace_only, logger_warning from .constants import ColorSpaces -from .errors import PdfReadError +from .errors import EmptyImageDataError, PdfReadError from .generic import ( ArrayObject, DecodedStreamObject, @@ -148,9 +148,12 @@ def _extended_image_frombytes( img = Image.frombytes(mode, size, data) except ValueError as exc: nb_pix = size[0] * size[1] - if len(data) % nb_pix != 0: + data_length = len(data) + if data_length == 0: + raise EmptyImageDataError("Data is 0 bytes, cannot process an image from empty data.") from exc + if data_length % nb_pix != 0: raise exc - k = nb_pix * len(mode) / len(data) + k = nb_pix * len(mode) / data_length data = b"".join([bytes((x,) * int(k)) for x in data]) img = Image.frombytes(mode, size, data) return img diff --git a/pypdf/annotations/_markup_annotations.py b/pypdf/annotations/_markup_annotations.py index 4db8dfdbf..98a222483 100644 --- a/pypdf/annotations/_markup_annotations.py +++ b/pypdf/annotations/_markup_annotations.py @@ -104,9 +104,9 @@ def __init__( self[NameObject("/Rect")] = RectangleObject(rect) font_str = "font: " - if bold is True: + if bold: font_str = f"{font_str}bold " - if italic is True: + if italic: font_str = f"{font_str}italic " font_str = f"{font_str}{font} {font_size}" font_str = f"{font_str};text-align:left;color:#{font_color}" diff --git a/pypdf/annotations/_non_markup_annotations.py b/pypdf/annotations/_non_markup_annotations.py index dcdb3b0ff..6272cceee 100644 --- a/pypdf/annotations/_non_markup_annotations.py +++ b/pypdf/annotations/_non_markup_annotations.py @@ -1,6 +1,5 @@ from typing import TYPE_CHECKING, Any, Optional, Tuple, Union -from ..constants import AnnotationFlag from ..generic._base import ( BooleanObject, NameObject, @@ -12,8 +11,6 @@ from ..generic._rectangle import RectangleObject from ._base import AnnotationDictionary -DEFAULT_ANNOTATION_FLAG = AnnotationFlag(0) - class Link(AnnotationDictionary): def __init__( diff --git a/pypdf/errors.py b/pypdf/errors.py index c962dec66..ad197ffc1 100644 --- a/pypdf/errors.py +++ b/pypdf/errors.py @@ -59,4 +59,8 @@ class EmptyFileError(PdfReadError): """Raised when a PDF file is empty or has no content.""" +class EmptyImageDataError(PyPdfError): + """Raised when trying to process an image that has no data.""" + + STREAM_TRUNCATED_PREMATURELY = "Stream has ended unexpectedly" diff --git a/pypdf/filters.py b/pypdf/filters.py index 137e3603a..a278c297d 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -43,7 +43,7 @@ from ._utils import ( WHITESPACES_AS_BYTES, - b_, + bytes_, deprecate_with_replacement, deprecation_no_replacement, logger_warning, @@ -678,7 +678,7 @@ def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters)) if not isinstance(decodparms, (list, tuple)): decodparms = (decodparms,) - data: bytes = b_(stream._data) + data: bytes = bytes_(stream._data) # If there is not data to decode we should not try to decode the data. if data: for filter_type, params in zip(filters, decodparms): diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 2d606b418..724d63f5c 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -36,7 +36,7 @@ from .._protocols import PdfObjectProtocol, PdfWriterProtocol from .._utils import ( StreamType, - b_, + bytes_, deprecate_no_replacement, logger_warning, read_non_whitespace, @@ -607,7 +607,7 @@ def write_to_stream( # https://github.com/davidhalter/parso/issues/207 stream.write(("\\%03o" % c).encode()) else: - stream.write(b_(chr(c))) + stream.write(bytes_(chr(c))) stream.write(b")") @@ -713,7 +713,7 @@ def encode_pdfdocencoding(unicode_string: str) -> bytes: retval = bytearray() for c in unicode_string: try: - retval += b_(chr(_pdfdoc_encoding_rev[c])) + retval += bytes_(chr(_pdfdoc_encoding_rev[c])) except KeyError: raise UnicodeEncodeError( "pdfdocencoding", c, -1, -1, "does not exist in translation table" diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index 87d688674..27d7918c6 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -52,7 +52,7 @@ from .._utils import ( WHITESPACES, StreamType, - b_, + bytes_, deprecate_no_replacement, deprecate_with_replacement, logger_warning, @@ -885,7 +885,7 @@ def set_data(self, data: bytes) -> None: def hash_value_data(self) -> bytes: data = super().hash_value_data() - data += b_(self._data) + data += bytes_(self._data) return data def write_to_stream( @@ -955,7 +955,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject": retval[NameObject(SA.FILTER)] = f if params is not None: retval[NameObject(SA.DECODE_PARMS)] = params - retval._data = FlateDecode.encode(b_(self._data), level) + retval._data = FlateDecode.encode(bytes_(self._data), level) return retval def decode_as_image(self) -> Any: @@ -1003,7 +1003,7 @@ def get_data(self) -> Union[bytes, str]: # create decoded object decoded = DecodedStreamObject() - decoded.set_data(b_(decode_stream_data(self))) + decoded.set_data(bytes_(decode_stream_data(self))) for key, value in list(self.items()): if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS): decoded[key] = value @@ -1069,14 +1069,14 @@ def __init__( if isinstance(stream, ArrayObject): data = b"" for s in stream: - data += b_(s.get_object().get_data()) + data += bytes_(s.get_object().get_data()) if len(data) == 0 or data[-1] != b"\n": data += b"\n" super().set_data(bytes(data)) else: stream_data = stream.get_data() assert stream_data is not None - super().set_data(b_(stream_data)) + super().set_data(bytes_(stream_data)) self.forced_encoding = forced_encoding def clone( @@ -1132,7 +1132,7 @@ def _clone( ignore_fields: """ src_cs = cast("ContentStream", src) - super().set_data(b_(src_cs._data)) + super().set_data(bytes_(src_cs._data)) self.pdf = pdf_dest self._operations = list(src_cs._operations) self.forced_encoding = src_cs.forced_encoding @@ -1249,10 +1249,10 @@ def get_data(self) -> bytes: for op in operands: op.write_to_stream(new_data) new_data.write(b" ") - new_data.write(b_(operator)) + new_data.write(bytes_(operator)) new_data.write(b"\n") self._data = new_data.getvalue() - return b_(self._data) + return bytes_(self._data) # This overrides the parent method: def set_data(self, data: bytes) -> None: @@ -1262,7 +1262,7 @@ def set_data(self, data: bytes) -> None: @property def operations(self) -> List[Tuple[Any, Any]]: if not self._operations and self._data: - self._parse_content_stream(BytesIO(b_(self._data))) + self._parse_content_stream(BytesIO(bytes_(self._data))) self._data = b"" return self._operations @@ -1276,7 +1276,7 @@ def isolate_graphics_state(self) -> None: self._operations.insert(0, ([], "q")) self._operations.append(([], "Q")) elif self._data: - self._data = b"q\n" + b_(self._data) + b"\nQ\n" + self._data = b"q\n" + bytes_(self._data) + b"\nQ\n" # This overrides the parent method: def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index fdcdc3339..950c5f046 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -2,7 +2,7 @@ from typing import Dict, List, Tuple, Union from .._codecs import _pdfdoc_encoding -from .._utils import StreamType, b_, logger_warning, read_non_whitespace +from .._utils import StreamType, bytes_, logger_warning, read_non_whitespace from ..errors import STREAM_TRUNCATED_PREMATURELY, PdfStreamError from ._base import ByteStringObject, TextStringObject @@ -32,7 +32,7 @@ def read_hex_string_from_stream( x += b"0" if len(x) == 2: txt += chr(int(x, base=16)) - return create_string_object(b_(txt), forced_encoding) + return create_string_object(bytes_(txt), forced_encoding) def read_string_from_stream( @@ -92,7 +92,7 @@ def read_string_from_stream( else: stream.seek(-1, 1) # ntok has to be analyzed break - tok = b_(chr(int(tok, base=8))) + tok = bytes_(chr(int(tok, base=8))) elif tok in b"\n\r": # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the diff --git a/requirements/ci-3.11.txt b/requirements/ci-3.11.txt index f382fe2b9..210177118 100644 --- a/requirements/ci-3.11.txt +++ b/requirements/ci-3.11.txt @@ -6,7 +6,7 @@ # attrs==23.1.0 # via flake8-bugbear -coverage[toml]==7.3.0 +coverage[toml]==7.6.0 # via # -r requirements/ci.in # pytest-cov @@ -35,7 +35,7 @@ mypy-extensions==1.0.0 # via mypy packaging==23.1 # via pytest -pillow==10.0.1 +pillow==10.4.0 # via # -r requirements/ci.in # fpdf2 diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 1ffa68a3e..dcd4e6cae 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -10,6 +10,7 @@ from pypdf import PdfReader, mult from pypdf._text_extraction import set_custom_rtl +from pypdf.errors import ParseError from . import get_data_from_url @@ -156,3 +157,19 @@ def test_layout_mode_type0_font_widths(): encoding="utf-8" ) assert expected == reader.pages[0].extract_text(extraction_mode="layout") + + +@pytest.mark.enable_socket() +def test_layout_mode_indirect_sequence_font_widths(): + # Cover the situation where the sequence for font widths is an IndirectObject + # ref https://github.com/py-pdf/pypdf/pull/2788 + url = "https://github.com/user-attachments/files/16491621/2788_example.pdf" + name ="2788_example.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].extract_text(extraction_mode="layout") == "" + url = "https://github.com/user-attachments/files/16491619/2788_example_malformed.pdf" + name = "2788_example_malformed.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + with pytest.raises(ParseError) as exc: + reader.pages[0].extract_text(extraction_mode="layout") + assert str(exc.value).startswith("Invalid font width definition") diff --git a/tests/test_utils.py b/tests/test_utils.py index 81fcf9fb4..e83ca92b1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -121,8 +121,8 @@ def test_mark_location(): ("😀😃", "😀😃".encode()), ], ) -def test_b(input_str: str, expected: bytes): - assert pypdf._utils.b_(input_str) == expected +def test_bytes_(input_str: str, expected: bytes): + assert pypdf._utils.bytes_(input_str) == expected def test_deprecate_no_replacement(): @@ -132,24 +132,6 @@ def test_deprecate_no_replacement(): assert warn[0].message.args[0] == error_msg -@pytest.mark.parametrize( - ("left", "up", "upleft", "expected"), - [ - (0, 0, 0, 0), - (1, 0, 0, 1), - (0, 1, 0, 1), - (0, 0, 1, 0), - (1, 2, 3, 1), - (2, 1, 3, 1), - (1, 3, 2, 2), - (3, 1, 2, 2), - (3, 2, 1, 3), - ], -) -def test_paeth_predictor(left, up, upleft, expected): - assert pypdf._utils.paeth_predictor(left, up, upleft) == expected - - @pytest.mark.parametrize( ("dat", "pos", "to_read", "expected", "expected_pos"), [ diff --git a/tests/test_xobject_image_helpers.py b/tests/test_xobject_image_helpers.py index 63ecebd9b..39b7131fc 100644 --- a/tests/test_xobject_image_helpers.py +++ b/tests/test_xobject_image_helpers.py @@ -4,8 +4,8 @@ import pytest from pypdf import PdfReader -from pypdf._xobj_image_helpers import _handle_flate -from pypdf.errors import PdfReadError +from pypdf._xobj_image_helpers import _extended_image_frombytes, _handle_flate +from pypdf.errors import EmptyImageDataError, PdfReadError from pypdf.generic import ArrayObject, DecodedStreamObject, NameObject, NumberObject from . import get_data_from_url @@ -113,3 +113,12 @@ def test_handle_flate__image_mode_1(): colors=2, obj_as_text="dummy", ) + + +def test_extended_image_frombytes_zero_data(): + mode = "RGB" + size = (1, 1) + data = b"" + + with pytest.raises(EmptyImageDataError, match="Data is 0 bytes, cannot process an image from empty data."): + _extended_image_frombytes(mode, size, data)