Skip to content

Commit

Permalink
ROB: Improve handling of LZW decoder table overflow (#3159)
Browse files Browse the repository at this point in the history
Closes #3032.
  • Loading branch information
stefan6419846 authored Feb 27, 2025
1 parent 6003a1e commit f15ddca
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 1 deletion.
8 changes: 7 additions & 1 deletion pypdf/_codecs/_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from abc import ABC, abstractmethod
from typing import Dict, List

from pypdf._utils import logger_warning


class Codec(ABC):
"""Abstract base class for all codecs."""
Expand Down Expand Up @@ -142,9 +144,10 @@ def _pack_codes_into_bytes(self, codes: List[int]) -> bytes:
return bytes(output)

def _initialize_decoding_table(self) -> None:
self.max_code_value = (1 << self.MAX_BITS_PER_CODE) - 1
self.decoding_table = [bytes([i]) for i in range(self.CLEAR_TABLE_MARKER)] + [
b""
] * (4096 - self.CLEAR_TABLE_MARKER)
] * (self.max_code_value - self.CLEAR_TABLE_MARKER + 1)
self._table_index = self.EOD_MARKER + 1
self._bits_to_get = 9

Expand Down Expand Up @@ -250,6 +253,9 @@ def decode(self, data: bytes) -> bytes:

def _add_entry_decode(self, old_string: bytes, new_char: int) -> None:
new_string = old_string + bytes([new_char])
if self._table_index > self.max_code_value:
logger_warning("Ignoring too large LZW table index.", __name__)
return
self.decoding_table[self._table_index] = new_string
self._table_index += 1

Expand Down
Binary file added resources/lzw_decoder_table_overflow.bin
Binary file not shown.
15 changes: 15 additions & 0 deletions tests/test_codecs.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""Test LZW-related code."""
from pathlib import Path

import pytest

from pypdf._codecs._codecs import LzwCodec

TESTS_ROOT = Path(__file__).parent.resolve()
PROJECT_ROOT = TESTS_ROOT.parent
RESOURCE_ROOT = PROJECT_ROOT / "resources"

test_cases = [
pytest.param(b"", id="Empty input"),
pytest.param(b"A", id="Single character"),
Expand Down Expand Up @@ -56,3 +61,13 @@ def test_decode_lzw(encoded, expected_decoded):
codec = LzwCodec()
actual_decoded = codec.decode(encoded)
assert actual_decoded == expected_decoded


def test_lzw_decoder_table_overflow(caplog):
path = RESOURCE_ROOT / "lzw_decoder_table_overflow.bin"
codec = LzwCodec()
assert codec.decode(path.read_bytes()).startswith(
b'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@'
)
assert len(codec.decoding_table) == 4096
assert "Ignoring too large LZW table index." in caplog.text

0 comments on commit f15ddca

Please sign in to comment.