Skip to content

Commit d67e807

Browse files
authored
MAINT: Increase readability of text extraction code (#3091)
1 parent 8b3a786 commit d67e807

File tree

1 file changed

+22
-31
lines changed

1 file changed

+22
-31
lines changed

pypdf/_page.py

+22-31
Original file line numberDiff line numberDiff line change
@@ -1840,6 +1840,7 @@ def _extract_text(
18401840
str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject
18411841
],
18421842
] = {}
1843+
18431844
try:
18441845
objr = obj
18451846
while NameObject(PG.RESOURCES) not in objr:
@@ -1852,6 +1853,7 @@ def _extract_text(
18521853
# No resources means no text is possible (no font); we consider the
18531854
# file as not damaged, no need to check for TJ or Tj
18541855
return ""
1856+
18551857
if "/Font" in resources_dict:
18561858
for f in cast(DictionaryObject, resources_dict["/Font"]):
18571859
cmaps[f] = build_char_map(f, space_width, obj)
@@ -1863,6 +1865,7 @@ def _extract_text(
18631865
"NotInitialized",
18641866
None,
18651867
) # (encoding, CMAP, font resource name, font)
1868+
18661869
try:
18671870
content = (
18681871
obj[content_key].get_object() if isinstance(content_key, str) else obj
@@ -1891,11 +1894,12 @@ def _extract_text(
18911894
space_scale = 1.0
18921895
_space_width: float = 500.0 # will be set correctly at first Tf
18931896
_actual_str_size: Dict[str, float] = {
1894-
"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} # will be set to string length calculation result
1897+
"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0
1898+
} # will be set to string length calculation result
18951899
TL = 0.0
18961900
font_size = 12.0 # init just in case of
18971901

1898-
def compute_strwidths(str_widths: float) -> float:
1902+
def compute_str_widths(str_widths: float) -> float:
18991903
return str_widths / 1000
19001904

19011905
def process_operation(operator: bytes, operands: List[Any]) -> None:
@@ -1955,15 +1959,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19551959
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19561960
text = ""
19571961
cm_matrix = mult(
1958-
[
1959-
float(operands[0]),
1960-
float(operands[1]),
1961-
float(operands[2]),
1962-
float(operands[3]),
1963-
float(operands[4]),
1964-
float(operands[5]),
1965-
],
1966-
cm_matrix,
1962+
[float(operand) for operand in operands[:6]],
1963+
cm_matrix
19671964
)
19681965
memo_cm = cm_matrix.copy()
19691966
memo_tm = tm_matrix.copy()
@@ -2023,19 +2020,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20232020
ty = float(operands[1])
20242021
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
20252022
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
2026-
str_widths = compute_strwidths(_actual_str_size["str_widths"])
2023+
str_widths = compute_str_widths(_actual_str_size["str_widths"])
20272024
_actual_str_size["str_widths"] = 0.0
20282025
elif operator == b"Tm":
20292026
check_crlf_space = True
2030-
tm_matrix = [
2031-
float(operands[0]),
2032-
float(operands[1]),
2033-
float(operands[2]),
2034-
float(operands[3]),
2035-
float(operands[4]),
2036-
float(operands[5]),
2037-
]
2038-
str_widths = compute_strwidths(_actual_str_size["str_widths"])
2027+
tm_matrix = [float(operand) for operand in operands[:6]]
2028+
str_widths = compute_str_widths(_actual_str_size["str_widths"])
20392029
_actual_str_size["str_widths"] = 0.0
20402030
elif operator == b"T*":
20412031
check_crlf_space = True
@@ -2046,7 +2036,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20462036
text,
20472037
operands,
20482038
cm_matrix,
2049-
tm_matrix, # text matrix
2039+
tm_matrix,
20502040
cmap,
20512041
orientations,
20522042
font_size,
@@ -2057,6 +2047,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20572047
)
20582048
else:
20592049
return None
2050+
20602051
if check_crlf_space:
20612052
try:
20622053
text, output, cm_prev, tm_prev = crlf_space_check(
@@ -2070,7 +2061,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20702061
font_size,
20712062
visitor_text,
20722063
str_widths,
2073-
compute_strwidths(_actual_str_size["space_width"]),
2064+
compute_str_widths(_actual_str_size["space_width"]),
20742065
_actual_str_size["str_height"]
20752066
)
20762067
if text == "":
@@ -2082,7 +2073,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20822073
for operands, operator in content.operations:
20832074
if visitor_operand_before is not None:
20842075
visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
2085-
# Multiple operators are defined in here
2076+
# Multiple operators are handled here
20862077
if operator == b"'":
20872078
process_operation(b"T*", [])
20882079
process_operation(b"Tj", operands)
@@ -2091,9 +2082,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20912082
process_operation(b"Tc", [operands[1]])
20922083
process_operation(b"T*", [])
20932084
process_operation(b"Tj", operands[2:])
2094-
elif operator == b"TD":
2095-
process_operation(b"TL", [-operands[1]])
2096-
process_operation(b"Td", operands)
20972085
elif operator == b"TJ":
20982086
# The space width may be smaller than the font width, so the width should be 95%.
20992087
_confirm_space_width = _space_width * 0.95
@@ -2102,11 +2090,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21022090
if isinstance(op, (str, bytes)):
21032091
process_operation(b"Tj", [op])
21042092
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
2105-
(abs(float(op)) >= _confirm_space_width)
2106-
and (len(text) > 0)
2107-
and (text[-1] != " ")
2093+
abs(float(op)) >= _confirm_space_width
2094+
and text
2095+
and text[-1] != " "
21082096
):
21092097
process_operation(b"Tj", [" "])
2098+
elif operator == b"TD":
2099+
process_operation(b"TL", [-operands[1]])
2100+
process_operation(b"Td", operands)
21102101
elif operator == b"Do":
21112102
output += text
21122103
if visitor_text is not None:
@@ -2157,7 +2148,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21572148
process_operation(operator, operands)
21582149
if visitor_operand_after is not None:
21592150
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
2160-
output += text # just in case of
2151+
output += text # just in case
21612152
if text != "" and visitor_text is not None:
21622153
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
21632154
return output

0 commit comments

Comments
 (0)