@@ -1840,6 +1840,7 @@ def _extract_text(
1840
1840
str , float , Union [str , Dict [int , str ]], Dict [str , str ], DictionaryObject
1841
1841
],
1842
1842
] = {}
1843
+
1843
1844
try :
1844
1845
objr = obj
1845
1846
while NameObject (PG .RESOURCES ) not in objr :
@@ -1852,6 +1853,7 @@ def _extract_text(
1852
1853
# No resources means no text is possible (no font); we consider the
1853
1854
# file as not damaged, no need to check for TJ or Tj
1854
1855
return ""
1856
+
1855
1857
if "/Font" in resources_dict :
1856
1858
for f in cast (DictionaryObject , resources_dict ["/Font" ]):
1857
1859
cmaps [f ] = build_char_map (f , space_width , obj )
@@ -1863,6 +1865,7 @@ def _extract_text(
1863
1865
"NotInitialized" ,
1864
1866
None ,
1865
1867
) # (encoding, CMAP, font resource name, font)
1868
+
1866
1869
try :
1867
1870
content = (
1868
1871
obj [content_key ].get_object () if isinstance (content_key , str ) else obj
@@ -1891,11 +1894,12 @@ def _extract_text(
1891
1894
space_scale = 1.0
1892
1895
_space_width : float = 500.0 # will be set correctly at first Tf
1893
1896
_actual_str_size : Dict [str , float ] = {
1894
- "str_widths" : 0.0 , "space_width" : 0.0 , "str_height" : 0.0 } # will be set to string length calculation result
1897
+ "str_widths" : 0.0 , "space_width" : 0.0 , "str_height" : 0.0
1898
+ } # will be set to string length calculation result
1895
1899
TL = 0.0
1896
1900
font_size = 12.0 # init just in case of
1897
1901
1898
- def compute_strwidths (str_widths : float ) -> float :
1902
+ def compute_str_widths (str_widths : float ) -> float :
1899
1903
return str_widths / 1000
1900
1904
1901
1905
def process_operation (operator : bytes , operands : List [Any ]) -> None :
@@ -1955,15 +1959,8 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
1955
1959
visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
1956
1960
text = ""
1957
1961
cm_matrix = mult (
1958
- [
1959
- float (operands [0 ]),
1960
- float (operands [1 ]),
1961
- float (operands [2 ]),
1962
- float (operands [3 ]),
1963
- float (operands [4 ]),
1964
- float (operands [5 ]),
1965
- ],
1966
- cm_matrix ,
1962
+ [float (operand ) for operand in operands [:6 ]],
1963
+ cm_matrix
1967
1964
)
1968
1965
memo_cm = cm_matrix .copy ()
1969
1966
memo_tm = tm_matrix .copy ()
@@ -2023,19 +2020,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2023
2020
ty = float (operands [1 ])
2024
2021
tm_matrix [4 ] += tx * tm_matrix [0 ] + ty * tm_matrix [2 ]
2025
2022
tm_matrix [5 ] += tx * tm_matrix [1 ] + ty * tm_matrix [3 ]
2026
- str_widths = compute_strwidths (_actual_str_size ["str_widths" ])
2023
+ str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
2027
2024
_actual_str_size ["str_widths" ] = 0.0
2028
2025
elif operator == b"Tm" :
2029
2026
check_crlf_space = True
2030
- tm_matrix = [
2031
- float (operands [0 ]),
2032
- float (operands [1 ]),
2033
- float (operands [2 ]),
2034
- float (operands [3 ]),
2035
- float (operands [4 ]),
2036
- float (operands [5 ]),
2037
- ]
2038
- str_widths = compute_strwidths (_actual_str_size ["str_widths" ])
2027
+ tm_matrix = [float (operand ) for operand in operands [:6 ]]
2028
+ str_widths = compute_str_widths (_actual_str_size ["str_widths" ])
2039
2029
_actual_str_size ["str_widths" ] = 0.0
2040
2030
elif operator == b"T*" :
2041
2031
check_crlf_space = True
@@ -2046,7 +2036,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2046
2036
text ,
2047
2037
operands ,
2048
2038
cm_matrix ,
2049
- tm_matrix , # text matrix
2039
+ tm_matrix ,
2050
2040
cmap ,
2051
2041
orientations ,
2052
2042
font_size ,
@@ -2057,6 +2047,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2057
2047
)
2058
2048
else :
2059
2049
return None
2050
+
2060
2051
if check_crlf_space :
2061
2052
try :
2062
2053
text , output , cm_prev , tm_prev = crlf_space_check (
@@ -2070,7 +2061,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2070
2061
font_size ,
2071
2062
visitor_text ,
2072
2063
str_widths ,
2073
- compute_strwidths (_actual_str_size ["space_width" ]),
2064
+ compute_str_widths (_actual_str_size ["space_width" ]),
2074
2065
_actual_str_size ["str_height" ]
2075
2066
)
2076
2067
if text == "" :
@@ -2082,7 +2073,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2082
2073
for operands , operator in content .operations :
2083
2074
if visitor_operand_before is not None :
2084
2075
visitor_operand_before (operator , operands , cm_matrix , tm_matrix )
2085
- # Multiple operators are defined in here
2076
+ # Multiple operators are handled here
2086
2077
if operator == b"'" :
2087
2078
process_operation (b"T*" , [])
2088
2079
process_operation (b"Tj" , operands )
@@ -2091,9 +2082,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2091
2082
process_operation (b"Tc" , [operands [1 ]])
2092
2083
process_operation (b"T*" , [])
2093
2084
process_operation (b"Tj" , operands [2 :])
2094
- elif operator == b"TD" :
2095
- process_operation (b"TL" , [- operands [1 ]])
2096
- process_operation (b"Td" , operands )
2097
2085
elif operator == b"TJ" :
2098
2086
# The space width may be smaller than the font width, so the width should be 95%.
2099
2087
_confirm_space_width = _space_width * 0.95
@@ -2102,11 +2090,14 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2102
2090
if isinstance (op , (str , bytes )):
2103
2091
process_operation (b"Tj" , [op ])
2104
2092
if isinstance (op , (int , float , NumberObject , FloatObject )) and (
2105
- ( abs (float (op )) >= _confirm_space_width )
2106
- and ( len ( text ) > 0 )
2107
- and ( text [- 1 ] != " " )
2093
+ abs (float (op )) >= _confirm_space_width
2094
+ and text
2095
+ and text [- 1 ] != " "
2108
2096
):
2109
2097
process_operation (b"Tj" , [" " ])
2098
+ elif operator == b"TD" :
2099
+ process_operation (b"TL" , [- operands [1 ]])
2100
+ process_operation (b"Td" , operands )
2110
2101
elif operator == b"Do" :
2111
2102
output += text
2112
2103
if visitor_text is not None :
@@ -2157,7 +2148,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
2157
2148
process_operation (operator , operands )
2158
2149
if visitor_operand_after is not None :
2159
2150
visitor_operand_after (operator , operands , cm_matrix , tm_matrix )
2160
- output += text # just in case of
2151
+ output += text # just in case
2161
2152
if text != "" and visitor_text is not None :
2162
2153
visitor_text (text , memo_cm , memo_tm , cmap [3 ], font_size )
2163
2154
return output
0 commit comments