Skip to content

Commit

Permalink
Add thread safety to default generators
Browse files Browse the repository at this point in the history
  • Loading branch information
Belval authored Feb 10, 2020
1 parent ba073af commit ed9375b
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 23 deletions.
14 changes: 10 additions & 4 deletions pdf2image/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,26 @@
Define exceptions specific to pdf2image
"""


class PopplerNotInstalledError(Exception):
"Happens when poppler is not installed"
"""Happens when poppler is not installed"""

pass


class PDFInfoNotInstalledError(PopplerNotInstalledError):
"Happens when pdfinfo is not installed"
"""Happens when pdfinfo is not installed"""

pass


class PDFPageCountError(Exception):
"Happens when the pdfinfo was unable to retrieve the page count"
"""Happens when the pdfinfo was unable to retrieve the page count"""

pass


class PDFSyntaxError(Exception):
"Syntax error was thrown during rendering"
"""Syntax error was thrown during rendering"""

pass
28 changes: 26 additions & 2 deletions pdf2image/generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,40 @@
"""

import uuid
import threading


class ThreadSafeGenerator(object):
"""Wrapper around generator that protects concurrent access"""

def __init__(self, gen):
self.gen = gen
self.lock = threading.Lock()

def __next__(self):
with self.lock:
return next(self.gen)


def threadsafe(f):
"""Decorator to make generator threadsafe. Fix #125"""

def g(*a, **kw):
return ThreadSafeGenerator(f(*a, **kw))

return g


@threadsafe
def uuid_generator():
"Returns a UUID4"
"""Returns a UUID4"""
while True:
yield str(uuid.uuid4())


@threadsafe
def counter_generator(prefix="", suffix="", padding_goal=4):
"Returns a joined prefix, iteration number, and suffix"
"""Returns a joined prefix, iteration number, and suffix"""
i = 0
while True:
i += 1
Expand Down
12 changes: 4 additions & 8 deletions pdf2image/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@


def parse_buffer_to_ppm(data):
"""Parse PPM file bytes to Pillow Image
"""
"""Parse PPM file bytes to Pillow Image"""

images = []

Expand All @@ -26,8 +25,7 @@ def parse_buffer_to_ppm(data):


def parse_buffer_to_pgm(data):
"""Parse PGM file bytes to Pillow Image
"""
"""Parse PGM file bytes to Pillow Image"""

images = []

Expand All @@ -44,8 +42,7 @@ def parse_buffer_to_pgm(data):


def parse_buffer_to_jpeg(data):
"""Parse JPEG file bytes to Pillow Image
"""
"""Parse JPEG file bytes to Pillow Image"""

return [
Image.open(BytesIO(image_data + b"\xff\xd9"))
Expand All @@ -56,8 +53,7 @@ def parse_buffer_to_jpeg(data):


def parse_buffer_to_png(data):
"""Parse PNG file bytes to Pillow Image
"""
"""Parse PNG file bytes to Pillow Image"""

images = []

Expand Down
21 changes: 13 additions & 8 deletions pdf2image/pdf2image.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
PopplerNotInstalledError,
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
PDFSyntaxError,
)

TRANSPARENT_FILE_TYPES = ["png", "tiff"]
Expand Down Expand Up @@ -99,8 +99,7 @@ def convert_from_path(
)

poppler_version = _get_poppler_version(
"pdftocairo" if use_pdfcairo else "pdftoppm",
poppler_path=poppler_path
"pdftocairo" if use_pdfcairo else "pdftoppm", poppler_path=poppler_path
)

if poppler_version <= 57:
Expand Down Expand Up @@ -387,7 +386,9 @@ def _get_poppler_version(command, poppler_path=None):

try:
# TODO: Make this more robust
return int(err.decode("utf8", "ignore").split('\n')[0].split(' ')[-1].split('.')[1])
return int(
err.decode("utf8", "ignore").split("\n")[0].split(" ")[-1].split(".")[1]
)
except:
# Lowest version that includes pdftocairo (2011)
return 17
Expand All @@ -409,11 +410,15 @@ def pdfinfo_from_path(pdf_path, userpw=None, poppler_path=None):
out, err = proc.communicate()

d = {}
for field in out.decode("utf8", "ignore").split('\n'):
sf = field.split(':')
key, value = sf[0], ':'.join(sf[1:])
for field in out.decode("utf8", "ignore").split("\n"):
sf = field.split(":")
key, value = sf[0], ":".join(sf[1:])
if key != "":
d[key] = int(value.strip()) if key in PDFINFO_CONVERT_TO_INT else value.strip()
d[key] = (
int(value.strip())
if key in PDFINFO_CONVERT_TO_INT
else value.strip()
)

if "Pages" not in d:
raise ValueError
Expand Down
14 changes: 13 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import subprocess
from subprocess import Popen, PIPE
from tempfile import TemporaryDirectory

from multiprocessing.dummy import Pool
from memory_profiler import profile as profile_memory

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
Expand Down Expand Up @@ -1482,5 +1482,17 @@ def test_conversion_from_path_using_dir_paths_only(self):
)
)

# Test for issue #125
@profile
@unittest.skipIf(not POPPLER_INSTALLED, "Poppler is not installed")
def test_multithread_conversion(self):
start_time = time.time()
files = ["./tests/test.pdf", ] * 50
p = Pool(10)
res = p.map(convert_from_path, files)
self.assertTrue(len(res) == 50)
print("test_multithread_conversion: {} sec".format(time.time() - start_time))


if __name__ == "__main__":
unittest.main()

0 comments on commit ed9375b

Please sign in to comment.