-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr
executable file
·419 lines (359 loc) · 14.7 KB
/
ocr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
#!/usr/bin/env python3
"""
This script can handle three main scenarios, intelligently determining what to do based on the input:
1. A single PDF file → Flatten and re-OCR into a single searchable PDF
2. A folder of numbered image files (e.g., JPG/JPEG) → Combine and OCR into one searchable PDF
3. A folder of PDF files → Batch process each PDF file, flattening and re-OCRing each into its own searchable PDF
Usage:
./multi_mode_ocr.py <input_path> [--output <file_or_folder>] [--replace]
[--lang <language>] [--threads <num>] [--quiet]
Arguments:
<input_path> Path to either: (a) a single PDF file, (b) a folder of images,
or (c) a folder of PDF files. (Required)
--output, -o Desired output path. Interpreted differently depending on input:
- Single PDF or folder of images: Output is one PDF file.
- Folder of PDFs: Output is a folder containing new OCRed PDFs.
By default, if you do NOT use --replace:
- For a single PDF: appends "_searchable" before the ".pdf"
- For a folder of images: uses "<folder>_searchable.pdf"
- For a folder of multiple PDFs: each PDF gets its own
"_searchable" appended.
--replace, -r Overwrite the original PDF(s) instead of creating a new file
(this is only valid if the input is a PDF file or a folder of PDFs).
In Single PDF mode, replacement is **default** unless --output is provided.
--lang, -l OCR language (default: "eng").
--threads, -t Number of threads to use for OCR (default: auto-detect CPU cores).
--quiet, -q Suppress output messages (only errors are printed).
Dependencies:
- Python 3
- PIL (Pillow)
- pytesseract (Tesseract OCR)
- PyPDF2
- pdf2image (for flattening PDFs)
- concurrent.futures (built-in)
"""
import os
import sys
import time
import io
import argparse
import multiprocessing
from PIL import Image
import pytesseract
from PyPDF2 import PdfMerger
import concurrent.futures
try:
from pdf2image import convert_from_path
except ImportError:
convert_from_path = None
def process_image_ocr(args):
"""Process a single image with OCR and return the OCR PDF bytes."""
img, lang, page_num, total_pages, verbose = args
try:
if verbose:
sys.stdout.write(f"\rProcessing page {page_num}/{total_pages}...")
sys.stdout.flush()
pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension='pdf', lang=lang)
return pdf_bytes
except Exception as e:
print(f"\nError processing page {page_num}: {e}")
return None
def create_searchable_pdf_from_images(image_list, output_pdf, language='eng',
threads=None, verbose=True):
"""
Create a searchable PDF from a list of PIL Images using OCR and write to output_pdf.
"""
start_time = time.time()
total_pages = len(image_list)
if total_pages == 0:
if verbose:
print("No images found.")
return False
if threads is None:
threads = multiprocessing.cpu_count()
tasks = [
(image_list[i], language, i+1, total_pages, verbose)
for i in range(total_pages)
]
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
ocr_pdfs = list(executor.map(process_image_ocr, tasks))
if verbose:
print("\nMerging OCRed pages...")
merger = PdfMerger()
for pdf_page in ocr_pdfs:
if pdf_page:
merger.append(io.BytesIO(pdf_page))
try:
with open(output_pdf, "wb") as f:
merger.write(f)
except Exception as e:
print(f"Failed to write output PDF: {e}")
return False
elapsed_time = time.time() - start_time
if verbose:
print(f"OCR completed in {elapsed_time:.2f} seconds.")
print(f"Searchable PDF created: {output_pdf}")
return True
def create_searchable_pdf_from_directory_of_images(
input_dir, output_pdf, language='eng',
threads=None, verbose=True
):
"""
Collect all JPG/JPEG images in input_dir, sort them, and create a single searchable PDF.
"""
if verbose:
print(f"Processing images in folder '{input_dir}'...")
# Collect all .jpg or .jpeg
image_files = sorted([
os.path.join(input_dir, f) for f in os.listdir(input_dir)
if f.lower().endswith(('.jpg', '.jpeg'))
])
pil_images = []
for img_path in image_files:
try:
pil_images.append(Image.open(img_path))
except Exception as e:
if verbose:
print(f"Skipping {img_path} due to error: {e}")
if verbose:
print(f"Found {len(pil_images)} images to process.")
return create_searchable_pdf_from_images(
pil_images, output_pdf, language, threads, verbose
)
def flatten_pdf_to_images(input_pdf, dpi=300):
"""
Convert each page of a PDF to a list of PIL Images using pdf2image.
Returns a list of PIL Images.
"""
if convert_from_path is None:
raise RuntimeError("pdf2image is not installed. Cannot flatten PDFs.")
return convert_from_path(input_pdf, dpi=dpi)
def flatten_and_ocr_pdf(input_pdf, output_pdf, language='eng',
threads=None, verbose=True):
"""
Flatten an existing PDF to images, then re-OCR into a new searchable PDF.
"""
if verbose:
print(f"Flattening PDF '{input_pdf}' at 300 dpi...")
try:
pil_images = flatten_pdf_to_images(input_pdf, dpi=300)
except Exception as e:
print(f"Failed to convert PDF to images: {e}")
return False
if verbose:
print(f"PDF has {len(pil_images)} pages. Starting OCR...")
return create_searchable_pdf_from_images(
pil_images, output_pdf,
language=language,
threads=threads,
verbose=verbose
)
def batch_flatten_and_ocr_pdfs(pdf_files, output_folder, language='eng',
threads=None, replace=False, verbose=True):
"""
Batch-process a list of PDFs: flatten and re-OCR each.
- If replace=True, overwrites each original PDF
- Otherwise, outputs to output_folder each with "_searchable" appended
"""
if not pdf_files:
if verbose:
print("No PDF files found in the folder.")
return False
if verbose:
print(f"Found {len(pdf_files)} PDFs to process.")
success = True
for pdf_path in pdf_files:
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
if replace:
# Overwrite the original
out_path = pdf_path
else:
out_path = os.path.join(
output_folder, f"{base_name}_searchable.pdf"
)
if not flatten_and_ocr_pdf(pdf_path, out_path, language, threads, verbose):
success = False
return success
def determine_input_mode(input_path, verbose=True):
"""
Determine which of the three modes we're in:
1) Single PDF file
2) Folder of images
3) Folder of PDFs
Returns a tuple: (mode, items) where "mode" is one of
"single_pdf", "folder_images", "folder_pdfs"
or None if it can't be determined properly.
"""
if os.path.isfile(input_path):
# If single file, check if it's a PDF
if input_path.lower().endswith('.pdf'):
return ('single_pdf', input_path)
else:
# Could be a single image, but the user scenario mentions a "folder of images"
# so we'll not handle single-image logic. We'll just treat this as an error:
if verbose:
print("ERROR: Single file is not a PDF. Exiting.")
return (None, None)
elif os.path.isdir(input_path):
# Possibly a folder of images or a folder of PDFs
# Let's see what's inside
all_files = os.listdir(input_path)
pdf_files = [
os.path.join(input_path, f)
for f in all_files
if f.lower().endswith('.pdf')
]
image_files = [
os.path.join(input_path, f)
for f in all_files
if f.lower().endswith(('.jpg', '.jpeg'))
]
if len(pdf_files) > 0 and len(image_files) == 0:
# There's at least one PDF and no images → folder of PDFs
if len(pdf_files) == 1:
# Edge case: exactly one PDF in the folder. Treat as single_pdf.
return ('single_pdf', pdf_files[0])
else:
return ('folder_pdfs', pdf_files)
elif len(pdf_files) == 0 and len(image_files) > 0:
# It's likely a folder of images
return ('folder_images', image_files)
else:
# Mixed or empty
# If there's at least one image and no PDFs, we do folder_images.
# If there's at least one PDF and no images, we do folder_pdfs.
# If there's a mixture or nothing, handle or raise an error.
if len(pdf_files) > 0 and len(image_files) > 0:
if verbose:
print("ERROR: The folder contains both images and PDFs. "
"Please separate them or specify a single PDF file.")
return (None, None)
if len(pdf_files) == 0 and len(image_files) == 0:
if verbose:
print("ERROR: The folder is empty or doesn't contain PDFs or JPGs.")
return (None, None)
else:
if verbose:
print("ERROR: Input path is neither a file nor a folder.")
return (None, None)
def main():
parser = argparse.ArgumentParser(
description="Create a searchable PDF from either: "
"a PDF file (flatten, re-OCR), "
"a folder of numbered image files, "
"or a folder of PDF files (batch)."
)
# Change input to a positional argument
parser.add_argument('input_path',
help='Path to file/folder input (PDF file, folder of images, or folder of PDFs).')
parser.add_argument('--output', '-o',
help='Output path. Interpretation depends on input: '
'single file/folder-of-images => single PDF file, '
'folder-of-pdfs => output folder for new PDFs. '
'Default: appends "_searchable" to new PDFs if not using --replace.')
# Modify --replace to have default behavior based on mode
parser.add_argument('--replace', '-r', action='store_true',
help='Overwrite the original PDF(s). '
'Only valid if input is PDF(s).')
parser.add_argument('--lang', '-l', default='eng',
help='OCR language (default: eng)')
parser.add_argument('--threads', '-t', type=int,
help='Number of OCR threads (default: # of CPU cores).')
parser.add_argument('--quiet', '-q', action='store_true',
help='Minimize output messages.')
args = parser.parse_args()
verbose = not args.quiet
input_path = os.path.normpath(args.input_path)
mode, items = determine_input_mode(input_path, verbose=verbose)
if mode is None:
sys.exit(1) # an error has already been printed
# Initialize replace flag
replace = args.replace
if mode == 'single_pdf':
# items is the path to that single PDF
pdf_path = items
if args.output:
# If --output is provided, do not replace; output to specified path
output_pdf = args.output
replace = False
else:
# No --output provided; replace is True by default
output_pdf = pdf_path
success = flatten_and_ocr_pdf(
pdf_path, output_pdf,
language=args.lang,
threads=args.threads,
verbose=verbose
)
if not success:
sys.exit(1)
elif mode == 'folder_images':
input_dir = input_path
# There's no concept of replace for images → ignore if user set --replace
if args.replace:
if verbose:
print("Warning: --replace has no effect for folder-of-images input.")
if not args.output:
# By default, produce "<folder>_searchable.pdf"
folder_name = os.path.basename(os.path.normpath(input_dir))
output_pdf = os.path.join(input_dir, f"{folder_name}_searchable.pdf")
else:
output_pdf = args.output
success = create_searchable_pdf_from_directory_of_images(
input_dir, output_pdf, language=args.lang,
threads=args.threads, verbose=verbose
)
if not success:
sys.exit(1)
elif mode == 'folder_pdfs':
# items is the list of PDF files
pdf_files = items
# If there's only one PDF in the folder, we treat it as single_pdf above.
# Here, mode is 'folder_pdfs' only if multiple PDFs exist.
if len(pdf_files) == 0:
if verbose:
print("No PDFs found in folder.")
sys.exit(1)
if replace:
# Overwrite each PDF in place, ignore --output
success = batch_flatten_and_ocr_pdfs(
pdf_files, output_folder=None,
language=args.lang,
threads=args.threads,
replace=True,
verbose=verbose
)
if not success:
sys.exit(1)
else:
# Need an output folder
if not args.output:
# By default, create a subfolder next to the input folder
# named something like "OCRed_PDFs"
base_dir = input_path
output_folder = os.path.join(base_dir, "OCRed_PDFs")
if verbose:
print(f"No output folder specified; using '{output_folder}'.")
else:
output_folder = os.path.normpath(args.output)
# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
try:
os.makedirs(output_folder, exist_ok=True)
except Exception as e:
print(f"ERROR: Could not create output folder: {e}")
sys.exit(1)
success = batch_flatten_and_ocr_pdfs(
pdf_files, output_folder=output_folder,
language=args.lang,
threads=args.threads,
replace=False,
verbose=verbose
)
if not success:
sys.exit(1)
else:
# Shouldn't get here
sys.exit(1)
if __name__ == "__main__":
main()