-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathocr.py
169 lines (138 loc) · 6.69 KB
/
ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from pathlib import Path
import subprocess
import glob
from io import StringIO
import re
import os
from werkzeug.utils import secure_filename
def ensure_dir(path):
"""Ensure directory exists and has correct permissions"""
path.mkdir(parents=True, exist_ok=True)
os.chmod(str(path), 0o777)
return path
def check_tesseract_setup():
"""Verify Tesseract installation and available languages"""
try:
# Check Tesseract version
version = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
print("Tesseract version:")
print(version.stdout)
# Check TESSDATA_PREFIX
tessdata = os.getenv('TESSDATA_PREFIX')
print(f"TESSDATA_PREFIX: {tessdata}")
if tessdata:
print(f"TESSDATA contents: {os.listdir(tessdata) if os.path.exists(tessdata) else 'Directory not found'}")
# List available languages
langs = subprocess.run(['tesseract', '--list-langs'], capture_output=True, text=True)
print("Available languages:")
print(langs.stdout)
return True
except Exception as e:
print(f"Tesseract setup check failed: {e}")
return False
def tesseract_to_txt(uploaded_files, model, model_bis, rand_name, ROOT_FOLDER, UPLOAD_FOLDER):
try:
# Debug information
print(f"\n=== Starting OCR process ===")
print(f"Model: {model}")
print(f"Model bis: {model_bis}")
# Verify Tesseract setup
if not check_tesseract_setup():
raise Exception("Tesseract verification failed")
# Create base paths
root_path = Path(ROOT_FOLDER)
upload_path = root_path / UPLOAD_FOLDER
result_path = ensure_dir(upload_path / rand_name)
# Extensions supported
extensions = {'.jpg', '.jpeg', '.png', '.tiff', '.tif'}
output_stream = StringIO()
print(f"Result path: {result_path}")
if model_bis:
model = f"{model}+{model_bis}"
for f in uploaded_files:
filename = secure_filename(f.filename)
file_path = Path(filename)
file_stem = file_path.stem
file_extension = file_path.suffix.lower()
print(f"\n=== Processing file: {filename} ===")
print(f"Extension: {file_extension}")
if file_extension == '.pdf':
# Create temp directory for PDF processing
temp_dir = ensure_dir(upload_path / f"{file_stem}_temp")
pdf_path = temp_dir / filename
# Save PDF
f.save(str(pdf_path))
print(f"Saved PDF to: {pdf_path}")
# Convert to PNG
try:
print("Converting PDF to PNG...")
result = subprocess.run([
'pdftoppm', '-r', '180',
str(pdf_path),
str(temp_dir / file_stem),
'-png'
], capture_output=True, text=True, check=True)
print("PDF conversion output:", result.stdout)
except subprocess.CalledProcessError as e:
print(f"PDF conversion error output: {e.stderr}")
raise Exception(f"PDF conversion failed: {e}")
# Process each PNG
png_files = sorted(
glob.glob(str(temp_dir / '*.png')),
key=lambda f: int(re.sub(r'\D', '', f)) if re.search(r'\d+', f) else 0
)
print(f"Found {len(png_files)} PNG files to process")
for png_file in png_files:
png_path = Path(png_file)
output_base = png_path.with_suffix('')
try:
print(f"\nProcessing PNG: {png_path}")
result = subprocess.run([
'tesseract',
'-l', model,
str(png_path),
str(output_base)
], capture_output=True, text=True, check=True)
print("Tesseract output:", result.stdout)
txt_path = output_base.with_suffix('.txt')
if not txt_path.exists():
raise FileNotFoundError(f"Tesseract output not found: {txt_path}")
with txt_path.open('r', encoding='utf-8') as ftxt:
output_stream.write(ftxt.read())
output_stream.write('\n\n')
except subprocess.CalledProcessError as e:
print(f"Tesseract error output: {e.stderr}")
raise Exception(f"OCR failed for {png_path}: {e}")
elif file_extension in extensions:
# Process single image
image_path = result_path / filename
output_base = result_path / file_stem
# Save image
f.save(str(image_path))
print(f"Saved image to: {image_path}")
try:
print("Running Tesseract OCR...")
result = subprocess.run([
'tesseract',
'-l', model,
str(image_path),
str(output_base)
], capture_output=True, text=True, check=True)
print("Tesseract output:", result.stdout)
txt_path = output_base.with_suffix('.txt')
if not txt_path.exists():
raise FileNotFoundError(f"Tesseract output not found: {txt_path}")
with txt_path.open('r', encoding='utf-8') as ftxt:
output_stream.write(ftxt.read())
output_stream.write('\n\n')
except subprocess.CalledProcessError as e:
print(f"Tesseract error output: {e.stderr}")
raise Exception(f"OCR failed for {filename}: {e}")
else:
raise Exception(f"Unsupported file extension: {file_extension}")
final_text = output_stream.getvalue()
output_stream.close()
return final_text
except Exception as e:
print(f"Error in tesseract_to_txt: {e}")
raise