-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTesseract.R
56 lines (37 loc) · 1.53 KB
/
Tesseract.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
## OCR finnicky for tables because of sparse text distribution
## tutorial https://cran.r-project.org/web/packages/tesseract/vignettes/intro.html
library(tesseract)
library(dplyr)
library(magick)
img <- "https://gallery.mailchimp.com/89e5755d2cca4840b1af93176/images/62e6af37-9999-49f3-b6ef-30b56ede0459.png"
EbolaText <- ocr(engine = tesseract("fra"),image = img)
cat(EbolaText)
#check out all of these parameters!
tesseract_params() %>% View()
#preprocess your image
imgp <- img %>% image_read() %>%
image_resize("2000x") %>%
image_convert(type = 'Grayscale') %>%
image_trim(fuzz = 40)
#set engine parameters to look for tables
EbolaText <- ocr(engine = tesseract("fra",
options = list(tessedit_pageseg_mode = 'auto',
textord_tabfind_find_tables = '1',
textord_tablefind_recognize_tables = '1')),
image = imgp)
cat(EbolaText)
## with crop
library(magick)
library(tesseract)
img <- "./examplePDFs/EbolaCropHZ.png"
imgp <- img %>% image_read() %>%
image_resize("2000x") %>%
image_convert(type = 'Grayscale') %>%
image_trim(fuzz = 40)
#set engine parameters to look for tables
EbolaText <- ocr(engine = tesseract("fra",
options = list(tessedit_pageseg_mode = 'auto',
textord_tabfind_find_tables = '1',
textord_tablefind_recognize_tables = '1')),
image = imgp)
cat(EbolaText)