Skip to content

Commit

Permalink
Add ability to get HOCR data
Browse files Browse the repository at this point in the history
Will do the same in leptess this afternoon
  • Loading branch information
ccouzens committed Jan 31, 2021
1 parent 34e574e commit f81bbc9
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 3 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tesseract"
version = "0.6.2"
version = "0.7.0"
authors = ["Kevin Kwok <antimatter15@gmail.com>", "Chris Couzens <ccouzens@gmail.com>"]
documentation = "https://docs.rs/tesseract"
repository = "https://github.com/antimatter15/tesseract-rs"
Expand Down
61 changes: 61 additions & 0 deletions img.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<div class='ocr_page' id='page_1' title='image ""; bbox 0 0 2256 324; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 0 17 2206 314">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 0 17 2206 314">
<span class='ocr_line' id='line_1_1' title="bbox 5 17 2202 70; baseline 0 -10; x_size 53; x_descenders 10; x_ascenders 13">
<span class='ocrx_word' id='word_1_1' title='bbox 5 19 237 60; x_wconf 96'>Hundreds</span>
<span class='ocrx_word' id='word_1_2' title='bbox 256 19 306 60; x_wconf 96'>of</span>
<span class='ocrx_word' id='word_1_3' title='bbox 320 17 581 70; x_wconf 96'>companies</span>
<span class='ocrx_word' id='word_1_4' title='bbox 599 19 768 60; x_wconf 96'>around</span>
<span class='ocrx_word' id='word_1_5' title='bbox 787 19 866 60; x_wconf 96'>the</span>
<span class='ocrx_word' id='word_1_6' title='bbox 884 19 1016 60; x_wconf 95'>world</span>
<span class='ocrx_word' id='word_1_7' title='bbox 1037 30 1111 60; x_wconf 96'>are</span>
<span class='ocrx_word' id='word_1_8' title='bbox 1133 17 1259 70; x_wconf 96'>using</span>
<span class='ocrx_word' id='word_1_9' title='bbox 1278 22 1381 60; x_wconf 96'>Rust</span>
<span class='ocrx_word' id='word_1_10' title='bbox 1400 17 1438 59; x_wconf 96'>in</span>
<span class='ocrx_word' id='word_1_11' title='bbox 1462 17 1724 70; x_wconf 96'>production</span>
<span class='ocrx_word' id='word_1_12' title='bbox 1743 19 1880 70; x_wconf 96'>today</span>
<span class='ocrx_word' id='word_1_13' title='bbox 1896 19 1964 60; x_wconf 96'>for</span>
<span class='ocrx_word' id='word_1_14' title='bbox 1980 19 2081 68; x_wconf 93'>fast,</span>
<span class='ocrx_word' id='word_1_15' title='bbox 2103 19 2202 60; x_wconf 90'>low-</span>
</span>
<span class='ocr_line' id='line_1_2' title="bbox 5 98 2206 151; baseline -0 -10; x_size 51; x_descenders 9; x_ascenders 12">
<span class='ocrx_word' id='word_1_16' title='bbox 5 111 223 149; x_wconf 96'>resource,</span>
<span class='ocrx_word' id='word_1_17' title='bbox 243 100 598 151; x_wconf 95'>cross-platform</span>
<span class='ocrx_word' id='word_1_18' title='bbox 619 98 855 141; x_wconf 96'>solutions.</span>
<span class='ocrx_word' id='word_1_19' title='bbox 874 100 1086 141; x_wconf 96'>Software</span>
<span class='ocrx_word' id='word_1_20' title='bbox 1104 111 1187 151; x_wconf 96'>you</span>
<span class='ocrx_word' id='word_1_21' title='bbox 1211 100 1335 141; x_wconf 96'>know</span>
<span class='ocrx_word' id='word_1_22' title='bbox 1352 100 1438 141; x_wconf 94'>and</span>
<span class='ocrx_word' id='word_1_23' title='bbox 1462 100 1569 149; x_wconf 96'>love,</span>
<span class='ocrx_word' id='word_1_24' title='bbox 1591 98 1671 141; x_wconf 96'>like</span>
<span class='ocrx_word' id='word_1_25' title='bbox 1694 98 1866 149; x_wconf 96'>Firefox,</span>
<span class='ocrx_word' id='word_1_26' title='bbox 1889 100 2101 151; x_wconf 96'>Dropbox,</span>
<span class='ocrx_word' id='word_1_27' title='bbox 2120 100 2206 141; x_wconf 96'>and</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 3 176 2095 233; baseline 0 -12; x_size 53; x_descenders 11; x_ascenders 11">
<span class='ocrx_word' id='word_1_28' title='bbox 3 181 261 230; x_wconf 92'>Cloudflare,</span>
<span class='ocrx_word' id='word_1_29' title='bbox 283 192 387 222; x_wconf 96'>uses</span>
<span class='ocrx_word' id='word_1_30' title='bbox 408 184 522 222; x_wconf 96'>Rust.</span>
<span class='ocrx_word' id='word_1_31' title='bbox 542 184 660 222; x_wconf 96'>From</span>
<span class='ocrx_word' id='word_1_32' title='bbox 674 185 884 233; x_wconf 95'>startups</span>
<span class='ocrx_word' id='word_1_33' title='bbox 896 185 946 222; x_wconf 95'>to</span>
<span class='ocrx_word' id='word_1_34' title='bbox 962 180 1086 233; x_wconf 95'>large</span>
<span class='ocrx_word' id='word_1_35' title='bbox 1100 176 1430 233; x_wconf 96'>corporations,</span>
<span class='ocrx_word' id='word_1_36' title='bbox 1443 180 1559 222; x_wconf 96'>from</span>
<span class='ocrx_word' id='word_1_37' title='bbox 1574 180 1834 222; x_wconf 96'>embedded</span>
<span class='ocrx_word' id='word_1_38' title='bbox 1850 176 2033 222; x_wconf 96'>devices</span>
<span class='ocrx_word' id='word_1_39' title='bbox 2045 185 2095 222; x_wconf 96'>to</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 0 257 987 314; baseline 0 -11; x_size 57; x_descenders 11; x_ascenders 15">
<span class='ocrx_word' id='word_1_40' title='bbox 0 261 204 303; x_wconf 96'>scalable</span>
<span class='ocrx_word' id='word_1_41' title='bbox 217 261 318 303; x_wconf 96'>web</span>
<span class='ocrx_word' id='word_1_42' title='bbox 331 257 546 312; x_wconf 95'>services,</span>
<span class='ocrx_word' id='word_1_43' title='bbox 562 264 671 303; x_wconf 94'>Rust</span>
<span class='ocrx_word' id='word_1_44' title='bbox 684 257 723 303; x_wconf 94'>is</span>
<span class='ocrx_word' id='word_1_45' title='bbox 736 272 763 303; x_wconf 96'>a</span>
<span class='ocrx_word' id='word_1_46' title='bbox 777 266 910 314; x_wconf 88'>great</span>
<span class='ocrx_word' id='word_1_47' title='bbox 921 261 987 303; x_wconf 95'>fit.</span>
</span>
</p>
</div>
</div>
25 changes: 25 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ extern crate thiserror;
use self::thiserror::Error;
use std::ffi::CString;
use std::ffi::NulError;
use std::os::raw::c_int;
use std::str;

pub mod plumbing;
Expand Down Expand Up @@ -41,6 +42,8 @@ pub enum TesseractError {
RecognizeError(#[from] plumbing::TessBaseAPIRecogniseError),
#[error("Errored whilst getting text")]
GetTextError(#[from] plumbing::TessBaseAPIGetUTF8TextError),
#[error("Errored whilst getting HOCR text")]
GetHOCRTextError(#[from] plumbing::TessBaseAPIGetHOCRTextError),
#[error("Errored whilst setting frame")]
SetFrameError(#[from] plumbing::TessBaseAPISetImageSafetyError),
#[error("Errored whilst setting image from mem")]
Expand Down Expand Up @@ -111,6 +114,21 @@ impl Tesseract {
.to_string_lossy()
.into_owned())
}

/// Get the text encoded as HTML with bounding box tags
///
/// See [img.html](../img.html) for an example.
pub fn get_hocr_text(
&mut self,
page: c_int,
) -> Result<String, plumbing::TessBaseAPIGetHOCRTextError> {
Ok(self
.0
.get_hocr_text(page)?
.as_ref()
.to_string_lossy()
.into_owned())
}
}

pub fn ocr(filename: &str, language: &str) -> Result<String, TesseractError> {
Expand Down Expand Up @@ -170,3 +188,10 @@ fn expanded_test() -> Result<(), TesseractError> {
assert_eq!(&cube.get_text()?, include_str!("../img.txt"));
Ok(())
}

#[test]
fn hocr_test() -> Result<(), TesseractError> {
let mut cube = Tesseract::new(None, Some("eng"))?.set_image("img.png")?;
assert_eq!(&cube.get_hocr_text(0)?, include_str!("../img.html"));
Ok(())
}
1 change: 1 addition & 0 deletions src/plumbing/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pub use self::pix::Pix;
pub use self::pix::PixReadError;
pub use self::pix::PixReadMemError;
pub use self::tess_base_api::TessBaseAPI;
pub use self::tess_base_api::TessBaseAPIGetHOCRTextError;
pub use self::tess_base_api::TessBaseAPIGetUTF8TextError;
pub use self::tess_base_api::TessBaseAPIInitError;
pub use self::tess_base_api::TessBaseAPIRecogniseError;
Expand Down
27 changes: 25 additions & 2 deletions src/plumbing/tess_base_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ extern crate tesseract_sys;
extern crate thiserror;

use self::tesseract_sys::{
TessBaseAPICreate, TessBaseAPIDelete, TessBaseAPIGetUTF8Text, TessBaseAPIInit3,
TessBaseAPIRecognize, TessBaseAPISetImage, TessBaseAPISetImage2,
TessBaseAPICreate, TessBaseAPIDelete, TessBaseAPIGetHOCRText, TessBaseAPIGetUTF8Text,
TessBaseAPIInit3, TessBaseAPIRecognize, TessBaseAPISetImage, TessBaseAPISetImage2,
TessBaseAPISetSourceResolution, TessBaseAPISetVariable,
};
use self::thiserror::Error;
Expand Down Expand Up @@ -40,6 +40,10 @@ pub struct TessBaseAPISetVariableError();
#[error("TessBaseApi failed to recognize")]
pub struct TessBaseAPIRecogniseError();

#[derive(Debug, Error)]
#[error("TessBaseApi get_hocr_text returned null")]
pub struct TessBaseAPIGetHOCRTextError();

#[derive(Debug, Error)]
#[error("TessBaseApi get_utf8_text returned null")]
pub struct TessBaseAPIGetUTF8TextError();
Expand Down Expand Up @@ -175,6 +179,25 @@ impl TessBaseAPI {
Ok(unsafe { TesseractText::new(ptr) })
}
}

/// Wrapper for [`GetUTF8Text`](https://tesseract-ocr.github.io/tessapi/5.x/a02438.html#a655f906bbf64dcd6f33ce633ecce997d)
///
/// Get the text out of an image.
///
/// Can return an error (null pointer), but it is not clear to me what would cause this.
///
/// This will implicitly call `recognize` if required.
pub fn get_hocr_text(
&mut self,
page: c_int,
) -> Result<TesseractText, TessBaseAPIGetHOCRTextError> {
let ptr = unsafe { TessBaseAPIGetHOCRText(self.0, page) };
if ptr.is_null() {
Err(TessBaseAPIGetHOCRTextError {})
} else {
Ok(unsafe { TesseractText::new(ptr) })
}
}
}

#[test]
Expand Down

0 comments on commit f81bbc9

Please sign in to comment.