From 28748a38f7f2e27fd04d9bbe68a2b3b03ce2287c Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sat, 24 Apr 2021 14:44:32 +0200 Subject: [PATCH 01/15] make tags, chunks optional, preliminary separation of tagger --- nlprule/src/lib.rs | 2 + nlprule/src/rule/disambiguation.rs | 42 +++--- nlprule/src/rule/engine/composition.rs | 142 ++++++++++-------- nlprule/src/rule/engine/mod.rs | 47 +++--- nlprule/src/rule/mod.rs | 194 ++++++++++++++----------- nlprule/src/rules.rs | 33 +++-- nlprule/src/tokenizer.rs | 63 ++++---- nlprule/src/tokenizer/chunk.rs | 18 ++- nlprule/src/tokenizer/multiword.rs | 6 +- nlprule/src/tokenizer/tag.rs | 43 ++++++ nlprule/src/types.rs | 98 +++++++++---- nlprule/tests/tests.rs | 32 ++-- python/src/lib.rs | 14 +- 13 files changed, 448 insertions(+), 286 deletions(-) diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index 585c591..cdb88ba 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -93,6 +93,8 @@ pub enum Error { Serialization(#[from] bincode::Error), #[error(transparent)] IdError(#[from] rule::id::Error), + #[error("unset token property: {0}")] + Unset(&'static str), } /// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs index 207c289..ace067a 100644 --- a/nlprule/src/rule/disambiguation.rs +++ b/nlprule/src/rule/disambiguation.rs @@ -52,21 +52,21 @@ pub enum Disambiguation { } impl Disambiguation { - pub fn apply<'t>(&'t self, groups: Vec>>) { + pub fn apply<'t>(&'t self, groups: Vec>>) -> Result<(), crate::Error> { match self { Disambiguation::Remove(data_or_filters) => { for (group, data_or_filter) in groups.into_iter().zip(data_or_filters) { for token in group.into_iter() { match data_or_filter { either::Left(data) => { - token.tags_mut().retain(|x| { + token.tags_mut()?.retain(|x| { !(x.pos() == data.pos() && (data.lemma().as_str().is_empty() || x.lemma() == data.lemma())) }); } either::Right(filter) => { - filter.remove(token.tags_mut()); + filter.remove(token.tags_mut()?); } } } @@ -79,7 +79,7 @@ impl Disambiguation { either::Left(limit) => { for token in group.into_iter() { let last = token - .tags() + .tags()? .iter() .next() .and_then(|x| { @@ -91,18 +91,18 @@ impl Disambiguation { }) .unwrap_or_else(|| token.text().clone()); - token.tags_mut().retain(|x| x.pos() == limit.pos()); + token.tags_mut()?.retain(|x| x.pos() == limit.pos()); - if token.tags().is_empty() { + if token.tags()?.is_empty() { if *retain_last { token - .tags_mut() + .tags_mut()? .push(WordData::new(last, limit.pos().clone())); } else { let lemma = token.text().clone(); token - .tags_mut() + .tags_mut()? .push(WordData::new(lemma, limit.pos().clone())); } } @@ -110,7 +110,7 @@ impl Disambiguation { } either::Right(filter) => { for token in group.into_iter() { - filter.keep(token.tags_mut()); + filter.keep(token.tags_mut()?); } } } @@ -129,8 +129,8 @@ impl Disambiguation { data.pos().clone(), ); - token.tags_mut().push(data); - token.tags_mut().retain(|x| !x.pos().as_str().is_empty()); + token.tags_mut()?.push(data); + token.tags_mut()?.retain(|x| !x.pos().as_str().is_empty()); } } } @@ -146,8 +146,8 @@ impl Disambiguation { data.pos().clone(), ); - token.tags_mut().clear(); - token.tags_mut().push(data); + token.tags_mut()?.clear(); + token.tags_mut()?.push(data); } } } @@ -160,14 +160,14 @@ impl Disambiguation { for token in group.iter() { if *use_mask_val { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()); + *mask_val = *mask_val && PosFilter::and(filter, token.tags()?); } } } } if !filter_mask.iter().any(|x| *x) { - return; + return Ok(()); } let to_apply: Vec<_> = filter_mask @@ -188,16 +188,16 @@ impl Disambiguation { { if *use_mask_val { for token in group.into_iter() { - let before = token.tags().clone(); + let before = token.tags()?.clone(); - PosFilter::apply(&to_apply, token.tags_mut()); + PosFilter::apply(&to_apply, token.tags_mut()?); if let Some(disambig) = disambig { - disambig.keep(token.tags_mut()); + disambig.keep(token.tags_mut()?); } - if token.tags().is_empty() { - *token.tags_mut() = before; + if token.tags()?.is_empty() { + *token.tags_mut()? = before; } } } @@ -205,6 +205,8 @@ impl Disambiguation { } Disambiguation::Nop => {} } + + Ok(()) } } diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 6e042de..11f70df 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -161,7 +161,7 @@ pub struct Quantifier { #[enum_dispatch] pub trait Atomable: Send + Sync { - fn is_match(&self, context: Context, position: usize) -> bool; + fn is_match(&self, context: Context, position: usize) -> Result; } #[enum_dispatch(Atomable)] @@ -189,11 +189,12 @@ pub mod concrete { } impl Atomable for TextAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match(&self, context: Context, position: usize) -> Result { let (sentence, _) = context; - self.matcher - .is_match(&sentence.index(position).text(), Some(context), None) + Ok(self + .matcher + .is_match(&sentence.index(position).text(), Some(context), None)) } } @@ -203,11 +204,14 @@ pub mod concrete { } impl Atomable for ChunkAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match(&self, context: Context, position: usize) -> Result { let (sentence, _) = context; - self.matcher - .is_slice_match(&sentence.index(position).chunks(), Some(context), None) + Ok(self.matcher.is_slice_match( + &sentence.index(position).chunks()?, + Some(context), + None, + )) } } @@ -217,10 +221,10 @@ pub mod concrete { } impl Atomable for SpaceBeforeAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match(&self, context: Context, position: usize) -> Result { let (sentence, _) = context; - sentence.index(position).has_space_before() == self.value + Ok(sentence.index(position).has_space_before() == self.value) } } @@ -231,12 +235,13 @@ pub mod concrete { } impl Atomable for WordDataAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match(&self, context: Context, position: usize) -> Result { let (sentence, _) = context; - let tags = sentence.index(position).tags().iter(); + let tags = sentence.index(position).tags()?.iter(); - self.matcher - .is_match(tags, Some(context), Some(self.case_sensitive)) + Ok(self + .matcher + .is_match(tags, Some(context), Some(self.case_sensitive))) } } } @@ -245,8 +250,8 @@ pub mod concrete { pub struct TrueAtom {} impl Atomable for TrueAtom { - fn is_match(&self, _context: Context, _position: usize) -> bool { - true + fn is_match(&self, _context: Context, _position: usize) -> Result { + Ok(true) } } @@ -254,8 +259,8 @@ impl Atomable for TrueAtom { pub struct FalseAtom {} impl Atomable for FalseAtom { - fn is_match(&self, _context: Context, _position: usize) -> bool { - false + fn is_match(&self, _context: Context, _position: usize) -> Result { + Ok(false) } } @@ -265,8 +270,14 @@ pub struct AndAtom { } impl Atomable for AndAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - self.atoms.iter().all(|x| x.is_match(context, position)) + fn is_match(&self, context: Context, position: usize) -> Result { + for atom in &self.atoms { + if !atom.is_match(context, position)? { + return Ok(false); + } + } + + Ok(true) } } @@ -276,8 +287,14 @@ pub struct OrAtom { } impl Atomable for OrAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - self.atoms.iter().any(|x| x.is_match(context, position)) + fn is_match(&self, context: Context, position: usize) -> Result { + for atom in &self.atoms { + if atom.is_match(context, position)? { + return Ok(true); + } + } + + Ok(false) } } @@ -287,8 +304,8 @@ pub struct NotAtom { } impl Atomable for NotAtom { - fn is_match(&self, context: Context, position: usize) -> bool { - !self.atom.is_match(context, position) + fn is_match(&self, context: Context, position: usize) -> Result { + Ok(!self.atom.is_match(context, position)?) } } @@ -299,15 +316,17 @@ pub struct OffsetAtom { } impl Atomable for OffsetAtom { - fn is_match(&self, context: Context, position: usize) -> bool { + fn is_match(&self, context: Context, position: usize) -> Result { let (sentence, _) = context; let new_position = position as isize + self.offset; - if new_position < 0 || (new_position as usize) >= sentence.len() { - false - } else { - self.atom.is_match(context, new_position as usize) - } + Ok( + if new_position < 0 || (new_position as usize) >= sentence.len() { + false + } else { + self.atom.is_match(context, new_position as usize)? + }, + ) } } @@ -357,20 +376,6 @@ impl GraphId { } } -lazy_static! { - static ref SENT_START: Token<'static> = Token::new( - WordId::empty(), - Tags::new(vec![WordData::new( - WordId::empty(), - PosId::special(SpecialPos::SentStart), - )],), - Span::default(), - false, - false, - Vec::new(), - ); -} - #[derive(Debug, Clone, PartialEq)] pub struct MatchSentence<'t> { sentence: &'t Sentence<'t>, @@ -383,7 +388,7 @@ impl<'t> MatchSentence<'t> { pub fn index(&self, index: usize) -> &Token { match index { - 0 => &*SENT_START, + 0 => &*crate::types::SENT_START, i => &self.sentence.tokens()[i - 1], } } @@ -518,7 +523,12 @@ pub struct Composition { } impl Composition { - fn next_can_match(&self, context: Context, position: usize, index: usize) -> bool { + fn next_can_match( + &self, + context: Context, + position: usize, + index: usize, + ) -> Result { let next_required_pos = match self.parts[index + 1..] .iter() .position(|x| x.quantifier.min > 0) @@ -527,9 +537,13 @@ impl Composition { None => self.parts.len(), }; - self.parts[index + 1..next_required_pos] - .iter() - .any(|x| x.atom.is_match(context, position)) + for part in &self.parts[index + 1..next_required_pos] { + if part.atom.is_match(context, position)? { + return Ok(true); + } + } + + Ok(false) } fn apply_recursive<'t>( @@ -538,7 +552,7 @@ impl Composition { mut position: usize, mut cur_atom_idx: usize, mut graph: MatchGraph<'t>, - ) -> Option> { + ) -> Result>, crate::Error> { let mut cur_count = 0; let is_match = loop { if cur_atom_idx >= self.parts.len() { @@ -561,21 +575,23 @@ impl Composition { } if cur_count >= part.quantifier.min && cur_atom_idx + 1 < self.parts.len() { - if !part.greedy && self.next_can_match((sentence, &graph), position, cur_atom_idx) { + if !part.greedy + && self.next_can_match((sentence, &graph), position, cur_atom_idx)? + { cur_atom_idx += 1; cur_count = 0; continue; } if part.greedy { if let Some(graph) = - self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone()) + self.apply_recursive(sentence, position, cur_atom_idx + 1, graph.clone())? { - return Some(graph); + return Ok(Some(graph)); } } } - if part.atom.is_match((sentence, &graph), position) { + if part.atom.is_match((sentence, &graph), position)? { let group = &mut graph.groups[cur_atom_idx + 1]; // set the group beginning if the char end was zero (i. e. the group was empty) @@ -599,19 +615,21 @@ impl Composition { cur_atom_idx += 1; } - if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] { - graph.fill_empty(sentence); - Some(graph) - } else { - None - } + Ok( + if is_match || cur_atom_idx == self.parts.len() || self.can_stop_mask[cur_atom_idx] { + graph.fill_empty(sentence); + Some(graph) + } else { + None + }, + ) } pub fn apply<'t>( &'t self, sentence: &'t MatchSentence, start: usize, - ) -> Option> { + ) -> Result>, crate::Error> { // this path is extremely hot so more optimizations are done // the first matcher can never rely on the match graph, so we use an empty default graph for the first match @@ -623,9 +641,9 @@ impl Composition { if self.parts[0].quantifier.min > 0 && !self.parts[0] .atom - .is_match((sentence, &DEFAULT_GRAPH), start) + .is_match((sentence, &DEFAULT_GRAPH), start)? { - return None; + return Ok(None); } let position = start; diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs index 22ec069..6a9a98d 100644 --- a/nlprule/src/rule/engine/mod.rs +++ b/nlprule/src/rule/engine/mod.rs @@ -14,14 +14,18 @@ pub struct TokenEngine { } impl TokenEngine { - fn get_match<'t>(&'t self, sentence: &'t MatchSentence, i: usize) -> Option> { - if let Some(graph) = self.composition.apply(sentence, i) { + fn get_match<'t>( + &'t self, + sentence: &'t MatchSentence, + i: usize, + ) -> Result>, crate::Error> { + if let Some(graph) = self.composition.apply(sentence, i)? { let mut blocked = false; // TODO: cache / move to outer loop for i in 0..sentence.len() { for antipattern in &self.antipatterns { - if let Some(anti_graph) = antipattern.apply(sentence, i) { + if let Some(anti_graph) = antipattern.apply(sentence, i)? { let anti_start = anti_graph.by_index(0).span.char().start; let anti_end = anti_graph .by_index(anti_graph.groups().len() - 1) @@ -44,11 +48,11 @@ impl TokenEngine { } if !blocked { - return Some(graph); + return Ok(Some(graph)); } } - None + Ok(None) } } @@ -84,7 +88,7 @@ pub struct EngineMatches<'a, 't> { } impl<'a, 't> Iterator for EngineMatches<'a, 't> { - type Item = MatchGraph<'t>; + type Item = Result, crate::Error>; fn next(&mut self) -> Option { let sentence = self.sentence; @@ -93,22 +97,25 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { match &mut self.inner { InnerMatches::Token(inner) => (inner.index..sentence.len()).find_map(|i| { - inner.engine.get_match(sentence, i).and_then(|graph| { - let start_group = graph.by_id(start_id); - let end_group = graph.by_id(end_id); + match inner.engine.get_match(sentence, i) { + Ok(graph) => graph.and_then(|graph| { + let start_group = graph.by_id(start_id); + let end_group = graph.by_id(end_id); - let start = start_group.span.char().start - sentence.span().char().start; - let end = end_group.span.char().end - sentence.span().char().start; + let start = start_group.span.char().start - sentence.span().char().start; + let end = end_group.span.char().end - sentence.span().char().start; - if inner.mask[start..end].iter().all(|x| !x) { - inner.mask[start..end].iter_mut().for_each(|x| *x = true); + if inner.mask[start..end].iter().all(|x| !x) { + inner.mask[start..end].iter_mut().for_each(|x| *x = true); - inner.index += 1; - Some(graph) - } else { - None - } - }) + inner.index += 1; + Some(Ok(graph)) + } else { + None + } + }), + Err(err) => Some(Err(err)), + } }), InnerMatches::Text(inner) => inner.captures.next().map(|captures| { let bi_to_ci = &inner.byte_idx_to_char_idx; @@ -134,7 +141,7 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { } } - MatchGraph::new(groups, inner.id_to_idx) + Ok(MatchGraph::new(groups, inner.id_to_idx)) }), } } diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index 5be2342..cec5a1f 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -38,7 +38,7 @@ pub(crate) struct Unification { } impl Unification { - pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> bool { + pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> Result { let filters: Vec<_> = self.filters.iter().multi_cartesian_product().collect(); let mut filter_mask: Vec<_> = filters.iter().map(|_| true).collect(); @@ -48,18 +48,14 @@ impl Unification { if maybe_mask_val.is_some() { for token in group.tokens(sentence) { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()); + *mask_val = *mask_val && PosFilter::and(filter, token.tags()?); } } } } let result = filter_mask.iter().any(|x| *x); - if negate { - !result - } else { - result - } + Ok(if negate { !result } else { result }) } } @@ -131,16 +127,21 @@ impl DisambiguationRule { &self.id } - pub(crate) fn apply<'t>(&'t self, sentence: &MatchSentence<'t>) -> Changes { + pub(crate) fn apply<'t>( + &'t self, + sentence: &MatchSentence<'t>, + ) -> Result { if matches!(self.disambiguations, disambiguation::Disambiguation::Nop) { - return Changes::default(); + return Ok(Changes::default()); } let mut all_spans = Vec::new(); for graph in self.engine.get_matches(sentence, self.start, self.end) { + let graph = graph?; + if let Some(unification) = &self.unification { - if !unification.keep(&graph, sentence) { + if !unification.keep(&graph, sentence)? { continue; } } @@ -165,10 +166,14 @@ impl DisambiguationRule { all_spans.push(spans); } - Changes(all_spans) + Ok(Changes(all_spans)) } - pub(crate) fn change<'t>(&'t self, sentence: &mut Sentence<'t>, changes: Changes) { + pub(crate) fn change<'t>( + &'t self, + sentence: &mut Sentence<'t>, + changes: Changes, + ) -> Result<(), crate::Error> { log::info!("applying {}", self.id); for spans in changes.0 { @@ -185,8 +190,10 @@ impl DisambiguationRule { groups.push(group); } - self.disambiguations.apply(groups); + self.disambiguations.apply(groups)?; } + + Ok(()) } /// Often there are examples associated with a rule. @@ -201,12 +208,14 @@ impl DisambiguationRule { }; // by convention examples are always considered as one sentence even if the sentencizer would split - let sentence_before = tokenizer.disambiguate_up_to_id( - tokenizer - .tokenize(text) - .expect("test text must not be empty"), - Some(&self.id), - ); + let sentence_before = tokenizer + .disambiguate_up_to_id( + tokenizer + .tokenize(text) + .expect("test text must not be empty"), + Some(&self.id), + ) + .unwrap(); // shift the sentence to the right before matching to make sure // nothing assumes the sentene starts from absolute index zero @@ -214,11 +223,12 @@ impl DisambiguationRule { let sentence_before_complete = sentence_before.clone().rshift(shift_delta); let changes = self .apply(&MatchSentence::new(&sentence_before_complete)) + .unwrap() .lshift(shift_delta); let mut sentence_after = sentence_before.clone(); if !changes.is_empty() { - self.change(&mut sentence_after, changes); + self.change(&mut sentence_after, changes).unwrap(); } info!("Tokens: {:#?}", sentence_before); @@ -238,7 +248,8 @@ impl DisambiguationRule { .find(|x| *x.span().char() == change.char_span) .unwrap(); - let unordered_tags = after.tags().iter().collect::>(); + let unordered_tags = + after.tags().unwrap().iter().collect::>(); let unordered_tags_change = change.after.iter().collect::>(); let pass = unordered_tags == unordered_tags_change; @@ -280,82 +291,92 @@ pub struct Suggestions<'a, 't> { sentence: &'t MatchSentence<'t>, } -impl<'a, 't> Iterator for Suggestions<'a, 't> { - type Item = Suggestion; - - fn next(&mut self) -> Option { - let rule = self.rule; - let sentence = self.sentence; - let (start, end) = (self.rule.start, self.rule.end); - - self.matches.find_map(|graph| { - if let Some(unification) = &rule.unification { - if !unification.keep(&graph, sentence) { - return None; - } +impl<'a, 't> Suggestions<'a, 't> { + fn suggest_from_graph( + graph: Result, + rule: &'a Rule, + sentence: &'t MatchSentence<'t>, + ) -> Result, crate::Error> { + let graph = graph?; + + if let Some(unification) = &rule.unification { + if !unification.keep(&graph, sentence)? { + return Ok(None); } + } + + let start_group = graph.by_id(rule.start); + let end_group = graph.by_id(rule.end); - let start_group = graph.by_id(start); - let end_group = graph.by_id(end); + let replacements: Vec = rule + .suggesters + .iter() + .filter_map(|x| x.apply(sentence, &graph, rule.start, rule.end)) + .collect(); - let replacements: Vec = rule - .suggesters + let start = if replacements + .iter() + .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c))) + { + let first_token = graph.groups()[graph.get_index(rule.start)..] .iter() - .filter_map(|x| x.apply(sentence, &graph, start, end)) - .collect(); + .find_map(|x| x.tokens(sentence).next()) + .unwrap(); - let start = if replacements + let idx = sentence .iter() - .all(|x| utils::no_space_chars().chars().any(|c| x.starts_with(c))) - { - let first_token = graph.groups()[graph.get_index(start)..] - .iter() - .find_map(|x| x.tokens(sentence).next()) - .unwrap(); - - let idx = sentence - .iter() - .position(|x| std::ptr::eq(x, first_token)) - .unwrap_or(0); - - if idx > 0 { - sentence.index(idx - 1).span().end() - } else { - start_group.span.start() - } + .position(|x| std::ptr::eq(x, first_token)) + .unwrap_or(0); + + if idx > 0 { + sentence.index(idx - 1).span().end() } else { start_group.span.start() - }; - let end = end_group.span.end(); - - // this should never happen, but just return None instead of raising an Error - // `end` COULD be equal to `start` if the suggestion is to insert text at this position - if end < start { - return None; } + } else { + start_group.span.start() + }; + let end = end_group.span.end(); + + // this should never happen, but just return None instead of raising an Error + // `end` COULD be equal to `start` if the suggestion is to insert text at this position + if end < start { + return Ok(None); + } - let text_before = sentence.slice(Span::from_positions(start, end)); + let text_before = sentence.slice(Span::from_positions(start, end)); + + // fix e. g. "Super , dass" + let replacements: Vec = replacements + .into_iter() + .filter(|suggestion| *suggestion != text_before) + .map(|x| utils::fix_nospace_chars(&x)) + .collect(); + + Ok(if !replacements.is_empty() { + Some(Suggestion::new( + rule.id.to_string(), + rule.message + .apply(sentence, &graph, rule.start, rule.end) + .expect("Rules must have a message."), + Span::from_positions(start, end), + replacements, + )) + } else { + None + }) + } +} - // fix e. g. "Super , dass" - let replacements: Vec = replacements - .into_iter() - .filter(|suggestion| *suggestion != text_before) - .map(|x| utils::fix_nospace_chars(&x)) - .collect(); +impl<'a, 't> Iterator for Suggestions<'a, 't> { + type Item = Result; - if !replacements.is_empty() { - Some(Suggestion::new( - rule.id.to_string(), - rule.message - .apply(sentence, &graph, rule.start, rule.end) - .expect("Rules must have a message."), - Span::from_positions(start, end), - replacements, - )) - } else { - None - } - }) + fn next(&mut self) -> Option { + let rule = self.rule; + let sentence = self.sentence; + + self.matches + .find_map(|graph| Suggestions::suggest_from_graph(graph, rule, sentence).transpose()) } } @@ -475,12 +496,13 @@ impl Rule { .tokenize(&test.text()) .expect("test text must not be empty."), ) + .unwrap() .rshift(shift_delta); info!("Sentence: {:#?}", sentence); let suggestions: Vec<_> = self .apply(&MatchSentence::new(&sentence)) - .map(|s| s.lshift(shift_delta)) + .map(|s| s.unwrap().lshift(shift_delta)) .collect(); let pass = if suggestions.len() > 1 { diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs index dc924fc..684b8a9 100644 --- a/nlprule/src/rules.rs +++ b/nlprule/src/rules.rs @@ -89,7 +89,7 @@ impl Rules { } /// Compute the suggestions for the given sentence by checking all rules. - pub fn apply(&self, sentence: &Sentence) -> Vec { + pub fn apply(&self, sentence: &Sentence) -> Result, crate::Error> { let sentence = MatchSentence::new(sentence); let mut output: Vec<(usize, Suggestion)> = self @@ -101,11 +101,16 @@ impl Rules { let mut output = Vec::new(); for suggestion in rule.apply(&sentence) { - output.push((i, suggestion)); + match suggestion { + Ok(suggestion) => output.push((i, suggestion)), + Err(err) => return Err(err), + } } - output + Ok(output) }) + .collect::>, crate::Error>>()? + .into_iter() .flatten() .collect(); @@ -119,7 +124,7 @@ impl Rules { let mut mask = vec![false; sentence.text().chars().count()]; - output + Ok(output .into_iter() .filter_map(|(_, suggestion)| { let span = suggestion.span().clone().lshift(sentence.span().start()); @@ -131,29 +136,33 @@ impl Rules { None } }) - .collect() + .collect()) } /// Compute the suggestions for a text by checking all rules. - pub fn suggest(&self, text: &str, tokenizer: &Tokenizer) -> Vec { + pub fn suggest( + &self, + text: &str, + tokenizer: &Tokenizer, + ) -> Result, crate::Error> { if text.is_empty() { - return Vec::new(); + return Ok(Vec::new()); } let mut suggestions = Vec::new(); // get suggestions sentence by sentence for sentence in tokenizer.pipe(text) { - suggestions.extend(self.apply(&sentence)); + suggestions.extend(self.apply(&sentence?)?); } - suggestions + Ok(suggestions) } /// Correct a text by first tokenizing, then finding all suggestions and choosing the first replacement of each suggestion. - pub fn correct(&self, text: &str, tokenizer: &Tokenizer) -> String { - let suggestions = self.suggest(text, tokenizer); - apply_suggestions(text, &suggestions) + pub fn correct(&self, text: &str, tokenizer: &Tokenizer) -> Result { + let suggestions = self.suggest(text, tokenizer)?; + Ok(apply_suggestions(text, &suggestions)) } } diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs index 6076195..8d61af8 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/tokenizer.rs @@ -139,7 +139,7 @@ pub struct SentenceIter<'t> { } impl<'t> Iterator for SentenceIter<'t> { - type Item = Sentence<'t>; + type Item = Result, crate::Error>; fn next(&mut self) -> Option { self.inner @@ -203,7 +203,7 @@ impl Tokenizer { &'t self, mut sentence: Sentence<'t>, id: Option<&Index>, - ) -> Sentence<'t> { + ) -> Result, crate::Error> { let n = id.map_or(self.rules.len(), |id| { self.rules.iter().position(|x| x.id == *id).unwrap() }); @@ -217,28 +217,38 @@ impl Tokenizer { .enumerate() .filter_map(|(j, rule)| { let changes = rule.apply(&match_sentence); - if changes.is_empty() { - None - } else { - Some((j + i, changes)) + + match changes { + Ok(changes) => { + if changes.is_empty() { + None + } else { + Some(Ok((j + i, changes))) + } + } + Err(err) => Some(Err(err)), } }) - .find_first(|_| true); + .find_first(|_| true) + .transpose()?; if let Some((index, changes)) = result { - self.rules[index].change(&mut sentence, changes); + self.rules[index].change(&mut sentence, changes)?; i = index + 1; } else { i = n; } } - sentence + Ok(sentence) } /// Apply rule-based disambiguation to the tokens. /// This does not change the number of tokens, but can change the content arbitrarily. - pub fn disambiguate<'t>(&'t self, sentence: Sentence<'t>) -> Sentence<'t> { + pub fn disambiguate<'t>( + &'t self, + sentence: Sentence<'t>, + ) -> Result, crate::Error> { self.disambiguate_up_to_id(sentence, None) } @@ -324,52 +334,29 @@ impl Tokenizer { let id = self.tagger.id_word(token_text.into()); - let mut tag_vec: Vec<_> = self - .tagger - .get_tags_with_options( - token_text, - if is_sentence_start { Some(true) } else { None }, - None, - ) - .collect(); - - tag_vec.push( - WordData::new( - self.tagger().id_word(token_text.into()), - PosId::special(SpecialPos::None), - ) - .freeze(), - ); - - if is_sentence_end { - tag_vec.push( - WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)) - .freeze(), - ); - } - Token::new( id, - Tags::new(tag_vec), Span::new( byte_start..byte_start + token_text.len(), char_start..char_start + token_text.chars().count(), ), + is_sentence_start, is_sentence_end, sentence[..byte_start].ends_with(char::is_whitespace), - Vec::new(), ) }) .collect(); let mut sentence = Sentence::new(tokens, sentence, &self.tagger); + self.tagger.apply(&mut sentence).unwrap(); + if let Some(chunker) = &self.chunker { - chunker.apply(&mut sentence); + chunker.apply(&mut sentence).unwrap(); } if let Some(multiword_tagger) = &self.multiword_tagger { - multiword_tagger.apply(&mut sentence); + multiword_tagger.apply(&mut sentence).unwrap(); } Some(sentence) diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/tokenizer/chunk.rs index 40ae936..d4c818b 100644 --- a/nlprule/src/tokenizer/chunk.rs +++ b/nlprule/src/tokenizer/chunk.rs @@ -701,7 +701,9 @@ pub struct Chunker { impl Chunker { /// Populates the `.chunks` field of the passed tokens by predicting with the maximum entropy model. - pub fn apply(&self, sentence: &mut Sentence) { + pub fn apply(&self, sentence: &mut Sentence) -> Result<(), crate::Error> { + sentence.init_chunks(); + let text = sentence.text().replace('’', "\'"); let mut bi_to_ci: DefaultHashMap = text @@ -757,8 +759,12 @@ impl Chunker { let contains_nns = sentence .iter() .find(|token| *token.span().char() == char_span) - .map(|token| token.tags().iter().any(|tag| tag.pos().as_str() == "NNS")) - .unwrap_or(false); + .map(|token| { + token + .tags() + .map(|tags| tags.iter().any(|tag| tag.pos().as_str() == "NNS")) + }) + .unwrap_or(Ok(false))?; if contains_nns { number = "plural"; @@ -791,9 +797,13 @@ impl Chunker { for token in sentence.iter_mut() { for (chunk, (_, char_span)) in chunks.iter().zip(internal_chunks.iter()) { if char_span == token.span().char() { - *token.chunks_mut() = (*chunk).clone(); + *token + .chunks_mut() + .expect("chunks are initialized in chunker") = (*chunk).clone(); } } } + + Ok(()) } } diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/tokenizer/multiword.rs index 9af2ca7..071bb7d 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/tokenizer/multiword.rs @@ -38,7 +38,7 @@ pub struct MultiwordTagger { impl MultiwordTagger { /// Populates the `.multiword_data` field of the passed tokens by checking if any known phrases are contained. - pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) { + pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) -> Result<(), crate::Error> { let tagger = sentence.tagger(); let mut start_indices = DefaultHashMap::new(); @@ -66,11 +66,13 @@ impl MultiwordTagger { let (word, pos) = &self.multiwords[m.pattern()]; // end index is inclusive for token in sentence.iter_mut().skip(*start).take((end + 1) - start) { - token.tags_mut().push( + token.tags_mut()?.push( WordData::new(tagger.id_word(word.as_str().into()), pos.clone()).freeze(), ); } } } + + Ok(()) } } diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs index c1ad8fc..5b19227 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/tokenizer/tag.rs @@ -72,6 +72,13 @@ impl<'t> WordId<'t> { self.0.as_ref() } + pub fn as_ref_str(&self) -> &'t str { + match &self.0 { + Cow::Borrowed(x) => *x, + Cow::Owned(_) => panic!("can not get `&'t str` reference from owned Cow!"), + } + } + /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. pub fn into_static(self) -> WordId<'static> { WordId(self.0.into_owned().into(), self.1) @@ -725,4 +732,40 @@ impl Tagger { pub fn get_tags<'a>(&'a self, word: &'a str) -> TagIter<'a> { self.get_tags_with_options(word, None, None) } + + pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) -> Result<(), crate::Error> { + sentence.init_tags(); + + for token in sentence.iter_mut() { + let mut tag_vec: Vec<_> = self + .get_tags_with_options( + token.as_str(), + if token.is_sentence_start() { + Some(true) + } else { + None + }, + None, + ) + .collect(); + + tag_vec.push( + WordData::new( + self.id_word(token.as_str().into()), + PosId::special(SpecialPos::None), + ) + .freeze(), + ); + + if token.is_sentence_end() { + tag_vec.push( + WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), + ); + } + + *token.tags_mut().expect("tags are initialized in tagger") = Tags::new(tag_vec); + } + + Ok(()) + } } diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index d89809e..87d78f5 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -226,33 +226,49 @@ impl<'t> Tags<'t> { } } +lazy_static! { + pub(crate) static ref SENT_START: Token<'static> = Token { + text: WordId::empty(), + span: Span::default(), + is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. + is_sentence_end: false, + has_space_before: false, + tags: Some(Tags::new(vec![WordData::new( + WordId::empty(), + PosId::special(SpecialPos::SentStart), + )],)), + chunks: Some(Vec::new()), + }; +} + /// A token where varying levels of information are set. #[derive(Debug, Clone, PartialEq)] pub struct Token<'t> { text: WordId<'t>, - tags: Tags<'t>, span: Span, + is_sentence_start: bool, is_sentence_end: bool, has_space_before: bool, - chunks: Vec, + tags: Option>, + chunks: Option>, } impl<'t> Token<'t> { pub(crate) fn new( text: WordId<'t>, - tags: Tags<'t>, span: Span, + is_sentence_start: bool, is_sentence_end: bool, has_space_before: bool, - chunks: Vec, ) -> Self { Token { text, - tags, span, + is_sentence_start, is_sentence_end, has_space_before, - chunks, + tags: None, + chunks: None, } } @@ -262,18 +278,11 @@ impl<'t> Token<'t> { } /// Gets the token as string. - pub fn as_str(&self) -> &str { - self.text.as_str() - } - - /// The tags of this token. Contain information about the part-of-speech tags and lemmas. - pub fn tags(&self) -> &Tags<'t> { - &self.tags - } - - #[allow(missing_docs)] - pub fn tags_mut(&mut self) -> &mut Tags<'t> { - &mut self.tags + pub fn as_str(&self) -> &'t str { + // we know that the token text can never be changed, and it is created + // from a slice of the input text, so the `WordId` will always contain + // a borrowed Cow. + self.text.as_ref_str() } /// The span of this sentence. @@ -281,7 +290,12 @@ impl<'t> Token<'t> { &self.span } - /// Whether this token is the last token in the sentence- + /// Whether this token is the first token in the sentence. + pub fn is_sentence_start(&self) -> bool { + self.is_sentence_start + } + + /// Whether this token is the last token in the sentence. pub fn is_sentence_end(&self) -> bool { self.is_sentence_end } @@ -291,16 +305,6 @@ impl<'t> Token<'t> { self.has_space_before } - /// Chunks associated with this token. - pub fn chunks(&self) -> &[String] { - &self.chunks - } - - #[allow(missing_docs)] - pub fn chunks_mut(&mut self) -> &mut Vec { - &mut self.chunks - } - /// Shift the span of this token right by the specified amount. pub fn rshift(mut self, position: Position) -> Self { self.span = self.span.rshift(position); @@ -311,8 +315,9 @@ impl<'t> Token<'t> { pub fn into_static(self) -> Token<'static> { Token { text: self.text.into_static(), - tags: self.tags.into_static(), + tags: self.tags.map(Tags::into_static), span: self.span, + is_sentence_start: self.is_sentence_start, is_sentence_end: self.is_sentence_end, has_space_before: self.has_space_before, chunks: self.chunks, @@ -320,6 +325,39 @@ impl<'t> Token<'t> { } } +impl<'t> Sentence<'t> { + pub fn init_tags(&mut self) { + for token in self.iter_mut() { + token.tags = Some(Tags::new(Vec::new())); + } + } + + pub fn init_chunks(&mut self) { + for token in self.iter_mut() { + token.chunks = Some(Vec::new()); + } + } +} + +impl<'t> Token<'t> { + /// The tags of this token. Contain information about the part-of-speech tags and lemmas. + pub fn tags(&self) -> Result<&Tags<'t>, crate::Error> { + self.tags.as_ref().ok_or(crate::Error::Unset("tags")) + } + + pub fn tags_mut(&mut self) -> Result<&mut Tags<'t>, crate::Error> { + self.tags.as_mut().ok_or(crate::Error::Unset("tags")) + } + + pub fn chunks(&self) -> Result<&[String], crate::Error> { + self.chunks.as_deref().ok_or(crate::Error::Unset("chunks")) + } + + pub fn chunks_mut(&mut self) -> Result<&mut Vec, crate::Error> { + self.chunks.as_mut().ok_or(crate::Error::Unset("chunks")) + } +} + /// A position in a text. Determined by a byte and char index. /// Can be an absolute position (offset relative to zero) or a position delta (offset relative to some other position). #[derive(Debug, Clone, Copy, PartialEq, Default, Serialize, Deserialize)] diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 7d08956..7ea7f86 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -25,14 +25,14 @@ fn handles_whitespace_correctly() { let mut sentences = TOKENIZER.pipe(text); assert_eq!( - &text[sentences.next().unwrap().span().byte().clone()], + &text[sentences.next().unwrap().unwrap().span().byte().clone()], " hello.\t" ); assert_eq!( - &text[sentences.next().unwrap().span().byte().clone()], + &text[sentences.next().unwrap().unwrap().span().byte().clone()], "test.\t" ); - assert_eq!(sentences.next(), None); + assert!(sentences.next().is_none()); } #[quickcheck] @@ -43,10 +43,12 @@ fn can_tokenize_anything(text: String) -> bool { #[test] fn suggest_indices_are_relative_to_input_text() { - let suggestions = RULES.suggest( - "I can due his homework for 10€. I can due his homework.", - &*TOKENIZER, - ); + let suggestions = RULES + .suggest( + "I can due his homework for 10€. I can due his homework.", + &*TOKENIZER, + ) + .unwrap(); assert_eq!(*suggestions[0].span().char(), 6..9); assert_eq!(*suggestions[0].span().byte(), 6..9); @@ -62,7 +64,7 @@ fn suggest_indices_are_relative_to_input_text() { fn sentence_spans_correct() { let text = "A short test. A test with emoji 😊."; - let sentences: Vec<_> = TOKENIZER.pipe(text).collect(); + let sentences: Vec<_> = TOKENIZER.pipe(text).collect::>().unwrap(); assert_eq!(sentences.len(), 2); assert_eq!(*sentences[0].span().char(), 0..14); @@ -100,6 +102,8 @@ fn no_gaps_between_sentences(text: String) { let mut contains_sentence = false; for sentence in TOKENIZER.pipe(&text) { + let sentence = sentence.unwrap(); + assert_eq!(sentence.span().start(), prev_pos); prev_pos += sentence.span().len(); @@ -116,6 +120,7 @@ fn rules_can_be_disabled_enabled() { // enabled by default assert!(!rules .suggest("I can due his homework", &*TOKENIZER) + .unwrap() .is_empty()); rules @@ -129,15 +134,22 @@ fn rules_can_be_disabled_enabled() { // disabled now assert!(rules .suggest("I can due his homework", &*TOKENIZER) + .unwrap() .is_empty()); // disabled by default - assert!(rules.suggest("I can not go", &*TOKENIZER).is_empty()); + assert!(rules + .suggest("I can not go", &*TOKENIZER) + .unwrap() + .is_empty()); rules .select_mut(&"typos/can_not".try_into().unwrap()) .for_each(|x| x.enable()); // enabled now - assert!(!rules.suggest("I can not go", &*TOKENIZER).is_empty()); + assert!(!rules + .suggest("I can not go", &*TOKENIZER) + .unwrap() + .is_empty()); } diff --git a/python/src/lib.rs b/python/src/lib.rs index 49d1e28..ccb310e 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -182,6 +182,7 @@ impl PyToken { fn data(&self) -> Vec<(&str, &str)> { self.token .tags() + .unwrap() .iter() .map(|x| (x.lemma().as_str(), x.pos().as_str())) .collect() @@ -192,6 +193,7 @@ impl PyToken { let mut lemmas: Vec<_> = self .token .tags() + .unwrap() .iter() .filter_map(|x| { if x.lemma().as_str().is_empty() { @@ -211,6 +213,7 @@ impl PyToken { let mut tags: Vec<_> = self .token .tags() + .unwrap() .iter() .filter_map(|x| { if x.pos().as_str().is_empty() { @@ -227,7 +230,12 @@ impl PyToken { #[getter] fn chunks(&self) -> Vec<&str> { - self.token.chunks().iter().map(|x| x.as_str()).collect() + self.token + .chunks() + .unwrap() + .iter() + .map(|x| x.as_str()) + .collect() } } @@ -355,6 +363,7 @@ impl PyTokenizer { .pipe(&text) .map(|sentence| { sentence + .unwrap() .into_iter() .map(|token| PyCell::new(py, PyToken::from(token.into_static()))) .collect::>>() @@ -619,6 +628,7 @@ impl PyRules { self.rules .read() .suggest(&sentence, &tokenizer) + .unwrap() .into_iter() .map(|x| PyCell::new(py, PySuggestion::from(x))) .collect::>>() @@ -639,7 +649,7 @@ impl PyRules { let tokenizer = self.tokenizer.borrow(py); let tokenizer = tokenizer.tokenizer(); - Ok(self.rules.read().correct(&text, tokenizer)) + Ok(self.rules.read().correct(&text, tokenizer).unwrap()) }) } From 0d968676aad4d26c619f4046e34787f8231cf300 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sat, 1 May 2021 14:05:40 +0200 Subject: [PATCH 02/15] add property guards --- nlprule/src/bin/test.rs | 4 +- nlprule/src/bin/test_disambiguation.rs | 2 +- nlprule/src/compile/impls.rs | 6 +- nlprule/src/compile/parse_structure.rs | 2 + nlprule/src/lib.rs | 3 +- nlprule/src/properties.rs | 328 +++++++++++++++++++++++++ nlprule/src/rule/disambiguation.rs | 55 +++-- nlprule/src/rule/engine/composition.rs | 134 ++++++++-- nlprule/src/rule/engine/mod.rs | 21 +- nlprule/src/rule/mod.rs | 86 +++++-- nlprule/src/rules.rs | 31 ++- nlprule/src/tokenizer.rs | 22 +- nlprule/src/tokenizer/chunk.rs | 25 +- nlprule/src/tokenizer/multiword.rs | 20 +- nlprule/src/tokenizer/tag.rs | 21 +- nlprule/src/types.rs | 26 +- 16 files changed, 678 insertions(+), 108 deletions(-) create mode 100644 nlprule/src/properties.rs diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs index 3669a8e..743fc81 100644 --- a/nlprule/src/bin/test.rs +++ b/nlprule/src/bin/test.rs @@ -28,7 +28,9 @@ fn main() { let mut passes = 0; for rule in rules { if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) { - passes += rule.test(&tokenizer) as usize; + if let Ok(true) = rule.test(&tokenizer) { + passes += 1; + } } } diff --git a/nlprule/src/bin/test_disambiguation.rs b/nlprule/src/bin/test_disambiguation.rs index 30321a3..4912508 100644 --- a/nlprule/src/bin/test_disambiguation.rs +++ b/nlprule/src/bin/test_disambiguation.rs @@ -26,7 +26,7 @@ fn main() { let mut passes = 0; for rule in rules { - if rule.test(&tokenizer) { + if let Ok(true) = rule.test(&tokenizer) { passes += 1; } else if opts.stop_at_error { break; diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs index ec7fd7d..9541a22 100644 --- a/nlprule/src/compile/impls.rs +++ b/nlprule/src/compile/impls.rs @@ -352,7 +352,10 @@ impl Rules { ); } - Rules { rules } + Rules { + rules, + properties: Default::default(), + } } } @@ -435,6 +438,7 @@ impl Tokenizer { multiword_tagger, rules, lang_options, + properties: Default::default(), }) } } diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/compile/parse_structure.rs index 0be9924..4afdbb6 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/compile/parse_structure.rs @@ -902,6 +902,7 @@ impl Rule { category_name: String::new(), category_type: None, enabled: true, + properties: Default::default(), }) } } @@ -1327,6 +1328,7 @@ impl DisambiguationRule { disambiguations, examples, id: Index::default(), + properties: Default::default(), }) } } diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index cdb88ba..8617b31 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -66,7 +66,7 @@ //! --- //! Binaries are distributed with [Github releases](https://github.com/bminixhofer/nlprule/releases). -#![warn(missing_docs)] +// #![warn(missing_docs)] use std::io; use thiserror::Error; @@ -74,6 +74,7 @@ use thiserror::Error; #[cfg(feature = "compile")] pub mod compile; mod filter; +pub mod properties; pub mod rule; pub mod rules; pub mod tokenizer; diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs new file mode 100644 index 0000000..f52a216 --- /dev/null +++ b/nlprule/src/properties.rs @@ -0,0 +1,328 @@ +use serde::{Deserialize, Serialize}; + +use crate::types::*; +use thiserror::Error; + +pub trait ReadProperties { + fn properties(&self) -> Properties { + Properties::default() + } + + fn property_guard(&self, sentence: &Sentence) -> Result { + self.properties().build(sentence) + } +} + +pub trait WriteProperties { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } +} + +#[derive(Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[error("unset token property: {0:?}")] + Unset(Property), +} + +#[derive(Debug, Clone, Copy)] +pub enum Property { + Tags = 0, + Chunks = 1, +} + +impl Property { + pub fn properties() -> &'static [Property] { + &[Property::Tags, Property::Chunks] + } +} + +#[derive(Debug, Copy, Clone, Serialize, Deserialize, Default)] +struct Bitset(u16); + +impl Bitset { + pub fn insert(&mut self, value: Property) { + self.0 |= 1 << (value as u16); + } + + pub fn contains(&self, value: &Property) -> bool { + self.0 & (1 << (*value as u16)) != 0 + } + + pub fn union(mut self, other: Bitset) -> Self { + self.0 |= other.0; + self + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct Properties { + read_mask: Bitset, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] +pub struct PropertiesMut { + read_mask: Bitset, + write_mask: Bitset, +} + +impl std::iter::FromIterator for Properties { + fn from_iter>(iter: T) -> Self { + let mut out = Properties::default(); + + for properties in iter { + out = out.union(properties) + } + + out + } +} + +impl std::iter::FromIterator for PropertiesMut { + fn from_iter>(iter: T) -> Self { + let mut out = PropertiesMut::default(); + + for properties in iter { + out = out.union(properties) + } + + out + } +} + +impl Properties { + pub fn read(mut self, properties: &[Property]) -> Self { + for property in properties { + self.read_mask.insert(*property); + } + + self + } + + pub fn write(self, properties: &[Property]) -> PropertiesMut { + let mut write_mask = Bitset::default(); + let mut read_mask = self.read_mask; + + for property in properties { + // write implies read + read_mask.insert(*property); + write_mask.insert(*property); + } + + PropertiesMut { + read_mask, + write_mask, + } + } + + pub fn union(mut self, properties: Properties) -> Self { + self.read_mask = self.read_mask.union(properties.read_mask); + + self + } + + pub fn build(&self, sentence: &Sentence) -> Result { + for property in Property::properties() { + if self.read_mask.contains(property) { + match *property { + Property::Tags => { + if sentence.first().tags.is_none() { + return Err(Error::Unset(Property::Tags)); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + return Err(Error::Unset(Property::Chunks)); + } + } + } + } + } + + Ok(PropertyGuard { + read_mask: self.read_mask, + }) + } +} + +impl PropertiesMut { + pub fn union(mut self, properties: PropertiesMut) -> Self { + self.read_mask = self.read_mask.union(properties.read_mask); + self.write_mask = self.write_mask.union(properties.read_mask); + + self + } + + pub fn build(&self, sentence: &mut Sentence) -> Result { + for property in Property::properties() { + if self.write_mask.contains(property) { + match property { + Property::Tags => { + if sentence.first().tags.is_none() { + sentence + .iter_mut() + .for_each(|token| token.tags = Some(Tags::default())); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + sentence + .iter_mut() + .for_each(|token| token.chunks = Some(Vec::default())); + } + } + } + } + } + + for property in Property::properties() { + if self.read_mask.contains(property) { + match *property { + Property::Tags => { + if sentence.first().tags.is_none() { + return Err(Error::Unset(Property::Tags)); + } + } + Property::Chunks => { + if sentence.first().chunks.is_none() { + return Err(Error::Unset(Property::Chunks)); + } + } + } + } + } + + Ok(PropertyGuardMut { + read_mask: self.read_mask, + write_mask: self.write_mask, + }) + } +} + +#[derive(Debug, Copy, Clone)] +pub struct PropertyGuard { + read_mask: Bitset, +} + +#[derive(Debug, Copy, Clone)] +pub struct PropertyGuardMut { + read_mask: Bitset, + write_mask: Bitset, +} + +impl PropertyGuard { + pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> { + match ( + token.chunks.as_deref(), + self.read_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> { + match ( + token.tags.as_ref(), + self.read_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } +} + +impl PropertyGuardMut { + pub fn chunks<'a>(&self, token: &'a Token) -> Result<&'a [String], Error> { + match ( + token.chunks.as_deref(), + self.read_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags<'a, 't>(&self, token: &'a Token<'t>) -> Result<&'a Tags<'t>, Error> { + match ( + token.tags.as_ref(), + self.read_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } + + pub fn chunks_mut<'a, 't>( + &self, + token: &'a mut Token<'t>, + ) -> Result<&'a mut Vec, Error> { + match ( + token.chunks.as_mut(), + self.write_mask.contains(&Property::Chunks), + ) { + (Some(chunks), true) => Ok(chunks), + _ => Err(Error::Unset(Property::Chunks)), + } + } + + pub fn tags_mut<'a, 't>(&self, token: &'a mut Token<'t>) -> Result<&'a mut Tags<'t>, Error> { + match ( + token.tags.as_mut(), + self.write_mask.contains(&Property::Tags), + ) { + (Some(tags), true) => Ok(tags), + _ => Err(Error::Unset(Property::Tags)), + } + } + + pub fn downgrade(self) -> PropertyGuard { + PropertyGuard { + read_mask: self.read_mask, + } + } +} + +// pub trait Transform { +// fn transform<'t>(&'t self, sentences: SentenceIter<'t>) -> SentenceIter<'t>; + +// fn in_properties(&self) -> +// } + +// pub struct Pipeline(T, P); + +// type SentenceIter<'t> = Box>>; + +// macro_rules! impl_pipeline { +// ( $first:ident, $($name:ident),+) => { +// impl<$first: Tokenize, $($name: Transform,)+> Tokenize for Pipeline<($first, $($name,)+)> { +// #[allow(non_snake_case)] +// fn tokenize<'t>(&'t self, text: &'t str) -> SentenceIter<'t> { +// let (ref $first, $(ref $name),+) = self.0; +// let sentences = $first.tokenize(text); +// $(let sentences = $name.transform(sentences);)+ +// sentences +// } +// } + +// impl<$first: Transform, $($name: Transform,)+> Transform for Pipeline<($first, $($name,)+)> { +// #[allow(non_snake_case)] +// fn transform<'t>(&'t self, sentences: SentenceIter<'t>) -> SentenceIter<'t> { +// let (ref $first, $(ref $name),+) = self.0; +// let sentences = $first.transform(sentences); +// $(let sentences = $name.transform(sentences);)+ +// sentences +// } +// } +// }; +// } + +// impl_pipeline! { A, B } +// impl_pipeline! { A, B, C } +// impl_pipeline! { A, B, C, D } +// impl_pipeline! { A, B, C, D, E } diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs index ace067a..fac4fe3 100644 --- a/nlprule/src/rule/disambiguation.rs +++ b/nlprule/src/rule/disambiguation.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::types::*; +use crate::{properties::PropertyGuardMut, types::*}; use itertools::Itertools; use serde::{Deserialize, Serialize}; @@ -52,21 +52,25 @@ pub enum Disambiguation { } impl Disambiguation { - pub fn apply<'t>(&'t self, groups: Vec>>) -> Result<(), crate::Error> { + pub fn apply<'t>( + &'t self, + groups: Vec>>, + guard: PropertyGuardMut, + ) -> Result<(), crate::properties::Error> { match self { Disambiguation::Remove(data_or_filters) => { for (group, data_or_filter) in groups.into_iter().zip(data_or_filters) { for token in group.into_iter() { match data_or_filter { either::Left(data) => { - token.tags_mut()?.retain(|x| { + guard.tags_mut(token)?.retain(|x| { !(x.pos() == data.pos() && (data.lemma().as_str().is_empty() || x.lemma() == data.lemma())) }); } either::Right(filter) => { - filter.remove(token.tags_mut()?); + filter.remove(guard.tags_mut(token)?); } } } @@ -78,8 +82,8 @@ impl Disambiguation { match data_or_filter { either::Left(limit) => { for token in group.into_iter() { - let last = token - .tags()? + let last = guard + .tags(token)? .iter() .next() .and_then(|x| { @@ -91,18 +95,18 @@ impl Disambiguation { }) .unwrap_or_else(|| token.text().clone()); - token.tags_mut()?.retain(|x| x.pos() == limit.pos()); + guard.tags_mut(token)?.retain(|x| x.pos() == limit.pos()); - if token.tags()?.is_empty() { + if guard.tags(token)?.is_empty() { if *retain_last { - token - .tags_mut()? + guard + .tags_mut(token)? .push(WordData::new(last, limit.pos().clone())); } else { let lemma = token.text().clone(); - token - .tags_mut()? + guard + .tags_mut(token)? .push(WordData::new(lemma, limit.pos().clone())); } } @@ -110,7 +114,7 @@ impl Disambiguation { } either::Right(filter) => { for token in group.into_iter() { - filter.keep(token.tags_mut()?); + filter.keep(guard.tags_mut(token)?); } } } @@ -129,8 +133,10 @@ impl Disambiguation { data.pos().clone(), ); - token.tags_mut()?.push(data); - token.tags_mut()?.retain(|x| !x.pos().as_str().is_empty()); + let tags = guard.tags_mut(token)?; + + tags.push(data); + tags.retain(|x| !x.pos().as_str().is_empty()); } } } @@ -146,8 +152,10 @@ impl Disambiguation { data.pos().clone(), ); - token.tags_mut()?.clear(); - token.tags_mut()?.push(data); + let tags = guard.tags_mut(token)?; + + tags.clear(); + tags.push(data); } } } @@ -160,7 +168,7 @@ impl Disambiguation { for token in group.iter() { if *use_mask_val { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()?); + *mask_val = *mask_val && PosFilter::and(filter, guard.tags(token)?); } } } @@ -188,16 +196,17 @@ impl Disambiguation { { if *use_mask_val { for token in group.into_iter() { - let before = token.tags()?.clone(); + let tags = guard.tags_mut(token)?; + let before = tags.clone(); - PosFilter::apply(&to_apply, token.tags_mut()?); + PosFilter::apply(&to_apply, tags); if let Some(disambig) = disambig { - disambig.keep(token.tags_mut()?); + disambig.keep(tags); } - if token.tags()?.is_empty() { - *token.tags_mut()? = before; + if tags.is_empty() { + *tags = before; } } } diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 11f70df..227b8f0 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -1,6 +1,6 @@ use std::iter; -use crate::{tokenizer::tag::Tagger, types::*, utils::regex::Regex}; +use crate::{properties::*, tokenizer::tag::Tagger, types::*, utils::regex::Regex}; use enum_dispatch::enum_dispatch; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -161,10 +161,15 @@ pub struct Quantifier { #[enum_dispatch] pub trait Atomable: Send + Sync { - fn is_match(&self, context: Context, position: usize) -> Result; + fn is_match(&self, context: Context, position: usize) + -> Result; + + fn properties(&self) -> Properties { + Properties::default() + } } -#[enum_dispatch(Atomable)] +#[enum_dispatch(Atomable, ReadProperties)] #[derive(Debug, Serialize, Deserialize, Clone)] pub enum Atom { ChunkAtom(concrete::ChunkAtom), @@ -180,7 +185,8 @@ pub enum Atom { } pub mod concrete { - use super::{Atomable, Context, Matcher, TextMatcher, WordDataMatcher}; + use super::{Atomable, Context, Matcher, Properties, Property, TextMatcher, WordDataMatcher}; + use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; #[derive(Debug, Serialize, Deserialize, Clone)] @@ -189,7 +195,11 @@ pub mod concrete { } impl Atomable for TextAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; Ok(self @@ -204,15 +214,26 @@ pub mod concrete { } impl Atomable for ChunkAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; Ok(self.matcher.is_slice_match( - &sentence.index(position).chunks()?, + sentence.guard().chunks(sentence.index(position))?, Some(context), None, )) } + + fn properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Chunks]); + } + *PROPERTIES + } } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -221,7 +242,11 @@ pub mod concrete { } impl Atomable for SpaceBeforeAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; Ok(sentence.index(position).has_space_before() == self.value) @@ -235,14 +260,25 @@ pub mod concrete { } impl Atomable for WordDataAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; - let tags = sentence.index(position).tags()?.iter(); + let tags = sentence.guard().tags(sentence.index(position))?.iter(); Ok(self .matcher .is_match(tags, Some(context), Some(self.case_sensitive))) } + + fn properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES + } } } @@ -250,7 +286,11 @@ pub mod concrete { pub struct TrueAtom {} impl Atomable for TrueAtom { - fn is_match(&self, _context: Context, _position: usize) -> Result { + fn is_match( + &self, + _context: Context, + _position: usize, + ) -> Result { Ok(true) } } @@ -259,7 +299,11 @@ impl Atomable for TrueAtom { pub struct FalseAtom {} impl Atomable for FalseAtom { - fn is_match(&self, _context: Context, _position: usize) -> Result { + fn is_match( + &self, + _context: Context, + _position: usize, + ) -> Result { Ok(false) } } @@ -270,7 +314,11 @@ pub struct AndAtom { } impl Atomable for AndAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { for atom in &self.atoms { if !atom.is_match(context, position)? { return Ok(false); @@ -279,6 +327,10 @@ impl Atomable for AndAtom { Ok(true) } + + fn properties(&self) -> Properties { + self.atoms.iter().map(|x| x.properties()).collect() + } } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -287,7 +339,11 @@ pub struct OrAtom { } impl Atomable for OrAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { for atom in &self.atoms { if atom.is_match(context, position)? { return Ok(true); @@ -296,6 +352,10 @@ impl Atomable for OrAtom { Ok(false) } + + fn properties(&self) -> Properties { + self.atoms.iter().map(|x| x.properties()).collect() + } } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -304,9 +364,17 @@ pub struct NotAtom { } impl Atomable for NotAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { Ok(!self.atom.is_match(context, position)?) } + + fn properties(&self) -> Properties { + self.atom.properties() + } } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -316,7 +384,11 @@ pub struct OffsetAtom { } impl Atomable for OffsetAtom { - fn is_match(&self, context: Context, position: usize) -> Result { + fn is_match( + &self, + context: Context, + position: usize, + ) -> Result { let (sentence, _) = context; let new_position = position as isize + self.offset; @@ -328,6 +400,10 @@ impl Atomable for OffsetAtom { }, ) } + + fn properties(&self) -> Properties { + self.atom.properties() + } } #[derive(Debug, Default, Clone)] @@ -376,14 +452,15 @@ impl GraphId { } } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone)] pub struct MatchSentence<'t> { sentence: &'t Sentence<'t>, + guard: PropertyGuard, } impl<'t> MatchSentence<'t> { - pub fn new(sentence: &'t Sentence<'t>) -> Self { - MatchSentence { sentence } + pub fn new(sentence: &'t Sentence<'t>, guard: PropertyGuard) -> Self { + MatchSentence { sentence, guard } } pub fn index(&self, index: usize) -> &Token { @@ -414,6 +491,10 @@ impl<'t> MatchSentence<'t> { self.sentence.tagger() } + pub fn guard(&self) -> &PropertyGuard { + &self.guard + } + pub fn span(&self) -> &Span { self.sentence.span() } @@ -522,13 +603,22 @@ pub struct Composition { pub(crate) can_stop_mask: Vec, } +impl ReadProperties for Composition { + fn properties(&self) -> Properties { + self.parts + .iter() + .map(|part| part.atom.properties()) + .collect() + } +} + impl Composition { fn next_can_match( &self, context: Context, position: usize, index: usize, - ) -> Result { + ) -> Result { let next_required_pos = match self.parts[index + 1..] .iter() .position(|x| x.quantifier.min > 0) @@ -552,7 +642,7 @@ impl Composition { mut position: usize, mut cur_atom_idx: usize, mut graph: MatchGraph<'t>, - ) -> Result>, crate::Error> { + ) -> Result>, crate::properties::Error> { let mut cur_count = 0; let is_match = loop { if cur_atom_idx >= self.parts.len() { @@ -629,7 +719,7 @@ impl Composition { &'t self, sentence: &'t MatchSentence, start: usize, - ) -> Result>, crate::Error> { + ) -> Result>, crate::properties::Error> { // this path is extremely hot so more optimizations are done // the first matcher can never rely on the match graph, so we use an empty default graph for the first match diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs index 6a9a98d..5e4af22 100644 --- a/nlprule/src/rule/engine/mod.rs +++ b/nlprule/src/rule/engine/mod.rs @@ -1,4 +1,7 @@ +use std::iter; + use crate::{ + properties::*, types::*, utils::regex::{CaptureMatches, Regex}, }; @@ -18,7 +21,7 @@ impl TokenEngine { &'t self, sentence: &'t MatchSentence, i: usize, - ) -> Result>, crate::Error> { + ) -> Result>, crate::properties::Error> { if let Some(graph) = self.composition.apply(sentence, i)? { let mut blocked = false; @@ -63,6 +66,20 @@ pub enum Engine { Text(Box, DefaultHashMap), } +impl ReadProperties for Engine { + fn properties(&self) -> Properties { + match &self { + Engine::Token(engine) => engine + .antipatterns + .iter() + .map(|x| x.properties()) + .chain(iter::once(engine.composition.properties())) + .collect(), + Engine::Text(_, _) => Properties::default(), + } + } +} + struct TokenMatches<'a> { engine: &'a TokenEngine, index: usize, @@ -88,7 +105,7 @@ pub struct EngineMatches<'a, 't> { } impl<'a, 't> Iterator for EngineMatches<'a, 't> { - type Item = Result, crate::Error>; + type Item = Result, crate::properties::Error>; fn next(&mut self) -> Option { let sentence = self.sentence; diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index cec5a1f..1779113 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -1,16 +1,19 @@ //! Implementations related to single rules. -use crate::types::*; use crate::{ filter::{Filter, Filterable}, + properties::*, tokenizer::Tokenizer, + types::*, utils, }; use itertools::Itertools; +use lazy_static::lazy_static; use log::{error, info, warn}; +use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; use std::fmt; +use std::{collections::HashSet, iter}; pub(crate) mod disambiguation; pub(crate) mod engine; @@ -37,8 +40,21 @@ pub(crate) struct Unification { pub(crate) filters: Vec>, } +impl ReadProperties for Unification { + fn properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES + } +} + impl Unification { - pub fn keep(&self, graph: &MatchGraph, sentence: &MatchSentence) -> Result { + pub fn keep( + &self, + graph: &MatchGraph, + sentence: &MatchSentence, + ) -> Result { let filters: Vec<_> = self.filters.iter().multi_cartesian_product().collect(); let mut filter_mask: Vec<_> = filters.iter().map(|_| true).collect(); @@ -48,7 +64,8 @@ impl Unification { if maybe_mask_val.is_some() { for token in group.tokens(sentence) { for (mask_val, filter) in filter_mask.iter_mut().zip(filters.iter()) { - *mask_val = *mask_val && PosFilter::and(filter, token.tags()?); + *mask_val = + *mask_val && PosFilter::and(filter, sentence.guard().tags(token)?); } } } @@ -87,6 +104,19 @@ pub struct DisambiguationRule { pub(crate) end: GraphId, pub(crate) examples: Vec, pub(crate) unification: Option, + #[serde(skip)] + pub(crate) properties: OnceCell, +} + +impl WriteProperties for DisambiguationRule { + fn properties(&self) -> PropertiesMut { + *self.properties.get_or_init(|| { + iter::once(self.engine.properties()) + .chain(self.unification.iter().map(|x| x.properties())) + .collect::() + .write(&[Property::Tags]) + }) + } } #[derive(Default, Debug)] @@ -130,7 +160,7 @@ impl DisambiguationRule { pub(crate) fn apply<'t>( &'t self, sentence: &MatchSentence<'t>, - ) -> Result { + ) -> Result { if matches!(self.disambiguations, disambiguation::Disambiguation::Nop) { return Ok(Changes::default()); } @@ -173,9 +203,11 @@ impl DisambiguationRule { &'t self, sentence: &mut Sentence<'t>, changes: Changes, - ) -> Result<(), crate::Error> { + ) -> Result<(), crate::properties::Error> { log::info!("applying {}", self.id); + let guard = self.property_guard(sentence)?; + for spans in changes.0 { let mut groups = Vec::new(); let mut refs = sentence.iter_mut().collect::>(); @@ -190,7 +222,7 @@ impl DisambiguationRule { groups.push(group); } - self.disambiguations.apply(groups)?; + self.disambiguations.apply(groups, guard)?; } Ok(()) @@ -198,7 +230,7 @@ impl DisambiguationRule { /// Often there are examples associated with a rule. /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> bool { + pub fn test(&self, tokenizer: &Tokenizer) -> Result { let mut passes = Vec::new(); for (i, test) in self.examples.iter().enumerate() { @@ -220,9 +252,14 @@ impl DisambiguationRule { // shift the sentence to the right before matching to make sure // nothing assumes the sentene starts from absolute index zero let shift_delta = Position { byte: 1, char: 1 }; - let sentence_before_complete = sentence_before.clone().rshift(shift_delta); + let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); + + let guard = self.property_guard(&mut sentence_before_complete)?; let changes = self - .apply(&MatchSentence::new(&sentence_before_complete)) + .apply(&MatchSentence::new( + &sentence_before_complete, + guard.downgrade(), + )) .unwrap() .lshift(shift_delta); let mut sentence_after = sentence_before.clone(); @@ -280,7 +317,7 @@ impl DisambiguationRule { passes.push(pass); } - passes.iter().all(|x| *x) + Ok(passes.iter().all(|x| *x)) } } @@ -293,10 +330,10 @@ pub struct Suggestions<'a, 't> { impl<'a, 't> Suggestions<'a, 't> { fn suggest_from_graph( - graph: Result, + graph: Result, rule: &'a Rule, sentence: &'t MatchSentence<'t>, - ) -> Result, crate::Error> { + ) -> Result, crate::properties::Error> { let graph = graph?; if let Some(unification) = &rule.unification { @@ -369,7 +406,7 @@ impl<'a, 't> Suggestions<'a, 't> { } impl<'a, 't> Iterator for Suggestions<'a, 't> { - type Item = Result; + type Item = Result; fn next(&mut self) -> Option { let rule = self.rule; @@ -411,6 +448,18 @@ pub struct Rule { pub(crate) category_type: Option, pub(crate) unification: Option, pub(crate) enabled: bool, + #[serde(skip)] + pub(crate) properties: OnceCell, +} + +impl ReadProperties for Rule { + fn properties(&self) -> Properties { + *self.properties.get_or_init(|| { + iter::once(self.engine.properties()) + .chain(self.unification.iter().map(|x| x.properties())) + .collect() + }) + } } impl fmt::Display for Rule { @@ -480,7 +529,7 @@ impl Rule { /// Grammar rules always have at least one example associated with them. /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> bool { + pub fn test(&self, tokenizer: &Tokenizer) -> Result { let mut passes = Vec::new(); // make sure relative position is handled correctly @@ -501,7 +550,10 @@ impl Rule { info!("Sentence: {:#?}", sentence); let suggestions: Vec<_> = self - .apply(&MatchSentence::new(&sentence)) + .apply(&MatchSentence::new( + &sentence, + self.property_guard(&sentence)?, + )) .map(|s| s.unwrap().lshift(shift_delta)) .collect(); @@ -529,6 +581,6 @@ impl Rule { passes.push(pass); } - passes.iter().all(|x| *x) + Ok(passes.iter().all(|x| *x)) } } diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs index 684b8a9..a8108fa 100644 --- a/nlprule/src/rules.rs +++ b/nlprule/src/rules.rs @@ -1,9 +1,11 @@ //! Sets of grammatical error correction rules. +use crate::properties::*; use crate::types::*; use crate::utils::parallelism::MaybeParallelRefIterator; use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, tokenizer::Tokenizer, Error}; use fs_err::File; +use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use std::{ io::{BufReader, Read, Write}, @@ -38,6 +40,16 @@ impl Default for RulesLangOptions { #[derive(Serialize, Deserialize, Default)] pub struct Rules { pub(crate) rules: Vec, + #[serde(skip)] + pub(crate) properties: OnceCell, +} + +impl ReadProperties for Rules { + fn properties(&self) -> Properties { + *self + .properties + .get_or_init(|| self.rules.iter().map(ReadProperties::properties).collect()) + } } impl Rules { @@ -89,8 +101,8 @@ impl Rules { } /// Compute the suggestions for the given sentence by checking all rules. - pub fn apply(&self, sentence: &Sentence) -> Result, crate::Error> { - let sentence = MatchSentence::new(sentence); + pub fn apply(&self, sentence: &Sentence) -> Result, crate::properties::Error> { + let sentence = MatchSentence::new(sentence, self.property_guard(sentence)?); let mut output: Vec<(usize, Suggestion)> = self .rules @@ -109,7 +121,7 @@ impl Rules { Ok(output) }) - .collect::>, crate::Error>>()? + .collect::>, crate::properties::Error>>()? .into_iter() .flatten() .collect(); @@ -144,7 +156,7 @@ impl Rules { &self, text: &str, tokenizer: &Tokenizer, - ) -> Result, crate::Error> { + ) -> Result, crate::properties::Error> { if text.is_empty() { return Ok(Vec::new()); } @@ -160,7 +172,11 @@ impl Rules { } /// Correct a text by first tokenizing, then finding all suggestions and choosing the first replacement of each suggestion. - pub fn correct(&self, text: &str, tokenizer: &Tokenizer) -> Result { + pub fn correct( + &self, + text: &str, + tokenizer: &Tokenizer, + ) -> Result { let suggestions = self.suggest(text, tokenizer)?; Ok(apply_suggestions(text, &suggestions)) } @@ -231,6 +247,9 @@ where { fn from_iter>(iter: I) -> Self { let rules: Vec = iter.into_iter().map(|x| x.into()).collect(); - Self { rules } + Self { + rules, + properties: OnceCell::default(), + } } } diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs index 8d61af8..ccfd449 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/tokenizer.rs @@ -5,6 +5,7 @@ //! [DisambiguationRule][crate::rule::DisambiguationRule]s. use crate::{ + properties::*, rule::id::{Index, Selector}, rule::MatchSentence, types::*, @@ -12,6 +13,7 @@ use crate::{ Error, }; use fs_err::File; +use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use std::{ io::{BufReader, Read, Write}, @@ -139,7 +141,7 @@ pub struct SentenceIter<'t> { } impl<'t> Iterator for SentenceIter<'t> { - type Item = Result, crate::Error>; + type Item = Result, crate::properties::Error>; fn next(&mut self) -> Option { self.inner @@ -157,6 +159,16 @@ pub struct Tokenizer { pub(crate) multiword_tagger: Option, pub(crate) tagger: Arc, pub(crate) lang_options: TokenizerLangOptions, + #[serde(skip)] + pub(crate) properties: OnceCell, +} + +impl WriteProperties for Tokenizer { + fn properties(&self) -> PropertiesMut { + *self + .properties + .get_or_init(|| self.rules.iter().map(WriteProperties::properties).collect()) + } } impl Tokenizer { @@ -203,14 +215,16 @@ impl Tokenizer { &'t self, mut sentence: Sentence<'t>, id: Option<&Index>, - ) -> Result, crate::Error> { + ) -> Result, crate::properties::Error> { let n = id.map_or(self.rules.len(), |id| { self.rules.iter().position(|x| x.id == *id).unwrap() }); let mut i = 0; + let guard = self.property_guard(&mut sentence)?; + while i < n { - let match_sentence = MatchSentence::new(&sentence); + let match_sentence = MatchSentence::new(&sentence, guard.downgrade()); let result = self.rules[i..n] .maybe_par_iter() @@ -248,7 +262,7 @@ impl Tokenizer { pub fn disambiguate<'t>( &'t self, sentence: Sentence<'t>, - ) -> Result, crate::Error> { + ) -> Result, crate::properties::Error> { self.disambiguate_up_to_id(sentence, None) } diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/tokenizer/chunk.rs index d4c818b..18bf05e 100644 --- a/nlprule/src/tokenizer/chunk.rs +++ b/nlprule/src/tokenizer/chunk.rs @@ -1,10 +1,12 @@ //! A Chunker ported from [OpenNLP](https://opennlp.apache.org/). use half::bf16; +use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; use std::hash::{Hash, Hasher}; use std::{cmp::Ordering, collections::BinaryHeap}; +use crate::properties::*; use crate::types::{DefaultHashMap, DefaultHasher, Sentence}; fn softmax(vec: &mut Vec) { @@ -699,10 +701,21 @@ pub struct Chunker { pub(crate) chunk_model: MaxentChunker, } +impl WriteProperties for Chunker { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default() + .read(&[Property::Tags]) + .write(&[Property::Chunks]); + } + *PROPERTIES + } +} + impl Chunker { /// Populates the `.chunks` field of the passed tokens by predicting with the maximum entropy model. - pub fn apply(&self, sentence: &mut Sentence) -> Result<(), crate::Error> { - sentence.init_chunks(); + pub fn apply(&self, sentence: &mut Sentence) -> Result<(), crate::properties::Error> { + let props = self.property_guard(sentence)?; let text = sentence.text().replace('’', "\'"); @@ -760,8 +773,8 @@ impl Chunker { .iter() .find(|token| *token.span().char() == char_span) .map(|token| { - token - .tags() + props + .tags(token) .map(|tags| tags.iter().any(|tag| tag.pos().as_str() == "NNS")) }) .unwrap_or(Ok(false))?; @@ -797,9 +810,7 @@ impl Chunker { for token in sentence.iter_mut() { for (chunk, (_, char_span)) in chunks.iter().zip(internal_chunks.iter()) { if char_span == token.span().char() { - *token - .chunks_mut() - .expect("chunks are initialized in chunker") = (*chunk).clone(); + *props.chunks_mut(token)? = (*chunk).clone(); } } } diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/tokenizer/multiword.rs index 071bb7d..13c4e7e 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/tokenizer/multiword.rs @@ -1,7 +1,9 @@ //! Checks if the input text contains multi-token phrases from a finite list (might contain e. g. city names) and assigns lemmas and part-of-speech tags accordingly. +use crate::properties::*; use crate::types::*; use aho_corasick::AhoCorasick; +use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; #[derive(Serialize, Deserialize)] @@ -36,9 +38,23 @@ pub struct MultiwordTagger { multiwords: Vec<(String, PosId<'static>)>, } +impl WriteProperties for MultiwordTagger { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); + } + *PROPERTIES + } +} + impl MultiwordTagger { /// Populates the `.multiword_data` field of the passed tokens by checking if any known phrases are contained. - pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) -> Result<(), crate::Error> { + pub fn apply<'t>( + &'t self, + sentence: &mut Sentence<'t>, + ) -> Result<(), crate::properties::Error> { + let props = self.property_guard(sentence)?; + let tagger = sentence.tagger(); let mut start_indices = DefaultHashMap::new(); @@ -66,7 +82,7 @@ impl MultiwordTagger { let (word, pos) = &self.multiwords[m.pattern()]; // end index is inclusive for token in sentence.iter_mut().skip(*start).take((end + 1) - start) { - token.tags_mut()?.push( + props.tags_mut(token)?.push( WordData::new(tagger.id_word(word.as_str().into()), pos.clone()).freeze(), ); } diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs index 5b19227..8ff09ea 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/tokenizer/tag.rs @@ -1,8 +1,9 @@ //! A dictionary-based tagger. -use crate::{types::*, utils::parallelism::MaybeParallelRefIterator}; +use crate::{properties::*, types::*, utils::parallelism::MaybeParallelRefIterator}; use bimap::BiMap; use fst::{IntoStreamer, Map, Streamer}; +use lazy_static::lazy_static; use log::error; use serde::{Deserialize, Serialize}; use std::{ @@ -547,6 +548,15 @@ pub struct Tagger { pub(crate) lang_options: TaggerLangOptions, } +impl WriteProperties for Tagger { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); + } + *PROPERTIES + } +} + impl Tagger { /// Directly looks up the given word in the `tags` map and returns /// corresponding [WordData]. @@ -733,8 +743,11 @@ impl Tagger { self.get_tags_with_options(word, None, None) } - pub fn apply<'t>(&'t self, sentence: &mut Sentence<'t>) -> Result<(), crate::Error> { - sentence.init_tags(); + pub fn apply<'t>( + &'t self, + sentence: &mut Sentence<'t>, + ) -> Result<(), crate::properties::Error> { + let props = self.property_guard(sentence)?; for token in sentence.iter_mut() { let mut tag_vec: Vec<_> = self @@ -763,7 +776,7 @@ impl Tagger { ); } - *token.tags_mut().expect("tags are initialized in tagger") = Tags::new(tag_vec); + *props.tags_mut(token)? = Tags::new(tag_vec); } Ok(()) diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 87d78f5..d8cfd8f 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -69,6 +69,12 @@ impl<'t> Sentence<'t> { &self.tokens } + /// Gets the first token in this sentence. There is always at least one token in the sentence + /// so this will never panic. + pub fn first(&self) -> &Token<'t> { + &self.tokens[0] + } + /// Gets the amount of tokens in this sentence. pub fn len(&self) -> usize { self.tokens.len() @@ -177,7 +183,7 @@ impl<'a, 't> Iterator for TagIter<'a, 't> { /// Contains all the local information about a token i. e. /// the text itself and the [WordData]s associated with the word. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] pub struct Tags<'t> { tags: Vec>, } @@ -249,8 +255,8 @@ pub struct Token<'t> { is_sentence_start: bool, is_sentence_end: bool, has_space_before: bool, - tags: Option>, - chunks: Option>, + pub tags: Option>, + pub chunks: Option>, } impl<'t> Token<'t> { @@ -325,20 +331,6 @@ impl<'t> Token<'t> { } } -impl<'t> Sentence<'t> { - pub fn init_tags(&mut self) { - for token in self.iter_mut() { - token.tags = Some(Tags::new(Vec::new())); - } - } - - pub fn init_chunks(&mut self) { - for token in self.iter_mut() { - token.chunks = Some(Vec::new()); - } - } -} - impl<'t> Token<'t> { /// The tags of this token. Contain information about the part-of-speech tags and lemmas. pub fn tags(&self) -> Result<&Tags<'t>, crate::Error> { From 4ab8e331aa890d3a9e401fd47862ff475974bc97 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sat, 1 May 2021 14:36:35 +0200 Subject: [PATCH 03/15] add Transform, Suggest and Tokenize traits --- nlprule/src/compile/parse_structure.rs | 2 - nlprule/src/properties.rs | 18 +++--- nlprule/src/rule/engine/composition.rs | 30 +++++----- nlprule/src/rule/engine/mod.rs | 26 ++++----- nlprule/src/rule/mod.rs | 67 +++++++++------------ nlprule/src/rules.rs | 11 ++-- nlprule/src/tokenizer.rs | 26 ++++++--- nlprule/src/tokenizer/chunk.rs | 14 ++--- nlprule/src/tokenizer/multiword.rs | 15 ++--- nlprule/src/tokenizer/tag.rs | 80 +++++++++++++------------- 10 files changed, 140 insertions(+), 149 deletions(-) diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/compile/parse_structure.rs index 4afdbb6..0be9924 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/compile/parse_structure.rs @@ -902,7 +902,6 @@ impl Rule { category_name: String::new(), category_type: None, enabled: true, - properties: Default::default(), }) } } @@ -1328,7 +1327,6 @@ impl DisambiguationRule { disambiguations, examples, id: Index::default(), - properties: Default::default(), }) } } diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index f52a216..7d6cd21 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::types::*; use thiserror::Error; -pub trait ReadProperties { +pub trait Suggest { fn properties(&self) -> Properties { Properties::default() } @@ -13,7 +13,7 @@ pub trait ReadProperties { } } -pub trait WriteProperties { +pub trait Transform { fn properties(&self) -> PropertiesMut { PropertiesMut::default() } @@ -21,6 +21,12 @@ pub trait WriteProperties { fn property_guard(&self, sentence: &mut Sentence) -> Result { self.properties().build(sentence) } + + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error>; +} + +pub trait Tokenize { + fn tokenize<'t>(&'t self, text: &'t str) -> Box>>; } #[derive(Error, Debug)] @@ -288,13 +294,7 @@ impl PropertyGuardMut { } } -// pub trait Transform { -// fn transform<'t>(&'t self, sentences: SentenceIter<'t>) -> SentenceIter<'t>; - -// fn in_properties(&self) -> -// } - -// pub struct Pipeline(T, P); +pub struct Pipeline(T); // type SentenceIter<'t> = Box>>; diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 227b8f0..47a3ffa 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -164,7 +164,7 @@ pub trait Atomable: Send + Sync { fn is_match(&self, context: Context, position: usize) -> Result; - fn properties(&self) -> Properties { + fn compute_properties(&self) -> Properties { Properties::default() } } @@ -228,7 +228,7 @@ pub mod concrete { )) } - fn properties(&self) -> Properties { + fn compute_properties(&self) -> Properties { lazy_static! { static ref PROPERTIES: Properties = Properties::default().read(&[Property::Chunks]); } @@ -273,7 +273,7 @@ pub mod concrete { .is_match(tags, Some(context), Some(self.case_sensitive))) } - fn properties(&self) -> Properties { + fn compute_properties(&self) -> Properties { lazy_static! { static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); } @@ -328,8 +328,8 @@ impl Atomable for AndAtom { Ok(true) } - fn properties(&self) -> Properties { - self.atoms.iter().map(|x| x.properties()).collect() + fn compute_properties(&self) -> Properties { + self.atoms.iter().map(Atom::compute_properties).collect() } } @@ -353,8 +353,8 @@ impl Atomable for OrAtom { Ok(false) } - fn properties(&self) -> Properties { - self.atoms.iter().map(|x| x.properties()).collect() + fn compute_properties(&self) -> Properties { + self.atoms.iter().map(Atom::compute_properties).collect() } } @@ -372,8 +372,8 @@ impl Atomable for NotAtom { Ok(!self.atom.is_match(context, position)?) } - fn properties(&self) -> Properties { - self.atom.properties() + fn compute_properties(&self) -> Properties { + self.atom.compute_properties() } } @@ -401,8 +401,8 @@ impl Atomable for OffsetAtom { ) } - fn properties(&self) -> Properties { - self.atom.properties() + fn compute_properties(&self) -> Properties { + self.atom.compute_properties() } } @@ -603,16 +603,14 @@ pub struct Composition { pub(crate) can_stop_mask: Vec, } -impl ReadProperties for Composition { - fn properties(&self) -> Properties { +impl Composition { + pub fn compute_properties(&self) -> Properties { self.parts .iter() - .map(|part| part.atom.properties()) + .map(|part| part.atom.compute_properties()) .collect() } -} -impl Composition { fn next_can_match( &self, context: Context, diff --git a/nlprule/src/rule/engine/mod.rs b/nlprule/src/rule/engine/mod.rs index 5e4af22..ceda355 100644 --- a/nlprule/src/rule/engine/mod.rs +++ b/nlprule/src/rule/engine/mod.rs @@ -66,20 +66,6 @@ pub enum Engine { Text(Box, DefaultHashMap), } -impl ReadProperties for Engine { - fn properties(&self) -> Properties { - match &self { - Engine::Token(engine) => engine - .antipatterns - .iter() - .map(|x| x.properties()) - .chain(iter::once(engine.composition.properties())) - .collect(), - Engine::Text(_, _) => Properties::default(), - } - } -} - struct TokenMatches<'a> { engine: &'a TokenEngine, index: usize, @@ -165,6 +151,18 @@ impl<'a, 't> Iterator for EngineMatches<'a, 't> { } impl Engine { + pub fn compute_properties(&self) -> Properties { + match &self { + Engine::Token(engine) => engine + .antipatterns + .iter() + .map(|x| x.compute_properties()) + .chain(iter::once(engine.composition.compute_properties())) + .collect(), + Engine::Text(_, _) => Properties::default(), + } + } + pub fn get_matches<'a, 't>( &'a self, sentence: &'t MatchSentence, diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index 1779113..0bb8d77 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -10,7 +10,6 @@ use crate::{ use itertools::Itertools; use lazy_static::lazy_static; use log::{error, info, warn}; -use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use std::fmt; use std::{collections::HashSet, iter}; @@ -40,15 +39,6 @@ pub(crate) struct Unification { pub(crate) filters: Vec>, } -impl ReadProperties for Unification { - fn properties(&self) -> Properties { - lazy_static! { - static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); - } - *PROPERTIES - } -} - impl Unification { pub fn keep( &self, @@ -74,6 +64,13 @@ impl Unification { let result = filter_mask.iter().any(|x| *x); Ok(if negate { !result } else { result }) } + + pub fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES + } } /// A disambiguation rule. @@ -104,19 +101,6 @@ pub struct DisambiguationRule { pub(crate) end: GraphId, pub(crate) examples: Vec, pub(crate) unification: Option, - #[serde(skip)] - pub(crate) properties: OnceCell, -} - -impl WriteProperties for DisambiguationRule { - fn properties(&self) -> PropertiesMut { - *self.properties.get_or_init(|| { - iter::once(self.engine.properties()) - .chain(self.unification.iter().map(|x| x.properties())) - .collect::() - .write(&[Property::Tags]) - }) - } } #[derive(Default, Debug)] @@ -152,6 +136,13 @@ impl Changes { } impl DisambiguationRule { + pub fn compute_properties(&self) -> PropertiesMut { + iter::once(self.engine.compute_properties()) + .chain(self.unification.iter().map(|x| x.compute_properties())) + .collect::() + .write(&[Property::Tags]) + } + /// Get a unique identifier of this rule. pub fn id(&self) -> &Index { &self.id @@ -203,11 +194,10 @@ impl DisambiguationRule { &'t self, sentence: &mut Sentence<'t>, changes: Changes, + guard: PropertyGuardMut, ) -> Result<(), crate::properties::Error> { log::info!("applying {}", self.id); - let guard = self.property_guard(sentence)?; - for spans in changes.0 { let mut groups = Vec::new(); let mut refs = sentence.iter_mut().collect::>(); @@ -254,7 +244,10 @@ impl DisambiguationRule { let shift_delta = Position { byte: 1, char: 1 }; let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); - let guard = self.property_guard(&mut sentence_before_complete)?; + let guard = self + .compute_properties() + .build(&mut sentence_before_complete)?; + let changes = self .apply(&MatchSentence::new( &sentence_before_complete, @@ -265,7 +258,7 @@ impl DisambiguationRule { let mut sentence_after = sentence_before.clone(); if !changes.is_empty() { - self.change(&mut sentence_after, changes).unwrap(); + self.change(&mut sentence_after, changes, guard).unwrap(); } info!("Tokens: {:#?}", sentence_before); @@ -448,18 +441,6 @@ pub struct Rule { pub(crate) category_type: Option, pub(crate) unification: Option, pub(crate) enabled: bool, - #[serde(skip)] - pub(crate) properties: OnceCell, -} - -impl ReadProperties for Rule { - fn properties(&self) -> Properties { - *self.properties.get_or_init(|| { - iter::once(self.engine.properties()) - .chain(self.unification.iter().map(|x| x.properties())) - .collect() - }) - } } impl fmt::Display for Rule { @@ -484,6 +465,12 @@ impl Rule { self.enabled } + pub fn compute_properties(&self) -> Properties { + iter::once(self.engine.compute_properties()) + .chain(self.unification.iter().map(|x| x.compute_properties())) + .collect() + } + /// Get a unique identifier of this rule. pub fn id(&self) -> &Index { &self.id @@ -552,7 +539,7 @@ impl Rule { let suggestions: Vec<_> = self .apply(&MatchSentence::new( &sentence, - self.property_guard(&sentence)?, + self.compute_properties().build(&sentence)?, )) .map(|s| s.unwrap().lshift(shift_delta)) .collect(); diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs index a8108fa..59dd195 100644 --- a/nlprule/src/rules.rs +++ b/nlprule/src/rules.rs @@ -44,11 +44,14 @@ pub struct Rules { pub(crate) properties: OnceCell, } -impl ReadProperties for Rules { +impl Suggest for Rules { fn properties(&self) -> Properties { - *self - .properties - .get_or_init(|| self.rules.iter().map(ReadProperties::properties).collect()) + *self.properties.get_or_init(|| { + self.rules + .iter() + .map(|rule| rule.compute_properties()) + .collect() + }) } } diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs index ccfd449..d7f55f2 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/tokenizer.rs @@ -163,11 +163,21 @@ pub struct Tokenizer { pub(crate) properties: OnceCell, } -impl WriteProperties for Tokenizer { +impl Transform for Tokenizer { fn properties(&self) -> PropertiesMut { - *self - .properties - .get_or_init(|| self.rules.iter().map(WriteProperties::properties).collect()) + *self.properties.get_or_init(|| { + self.rules + .iter() + .map(|rule| rule.compute_properties()) + .collect() + }) + } + + fn transform<'t>( + &'t self, + _sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + unimplemented!() } } @@ -247,7 +257,7 @@ impl Tokenizer { .transpose()?; if let Some((index, changes)) = result { - self.rules[index].change(&mut sentence, changes)?; + self.rules[index].change(&mut sentence, changes, guard)?; i = index + 1; } else { i = n; @@ -363,14 +373,14 @@ impl Tokenizer { let mut sentence = Sentence::new(tokens, sentence, &self.tagger); - self.tagger.apply(&mut sentence).unwrap(); + sentence = self.tagger.transform(sentence).unwrap(); if let Some(chunker) = &self.chunker { - chunker.apply(&mut sentence).unwrap(); + sentence = chunker.transform(sentence).unwrap(); } if let Some(multiword_tagger) = &self.multiword_tagger { - multiword_tagger.apply(&mut sentence).unwrap(); + sentence = multiword_tagger.transform(sentence).unwrap(); } Some(sentence) diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/tokenizer/chunk.rs index 18bf05e..35cd953 100644 --- a/nlprule/src/tokenizer/chunk.rs +++ b/nlprule/src/tokenizer/chunk.rs @@ -701,7 +701,7 @@ pub struct Chunker { pub(crate) chunk_model: MaxentChunker, } -impl WriteProperties for Chunker { +impl Transform for Chunker { fn properties(&self) -> PropertiesMut { lazy_static! { static ref PROPERTIES: PropertiesMut = Properties::default() @@ -710,12 +710,12 @@ impl WriteProperties for Chunker { } *PROPERTIES } -} -impl Chunker { - /// Populates the `.chunks` field of the passed tokens by predicting with the maximum entropy model. - pub fn apply(&self, sentence: &mut Sentence) -> Result<(), crate::properties::Error> { - let props = self.property_guard(sentence)?; + fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + let props = self.property_guard(&mut sentence)?; let text = sentence.text().replace('’', "\'"); @@ -815,6 +815,6 @@ impl Chunker { } } - Ok(()) + Ok(sentence) } } diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/tokenizer/multiword.rs index 13c4e7e..5168aed 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/tokenizer/multiword.rs @@ -38,22 +38,19 @@ pub struct MultiwordTagger { multiwords: Vec<(String, PosId<'static>)>, } -impl WriteProperties for MultiwordTagger { +impl Transform for MultiwordTagger { fn properties(&self) -> PropertiesMut { lazy_static! { static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); } *PROPERTIES } -} -impl MultiwordTagger { - /// Populates the `.multiword_data` field of the passed tokens by checking if any known phrases are contained. - pub fn apply<'t>( + fn transform<'t>( &'t self, - sentence: &mut Sentence<'t>, - ) -> Result<(), crate::properties::Error> { - let props = self.property_guard(sentence)?; + mut sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + let props = self.property_guard(&mut sentence)?; let tagger = sentence.tagger(); @@ -89,6 +86,6 @@ impl MultiwordTagger { } } - Ok(()) + Ok(sentence) } } diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs index 8ff09ea..fbdacf7 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/tokenizer/tag.rs @@ -548,13 +548,52 @@ pub struct Tagger { pub(crate) lang_options: TaggerLangOptions, } -impl WriteProperties for Tagger { +impl Transform for Tagger { fn properties(&self) -> PropertiesMut { lazy_static! { static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); } *PROPERTIES } + + fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + let props = self.property_guard(&mut sentence)?; + + for token in sentence.iter_mut() { + let mut tag_vec: Vec<_> = self + .get_tags_with_options( + token.as_str(), + if token.is_sentence_start() { + Some(true) + } else { + None + }, + None, + ) + .collect(); + + tag_vec.push( + WordData::new( + self.id_word(token.as_str().into()), + PosId::special(SpecialPos::None), + ) + .freeze(), + ); + + if token.is_sentence_end() { + tag_vec.push( + WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), + ); + } + + *props.tags_mut(token)? = Tags::new(tag_vec); + } + + Ok(sentence) + } } impl Tagger { @@ -742,43 +781,4 @@ impl Tagger { pub fn get_tags<'a>(&'a self, word: &'a str) -> TagIter<'a> { self.get_tags_with_options(word, None, None) } - - pub fn apply<'t>( - &'t self, - sentence: &mut Sentence<'t>, - ) -> Result<(), crate::properties::Error> { - let props = self.property_guard(sentence)?; - - for token in sentence.iter_mut() { - let mut tag_vec: Vec<_> = self - .get_tags_with_options( - token.as_str(), - if token.is_sentence_start() { - Some(true) - } else { - None - }, - None, - ) - .collect(); - - tag_vec.push( - WordData::new( - self.id_word(token.as_str().into()), - PosId::special(SpecialPos::None), - ) - .freeze(), - ); - - if token.is_sentence_end() { - tag_vec.push( - WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), - ); - } - - *props.tags_mut(token)? = Tags::new(tag_vec); - } - - Ok(()) - } } From 1dd7587dca3a61bf50220e62240cd0d79699657f Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Tue, 4 May 2021 10:57:25 +0200 Subject: [PATCH 04/15] implement Tokenize --- nlprule/src/compile/impls.rs | 9 + nlprule/src/compile/parse_structure.rs | 10 +- nlprule/src/properties.rs | 183 +++++++++++++--- nlprule/src/rule/disambiguation.rs | 35 ++-- nlprule/src/rule/engine/composition.rs | 24 ++- nlprule/src/rule/mod.rs | 4 +- nlprule/src/rules.rs | 134 ++++-------- nlprule/src/tokenizer.rs | 53 ++--- nlprule/src/tokenizer/multiword.rs | 2 +- nlprule/src/tokenizer/tag.rs | 8 +- nlprule/src/types.rs | 66 +++--- nlprule/tests/tests.rs | 276 +++++++++++++------------ 12 files changed, 442 insertions(+), 362 deletions(-) diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs index 9541a22..3e2ef9a 100644 --- a/nlprule/src/compile/impls.rs +++ b/nlprule/src/compile/impls.rs @@ -371,6 +371,14 @@ impl Tokenizer { let rules = super::parse_structure::read_disambiguation_rules(path); let mut error = None; + let mut whitelist = DefaultHashSet::new(); + + for (word, _) in build_info.tagger().word_store() { + if word.contains(|c| lang_options.extra_split_chars.contains(&c)) { + whitelist.insert(word.to_owned()); + } + } + let rules: Vec<_> = rules .into_iter() .filter_map(|x| match x { @@ -438,6 +446,7 @@ impl Tokenizer { multiword_tagger, rules, lang_options, + whitelist, properties: Default::default(), }) } diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/compile/parse_structure.rs index 0be9924..aea79cf 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/compile/parse_structure.rs @@ -910,7 +910,7 @@ fn parse_tag_form( form: &str, is_sentence_end: bool, info: &mut BuildInfo, -) -> Result, Error> { +) -> Result>, Error> { lazy_static! { static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into()); } @@ -922,7 +922,7 @@ fn parse_tag_form( let text = captures.get(1).expect("1st regex group exists").as_str(); let tags = captures.get(2).expect("2nd regex group exists").as_str(); - let mut tag_vec: Vec<_> = tags + let mut tags: DefaultHashSet<_> = tags .split(',') .filter_map(|x| { if x == "" { @@ -942,7 +942,7 @@ fn parse_tag_form( }) .collect(); - tag_vec.push( + tags.insert( WordData::new( info.tagger.id_word(text.to_owned().into()), PosId::special(SpecialPos::None), @@ -951,11 +951,9 @@ fn parse_tag_form( ); if is_sentence_end { - tag_vec.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); + tags.insert(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); } - let tags = Tags::new(tag_vec); - Ok(tags) } diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index 7d6cd21..6089aba 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -3,6 +3,25 @@ use serde::{Deserialize, Serialize}; use crate::types::*; use thiserror::Error; +/// Correct a text by applying suggestions to it. +/// In the case of multiple possible replacements, always chooses the first one. +pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { + let mut offset: isize = 0; + let mut chars: Vec<_> = text.chars().collect(); + + for suggestion in suggestions { + let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); + chars.splice( + (suggestion.span().char().start as isize + offset) as usize + ..(suggestion.span().char().end as isize + offset) as usize, + replacement.iter().cloned(), + ); + offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; + } + + chars.into_iter().collect() +} + pub trait Suggest { fn properties(&self) -> Properties { Properties::default() @@ -11,6 +30,34 @@ pub trait Suggest { fn property_guard(&self, sentence: &Sentence) -> Result { self.properties().build(sentence) } + + fn suggest(&self, sentence: &Sentence) -> Result, Error>; + + fn correct(&self, sentence: &Sentence) -> Result { + let suggestions = self.suggest(sentence)?; + Ok(apply_suggestions(sentence.text(), &suggestions)) + } +} + +impl<'a, T> Suggest for &'a T +where + T: Suggest, +{ + fn properties(&self) -> Properties { + (*self).properties() + } + + fn property_guard(&self, sentence: &Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn suggest(&self, sentence: &Sentence) -> Result, Error> { + (*self).suggest(sentence) + } + + fn correct(&self, sentence: &Sentence) -> Result { + (*self).correct(sentence) + } } pub trait Transform { @@ -25,8 +72,34 @@ pub trait Transform { fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error>; } +impl<'a, T> Transform for &'a T +where + T: Transform, +{ + fn properties(&self) -> PropertiesMut { + (*self).properties() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error> { + (*self).transform(sentence) + } +} + pub trait Tokenize { - fn tokenize<'t>(&'t self, text: &'t str) -> Box>>; + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't>; +} + +impl<'a, T> Tokenize for &'a T +where + T: Tokenize, +{ + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + (*self).tokenize(text) + } } #[derive(Error, Debug)] @@ -64,6 +137,20 @@ impl Bitset { self.0 |= other.0; self } + + pub fn intersection(mut self, other: Bitset) -> Self { + self.0 &= other.0; + self + } + + pub fn inverse(mut self) -> Self { + self.0 = !self.0; + self + } + + pub fn is_empty(&self) -> bool { + self.0 == 0 + } } #[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] @@ -164,6 +251,19 @@ impl PropertiesMut { self } + pub fn chain(mut self, next: PropertiesMut) -> Result { + if !next + .read_mask + .intersection(self.write_mask.inverse()) + .is_empty() + { + unimplemented!() + } + + self.write_mask = self.write_mask.union(next.write_mask); + Ok(self) + } + pub fn build(&self, sentence: &mut Sentence) -> Result { for property in Property::properties() { if self.write_mask.contains(property) { @@ -296,33 +396,54 @@ impl PropertyGuardMut { pub struct Pipeline(T); -// type SentenceIter<'t> = Box>>; - -// macro_rules! impl_pipeline { -// ( $first:ident, $($name:ident),+) => { -// impl<$first: Tokenize, $($name: Transform,)+> Tokenize for Pipeline<($first, $($name,)+)> { -// #[allow(non_snake_case)] -// fn tokenize<'t>(&'t self, text: &'t str) -> SentenceIter<'t> { -// let (ref $first, $(ref $name),+) = self.0; -// let sentences = $first.tokenize(text); -// $(let sentences = $name.transform(sentences);)+ -// sentences -// } -// } - -// impl<$first: Transform, $($name: Transform,)+> Transform for Pipeline<($first, $($name,)+)> { -// #[allow(non_snake_case)] -// fn transform<'t>(&'t self, sentences: SentenceIter<'t>) -> SentenceIter<'t> { -// let (ref $first, $(ref $name),+) = self.0; -// let sentences = $first.transform(sentences); -// $(let sentences = $name.transform(sentences);)+ -// sentences -// } -// } -// }; -// } - -// impl_pipeline! { A, B } -// impl_pipeline! { A, B, C } -// impl_pipeline! { A, B, C, D } -// impl_pipeline! { A, B, C, D, E } +macro_rules! impl_pipeline { + ( $first:ident, $last:ident, $($name:ident),*) => { + impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case)] + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + sentence = $last.transform(sentence).unwrap(); + sentence + }); + + Box::new(sentences) + } + } + + impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case)] + fn transform<'t>(&'t self, mut sentence: Sentence<'t>) -> Result, crate::properties::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + sentence = $first.transform(sentence)?; + $(sentence = $name.transform(sentence)?;)* + sentence = $last.transform(sentence)?; + Ok(sentence) + } + } + + impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> { + pub fn new(components: ($first, $($name,)* $last)) -> Self { + Pipeline(components) + } + + #[allow(non_snake_case)] + pub fn suggest<'t>(&'t self, text: &'t str) -> impl Iterator> + 't { + let (ref $first, $(ref $name,)* ref $last) = self.0; + #[allow(unused_mut)] + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + $last.suggest(&sentence).unwrap() + }); + + sentences + } + } + }; +} + +impl_pipeline! { A, B, } +impl_pipeline! { A, C, B } +impl_pipeline! { A, D, B, C } +impl_pipeline! { A, E, B, C, D } diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs index fac4fe3..fcb4767 100644 --- a/nlprule/src/rule/disambiguation.rs +++ b/nlprule/src/rule/disambiguation.rs @@ -82,18 +82,19 @@ impl Disambiguation { match data_or_filter { either::Left(limit) => { for token in group.into_iter() { - let last = guard - .tags(token)? - .iter() - .next() - .and_then(|x| { - if *x.lemma() != WordId::empty() { - Some(x.lemma().clone()) - } else { - None - } - }) - .unwrap_or_else(|| token.text().clone()); + let last = { + let tags = guard.tags(token)?; + tags.iter() + .next() + .and_then(|x| { + if *x.lemma() != WordId::empty() { + Some(x.lemma().clone()) + } else { + None + } + }) + .unwrap_or_else(|| tags.id().clone()) + }; guard.tags_mut(token)?.retain(|x| x.pos() == limit.pos()); @@ -103,7 +104,7 @@ impl Disambiguation { .tags_mut(token)? .push(WordData::new(last, limit.pos().clone())); } else { - let lemma = token.text().clone(); + let lemma = guard.tags(token)?.id().clone(); guard .tags_mut(token)? @@ -126,7 +127,7 @@ impl Disambiguation { for token in group.into_iter() { let data = WordData::new( if data.lemma().as_str().is_empty() { - token.text().clone() + guard.tags(token)?.id().clone() } else { data.lemma().clone() }, @@ -145,7 +146,7 @@ impl Disambiguation { for token in group.into_iter() { let data = WordData::new( if data.lemma().as_str().is_empty() { - token.text().clone() + guard.tags(token)?.id().clone() } else { data.lemma().clone() }, @@ -223,8 +224,8 @@ impl Disambiguation { pub struct DisambiguationChange { pub text: String, pub char_span: Range, - pub before: Tags<'static>, - pub after: Tags<'static>, + pub before: DefaultHashSet>, + pub after: DefaultHashSet>, } #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 47a3ffa..a444fbd 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -202,9 +202,18 @@ pub mod concrete { ) -> Result { let (sentence, _) = context; - Ok(self - .matcher - .is_match(&sentence.index(position).text(), Some(context), None)) + Ok(self.matcher.is_match( + sentence.guard().tags(sentence.index(position))?.id(), + Some(context), + None, + )) + } + + fn compute_properties(&self) -> Properties { + lazy_static! { + static ref PROPERTIES: Properties = Properties::default().read(&[Property::Tags]); + } + *PROPERTIES } } @@ -455,17 +464,22 @@ impl GraphId { #[derive(Debug, Clone)] pub struct MatchSentence<'t> { sentence: &'t Sentence<'t>, + sent_start: Token<'t>, guard: PropertyGuard, } impl<'t> MatchSentence<'t> { pub fn new(sentence: &'t Sentence<'t>, guard: PropertyGuard) -> Self { - MatchSentence { sentence, guard } + MatchSentence { + sentence, + sent_start: Token::sent_start(), + guard, + } } pub fn index(&self, index: usize) -> &Token { match index { - 0 => &*crate::types::SENT_START, + 0 => &self.sent_start, i => &self.sentence.tokens()[i - 1], } } diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index 0bb8d77..e14903e 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -233,7 +233,7 @@ impl DisambiguationRule { let sentence_before = tokenizer .disambiguate_up_to_id( tokenizer - .tokenize(text) + .tokenize_sentence(text) .expect("test text must not be empty"), Some(&self.id), ) @@ -529,7 +529,7 @@ impl Rule { let sentence = tokenizer .disambiguate( tokenizer - .tokenize(&test.text()) + .tokenize_sentence(&test.text()) .expect("test text must not be empty."), ) .unwrap() diff --git a/nlprule/src/rules.rs b/nlprule/src/rules.rs index 59dd195..bd90f96 100644 --- a/nlprule/src/rules.rs +++ b/nlprule/src/rules.rs @@ -3,7 +3,7 @@ use crate::properties::*; use crate::types::*; use crate::utils::parallelism::MaybeParallelRefIterator; -use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, tokenizer::Tokenizer, Error}; +use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, Error}; use fs_err::File; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; @@ -37,7 +37,7 @@ impl Default for RulesLangOptions { } /// A set of grammatical error correction rules. -#[derive(Serialize, Deserialize, Default)] +#[derive(Serialize, Deserialize, Default, Clone)] pub struct Rules { pub(crate) rules: Vec, #[serde(skip)] @@ -53,58 +53,8 @@ impl Suggest for Rules { .collect() }) } -} - -impl Rules { - /// Creates a new rule set from a path to a binary. - /// - /// # Errors - /// - If the file can not be opened. - /// - If the file content can not be deserialized to a rules set. - pub fn new>(p: P) -> Result { - let reader = BufReader::new(File::open(p.as_ref())?); - let rules: Rules = bincode::deserialize_from(reader)?; - Ok(rules) - } - - /// Creates a new rules set from a reader. - pub fn from_reader(reader: R) -> Result { - Ok(bincode::deserialize_from(reader)?) - } - - /// Serializes this rules set to a writer. - pub fn to_writer(&self, writer: W) -> Result<(), Error> { - Ok(bincode::serialize_into(writer, &self)?) - } - - /// All rules ordered by priority. - pub fn rules(&self) -> &[Rule] { - &self.rules - } - - /// All rules ordered by priority (mutable). - pub fn rules_mut(&mut self) -> &mut [Rule] { - &mut self.rules - } - - /// Returns an iterator over all rules matching the selector. - pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> { - RulesIter { - inner: self.rules.iter(), - selector: Some(selector), - } - } - - /// Returns an iterator over all rules matching the selector (mutable). - pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> { - RulesIterMut { - inner: self.rules.iter_mut(), - selector: Some(selector), - } - } - /// Compute the suggestions for the given sentence by checking all rules. - pub fn apply(&self, sentence: &Sentence) -> Result, crate::properties::Error> { + fn suggest(&self, sentence: &Sentence) -> Result, crate::properties::Error> { let sentence = MatchSentence::new(sentence, self.property_guard(sentence)?); let mut output: Vec<(usize, Suggestion)> = self @@ -153,55 +103,55 @@ impl Rules { }) .collect()) } +} - /// Compute the suggestions for a text by checking all rules. - pub fn suggest( - &self, - text: &str, - tokenizer: &Tokenizer, - ) -> Result, crate::properties::Error> { - if text.is_empty() { - return Ok(Vec::new()); - } - - let mut suggestions = Vec::new(); +impl Rules { + /// Creates a new rule set from a path to a binary. + /// + /// # Errors + /// - If the file can not be opened. + /// - If the file content can not be deserialized to a rules set. + pub fn new>(p: P) -> Result { + let reader = BufReader::new(File::open(p.as_ref())?); + let rules: Rules = bincode::deserialize_from(reader)?; + Ok(rules) + } - // get suggestions sentence by sentence - for sentence in tokenizer.pipe(text) { - suggestions.extend(self.apply(&sentence?)?); - } + /// Creates a new rules set from a reader. + pub fn from_reader(reader: R) -> Result { + Ok(bincode::deserialize_from(reader)?) + } - Ok(suggestions) + /// Serializes this rules set to a writer. + pub fn to_writer(&self, writer: W) -> Result<(), Error> { + Ok(bincode::serialize_into(writer, &self)?) } - /// Correct a text by first tokenizing, then finding all suggestions and choosing the first replacement of each suggestion. - pub fn correct( - &self, - text: &str, - tokenizer: &Tokenizer, - ) -> Result { - let suggestions = self.suggest(text, tokenizer)?; - Ok(apply_suggestions(text, &suggestions)) + /// All rules ordered by priority. + pub fn rules(&self) -> &[Rule] { + &self.rules } -} -/// Correct a text by applying suggestions to it. -/// In the case of multiple possible replacements, always chooses the first one. -pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { - let mut offset: isize = 0; - let mut chars: Vec<_> = text.chars().collect(); + /// All rules ordered by priority (mutable). + pub fn rules_mut(&mut self) -> &mut [Rule] { + &mut self.rules + } - for suggestion in suggestions { - let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); - chars.splice( - (suggestion.span().char().start as isize + offset) as usize - ..(suggestion.span().char().end as isize + offset) as usize, - replacement.iter().cloned(), - ); - offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; + /// Returns an iterator over all rules matching the selector. + pub fn select<'a>(&'a self, selector: &'a Selector) -> RulesIter<'a> { + RulesIter { + inner: self.rules.iter(), + selector: Some(selector), + } } - chars.into_iter().collect() + /// Returns an iterator over all rules matching the selector (mutable). + pub fn select_mut<'a>(&'a mut self, selector: &'a Selector) -> RulesIterMut<'a> { + RulesIterMut { + inner: self.rules.iter_mut(), + selector: Some(selector), + } + } } /// An iterator over references to rules. diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs index d7f55f2..9e37864 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/tokenizer.rs @@ -90,7 +90,7 @@ impl Default for TokenizerLangOptions { } /// An iterator over [IncompleteSentence]s. Has the same properties as [SentenceIter]. -pub struct IncompleteSentenceIter<'t> { +pub struct SentenceIter<'t> { text: &'t str, splits: Vec>, tokenizer: &'t Tokenizer, @@ -98,7 +98,7 @@ pub struct IncompleteSentenceIter<'t> { position: Position, } -impl<'t> Iterator for IncompleteSentenceIter<'t> { +impl<'t> Iterator for SentenceIter<'t> { type Item = Sentence<'t>; fn next(&mut self) -> Option { @@ -119,7 +119,7 @@ impl<'t> Iterator for IncompleteSentenceIter<'t> { let sentence = self .tokenizer - .tokenize(&self.text[range.clone()]) + .tokenize_sentence(&self.text[range.clone()]) .map(|x| x.rshift(self.position)); self.position += Position { @@ -135,12 +135,12 @@ impl<'t> Iterator for IncompleteSentenceIter<'t> { /// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero. /// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`. /// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence. -pub struct SentenceIter<'t> { - inner: IncompleteSentenceIter<'t>, +pub struct AnalyzedSentenceIter<'t> { + inner: SentenceIter<'t>, tokenizer: &'t Tokenizer, } -impl<'t> Iterator for SentenceIter<'t> { +impl<'t> Iterator for AnalyzedSentenceIter<'t> { type Item = Result, crate::properties::Error>; fn next(&mut self) -> Option { @@ -154,6 +154,7 @@ impl<'t> Iterator for SentenceIter<'t> { #[derive(Serialize, Deserialize, Default, Clone)] pub struct Tokenizer { pub(crate) rules: Vec, + pub(crate) whitelist: DefaultHashSet, pub(crate) chunker: Option, pub(crate) sentencizer: srx::Rules, pub(crate) multiword_tagger: Option, @@ -181,6 +182,18 @@ impl Transform for Tokenizer { } } +impl Tokenize for Tokenizer { + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + Box::new(SentenceIter { + text, + splits: self.sentencizer.split_ranges(text), + tokenizer: &self, + index: 0, + position: Position::default(), + }) + } +} + impl Tokenizer { /// Creates a new tokenizer from a path to a binary. /// @@ -287,7 +300,7 @@ impl Tokenizer { let mut tokens = Vec::new(); for pretoken in split(text, split_char) { // if the token is in the dictionary, we add it right away - if self.tagger.id_word(pretoken.into()).1.is_some() { + if self.whitelist.contains(pretoken) { tokens.push(pretoken); } else { // otherwise, potentially split it again with `extra_split_chars` e. g. "-" @@ -333,8 +346,7 @@ impl Tokenizer { } /// Tokenize the given sentence. This applies chunking and tagging, but does not do disambiguation. - // NB: this is not public because it could be easily misused by passing a text instead of one sentence. - pub(crate) fn tokenize<'t>(&'t self, sentence: &'t str) -> Option> { + pub(crate) fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { if sentence.trim().is_empty() { return None; } @@ -356,10 +368,8 @@ impl Tokenizer { let is_sentence_start = i == 0; let is_sentence_end = i == n_token_strs - 1; - let id = self.tagger.id_word(token_text.into()); - Token::new( - id, + token_text, Span::new( byte_start..byte_start + token_text.len(), char_start..char_start + token_text.chars().count(), @@ -385,23 +395,4 @@ impl Tokenizer { Some(sentence) } - - /// Splits the text into sentences and tokenizes each sentence. - pub fn sentencize<'t>(&'t self, text: &'t str) -> IncompleteSentenceIter<'t> { - IncompleteSentenceIter { - text, - splits: self.sentencizer.split_ranges(text), - tokenizer: &self, - index: 0, - position: Position::default(), - } - } - - /// Applies the entire tokenization pipeline including sentencization, tagging, chunking and disambiguation. - pub fn pipe<'t>(&'t self, text: &'t str) -> SentenceIter<'t> { - SentenceIter { - inner: self.sentencize(text), - tokenizer: &self, - } - } } diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/tokenizer/multiword.rs index 5168aed..f56ff9c 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/tokenizer/multiword.rs @@ -63,7 +63,7 @@ impl Transform for MultiwordTagger { .enumerate() .map(|(i, x)| { start_indices.insert(byte_index, i); - byte_index += x.text().0.len(); + byte_index += x.as_str().len(); end_indices.insert(byte_index, i); byte_index += " ".len(); diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/tokenizer/tag.rs index fbdacf7..fee120d 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/tokenizer/tag.rs @@ -59,6 +59,12 @@ impl<'t> fmt::Debug for WordId<'t> { } } +impl<'t> Default for WordId<'t> { + fn default() -> Self { + WordId::empty() + } +} + impl<'t> WordId<'t> { pub(crate) fn id(&self) -> &Option { &self.1 @@ -589,7 +595,7 @@ impl Transform for Tagger { ); } - *props.tags_mut(token)? = Tags::new(tag_vec); + *props.tags_mut(token)? = Tags::new(self.id_word(token.as_str().into()), tag_vec); } Ok(sentence) diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index d8cfd8f..360d1c2 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -185,13 +185,18 @@ impl<'a, 't> Iterator for TagIter<'a, 't> { /// the text itself and the [WordData]s associated with the word. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] pub struct Tags<'t> { + id: WordId<'t>, tags: Vec>, } impl<'t> Tags<'t> { /// Creates new [Tags]. - pub fn new(tags: Vec>) -> Self { - Tags { tags } + pub fn new(id: WordId<'t>, tags: Vec>) -> Self { + Tags { id, tags } + } + + pub fn id(&self) -> &WordId<'t> { + &self.id } /// Multiple pairs of (lemma, part-of-speech) associated with this token. @@ -227,30 +232,16 @@ impl<'t> Tags<'t> { /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. pub fn into_static(self) -> Tags<'static> { Tags { + id: self.id.into_static(), tags: self.tags.into_iter().map(|x| x.into_static()).collect(), } } } -lazy_static! { - pub(crate) static ref SENT_START: Token<'static> = Token { - text: WordId::empty(), - span: Span::default(), - is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. - is_sentence_end: false, - has_space_before: false, - tags: Some(Tags::new(vec![WordData::new( - WordId::empty(), - PosId::special(SpecialPos::SentStart), - )],)), - chunks: Some(Vec::new()), - }; -} - /// A token where varying levels of information are set. #[derive(Debug, Clone, PartialEq)] pub struct Token<'t> { - text: WordId<'t>, + text: &'t str, span: Span, is_sentence_start: bool, is_sentence_end: bool, @@ -261,7 +252,7 @@ pub struct Token<'t> { impl<'t> Token<'t> { pub(crate) fn new( - text: WordId<'t>, + text: &'t str, span: Span, is_sentence_start: bool, is_sentence_end: bool, @@ -278,17 +269,27 @@ impl<'t> Token<'t> { } } - /// Gets the word id for this token. - pub fn text(&self) -> &WordId<'t> { - &self.text + pub(crate) fn sent_start<'a>() -> Token<'a> { + Token { + text: "", + span: Span::default(), + is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. + is_sentence_end: false, + has_space_before: false, + tags: Some(Tags::new( + WordId::empty(), + vec![WordData::new( + WordId::empty(), + PosId::special(SpecialPos::SentStart), + )], + )), + chunks: Some(Vec::new()), + } } /// Gets the token as string. pub fn as_str(&self) -> &'t str { - // we know that the token text can never be changed, and it is created - // from a slice of the input text, so the `WordId` will always contain - // a borrowed Cow. - self.text.as_ref_str() + self.text } /// The span of this sentence. @@ -316,19 +317,6 @@ impl<'t> Token<'t> { self.span = self.span.rshift(position); self } - - /// Converts this struct to a struct with `'static` lifetime by cloning borrowed data. - pub fn into_static(self) -> Token<'static> { - Token { - text: self.text.into_static(), - tags: self.tags.map(Tags::into_static), - span: self.span, - is_sentence_start: self.is_sentence_start, - is_sentence_end: self.is_sentence_end, - has_space_before: self.has_space_before, - chunks: self.chunks, - } - } } impl<'t> Token<'t> { diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 7ea7f86..7de0559 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -1,7 +1,7 @@ use std::convert::TryInto; use lazy_static::lazy_static; -use nlprule::{rule::id::Category, types::Position, Rules, Tokenizer}; +use nlprule::{properties::*, rule::id::Category, types::Position, Rules, Tokenizer}; use quickcheck_macros::quickcheck; const TOKENIZER_PATH: &str = "../storage/en_tokenizer.bin"; @@ -14,142 +14,144 @@ lazy_static! { #[test] fn can_tokenize_empty_text() { - let sentences: Vec<_> = TOKENIZER.pipe("").collect(); - assert!(sentences.is_empty()); -} - -#[test] -fn handles_whitespace_correctly() { - // preceding whitespace has to be included, trailing whitespace behavior is unspecified - let text = " hello.\ttest.\t\t"; - - let mut sentences = TOKENIZER.pipe(text); - assert_eq!( - &text[sentences.next().unwrap().unwrap().span().byte().clone()], - " hello.\t" - ); - assert_eq!( - &text[sentences.next().unwrap().unwrap().span().byte().clone()], - "test.\t" - ); - assert!(sentences.next().is_none()); -} - -#[quickcheck] -fn can_tokenize_anything(text: String) -> bool { - let _: Vec<_> = TOKENIZER.pipe(&text).collect(); - true -} - -#[test] -fn suggest_indices_are_relative_to_input_text() { - let suggestions = RULES - .suggest( - "I can due his homework for 10€. I can due his homework.", - &*TOKENIZER, - ) - .unwrap(); - - assert_eq!(*suggestions[0].span().char(), 6..9); - assert_eq!(*suggestions[0].span().byte(), 6..9); - - assert_eq!(*suggestions[1].span().char(), 38..41); - assert_eq!( - *suggestions[1].span().byte(), - 38 + '€'.len_utf8() - 1..41 + '€'.len_utf8() - 1 - ); -} - -#[test] -fn sentence_spans_correct() { - let text = "A short test. A test with emoji 😊."; - - let sentences: Vec<_> = TOKENIZER.pipe(text).collect::>().unwrap(); - assert_eq!(sentences.len(), 2); - - assert_eq!(*sentences[0].span().char(), 0..14); - assert_eq!(*sentences[0].span().byte(), 0..14); - - assert_eq!(*sentences[1].span().char(), 14..34); - assert_eq!(*sentences[1].span().byte(), 14..37); -} - -#[test] -fn token_spans_correct() { - let text = "A short test. A test with emoji 😊."; - - let tokens: Vec<_> = TOKENIZER - .pipe(text) - .map(|x| x.into_iter()) - .flatten() - .collect(); - assert_eq!(*tokens[0].span().byte(), 0..1); - assert_eq!(*tokens[0].span().char(), 0..1); - - assert_eq!(*tokens[2].span().char(), 8..12); - assert_eq!(*tokens[2].span().byte(), 8..12); - - assert_eq!(*tokens[tokens.len() - 2].span().char(), 32..33); - assert_eq!(*tokens[tokens.len() - 2].span().byte(), 32..36); - - assert_eq!(*tokens[tokens.len() - 1].span().char(), 33..34); - assert_eq!(*tokens[tokens.len() - 1].span().byte(), 36..37); -} - -#[quickcheck] -fn no_gaps_between_sentences(text: String) { - let mut prev_pos = Position::default(); - let mut contains_sentence = false; - - for sentence in TOKENIZER.pipe(&text) { - let sentence = sentence.unwrap(); - - assert_eq!(sentence.span().start(), prev_pos); - prev_pos += sentence.span().len(); + let tokenizer = Pipeline::<(&Tokenizer, &Rules)>::new((&TOKENIZER, &RULES)); - contains_sentence = true; - } - - assert_eq!(contains_sentence, !text.trim().is_empty()); + // let sentences: Vec<_> = TOKENIZER.pipe("").collect(); + // assert!(sentences.is_empty()); } -#[test] -fn rules_can_be_disabled_enabled() { - let mut rules = Rules::new(RULES_PATH).unwrap(); - - // enabled by default - assert!(!rules - .suggest("I can due his homework", &*TOKENIZER) - .unwrap() - .is_empty()); - - rules - .select_mut( - &Category::new("confused_words") - .join("confusion_due_do") - .into(), - ) - .for_each(|x| x.disable()); - - // disabled now - assert!(rules - .suggest("I can due his homework", &*TOKENIZER) - .unwrap() - .is_empty()); - - // disabled by default - assert!(rules - .suggest("I can not go", &*TOKENIZER) - .unwrap() - .is_empty()); - - rules - .select_mut(&"typos/can_not".try_into().unwrap()) - .for_each(|x| x.enable()); - - // enabled now - assert!(!rules - .suggest("I can not go", &*TOKENIZER) - .unwrap() - .is_empty()); -} +// #[test] +// fn handles_whitespace_correctly() { +// // preceding whitespace has to be included, trailing whitespace behavior is unspecified +// let text = " hello.\ttest.\t\t"; + +// let mut sentences = TOKENIZER.pipe(text); +// assert_eq!( +// &text[sentences.next().unwrap().unwrap().span().byte().clone()], +// " hello.\t" +// ); +// assert_eq!( +// &text[sentences.next().unwrap().unwrap().span().byte().clone()], +// "test.\t" +// ); +// assert!(sentences.next().is_none()); +// } + +// #[quickcheck] +// fn can_tokenize_anything(text: String) -> bool { +// let _: Vec<_> = TOKENIZER.pipe(&text).collect(); +// true +// } + +// #[test] +// fn suggest_indices_are_relative_to_input_text() { +// let suggestions = RULES +// .suggest( +// "I can due his homework for 10€. I can due his homework.", +// &*TOKENIZER, +// ) +// .unwrap(); + +// assert_eq!(*suggestions[0].span().char(), 6..9); +// assert_eq!(*suggestions[0].span().byte(), 6..9); + +// assert_eq!(*suggestions[1].span().char(), 38..41); +// assert_eq!( +// *suggestions[1].span().byte(), +// 38 + '€'.len_utf8() - 1..41 + '€'.len_utf8() - 1 +// ); +// } + +// #[test] +// fn sentence_spans_correct() { +// let text = "A short test. A test with emoji 😊."; + +// let sentences: Vec<_> = TOKENIZER.pipe(text).collect::>().unwrap(); +// assert_eq!(sentences.len(), 2); + +// assert_eq!(*sentences[0].span().char(), 0..14); +// assert_eq!(*sentences[0].span().byte(), 0..14); + +// assert_eq!(*sentences[1].span().char(), 14..34); +// assert_eq!(*sentences[1].span().byte(), 14..37); +// } + +// #[test] +// fn token_spans_correct() { +// let text = "A short test. A test with emoji 😊."; + +// let tokens: Vec<_> = TOKENIZER +// .pipe(text) +// .map(|x| x.into_iter()) +// .flatten() +// .collect(); +// assert_eq!(*tokens[0].span().byte(), 0..1); +// assert_eq!(*tokens[0].span().char(), 0..1); + +// assert_eq!(*tokens[2].span().char(), 8..12); +// assert_eq!(*tokens[2].span().byte(), 8..12); + +// assert_eq!(*tokens[tokens.len() - 2].span().char(), 32..33); +// assert_eq!(*tokens[tokens.len() - 2].span().byte(), 32..36); + +// assert_eq!(*tokens[tokens.len() - 1].span().char(), 33..34); +// assert_eq!(*tokens[tokens.len() - 1].span().byte(), 36..37); +// } + +// #[quickcheck] +// fn no_gaps_between_sentences(text: String) { +// let mut prev_pos = Position::default(); +// let mut contains_sentence = false; + +// for sentence in TOKENIZER.pipe(&text) { +// let sentence = sentence.unwrap(); + +// assert_eq!(sentence.span().start(), prev_pos); +// prev_pos += sentence.span().len(); + +// contains_sentence = true; +// } + +// assert_eq!(contains_sentence, !text.trim().is_empty()); +// } + +// #[test] +// fn rules_can_be_disabled_enabled() { +// let mut rules = Rules::new(RULES_PATH).unwrap(); + +// // enabled by default +// assert!(!rules +// .suggest("I can due his homework", &*TOKENIZER) +// .unwrap() +// .is_empty()); + +// rules +// .select_mut( +// &Category::new("confused_words") +// .join("confusion_due_do") +// .into(), +// ) +// .for_each(|x| x.disable()); + +// // disabled now +// assert!(rules +// .suggest("I can due his homework", &*TOKENIZER) +// .unwrap() +// .is_empty()); + +// // disabled by default +// assert!(rules +// .suggest("I can not go", &*TOKENIZER) +// .unwrap() +// .is_empty()); + +// rules +// .select_mut(&"typos/can_not".try_into().unwrap()) +// .for_each(|x| x.enable()); + +// // enabled now +// assert!(!rules +// .suggest("I can not go", &*TOKENIZER) +// .unwrap() +// .is_empty()); +// } From 9ff92e6e086fec279d39f71835296d851e2cb1d9 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Tue, 4 May 2021 13:28:33 +0200 Subject: [PATCH 05/15] add Pipeline::new --- nlprule/src/compile/parse_structure.rs | 10 ++-- nlprule/src/properties.rs | 65 +++++++++++++++++++++----- nlprule/src/rule/disambiguation.rs | 4 +- nlprule/src/rule/engine/composition.rs | 9 +--- nlprule/src/rule/mod.rs | 6 +-- nlprule/src/tokenizer.rs | 48 +++++++++---------- nlprule/src/types.rs | 36 +++++++------- nlprule/tests/tests.rs | 5 +- 8 files changed, 108 insertions(+), 75 deletions(-) diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/compile/parse_structure.rs index aea79cf..497001d 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/compile/parse_structure.rs @@ -910,7 +910,7 @@ fn parse_tag_form( form: &str, is_sentence_end: bool, info: &mut BuildInfo, -) -> Result>, Error> { +) -> Result, Error> { lazy_static! { static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into()); } @@ -922,7 +922,7 @@ fn parse_tag_form( let text = captures.get(1).expect("1st regex group exists").as_str(); let tags = captures.get(2).expect("2nd regex group exists").as_str(); - let mut tags: DefaultHashSet<_> = tags + let mut tags: Vec<_> = tags .split(',') .filter_map(|x| { if x == "" { @@ -942,7 +942,7 @@ fn parse_tag_form( }) .collect(); - tags.insert( + tags.push( WordData::new( info.tagger.id_word(text.to_owned().into()), PosId::special(SpecialPos::None), @@ -951,10 +951,10 @@ fn parse_tag_form( ); if is_sentence_end { - tags.insert(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); + tags.push(WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze()); } - Ok(tags) + Ok(Tags::new(WordId::empty(), tags)) } impl WordData<'static> { diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index 6089aba..fb8b323 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -90,6 +90,14 @@ where } pub trait Tokenize { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't>; } @@ -97,6 +105,14 @@ impl<'a, T> Tokenize for &'a T where T: Tokenize, { + fn properties(&self) -> PropertiesMut { + (*self).properties() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { (*self).tokenize(text) } @@ -105,8 +121,10 @@ where #[derive(Error, Debug)] #[allow(missing_docs)] pub enum Error { - #[error("unset token property: {0:?}")] + #[error("unset token property: {0:?}.")] Unset(Property), + #[error("invalid pipeline: properties {0:?} are read without being written.")] + InvalidPipeline(Vec), } #[derive(Debug, Clone, Copy)] @@ -151,6 +169,16 @@ impl Bitset { pub fn is_empty(&self) -> bool { self.0 == 0 } + + pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + Property::properties().iter().filter_map(move |property| { + if self.contains(property) { + Some(*property) + } else { + None + } + }) + } } #[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)] @@ -252,12 +280,11 @@ impl PropertiesMut { } pub fn chain(mut self, next: PropertiesMut) -> Result { - if !next - .read_mask - .intersection(self.write_mask.inverse()) - .is_empty() - { - unimplemented!() + let next_reads = next.read_mask.intersection(next.write_mask.inverse()); + let invalid_reads = next_reads.intersection(self.write_mask.inverse()); + + if !invalid_reads.is_empty() { + return Err(Error::InvalidPipeline(invalid_reads.iter().collect())); } self.write_mask = self.write_mask.union(next.write_mask); @@ -396,6 +423,11 @@ impl PropertyGuardMut { pub struct Pipeline(T); +#[allow(clippy::new_ret_no_self)] +pub trait CreatePipe { + fn new(components: T) -> Result, Error>; +} + macro_rules! impl_pipeline { ( $first:ident, $last:ident, $($name:ident),*) => { impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for Pipeline<($first, $($name,)* $last)> { @@ -423,15 +455,24 @@ macro_rules! impl_pipeline { } } - impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> { - pub fn new(components: ($first, $($name,)* $last)) -> Self { - Pipeline(components) + impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> CreatePipe<($first, $($name,)* $last)> for Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = PropertiesMut::default().chain($first.properties())?; + $(properties = properties.chain($name.properties())?;)* + properties.chain($last.properties().write(&[]))?; + + Ok(Pipeline(components)) } + } - #[allow(non_snake_case)] + impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] pub fn suggest<'t>(&'t self, text: &'t str) -> impl Iterator> + 't { let (ref $first, $(ref $name,)* ref $last) = self.0; - #[allow(unused_mut)] + let sentences = $first.tokenize(text).map(move |mut sentence| { $(sentence = $name.transform(sentence).unwrap();)* $last.suggest(&sentence).unwrap() diff --git a/nlprule/src/rule/disambiguation.rs b/nlprule/src/rule/disambiguation.rs index fcb4767..c684a53 100644 --- a/nlprule/src/rule/disambiguation.rs +++ b/nlprule/src/rule/disambiguation.rs @@ -224,8 +224,8 @@ impl Disambiguation { pub struct DisambiguationChange { pub text: String, pub char_span: Range, - pub before: DefaultHashSet>, - pub after: DefaultHashSet>, + pub before: Tags<'static>, + pub after: Tags<'static>, } #[derive(Debug, Serialize, Deserialize, Clone)] diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index a444fbd..6543955 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -464,22 +464,17 @@ impl GraphId { #[derive(Debug, Clone)] pub struct MatchSentence<'t> { sentence: &'t Sentence<'t>, - sent_start: Token<'t>, guard: PropertyGuard, } impl<'t> MatchSentence<'t> { pub fn new(sentence: &'t Sentence<'t>, guard: PropertyGuard) -> Self { - MatchSentence { - sentence, - sent_start: Token::sent_start(), - guard, - } + MatchSentence { sentence, guard } } pub fn index(&self, index: usize) -> &Token { match index { - 0 => &self.sent_start, + 0 => &crate::types::SENT_START, i => &self.sentence.tokens()[i - 1], } } diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index e14903e..b83ea14 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -282,11 +282,7 @@ impl DisambiguationRule { after.tags().unwrap().iter().collect::>(); let unordered_tags_change = change.after.iter().collect::>(); - let pass = unordered_tags == unordered_tags_change; - if !pass { - println!("{:#?} ---- {:#?}", unordered_tags, unordered_tags_change); - } - pass + unordered_tags == unordered_tags_change } }; diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/tokenizer.rs index 9e37864..c165cd4 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/tokenizer.rs @@ -13,6 +13,7 @@ use crate::{ Error, }; use fs_err::File; +use lazy_static::lazy_static; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use std::{ @@ -164,25 +165,32 @@ pub struct Tokenizer { pub(crate) properties: OnceCell, } -impl Transform for Tokenizer { - fn properties(&self) -> PropertiesMut { - *self.properties.get_or_init(|| { - self.rules - .iter() - .map(|rule| rule.compute_properties()) - .collect() - }) - } +// impl Transform for Tokenizer { +// fn properties(&self) -> PropertiesMut { +// *self.properties.get_or_init(|| { +// self.rules +// .iter() +// .map(|rule| rule.compute_properties()) +// .collect() +// }) +// } + +// fn transform<'t>( +// &'t self, +// _sentence: Sentence<'t>, +// ) -> Result, crate::properties::Error> { +// unimplemented!() +// } +// } - fn transform<'t>( - &'t self, - _sentence: Sentence<'t>, - ) -> Result, crate::properties::Error> { - unimplemented!() +impl Tokenize for Tokenizer { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); + } + *PROPERTIES } -} -impl Tokenize for Tokenizer { fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { Box::new(SentenceIter { text, @@ -385,14 +393,6 @@ impl Tokenizer { sentence = self.tagger.transform(sentence).unwrap(); - if let Some(chunker) = &self.chunker { - sentence = chunker.transform(sentence).unwrap(); - } - - if let Some(multiword_tagger) = &self.multiword_tagger { - sentence = multiword_tagger.transform(sentence).unwrap(); - } - Some(sentence) } } diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 360d1c2..0645d43 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -238,6 +238,24 @@ impl<'t> Tags<'t> { } } +lazy_static! { + pub(crate) static ref SENT_START: Token<'static> = Token { + text: "", + span: Span::default(), + is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. + is_sentence_end: false, + has_space_before: false, + tags: Some(Tags::new( + WordId::empty(), + vec![WordData::new( + WordId::empty(), + PosId::special(SpecialPos::SentStart), + )], + )), + chunks: Some(Vec::new()), + }; +} + /// A token where varying levels of information are set. #[derive(Debug, Clone, PartialEq)] pub struct Token<'t> { @@ -269,24 +287,6 @@ impl<'t> Token<'t> { } } - pub(crate) fn sent_start<'a>() -> Token<'a> { - Token { - text: "", - span: Span::default(), - is_sentence_start: false, // `is_sentence_start` marks the first *real* token in the sentence. - is_sentence_end: false, - has_space_before: false, - tags: Some(Tags::new( - WordId::empty(), - vec![WordData::new( - WordId::empty(), - PosId::special(SpecialPos::SentStart), - )], - )), - chunks: Some(Vec::new()), - } - } - /// Gets the token as string. pub fn as_str(&self) -> &'t str { self.text diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 7de0559..2ee1caa 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -14,9 +14,10 @@ lazy_static! { #[test] fn can_tokenize_empty_text() { - let tokenizer = Pipeline::<(&Tokenizer, &Rules)>::new((&TOKENIZER, &RULES)); + let tokenizer = + Pipeline::new((&*TOKENIZER, TOKENIZER.chunker().as_ref().unwrap(), &*RULES)).unwrap(); - // let sentences: Vec<_> = TOKENIZER.pipe("").collect(); + let sentences: Vec<_> = tokenizer.suggest("His homework is due tomorrow.").collect(); // assert!(sentences.is_empty()); } From b60750dbf811d18c7fe7cc02b053c4d4de7e3582 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sat, 15 May 2021 19:22:17 +0200 Subject: [PATCH 06/15] restructure into components/ --- nlprule/build.rs | 62 -- nlprule/configs/en/rules.json | 7 - nlprule/configs/en/tagger.json | 11 - nlprule/configs/en/tokenizer.json | 9 - nlprule/src/bin/compile.rs | 99 ++- nlprule/src/bin/run.rs | 47 +- nlprule/src/bin/test.rs | 76 +- nlprule/src/bin/test_disambiguation.rs | 86 +- nlprule/src/compile/impls.rs | 835 ------------------ nlprule/src/compile/mod.rs | 248 ++---- nlprule/src/compile/utils.rs | 52 -- nlprule/src/components/chunker/compile.rs | 84 ++ .../chunk.rs => components/chunker/mod.rs} | 10 + nlprule/src/components/mod.rs | 30 + .../components/multiword_tagger/compile.rs | 46 + .../multiword_tagger/mod.rs} | 14 +- nlprule/src/components/rules/compile/mod.rs | 234 +++++ .../rules/compile/structure/impls.rs | 375 ++++++++ .../rules/compile/structure/mod.rs} | 157 +--- .../rules/compile/structure/parse.rs} | 523 +++++++---- .../src/{rules.rs => components/rules/mod.rs} | 141 ++- nlprule/src/components/tagger/compile.rs | 165 ++++ .../tag.rs => components/tagger/mod.rs} | 97 +- nlprule/src/components/tokenizer/compile.rs | 45 + .../tokenizer/mod.rs} | 247 +----- nlprule/src/lib.rs | 7 +- nlprule/src/rule/engine/composition.rs | 2 +- nlprule/src/rule/mod.rs | 293 +++--- nlprule/src/types.rs | 5 +- 29 files changed, 1956 insertions(+), 2051 deletions(-) delete mode 100644 nlprule/build.rs delete mode 100644 nlprule/configs/en/rules.json delete mode 100644 nlprule/configs/en/tagger.json delete mode 100644 nlprule/configs/en/tokenizer.json delete mode 100644 nlprule/src/compile/impls.rs create mode 100644 nlprule/src/components/chunker/compile.rs rename nlprule/src/{tokenizer/chunk.rs => components/chunker/mod.rs} (99%) create mode 100644 nlprule/src/components/mod.rs create mode 100644 nlprule/src/components/multiword_tagger/compile.rs rename nlprule/src/{tokenizer/multiword.rs => components/multiword_tagger/mod.rs} (92%) create mode 100644 nlprule/src/components/rules/compile/mod.rs create mode 100644 nlprule/src/components/rules/compile/structure/impls.rs rename nlprule/src/{compile/structure.rs => components/rules/compile/structure/mod.rs} (79%) rename nlprule/src/{compile/parse_structure.rs => components/rules/compile/structure/parse.rs} (73%) rename nlprule/src/{rules.rs => components/rules/mod.rs} (64%) create mode 100644 nlprule/src/components/tagger/compile.rs rename nlprule/src/{tokenizer/tag.rs => components/tagger/mod.rs} (97%) create mode 100644 nlprule/src/components/tokenizer/compile.rs rename nlprule/src/{tokenizer.rs => components/tokenizer/mod.rs} (56%) diff --git a/nlprule/build.rs b/nlprule/build.rs deleted file mode 100644 index 8eb2ad1..0000000 --- a/nlprule/build.rs +++ /dev/null @@ -1,62 +0,0 @@ -//! Compiles the language build configurations in configs/ into two files (one for the tokenizer, one for the rules) -//! so they can be inlined. These configs are included at compile time because they define the neccessary parameters to -//! run the rules for a language correctly. They are NOT user configuration. - -use fs::File; -use fs_err as fs; -use std::{collections::HashMap, io::BufWriter, path::Path}; - -fn main() { - let path = env!("CARGO_MANIFEST_DIR"); - let path = Path::new(path).join("configs"); - - let out_dir = - std::env::var("OUT_DIR").expect("OUT_DIR env var must be set when build.rs is run"); - let out_dir = Path::new(&out_dir); - - println!("cargo:rerun-if-changed={}", path.display()); - - for (filename, joined_filename) in &[ - ("tokenizer.json", "tokenizer_configs.json"), - ("rules.json", "rules_configs.json"), - ("tagger.json", "tagger_configs.json"), - ] { - let mut config_map: HashMap = HashMap::new(); - - for entry in fs::read_dir(&path).expect("must be able to read config dir") { - let entry = entry.expect("must be able to read config dir entry"); - - println!("cargo:rerun-if-changed={}", entry.path().display()); - - if entry.path().is_dir() { - let lang_code = entry - .path() - .file_name() - .expect("directory must have name") - .to_str() - .expect("directory name must be unicode") - .to_string(); - - let path = entry.path().join(filename); - - println!("cargo:rerun-if-changed={}", path.display()); - - let json_str = fs::read_to_string(path) - .unwrap_or_else(|_| panic!("{} for 'lang_code' must exist", filename)); - - config_map.insert( - lang_code, - serde_json::from_str(&json_str) - .unwrap_or_else(|_| panic!("{} for language must be valid json", filename)), - ); - } - } - - let config_writer = BufWriter::new( - File::create(out_dir.join(joined_filename)) - .expect("must be able to create file in out dir"), - ); - serde_json::to_writer_pretty(config_writer, &config_map) - .expect("must be able to write JSON to file"); - } -} diff --git a/nlprule/configs/en/rules.json b/nlprule/configs/en/rules.json deleted file mode 100644 index a1a27a0..0000000 --- a/nlprule/configs/en/rules.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "allow_errors": false, - "ignore_ids": [ - "GRAMMAR/PRP_MD_NN/2", - "TYPOS/VERB_APOSTROPHE_S/3" - ] -} \ No newline at end of file diff --git a/nlprule/configs/en/tagger.json b/nlprule/configs/en/tagger.json deleted file mode 100644 index 8d2a8a6..0000000 --- a/nlprule/configs/en/tagger.json +++ /dev/null @@ -1,11 +0,0 @@ -{ - "use_compound_split_heuristic": false, - "always_add_lower_tags": true, - "extra_tags": [ - "PCT", - "ORD", - "SYM", - "RB_SENT" - ], - "retain_last": true -} \ No newline at end of file diff --git a/nlprule/configs/en/tokenizer.json b/nlprule/configs/en/tokenizer.json deleted file mode 100644 index 11eae18..0000000 --- a/nlprule/configs/en/tokenizer.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "allow_errors": false, - "ignore_ids": [ - "DISAMBIGUATION/BEST_JJS/0" - ], - "extra_join_regexes": [ - "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" - ] -} \ No newline at end of file diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 6f63412..9b883c9 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -1,7 +1,19 @@ use clap::Clap; +use fs::{File, OpenOptions}; use fs_err as fs; -use nlprule::compile::{compile, Error}; -use std::io::BufWriter; + +use log::info; +use nlprule::compile::{BuildComponent, BuildInfo, Error}; +use nlprule::components::{ + chunker::Chunker, + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tagger::Tagger, + tokenizer::Tokenizer, + Component, +}; +use serde::{Deserialize, Serialize}; +use std::path::Path; use std::path::PathBuf; #[derive(clap::Clap)] @@ -13,17 +25,88 @@ pub struct BuildOptions { #[clap(long, parse(from_os_str))] pub build_dir: PathBuf, #[clap(long, parse(from_os_str))] - pub tokenizer_out: PathBuf, - #[clap(long, parse(from_os_str))] - pub rules_out: PathBuf, + pub out_dir: PathBuf, +} + +#[derive(Serialize, Deserialize)] +struct BuildFilePaths { + lang_code: PathBuf, + tag_dict: Vec, + tag_remove_dict: Vec, + chunker: PathBuf, + disambiguator_xml: PathBuf, + rules_xml: PathBuf, + multiword_tags: PathBuf, + common_words: PathBuf, + regex_cache: PathBuf, + srx: PathBuf, + tagger_options: PathBuf, + rules_options: PathBuf, + tokenizer_options: PathBuf, + disambiguator_options: PathBuf, +} + +impl BuildFilePaths { + fn new>(build_dir: P) -> Self { + let p = build_dir.as_ref(); + BuildFilePaths { + lang_code: p.join("lang_code.txt"), + tag_dict: vec![p.join("tags/output.dump"), p.join("tags/added.txt")], + tag_remove_dict: vec![p.join("tags/removed.txt")], + chunker: p.join("chunker.json"), + disambiguator_xml: p.join("disambiguation.xml"), + rules_xml: p.join("grammar.xml"), + multiword_tags: p.join("tags/multiwords.txt"), + common_words: p.join("common.txt"), + regex_cache: p.join("regex_cache.bin"), + srx: p.join("segment.srx"), + tagger_options: p.join("tagger_options.json"), + rules_options: p.join("rules_options.json"), + tokenizer_options: p.join("tokenizer_options.json"), + disambiguator_options: p.join("disambiguator_options.json"), + } + } } fn main() -> Result<(), Error> { env_logger::init(); let opts = BuildOptions::parse(); + let paths = BuildFilePaths::new(opts.build_dir); + + fs::create_dir_all(&opts.out_dir)?; + + let paths_value = serde_json::to_value(&paths)?; + + let tagger = Tagger::build(serde_json::from_value(paths_value.clone())?, None)?; + let mut build_info = BuildInfo::new(&tagger, &paths.regex_cache)?; + + macro_rules! build { + ($component:ty) => { + info!("Creating component \"{}\".", <$component>::name()); + let instance = <$component>::build( + serde_json::from_value(paths_value.clone())?, + Some(&mut build_info), + )?; + instance.to_writer( + &OpenOptions::new() + .write(true) + .create(true) + .open(opts.out_dir.join(format!("{}.bin", <$component>::name())))?, + )?; + }; + } + + build!(Tokenizer); + build!(Disambiguator); + build!(MultiwordTagger); + build!(Chunker); + build!(Rules); - let tokenizer_sink = BufWriter::new(fs::File::create(&opts.tokenizer_out)?); - let rules_sink = BufWriter::new(fs::File::create(&opts.rules_out)?); + // write the regex cache at the end, otherwise it isn't fully populated + bincode::serialize_into( + &File::create(&paths.regex_cache)?, + build_info.mut_regex_cache(), + )?; - compile(opts.build_dir, rules_sink, tokenizer_sink) + Ok(()) } diff --git a/nlprule/src/bin/run.rs b/nlprule/src/bin/run.rs index 1b4258c..d8c419d 100644 --- a/nlprule/src/bin/run.rs +++ b/nlprule/src/bin/run.rs @@ -1,28 +1,29 @@ -use clap::Clap; -use nlprule::{rules::Rules, tokenizer::Tokenizer}; +// use clap::Clap; +// use nlprule::{rules::Rules, tokenizer::Tokenizer}; -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - text: String, - #[clap(long, short)] - tokenizer: String, - #[clap(long, short)] - rules: String, -} +// #[derive(Clap)] +// #[clap( +// version = "1.0", +// author = "Benjamin Minixhofer " +// )] +// struct Opts { +// text: String, +// #[clap(long, short)] +// tokenizer: String, +// #[clap(long, short)] +// rules: String, +// } -fn main() { - env_logger::init(); - let opts = Opts::parse(); +fn main() {} +// fn main() { +// env_logger::init(); +// let opts = Opts::parse(); - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules = Rules::new(opts.rules).unwrap(); +// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); +// let rules = Rules::new(opts.rules).unwrap(); - let tokens = tokenizer.pipe(&opts.text); +// let tokens = tokenizer.pipe(&opts.text); - println!("Tokens: {:#?}", tokens.collect::>()); - println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer)); -} +// println!("Tokens: {:#?}", tokens.collect::>()); +// println!("Suggestions: {:#?}", rules.suggest(&opts.text, &tokenizer)); +// } diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs index 743fc81..0988861 100644 --- a/nlprule/src/bin/test.rs +++ b/nlprule/src/bin/test.rs @@ -1,43 +1,45 @@ -use clap::Clap; -use nlprule::{rules::Rules, tokenizer::Tokenizer}; +// use clap::Clap; +// use nlprule::{rules::Rules, tokenizer::Tokenizer}; -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - #[clap(long, short)] - tokenizer: String, - #[clap(long, short)] - rules: String, - #[clap(long, short)] - ids: Vec, -} +// #[derive(Clap)] +// #[clap( +// version = "1.0", +// author = "Benjamin Minixhofer " +// )] +// struct Opts { +// #[clap(long, short)] +// tokenizer: String, +// #[clap(long, short)] +// rules: String, +// #[clap(long, short)] +// ids: Vec, +// } -fn main() { - env_logger::init(); - let opts = Opts::parse(); +// fn main() { +// env_logger::init(); +// let opts = Opts::parse(); - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules_container = Rules::new(opts.rules).unwrap(); - let rules = rules_container.rules(); +// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); +// let rules_container = Rules::new(opts.rules).unwrap(); +// let rules = rules_container.rules(); - println!("Runnable rules: {}", rules.len()); +// println!("Runnable rules: {}", rules.len()); - let mut passes = 0; - for rule in rules { - if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) { - if let Ok(true) = rule.test(&tokenizer) { - passes += 1; - } - } - } +// let mut passes = 0; +// for rule in rules { +// if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) { +// if let Ok(true) = rule.test(&tokenizer) { +// passes += 1; +// } +// } +// } - println!("Rules passing tests: {}", passes); - if passes == rules.len() { - std::process::exit(0); - } else { - std::process::exit(1); - } -} +// println!("Rules passing tests: {}", passes); +// if passes == rules.len() { +// std::process::exit(0); +// } else { +// std::process::exit(1); +// } +// } + +fn main() {} diff --git a/nlprule/src/bin/test_disambiguation.rs b/nlprule/src/bin/test_disambiguation.rs index 4912508..1ab8ff6 100644 --- a/nlprule/src/bin/test_disambiguation.rs +++ b/nlprule/src/bin/test_disambiguation.rs @@ -1,42 +1,44 @@ -use clap::Clap; -use nlprule::tokenizer::Tokenizer; - -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - #[clap(long)] - stop_at_error: bool, - #[clap(long, short)] - tokenizer: String, -} - -fn main() { - env_logger::init(); - let opts = Opts::parse(); - - let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); - let rules = tokenizer.rules(); - - println!("Last ID: {}", rules[rules.len() - 1].id()); - println!("Runnable rules: {}", rules.len()); - - let mut passes = 0; - - for rule in rules { - if let Ok(true) = rule.test(&tokenizer) { - passes += 1; - } else if opts.stop_at_error { - break; - } - } - - println!("Rules passing tests: {}", passes); - if passes == rules.len() { - std::process::exit(0); - } else { - std::process::exit(1); - } -} +// use clap::Clap; +// use nlprule::tokenizer::Tokenizer; + +// #[derive(Clap)] +// #[clap( +// version = "1.0", +// author = "Benjamin Minixhofer " +// )] +// struct Opts { +// #[clap(long)] +// stop_at_error: bool, +// #[clap(long, short)] +// tokenizer: String, +// } + +// fn main() { +// env_logger::init(); +// let opts = Opts::parse(); + +// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); +// let rules = tokenizer.rules(); + +// println!("Last ID: {}", rules[rules.len() - 1].id()); +// println!("Runnable rules: {}", rules.len()); + +// let mut passes = 0; + +// for rule in rules { +// if let Ok(true) = rule.test(&tokenizer) { +// passes += 1; +// } else if opts.stop_at_error { +// break; +// } +// } + +// println!("Rules passing tests: {}", passes); +// if passes == rules.len() { +// std::process::exit(0); +// } else { +// std::process::exit(1); +// } +// } + +fn main() {} diff --git a/nlprule/src/compile/impls.rs b/nlprule/src/compile/impls.rs deleted file mode 100644 index 3e2ef9a..0000000 --- a/nlprule/src/compile/impls.rs +++ /dev/null @@ -1,835 +0,0 @@ -use bimap::BiMap; -use fs_err::File; -use log::warn; -use serde::{Deserialize, Serialize}; -use std::{ - collections::{HashMap, HashSet}, - hash::{Hash, Hasher}, - io::{self, BufRead, BufReader}, - path::Path, -}; - -use crate::{ - rule::{ - disambiguation::PosFilter, - engine::{ - composition::{GraphId, Matcher, PosMatcher, TextMatcher}, - Engine, - }, - id::Category, - DisambiguationRule, Rule, - }, - rules::{Rules, RulesLangOptions}, - tokenizer::{ - chunk, - multiword::{MultiwordTagger, MultiwordTaggerFields}, - tag::{Tagger, TaggerLangOptions, WordIdMap}, - Tokenizer, TokenizerLangOptions, - }, - types::*, - utils::{parallelism::MaybeParallelIterator, regex::Regex}, -}; - -use super::{parse_structure::BuildInfo, Error}; - -impl Tagger { - fn get_lines, S2: AsRef>( - paths: &[S1], - remove_paths: &[S2], - ) -> std::io::Result> { - let mut output = Vec::new(); - let mut disallowed: Vec = Vec::new(); - - for path in remove_paths { - let file = File::open(path.as_ref())?; - let reader = std::io::BufReader::new(file); - - for line in reader.lines() { - let line = line?; - if line.starts_with('#') { - continue; - } - - disallowed.push(line.to_string()); - } - } - - for path in paths { - let file = File::open(path.as_ref())?; - let reader = std::io::BufReader::new(file); - - for line in reader.lines() { - let line = line?; - if line.starts_with('#') { - continue; - } - - if disallowed.contains(&line) { - continue; - } - - let parts: Vec<_> = line.split('\t').collect(); - - let word = parts[0].to_string(); - let inflection = parts[1].to_string(); - let tag = parts[2].to_string(); - - output.push((word, inflection, tag)) - } - } - - Ok(output) - } - - /// Creates a tagger from raw files. - /// - /// # Arguments - /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively, - /// separated by tabs, to be added to the tagger. - /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively, - /// separated by tabs, to be removed from the tagger if present in the files from `paths`. - pub(in crate::compile) fn from_dumps, S2: AsRef>( - paths: &[S1], - remove_paths: &[S2], - common_words: &HashSet, - lang_options: TaggerLangOptions, - ) -> std::io::Result { - let mut tag_store = HashSet::new(); - let mut word_store = HashSet::new(); - - // add language specific special tags - tag_store.extend(lang_options.extra_tags.iter().map(|x| x.as_str())); - - let lines = Tagger::get_lines(paths, remove_paths)?; - - let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~"; - for i in 0..punct.len() { - word_store.insert(&punct[i..(i + 1)]); - } - - word_store.extend(common_words.iter().map(|x| x.as_str())); - - for (word, inflection, tag) in lines.iter() { - word_store.insert(word); - word_store.insert(inflection); - tag_store.insert(tag); - } - - // the empty string must not be part of any wordlist - assert!(!word_store.contains("")); - - // word store ids should be consistent across runs - let mut word_store: Vec<_> = word_store.into_iter().collect(); - word_store.sort_unstable(); - - // add special empty string to wordlist, must be the first element to have id 0 - word_store.insert(0, ""); - - // tag store ids should be consistent across runs - let mut tag_store: Vec<_> = tag_store.into_iter().collect(); - tag_store.sort_unstable(); - - // add special part of speech tags, they must have ids starting from zero - for (i, special_pos) in SpecialPos::iter().enumerate() { - tag_store.insert(i, special_pos); - } - - let word_store: BiMap<_, _> = word_store - .iter() - .enumerate() - .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32))) - .collect(); - let tag_store: BiMap<_, _> = tag_store - .iter() - .enumerate() - .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16))) - .collect(); - - let mut tags: Vec>> = vec![None; word_store.len()]; - - for (word, inflection, tag) in lines.iter() { - let word_id = word_store.get_by_left(word).unwrap(); - let lemma_id = word_store.get_by_left(inflection).unwrap(); - let pos_id = tag_store.get_by_left(tag).unwrap(); - - match &mut tags[word_id.value() as usize] { - Some(vec) => { - vec.push((*lemma_id, *pos_id)); - } - None => { - tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]); - } - } - } - - Ok(Tagger { - tags: WordIdMap(tags), - word_store, - tag_store, - lang_options, - }) - } -} - -impl MultiwordTagger { - pub(in crate::compile) fn from_dump>( - dump: P, - info: &BuildInfo, - ) -> Result { - let reader = BufReader::new(File::open(dump.as_ref())?); - let mut multiwords = Vec::new(); - - for line in reader.lines() { - let line = line?; - - // strip comments - let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim(); - if line.is_empty() { - continue; - } - let tab_split: Vec<_> = line.split('\t').collect(); - - let word: String = tab_split[0] - .split_whitespace() - .collect::>() - .join(" "); - let pos = info.tagger().id_tag(tab_split[1]).into_static(); - multiwords.push((word, pos)); - } - - Ok((MultiwordTaggerFields { multiwords }).into()) - } -} - -impl TextMatcher { - pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self { - // can not cache a matcher that depends on the graph - let set = if matcher.graph_id().is_some() { - None - } else if let either::Right(regex) = &matcher.matcher { - let mut hasher = DefaultHasher::default(); - regex.hash(&mut hasher); - matcher.negate.hash(&mut hasher); - matcher.empty_always_false.hash(&mut hasher); - let matcher_hash = hasher.finish(); - - if let Some(set) = info.mut_regex_cache().get(&matcher_hash) { - set.clone() - } else { - let data: Vec<_> = info.tagger().word_store().iter().collect(); - - let set: DefaultHashSet<_> = data - .into_maybe_par_iter() - .filter_map(|(word, id)| { - if matcher.is_match(word.as_str(), None, None) { - Some(*id) - } else { - None - } - }) - .collect(); - - // there are some regexes which match lots of strings - // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up - // the vast majority of regexes matches less than 100 strings from manual inspection - let set = if set.len() > 100 { None } else { Some(set) }; - info.mut_regex_cache().insert(matcher_hash, set.clone()); - set - } - } else { - None - }; - - TextMatcher { matcher, set } - } -} - -impl PosMatcher { - pub(in crate::compile) fn new(matcher: Matcher, info: &mut BuildInfo) -> Self { - let mut mask = vec![false; info.tagger().tag_store().len()]; - - for (word, id) in info.tagger().tag_store().iter() { - mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None); - } - - PosMatcher { mask } - } -} - -impl Rules { - pub(in crate::compile) fn from_xml>( - path: P, - build_info: &mut BuildInfo, - options: RulesLangOptions, - ) -> Self { - let rules = super::parse_structure::read_rules(path); - let mut errors: HashMap = HashMap::new(); - - let rules: Vec<_> = rules - .into_iter() - .filter_map(|x| match x { - Ok((rule_structure, group, category)) => { - let category = category.expect("grammar rules must have category"); - let id = Category::new(category.id.as_str()); - - let id = if let Some(group) = &group { - id.join(group.id.as_str()).join(group.n) - } else { - id.join( - rule_structure - .id - .as_ref() - .expect("ID must be set if not in group."), - ) - .join(0) - }; - - let rule_on = match rule_structure.default.as_deref() { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let category_on = match category.default.as_deref() { - Some("off") | Some("temp_off") => false, - Some("on") | None => true, - Some(x) => panic!("unknown `default` value: {}", x), - }; - - let name = rule_structure.name.as_ref().map_or_else( - || { - let group = group.as_ref().expect("must have group if name not set"); - group.name.clone() - }, - |x| x.clone(), - ); - - match Rule::from_rule_structure(rule_structure, build_info) { - Ok(mut rule) => { - if (options.ids.is_empty() - || options.ids.iter().any(|x| x.is_match(&id))) - && !options.ignore_ids.iter().any(|x| x.is_match(&id)) - { - rule.id = id; - rule.name = name; - rule.category_name = category.name; - rule.category_type = category.kind; - rule.enabled = category_on && group_on && rule_on; - Some(rule) - } else { - None - } - } - Err(x) => { - *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1; - None - } - } - } - Err(x) => { - *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1; - None - } - }) - .collect(); - - if !errors.is_empty() { - let mut errors: Vec<(String, usize)> = errors.into_iter().collect(); - errors.sort_by_key(|x| -(x.1 as i32)); - - warn!( - "Errors constructing Rules: {:#?}", - &errors - .iter() - .map(|(message, number)| format!("{} (n={})", message, number)) - .collect::>() - ); - } - - Rules { - rules, - properties: Default::default(), - } - } -} - -impl Tokenizer { - pub(in crate::compile) fn from_xml>( - path: P, - build_info: &mut BuildInfo, - chunker: Option, - multiword_tagger: Option, - sentencizer: srx::Rules, - lang_options: TokenizerLangOptions, - ) -> Result { - let rules = super::parse_structure::read_disambiguation_rules(path); - let mut error = None; - - let mut whitelist = DefaultHashSet::new(); - - for (word, _) in build_info.tagger().word_store() { - if word.contains(|c| lang_options.extra_split_chars.contains(&c)) { - whitelist.insert(word.to_owned()); - } - } - - let rules: Vec<_> = rules - .into_iter() - .filter_map(|x| match x { - Ok((rule_structure, group, _)) => { - let id = Category::new("DISAMBIGUATION"); - - let id = if let Some(group) = &group { - id.join(group.id.as_str()).join(group.n) - } else { - id.join( - rule_structure - .id - .as_ref() - .expect("ID must be set if not in group."), - ) - .join(0) - }; - - match DisambiguationRule::from_rule_structure(rule_structure, build_info) { - Ok(mut rule) => { - if error.is_none() - && (lang_options.ids.is_empty() - || lang_options.ids.iter().any(|x| x.is_match(&id))) - && !lang_options.ignore_ids.iter().any(|x| x.is_match(&id)) - { - rule.id = id; - - Some(rule) - } else { - None - } - } - Err(x) => { - if error.is_none() { - error = Some(format!("[Rule] {}", x)); - } - None - } - } - } - Err(x) => { - if error.is_none() { - error = Some(format!("[Structure] {}", x)); - } - None - } - }) - .collect(); - - if let Some(x) = error { - if lang_options.allow_errors { - warn!("Error constructing Disambiguator: {}", x) - } else { - return Err(Error::Unexpected(format!( - "Error constructing Disambiguator: {}", - x - ))); - } - } - - Ok(Tokenizer { - tagger: build_info.tagger().clone(), - sentencizer, - chunker, - multiword_tagger, - rules, - lang_options, - whitelist, - properties: Default::default(), - }) - } -} - -#[derive(Deserialize)] -struct ModelData { - outcome_labels: Vec, - pmap: DefaultHashMap, -} - -#[derive(Serialize, Deserialize)] -pub(in crate::compile) struct ContextData { - parameters: Vec, - outcomes: Vec, -} - -impl From for chunk::Model { - fn from(data: ModelData) -> Self { - let mut outcomes: Vec = Vec::new(); - let mut parameters: Vec = Vec::new(); - - let pmap = data - .pmap - .into_iter() - .map(|(key, value)| { - assert_eq!(value.outcomes.len(), value.parameters.len()); - - let offset = outcomes.len(); - let length = value.outcomes.len(); - - outcomes.extend(value.outcomes); - parameters.extend(value.parameters); - - (chunk::hash::hash_str(&key), (offset, length)) - }) - .collect::>(); - - chunk::Model { - outcome_labels: data.outcome_labels, - outcomes, - parameters, - pmap, - } - } -} - -impl chunk::Chunker { - pub(in crate::compile) fn from_json( - reader: R, - ) -> Result { - #[derive(Deserialize)] - struct ChunkData { - token_model: ModelData, - pos_model: ModelData, - pos_tagdict: DefaultHashMap>, - chunk_model: ModelData, - } - - let chunk_data: ChunkData = serde_json::from_reader(reader)?; - Ok(chunk::Chunker { - token_model: chunk::MaxentTokenizer { - model: chunk_data.token_model.into(), - }, - pos_model: chunk::MaxentPosTagger { - model: chunk_data.pos_model.into(), - tagdict: chunk_data.pos_tagdict, - }, - chunk_model: chunk::MaxentChunker { - model: chunk_data.chunk_model.into(), - }, - }) - } -} - -impl PosFilter { - pub(in crate::compile) fn new(matcher: PosMatcher) -> Self { - PosFilter { matcher } - } -} - -impl Regex { - pub(in crate::compile) fn from_java_regex( - java_regex_str: &str, - full_match: bool, - case_sensitive: bool, - ) -> Result { - let regex_string = - super::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?; - - let regex = Regex::new(regex_string); - if let Err(error) = regex.try_compile() { - return Err(Error::Regex(error)); - } - - Ok(regex) - } -} - -impl Engine { - pub(in crate::compile) fn to_graph_id(&self, id: usize) -> Result { - let mut id = GraphId(id); - - let map = match &self { - Engine::Token(engine) => &engine.composition.id_to_idx, - Engine::Text(_, id_to_idx) => &id_to_idx, - }; - - let max_id = *map - .keys() - .max() - .ok_or_else(|| Error::Unexpected("graph is empty".into()))?; - - // ideally this should throw an error but LT is more lenient than nlprule - if !map.contains_key(&id) { - id = max_id; - } - - Ok(id) - } -} - -mod composition { - use super::*; - use crate::{ - rule::engine::composition::{ - AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part, - Quantifier, TrueAtom, - }, - utils::regex::Regex, - }; - - impl Atom { - fn iter_mut<'a>(&'a mut self) -> Box + 'a> { - match self { - Atom::ChunkAtom(_) - | Atom::SpaceBeforeAtom(_) - | Atom::TextAtom(_) - | Atom::WordDataAtom(_) - | Atom::FalseAtom(_) - | Atom::TrueAtom(_) => Box::new(std::iter::once(self)), - Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()), - Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()), - Atom::NotAtom(x) => x.atom.iter_mut(), - Atom::OffsetAtom(x) => x.atom.iter_mut(), - } - } - - pub(in crate::compile) fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> { - let mut ids = Vec::new(); - - for atom in self.iter_mut() { - let id = match atom { - Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(), - Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(), - Atom::WordDataAtom(atom) => atom - .matcher - .inflect_matcher - .as_mut() - .and_then(|x| x.matcher.mut_graph_id()), - _ => { - continue; - } - }; - - if let Some(id) = id { - ids.push(id); - } - } - - ids - } - } - - impl Matcher { - pub(in crate::compile) fn new_regex( - regex: Regex, - negate: bool, - empty_always_false: bool, - ) -> Self { - Matcher { - matcher: either::Right(regex), - negate, - case_sensitive: true, // handled by regex, should maybe be an option - empty_always_false, - } - } - - pub(in crate::compile) fn new_string( - string_or_idx: either::Either, - negate: bool, - case_sensitive: bool, - empty_always_false: bool, - ) -> Self { - Matcher { - matcher: either::Left(string_or_idx), - negate, - case_sensitive, - empty_always_false, - } - } - - pub(in crate::compile) fn graph_id(&self) -> Option { - if let either::Left(either::Right(id)) = &self.matcher { - Some(*id) - } else { - None - } - } - - pub(in crate::compile) fn mut_graph_id(&mut self) -> Option<&mut GraphId> { - if let either::Left(either::Right(id)) = &mut self.matcher { - Some(id) - } else { - None - } - } - } - - impl Quantifier { - pub(in crate::compile) fn new(min: usize, max: usize) -> Self { - assert!(max >= min); - Quantifier { min, max } - } - } - - impl AndAtom { - pub(in crate::compile) fn and(atoms: Vec) -> Atom { - let mut atoms: Vec<_> = atoms - .into_iter() - .filter(|x| !matches!(x, Atom::TrueAtom { .. })) - .collect(); - - if atoms.is_empty() { - (TrueAtom {}).into() - } else if atoms.len() == 1 { - atoms.remove(0) - } else { - (AndAtom { atoms }).into() - } - } - } - - impl OrAtom { - pub(in crate::compile) fn or(atoms: Vec) -> Atom { - let mut atoms: Vec<_> = atoms - .into_iter() - .filter(|x| !matches!(x, Atom::FalseAtom { .. })) - .collect(); - - if atoms.is_empty() { - (FalseAtom {}).into() - } else if atoms.len() == 1 { - atoms.remove(0) - } else { - (OrAtom { atoms }).into() - } - } - } - - impl NotAtom { - pub(in crate::compile) fn not(atom: Atom) -> Atom { - match atom { - Atom::TrueAtom { .. } => FalseAtom::default().into(), - Atom::FalseAtom { .. } => TrueAtom::default().into(), - x => (NotAtom { atom: Box::new(x) }).into(), - } - } - } - - impl OffsetAtom { - pub(in crate::compile) fn new(atom: Atom, offset: isize) -> Self { - OffsetAtom { - atom: Box::new(atom), - offset, - } - } - } - - impl Composition { - pub(in crate::compile) fn new(mut parts: Vec) -> Result { - let mut id_to_idx = DefaultHashMap::default(); - id_to_idx.insert(GraphId(0), 0); - let mut current_id = 1; - - for (i, part) in parts.iter().enumerate() { - if part.visible { - id_to_idx.insert(GraphId(current_id), i + 1); - current_id += 1; - } - } - - let can_stop_mask = (0..parts.len()) - .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0)) - .collect(); - - for (i, part) in parts.iter_mut().enumerate() { - for id in part.atom.mut_graph_ids() { - loop { - let index = *id_to_idx.get(&id).ok_or_else(|| { - Error::Unexpected(format!("id must exist in graph: {:?}", id)) - })?; - - // ideally this should throw an error but LT is more lenient than nlprule - if index > i { - *id = GraphId(id.0 - 1); - } else { - break; - } - } - } - } - - Ok(Composition { - parts, - id_to_idx, - can_stop_mask, - }) - } - } -} - -pub(in crate::compile) mod filters { - use super::Error; - use std::collections::HashMap; - - use crate::{filter::*, rule::engine::Engine, utils::regex::Regex}; - - trait FromArgs: Sized { - fn from_args(args: HashMap, engine: &Engine) -> Result; - } - - impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter { - fn from_args(args: HashMap, engine: &Engine) -> Result { - if args.contains_key("negate_postag") { - panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter"); - } - - Ok(NoDisambiguationEnglishPartialPosTagFilter { - id: engine.to_graph_id(args - .get("no") - .ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument" - .into(), - ) - })? - .parse::()?)?, - regexp: Regex::from_java_regex( - &args.get("regexp").ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument" - .into(), - ) - })?, - true, - true, - )?, - postag_regexp: Regex::from_java_regex( - &args.get("postag_regexp").ok_or_else(|| { - Error::Unexpected( - "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument" - .into(), - ) - })?, - true, - true, - )?, - negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"), - }) - } - } - - pub(in crate::compile) fn get_filter( - name: &str, - args: HashMap, - engine: &Engine, - ) -> Result { - match name { - "NoDisambiguationEnglishPartialPosTagFilter" => { - Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into()) - } - _ => Err(Error::Unexpected(format!("unsupported filter {}", name))), - } - } -} diff --git a/nlprule/src/compile/mod.rs b/nlprule/src/compile/mod.rs index c1258e1..965978c 100644 --- a/nlprule/src/compile/mod.rs +++ b/nlprule/src/compile/mod.rs @@ -1,62 +1,19 @@ -//! Creates the nlprule binaries from a *build directory*. Usage information in /build/README.md. - -use fs::File; -use fs_err as fs; - use std::{ hash::{Hash, Hasher}, - io::{self, BufReader, BufWriter}, + io::BufReader, num::ParseIntError, - path::{Path, PathBuf}, - str::FromStr, - sync::Arc, + path::Path, }; -use crate::{ - rules::Rules, - tokenizer::{chunk::Chunker, multiword::MultiwordTagger, tag::Tagger, Tokenizer}, - types::DefaultHasher, -}; -use log::info; - -use self::parse_structure::{BuildInfo, RegexCache}; -use thiserror::Error; +pub mod utils; -mod impls; -mod parse_structure; -mod structure; -mod utils; - -struct BuildFilePaths { - lang_code_path: PathBuf, - tag_paths: Vec, - tag_remove_paths: Vec, - chunker_path: PathBuf, - disambiguation_path: PathBuf, - grammar_path: PathBuf, - multiword_tag_path: PathBuf, - common_words_path: PathBuf, - regex_cache_path: PathBuf, - srx_path: PathBuf, -} +use crate::components::tagger::Tagger; -impl BuildFilePaths { - fn new>(build_dir: P) -> Self { - let p = build_dir.as_ref(); - BuildFilePaths { - lang_code_path: p.join("lang_code.txt"), - tag_paths: vec![p.join("tags/output.dump"), p.join("tags/added.txt")], - tag_remove_paths: vec![p.join("tags/removed.txt")], - chunker_path: p.join("chunker.json"), - disambiguation_path: p.join("disambiguation.xml"), - grammar_path: p.join("grammar.xml"), - multiword_tag_path: p.join("tags/multiwords.txt"), - common_words_path: p.join("common.txt"), - regex_cache_path: p.join("regex_cache.bin"), - srx_path: p.join("segment.srx"), - } - } -} +use crate::types::*; +use fs_err::File; +use log::info; +use serde::{de::DeserializeOwned, Deserialize, Serialize}; +use thiserror::Error; #[derive(Error, Debug)] #[allow(missing_docs)] @@ -71,8 +28,6 @@ pub enum Error { Json(#[from] serde_json::Error), #[error(transparent)] Srx(#[from] srx::Error), - #[error("language options do not exist for '{lang_code}'")] - LanguageOptionsDoNotExist { lang_code: String }, #[error(transparent)] RegexSyntax(#[from] regex_syntax::ast::Error), #[error("regex compilation error: {0}")] @@ -83,119 +38,90 @@ pub enum Error { Unimplemented(String), #[error(transparent)] ParseError(#[from] ParseIntError), + #[error("`BuildInfo` is required to build this component, but is unset.")] + BuildInfoUnset, #[error("unknown error: {0}")] Other(#[from] Box), } -/// Compiles the binaries from a build directory. -pub fn compile( - build_dir: impl AsRef, - rules_dest: impl io::Write, - tokenizer_dest: impl io::Write, -) -> Result<(), Error> { - let paths = BuildFilePaths::new(&build_dir); - - let lang_code = fs::read_to_string(paths.lang_code_path)?; - - info!( - "Reading common words from {}.", - paths.common_words_path.display() - ); - let common_words = fs::read_to_string(paths.common_words_path)? - .lines() - .map(|x| x.to_string()) - .collect(); - - let tokenizer_lang_options = utils::tokenizer_lang_options(&lang_code).ok_or_else(|| { - Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), +pub trait BuildComponent: Sized { + type Paths: DeserializeOwned; + + fn build(paths: Self::Paths, build_info: Option<&mut BuildInfo>) -> Result; +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct RegexCache { + cache: DefaultHashMap>>, + // this is compared with the hash of the word store of the tagger + word_hash: u64, +} + +impl RegexCache { + pub fn new(word_hash: u64) -> Self { + RegexCache { + cache: DefaultHashMap::default(), + word_hash, } - })?; - - let rules_lang_options = - utils::rules_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), - })?; - - let tagger_lang_options = - utils::tagger_lang_options(&lang_code).ok_or_else(|| Error::LanguageOptionsDoNotExist { - lang_code: lang_code.clone(), - })?; - - info!("Creating tagger."); - let tagger = Tagger::from_dumps( - &paths.tag_paths, - &paths.tag_remove_paths, - &common_words, - tagger_lang_options, - )?; - - let mut hasher = DefaultHasher::default(); - let mut word_store = tagger.word_store().iter().collect::>(); - word_store.sort_by(|a, b| a.1.cmp(b.1)); - word_store.hash(&mut hasher); - let word_store_hash = hasher.finish(); - - let regex_cache = if let Ok(file) = File::open(&paths.regex_cache_path) { - let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?; - if *cache.word_hash() == word_store_hash { + } + + pub fn word_hash(&self) -> &u64 { + &self.word_hash + } + + pub(crate) fn get(&self, key: &u64) -> Option<&Option>> { + self.cache.get(key) + } + + pub(crate) fn insert(&mut self, key: u64, value: Option>) { + self.cache.insert(key, value); + } +} + +pub struct BuildInfo<'a> { + tagger: &'a Tagger, + regex_cache: RegexCache, +} + +impl<'a> BuildInfo<'a> { + pub fn new>(tagger: &'a Tagger, regex_cache_path: P) -> Result { + let mut hasher = DefaultHasher::default(); + let mut word_store = tagger.word_store().iter().collect::>(); + word_store.sort_by(|a, b| a.1.cmp(b.1)); + word_store.hash(&mut hasher); + let word_store_hash = hasher.finish(); + + let regex_cache = if let Ok(file) = File::open(regex_cache_path.as_ref()) { + let cache: RegexCache = bincode::deserialize_from(BufReader::new(file))?; + if *cache.word_hash() == word_store_hash { + info!( + "Regex cache at {} is valid.", + regex_cache_path.as_ref().display() + ); + cache + } else { + info!("Regex cache was provided but is not valid. Rebuilding."); + RegexCache::new(word_store_hash) + } + } else { info!( - "Regex cache at {} is valid.", - paths.regex_cache_path.display() + "No regex cache provided. Building and writing to {}.", + regex_cache_path.as_ref().display() ); - cache - } else { - info!("Regex cache was provided but is not valid. Rebuilding."); RegexCache::new(word_store_hash) - } - } else { - info!( - "No regex cache provided. Building and writing to {}.", - paths.regex_cache_path.display() - ); - RegexCache::new(word_store_hash) - }; - - let mut build_info = BuildInfo::new(Arc::new(tagger), regex_cache); - let chunker = if paths.chunker_path.exists() { - info!("{} exists. Building chunker.", paths.chunker_path.display()); - let reader = BufReader::new(File::open(paths.chunker_path)?); - let chunker = Chunker::from_json(reader)?; - Some(chunker) - } else { - None - }; - let multiword_tagger = if paths.multiword_tag_path.exists() { - info!( - "{} exists. Building multiword tagger.", - paths.multiword_tag_path.display() - ); - Some(MultiwordTagger::from_dump( - paths.multiword_tag_path, - &build_info, - )?) - } else { - None - }; - - info!("Creating tokenizer."); - let tokenizer = Tokenizer::from_xml( - &paths.disambiguation_path, - &mut build_info, - chunker, - multiword_tagger, - srx::SRX::from_str(&fs::read_to_string(&paths.srx_path)?)?.language_rules(lang_code), - tokenizer_lang_options, - )?; - tokenizer.to_writer(tokenizer_dest)?; - - info!("Creating grammar rules."); - let rules = Rules::from_xml(&paths.grammar_path, &mut build_info, rules_lang_options); - rules.to_writer(rules_dest)?; - - // we need to write the regex cache after building the rules, otherwise it isn't fully populated - let f = BufWriter::new(File::create(&paths.regex_cache_path)?); - bincode::serialize_into(f, build_info.mut_regex_cache())?; - - Ok(()) + }; + + Ok(BuildInfo { + tagger, + regex_cache, + }) + } + + pub fn tagger(&self) -> &'a Tagger { + self.tagger + } + + pub fn mut_regex_cache(&mut self) -> &mut RegexCache { + &mut self.regex_cache + } } diff --git a/nlprule/src/compile/utils.rs b/nlprule/src/compile/utils.rs index 73b5322..53dab59 100644 --- a/nlprule/src/compile/utils.rs +++ b/nlprule/src/compile/utils.rs @@ -1,55 +1,3 @@ -use crate::{rules::RulesLangOptions, tokenizer::TokenizerLangOptions}; -use crate::{tokenizer::tag::TaggerLangOptions, types::*}; -use lazy_static::lazy_static; - -lazy_static! { - static ref TOKENIZER_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "tokenizer_configs.json" - ))) - .expect("tokenizer configs must be valid JSON") - }; -} - -lazy_static! { - static ref RULES_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "rules_configs.json" - ))) - .expect("rules configs must be valid JSON") - }; -} - -lazy_static! { - static ref TAGGER_LANG_OPTIONS: DefaultHashMap = { - serde_json::from_slice(include_bytes!(concat!( - env!("OUT_DIR"), - "/", - "tagger_configs.json" - ))) - .expect("tagger configs must be valid JSON") - }; -} - -/// Gets the tokenizer language options for the language code -pub(crate) fn tokenizer_lang_options(lang_code: &str) -> Option { - TOKENIZER_LANG_OPTIONS.get(lang_code).cloned() -} - -/// Gets the rules language options for the language code -pub(crate) fn rules_lang_options(lang_code: &str) -> Option { - RULES_LANG_OPTIONS.get(lang_code).cloned() -} - -/// Gets the tagger language options for the language code -pub(crate) fn tagger_lang_options(lang_code: &str) -> Option { - TAGGER_LANG_OPTIONS.get(lang_code).cloned() -} - pub(crate) use regex::from_java_regex; mod regex { diff --git a/nlprule/src/components/chunker/compile.rs b/nlprule/src/components/chunker/compile.rs new file mode 100644 index 0000000..73083fa --- /dev/null +++ b/nlprule/src/components/chunker/compile.rs @@ -0,0 +1,84 @@ +use std::{io::BufReader, path::PathBuf}; + +use fs_err::File; +use serde::Deserialize; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Serialize, Deserialize)] +struct ContextData { + parameters: Vec, + outcomes: Vec, +} + +#[derive(Deserialize)] +struct ModelData { + outcome_labels: Vec, + pmap: DefaultHashMap, +} + +impl From for Model { + fn from(data: ModelData) -> Self { + let mut outcomes: Vec = Vec::new(); + let mut parameters: Vec = Vec::new(); + + let pmap = data + .pmap + .into_iter() + .map(|(key, value)| { + assert_eq!(value.outcomes.len(), value.parameters.len()); + + let offset = outcomes.len(); + let length = value.outcomes.len(); + + outcomes.extend(value.outcomes); + parameters.extend(value.parameters); + + (hash::hash_str(&key), (offset, length)) + }) + .collect::>(); + + Model { + outcome_labels: data.outcome_labels, + outcomes, + parameters, + pmap, + } + } +} + +#[derive(Deserialize)] +pub struct Paths { + chunker: PathBuf, +} + +impl BuildComponent for Chunker { + type Paths = Paths; + + fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result { + #[derive(Deserialize)] + struct ChunkData { + token_model: ModelData, + pos_model: ModelData, + pos_tagdict: DefaultHashMap>, + chunk_model: ModelData, + } + + let chunk_data: ChunkData = + serde_json::from_reader(BufReader::new(File::open(paths.chunker)?))?; + Ok(Chunker { + token_model: MaxentTokenizer { + model: chunk_data.token_model.into(), + }, + pos_model: MaxentPosTagger { + model: chunk_data.pos_model.into(), + tagdict: chunk_data.pos_tagdict, + }, + chunk_model: MaxentChunker { + model: chunk_data.chunk_model.into(), + }, + }) + } +} diff --git a/nlprule/src/tokenizer/chunk.rs b/nlprule/src/components/chunker/mod.rs similarity index 99% rename from nlprule/src/tokenizer/chunk.rs rename to nlprule/src/components/chunker/mod.rs index 35cd953..e9825b5 100644 --- a/nlprule/src/tokenizer/chunk.rs +++ b/nlprule/src/components/chunker/mod.rs @@ -1,5 +1,7 @@ //! A Chunker ported from [OpenNLP](https://opennlp.apache.org/). +mod compile; + use half::bf16; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -9,6 +11,8 @@ use std::{cmp::Ordering, collections::BinaryHeap}; use crate::properties::*; use crate::types::{DefaultHashMap, DefaultHasher, Sentence}; +use super::Component; + fn softmax(vec: &mut Vec) { for x in vec.iter_mut() { *x = x.exp(); @@ -818,3 +822,9 @@ impl Transform for Chunker { Ok(sentence) } } + +impl Component for Chunker { + fn name() -> &'static str { + "chunker" + } +} diff --git a/nlprule/src/components/mod.rs b/nlprule/src/components/mod.rs new file mode 100644 index 0000000..fb157e1 --- /dev/null +++ b/nlprule/src/components/mod.rs @@ -0,0 +1,30 @@ +use std::{ + io::{BufReader, Read, Write}, + path::Path, +}; + +use fs_err::File; +use serde::{de::DeserializeOwned, Serialize}; + +pub mod chunker; +pub mod multiword_tagger; +pub mod rules; +pub mod tagger; +pub mod tokenizer; + +pub trait Component: Serialize + DeserializeOwned { + fn name() -> &'static str; + + fn new>(p: P) -> Result { + let reader = BufReader::new(File::open(p.as_ref())?); + Ok(Self::from_reader(reader)?) + } + + fn from_reader(reader: R) -> Result { + Ok(bincode::deserialize_from(reader)?) + } + + fn to_writer(&self, writer: W) -> Result<(), crate::Error> { + Ok(bincode::serialize_into(writer, self)?) + } +} diff --git a/nlprule/src/components/multiword_tagger/compile.rs b/nlprule/src/components/multiword_tagger/compile.rs new file mode 100644 index 0000000..c02b959 --- /dev/null +++ b/nlprule/src/components/multiword_tagger/compile.rs @@ -0,0 +1,46 @@ +use std::{ + io::{BufRead, BufReader}, + path::PathBuf, +}; + +use fs_err::File; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Deserialize)] +pub struct Paths { + multiword_tags: PathBuf, +} + +impl BuildComponent for MultiwordTagger { + type Paths = Paths; + + fn build(paths: Paths, info: Option<&mut BuildInfo>) -> Result { + let tagger = info.ok_or(Error::BuildInfoUnset)?.tagger(); + + let reader = BufReader::new(File::open(paths.multiword_tags)?); + let mut multiwords = Vec::new(); + + for line in reader.lines() { + let line = line?; + + // strip comments + let line = &line[..line.find('#').unwrap_or_else(|| line.len())].trim(); + if line.is_empty() { + continue; + } + let tab_split: Vec<_> = line.split('\t').collect(); + + let word: String = tab_split[0] + .split_whitespace() + .collect::>() + .join(" "); + let pos = tagger.id_tag(tab_split[1]).into_static(); + multiwords.push((word, pos)); + } + + Ok((MultiwordTaggerFields { multiwords }).into()) + } +} diff --git a/nlprule/src/tokenizer/multiword.rs b/nlprule/src/components/multiword_tagger/mod.rs similarity index 92% rename from nlprule/src/tokenizer/multiword.rs rename to nlprule/src/components/multiword_tagger/mod.rs index f56ff9c..b27e468 100644 --- a/nlprule/src/tokenizer/multiword.rs +++ b/nlprule/src/components/multiword_tagger/mod.rs @@ -6,9 +6,13 @@ use aho_corasick::AhoCorasick; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use super::Component; + +mod compile; + #[derive(Serialize, Deserialize)] -pub(crate) struct MultiwordTaggerFields { - pub(crate) multiwords: Vec<(String, PosId<'static>)>, +struct MultiwordTaggerFields { + multiwords: Vec<(String, PosId<'static>)>, } impl From for MultiwordTagger { @@ -89,3 +93,9 @@ impl Transform for MultiwordTagger { Ok(sentence) } } + +impl Component for MultiwordTagger { + fn name() -> &'static str { + "multiword_tagger" + } +} diff --git a/nlprule/src/components/rules/compile/mod.rs b/nlprule/src/components/rules/compile/mod.rs new file mode 100644 index 0000000..c3a9e17 --- /dev/null +++ b/nlprule/src/components/rules/compile/mod.rs @@ -0,0 +1,234 @@ +mod structure; + +use std::{io::BufReader, path::PathBuf}; + +use log::warn; + +use crate::{ + compile::{BuildComponent, BuildInfo, Error}, + rule::id::Category, +}; + +use super::*; + +/// Options for a disambiguator. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct DisambiguatorLangOptions { + /// Whether to allow errors while constructing the tokenizer. + pub allow_errors: bool, + /// Disambiguation Rule selectors to use in this tokenizer. + #[serde(default)] + pub ids: Vec, + /// Disambiguation Rule selectors to ignore in this tokenizer. + #[serde(default)] + pub ignore_ids: Vec, + /// Specific examples in the notation `{id}:{example_index}` which are known to fail. + #[serde(default)] + pub known_failures: Vec, +} + +#[derive(Deserialize)] +pub struct DisambiguatorPaths { + disambiguator_xml: PathBuf, + disambiguator_options: PathBuf, +} + +impl BuildComponent for Disambiguator { + type Paths = DisambiguatorPaths; + + fn build(paths: DisambiguatorPaths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: DisambiguatorLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.disambiguator_options)?))?; + let rules = structure::parse::read_disambiguation_rules(paths.disambiguator_xml); + + let mut error = None; + + let rules: Vec<_> = rules + .into_iter() + .filter_map(|x| match x { + Ok((rule_structure, group, _)) => { + let id = Category::new("DISAMBIGUATION"); + + let id = if let Some(group) = &group { + id.join(group.id.as_str()).join(group.n) + } else { + id.join( + rule_structure + .id + .as_ref() + .expect("ID must be set if not in group."), + ) + .join(0) + }; + + match DisambiguationRule::from_rule_structure(rule_structure, build_info) { + Ok(mut rule) => { + if error.is_none() + && (options.ids.is_empty() + || options.ids.iter().any(|x| x.is_match(&id))) + && !options.ignore_ids.iter().any(|x| x.is_match(&id)) + { + rule.id = id; + + Some(rule) + } else { + None + } + } + Err(x) => { + if error.is_none() { + error = Some(format!("[Rule] {}", x)); + } + None + } + } + } + Err(x) => { + if error.is_none() { + error = Some(format!("[Structure] {}", x)); + } + None + } + }) + .collect(); + + if let Some(x) = error { + if options.allow_errors { + warn!("Error constructing Disambiguator: {}", x) + } else { + return Err(Error::Unexpected(format!( + "Error constructing Disambiguator: {}", + x + ))); + } + } + + Ok(Disambiguator { rules }) + } +} + +/// Language-dependent options for a rule set. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct RulesLangOptions { + /// Whether to allow errors while constructing the rules. + pub allow_errors: bool, + /// Grammar Rule selectors to use in this set. + #[serde(default)] + pub ids: Vec, + /// Grammar Rule selectors to ignore in this set. + #[serde(default)] + pub ignore_ids: Vec, +} + +#[derive(Deserialize)] +pub struct RulesPaths { + rules_xml: PathBuf, + rules_options: PathBuf, +} + +impl BuildComponent for Rules { + type Paths = RulesPaths; + + fn build(paths: RulesPaths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: RulesLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.rules_options)?))?; + let rules = structure::parse::read_rules(paths.rules_xml); + let mut errors: DefaultHashMap = DefaultHashMap::new(); + + let rules: Vec<_> = rules + .into_iter() + .filter_map(|x| match x { + Ok((rule_structure, group, category)) => { + let category = category.expect("grammar rules must have category"); + let id = Category::new(category.id.as_str()); + + let id = if let Some(group) = &group { + id.join(group.id.as_str()).join(group.n) + } else { + id.join( + rule_structure + .id + .as_ref() + .expect("ID must be set if not in group."), + ) + .join(0) + }; + + let rule_on = match rule_structure.default.as_deref() { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let group_on = match group.as_ref().and_then(|x| x.default.as_deref()) { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let category_on = match category.default.as_deref() { + Some("off") | Some("temp_off") => false, + Some("on") | None => true, + Some(x) => panic!("unknown `default` value: {}", x), + }; + + let name = rule_structure.name.as_ref().map_or_else( + || { + let group = group.as_ref().expect("must have group if name not set"); + group.name.clone() + }, + |x| x.clone(), + ); + + match Rule::from_rule_structure(rule_structure, build_info) { + Ok(mut rule) => { + if (options.ids.is_empty() + || options.ids.iter().any(|x| x.is_match(&id))) + && !options.ignore_ids.iter().any(|x| x.is_match(&id)) + { + rule.id = id; + rule.name = name; + rule.category_name = category.name; + rule.category_type = category.kind; + rule.enabled = category_on && group_on && rule_on; + Some(rule) + } else { + None + } + } + Err(x) => { + *errors.entry(format!("[Rule] {}", x)).or_insert(0) += 1; + None + } + } + } + Err(x) => { + *errors.entry(format!("[Structure] {}", x)).or_insert(0) += 1; + None + } + }) + .collect(); + + if !errors.is_empty() { + let mut errors: Vec<(String, usize)> = errors.into_iter().collect(); + errors.sort_by_key(|x| -(x.1 as i32)); + + warn!( + "Errors constructing Rules: {:#?}", + &errors + .iter() + .map(|(message, number)| format!("{} (n={})", message, number)) + .collect::>() + ); + } + + Ok(Rules { + rules, + properties: Default::default(), + }) + } +} diff --git a/nlprule/src/components/rules/compile/structure/impls.rs b/nlprule/src/components/rules/compile/structure/impls.rs new file mode 100644 index 0000000..27920df --- /dev/null +++ b/nlprule/src/components/rules/compile/structure/impls.rs @@ -0,0 +1,375 @@ +use std::{ + collections::hash_map::DefaultHasher, + hash::{Hash, Hasher}, +}; + +use crate::utils::parallelism::MaybeParallelIterator; +use crate::{ + compile::{BuildInfo, Error}, + rule::engine::{composition::*, Engine}, + utils::regex::Regex, +}; +use crate::{rule::disambiguation::PosFilter, types::*}; + +impl TextMatcher { + pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result { + // can not cache a matcher that depends on the graph + let set = if matcher.graph_id().is_some() { + None + } else if let either::Right(regex) = &matcher.matcher { + let mut hasher = DefaultHasher::default(); + regex.hash(&mut hasher); + matcher.negate.hash(&mut hasher); + matcher.empty_always_false.hash(&mut hasher); + let matcher_hash = hasher.finish(); + + if let Some(set) = info.mut_regex_cache().get(&matcher_hash) { + set.clone() + } else { + let data: Vec<_> = info.tagger().word_store().iter().collect(); + + let set: DefaultHashSet<_> = data + .into_maybe_par_iter() + .filter_map(|(word, id)| { + if matcher.is_match(word.as_str(), None, None) { + Some(*id) + } else { + None + } + }) + .collect(); + + // there are some regexes which match lots of strings + // this cutoff is pretty arbitrary but without any threshold the size of some sets blows up + // the vast majority of regexes matches less than 100 strings from manual inspection + let set = if set.len() > 100 { None } else { Some(set) }; + info.mut_regex_cache().insert(matcher_hash, set.clone()); + set + } + } else { + None + }; + + Ok(TextMatcher { matcher, set }) + } +} + +impl PosMatcher { + pub fn new(matcher: Matcher, info: &mut BuildInfo) -> Result { + let mut mask = vec![false; info.tagger().tag_store().len()]; + + for (word, id) in info.tagger().tag_store().iter() { + mask[id.value() as usize] = matcher.is_match(word.as_str(), None, None); + } + + Ok(PosMatcher { mask }) + } +} + +impl PosFilter { + pub fn new(matcher: PosMatcher) -> Self { + PosFilter { matcher } + } +} + +impl Regex { + pub fn from_java_regex( + java_regex_str: &str, + full_match: bool, + case_sensitive: bool, + ) -> Result { + let regex_string = + crate::compile::utils::from_java_regex(java_regex_str, case_sensitive, full_match)?; + + let regex = Regex::new(regex_string); + if let Err(error) = regex.try_compile() { + return Err(Error::Regex(error)); + } + + Ok(regex) + } +} + +impl Engine { + pub fn to_graph_id(&self, id: usize) -> Result { + let mut id = GraphId(id); + + let map = match &self { + Engine::Token(engine) => &engine.composition.id_to_idx, + Engine::Text(_, id_to_idx) => &id_to_idx, + }; + + let max_id = *map + .keys() + .max() + .ok_or_else(|| Error::Unexpected("graph is empty".into()))?; + + // ideally this should throw an error but LT is more lenient than nlprule + if !map.contains_key(&id) { + id = max_id; + } + + Ok(id) + } +} + +mod composition { + use super::*; + use crate::{ + rule::engine::composition::{ + AndAtom, Atom, Composition, FalseAtom, GraphId, NotAtom, OffsetAtom, OrAtom, Part, + Quantifier, TrueAtom, + }, + utils::regex::Regex, + }; + + impl Atom { + fn iter_mut<'a>(&'a mut self) -> Box + 'a> { + match self { + Atom::ChunkAtom(_) + | Atom::SpaceBeforeAtom(_) + | Atom::TextAtom(_) + | Atom::WordDataAtom(_) + | Atom::FalseAtom(_) + | Atom::TrueAtom(_) => Box::new(std::iter::once(self)), + Atom::AndAtom(x) => Box::new(x.atoms.iter_mut()), + Atom::OrAtom(x) => Box::new(x.atoms.iter_mut()), + Atom::NotAtom(x) => x.atom.iter_mut(), + Atom::OffsetAtom(x) => x.atom.iter_mut(), + } + } + + pub fn mut_graph_ids(&mut self) -> Vec<&mut GraphId> { + let mut ids = Vec::new(); + + for atom in self.iter_mut() { + let id = match atom { + Atom::ChunkAtom(atom) => atom.matcher.mut_graph_id(), + Atom::TextAtom(atom) => atom.matcher.matcher.mut_graph_id(), + Atom::WordDataAtom(atom) => atom + .matcher + .inflect_matcher + .as_mut() + .and_then(|x| x.matcher.mut_graph_id()), + _ => { + continue; + } + }; + + if let Some(id) = id { + ids.push(id); + } + } + + ids + } + } + + impl Matcher { + pub fn new_regex(regex: Regex, negate: bool, empty_always_false: bool) -> Self { + Matcher { + matcher: either::Right(regex), + negate, + case_sensitive: true, // handled by regex, should maybe be an option + empty_always_false, + } + } + + pub fn new_string( + string_or_idx: either::Either, + negate: bool, + case_sensitive: bool, + empty_always_false: bool, + ) -> Self { + Matcher { + matcher: either::Left(string_or_idx), + negate, + case_sensitive, + empty_always_false, + } + } + + pub fn graph_id(&self) -> Option { + if let either::Left(either::Right(id)) = &self.matcher { + Some(*id) + } else { + None + } + } + + pub fn mut_graph_id(&mut self) -> Option<&mut GraphId> { + if let either::Left(either::Right(id)) = &mut self.matcher { + Some(id) + } else { + None + } + } + } + + impl Quantifier { + pub fn new(min: usize, max: usize) -> Self { + assert!(max >= min); + Quantifier { min, max } + } + } + + impl AndAtom { + pub fn and(atoms: Vec) -> Atom { + let mut atoms: Vec<_> = atoms + .into_iter() + .filter(|x| !matches!(x, Atom::TrueAtom { .. })) + .collect(); + + if atoms.is_empty() { + (TrueAtom {}).into() + } else if atoms.len() == 1 { + atoms.remove(0) + } else { + (AndAtom { atoms }).into() + } + } + } + + impl OrAtom { + pub fn or(atoms: Vec) -> Atom { + let mut atoms: Vec<_> = atoms + .into_iter() + .filter(|x| !matches!(x, Atom::FalseAtom { .. })) + .collect(); + + if atoms.is_empty() { + (FalseAtom {}).into() + } else if atoms.len() == 1 { + atoms.remove(0) + } else { + (OrAtom { atoms }).into() + } + } + } + + impl NotAtom { + pub fn not(atom: Atom) -> Atom { + match atom { + Atom::TrueAtom { .. } => FalseAtom::default().into(), + Atom::FalseAtom { .. } => TrueAtom::default().into(), + x => (NotAtom { atom: Box::new(x) }).into(), + } + } + } + + impl OffsetAtom { + pub fn new(atom: Atom, offset: isize) -> Self { + OffsetAtom { + atom: Box::new(atom), + offset, + } + } + } + + impl Composition { + pub fn new(mut parts: Vec) -> Result { + let mut id_to_idx = DefaultHashMap::default(); + id_to_idx.insert(GraphId(0), 0); + let mut current_id = 1; + + for (i, part) in parts.iter().enumerate() { + if part.visible { + id_to_idx.insert(GraphId(current_id), i + 1); + current_id += 1; + } + } + + let can_stop_mask = (0..parts.len()) + .map(|i| parts[i..].iter().all(|x| x.quantifier.min == 0)) + .collect(); + + for (i, part) in parts.iter_mut().enumerate() { + for id in part.atom.mut_graph_ids() { + loop { + let index = *id_to_idx.get(&id).ok_or_else(|| { + Error::Unexpected(format!("id must exist in graph: {:?}", id)) + })?; + + // ideally this should throw an error but LT is more lenient than nlprule + if index > i { + *id = GraphId(id.0 - 1); + } else { + break; + } + } + } + } + + Ok(Composition { + parts, + id_to_idx, + can_stop_mask, + }) + } + } +} + +pub mod filters { + use super::Error; + use std::collections::HashMap; + + use crate::{filter::*, rule::engine::Engine, utils::regex::Regex}; + + trait FromArgs: Sized { + fn from_args(args: HashMap, engine: &Engine) -> Result; + } + + impl FromArgs for NoDisambiguationEnglishPartialPosTagFilter { + fn from_args(args: HashMap, engine: &Engine) -> Result { + if args.contains_key("negate_postag") { + panic!("negate_postag not supported in NoDisambiguationEnglishPartialPosTagFilter"); + } + + Ok(NoDisambiguationEnglishPartialPosTagFilter { + id: engine.to_graph_id(args + .get("no") + .ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `no` argument" + .into(), + ) + })? + .parse::()?)?, + regexp: Regex::from_java_regex( + &args.get("regexp").ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `regexp` argument" + .into(), + ) + })?, + true, + true, + )?, + postag_regexp: Regex::from_java_regex( + &args.get("postag_regexp").ok_or_else(|| { + Error::Unexpected( + "NoDisambiguationEnglishPartialPosTagFilter must have `postag_regexp` argument" + .into(), + ) + })?, + true, + true, + )?, + negate_postag: args.get("negate_postag").map_or(false, |x| x == "yes"), + }) + } + } + + pub fn get_filter( + name: &str, + args: HashMap, + engine: &Engine, + ) -> Result { + match name { + "NoDisambiguationEnglishPartialPosTagFilter" => { + Ok(NoDisambiguationEnglishPartialPosTagFilter::from_args(args, engine)?.into()) + } + _ => Err(Error::Unexpected(format!("unsupported filter {}", name))), + } + } +} diff --git a/nlprule/src/compile/structure.rs b/nlprule/src/components/rules/compile/structure/mod.rs similarity index 79% rename from nlprule/src/compile/structure.rs rename to nlprule/src/components/rules/compile/structure/mod.rs index eb38b43..883b9fc 100644 --- a/nlprule/src/compile/structure.rs +++ b/nlprule/src/components/rules/compile/structure/mod.rs @@ -1,7 +1,7 @@ -use fs_err::File; use serde::Deserialize; -use std::io::BufReader; -use xml::reader::EventReader; + +pub mod impls; +pub mod parse; mod preprocess { use std::{borrow::Cow, str::FromStr}; @@ -639,154 +639,3 @@ pub enum DisambiguationRuleContainer { RuleGroup(DisambiguationRuleGroup), Unification(Unification), } - -macro_rules! flatten_group { - ($rulegroup:expr, $category:expr) => {{ - let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns { - antipatterns - } else { - Vec::new() - }; - - let group = Group { - id: $rulegroup.id, - default: $rulegroup.default, - name: $rulegroup.name, - n: 0, - }; - - $rulegroup - .rules - .into_iter() - .enumerate() - .map(|(i, mut rule)| { - if let Some(antipatterns) = &mut rule.antipatterns { - antipatterns.extend(group_antipatterns.clone()); - } else { - rule.antipatterns = Some(group_antipatterns.clone()); - } - - let mut group = group.clone(); - group.n = i; - (rule, Some(group), $category.clone()) - }) - .collect::>() - }}; -} - -type GrammarRuleReading = (Rule, Option, Option); -type DisambiguationRuleReading = (DisambiguationRule, Option, Option); - -pub fn read_rules>( - path: P, -) -> Vec> { - let file = File::open(path.as_ref()).unwrap(); - let file = BufReader::new(file); - - let sanitized = preprocess::sanitize(file, &["suggestion"]); - let rules = preprocess::extract_rules(sanitized.as_bytes()); - - let mut unifications = Vec::new(); - - let rules: Vec<_> = rules - .into_iter() - .map(|(xml, category)| { - let mut out = Vec::new(); - - let deseralized = RuleContainer::deserialize(&mut serde_xml_rs::Deserializer::new( - EventReader::new(xml.as_bytes()), - )); - - out.extend(match deseralized { - Ok(rule_container) => match rule_container { - RuleContainer::Rule(rule) => { - vec![Ok((rule, None, category))] - } - RuleContainer::RuleGroup(rule_group) => flatten_group!(rule_group, category) - .into_iter() - .map(Ok) - .collect(), - RuleContainer::Unification(unification) => { - unifications.push(unification); - - vec![] - } - }, - Err(err) => vec![Err(err)], - }); - out - }) - .flatten() - .collect(); - - rules - .into_iter() - .map(|result| match result { - Ok(mut x) => { - x.0.unifications = Some(unifications.clone()); - - Ok(x) - } - Err(x) => Err(x), - }) - .collect() -} - -pub fn read_disambiguation_rules>( - path: P, -) -> Vec> { - let file = File::open(path.as_ref()).unwrap(); - let file = BufReader::new(file); - - let sanitized = preprocess::sanitize(file, &[]); - let rules = preprocess::extract_rules(sanitized.as_bytes()); - - let mut unifications = Vec::new(); - - let rules: Vec<_> = rules - .into_iter() - .map(|(xml, _)| { - let mut out = Vec::new(); - - let deseralized = DisambiguationRuleContainer::deserialize( - &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), - ); - - let category: Option = None; - - out.extend(match deseralized { - Ok(rule_container) => match rule_container { - DisambiguationRuleContainer::Rule(rule) => { - vec![Ok((rule, None, category))] - } - DisambiguationRuleContainer::RuleGroup(rule_group) => { - flatten_group!(rule_group, category) - .into_iter() - .map(Ok) - .collect() - } - DisambiguationRuleContainer::Unification(unification) => { - unifications.push(unification); - - vec![] - } - }, - Err(err) => vec![Err(err)], - }); - out - }) - .flatten() - .collect(); - - rules - .into_iter() - .map(|result| match result { - Ok(mut x) => { - x.0.unifications = Some(unifications.clone()); - - Ok(x) - } - Err(x) => Err(x), - }) - .collect() -} diff --git a/nlprule/src/compile/parse_structure.rs b/nlprule/src/components/rules/compile/structure/parse.rs similarity index 73% rename from nlprule/src/compile/parse_structure.rs rename to nlprule/src/components/rules/compile/structure/parse.rs index 497001d..d05c5e0 100644 --- a/nlprule/src/compile/parse_structure.rs +++ b/nlprule/src/components/rules/compile/structure/parse.rs @@ -1,12 +1,12 @@ -use std::{ops::Range, sync::Arc}; +use std::{io::BufReader, ops::Range}; -use super::{structure, Error}; -use crate::{tokenizer::tag::Tagger, types::*}; +use crate::compile::{BuildInfo, Error}; +use crate::types::*; use crate::{utils, utils::regex::Regex}; +use fs_err::File; use lazy_static::lazy_static; -use serde::{Deserialize, Serialize}; - -pub use structure::{read_disambiguation_rules, read_rules}; +use serde::Deserialize; +use serde_xml_rs::EventReader; use crate::rule::disambiguation::*; use crate::rule::engine::composition::concrete::*; @@ -15,64 +15,16 @@ use crate::rule::engine::*; use crate::rule::grammar::*; use crate::rule::{id::Index, DisambiguationRule, Rule, Unification}; +use super::Category; + // this is set arbitrarily at the moment, could be an option #[inline] fn max_matches() -> usize { 20 } -#[derive(Serialize, Deserialize, Debug)] -pub(crate) struct RegexCache { - cache: DefaultHashMap>>, - // this is compared with the hash of the word store of the tagger - word_hash: u64, -} - -impl RegexCache { - pub fn new(word_hash: u64) -> Self { - RegexCache { - cache: DefaultHashMap::default(), - word_hash, - } - } - - pub fn word_hash(&self) -> &u64 { - &self.word_hash - } - - pub(crate) fn get(&self, key: &u64) -> Option<&Option>> { - self.cache.get(key) - } - - pub(crate) fn insert(&mut self, key: u64, value: Option>) { - self.cache.insert(key, value); - } -} - -pub(crate) struct BuildInfo { - tagger: Arc, - regex_cache: RegexCache, -} - -impl BuildInfo { - pub fn new(tagger: Arc, regex_cache: RegexCache) -> Self { - BuildInfo { - tagger, - regex_cache, - } - } - - pub fn tagger(&self) -> &Arc { - &self.tagger - } - - pub fn mut_regex_cache(&mut self) -> &mut RegexCache { - &mut self.regex_cache - } -} - fn parse_match_attribs( - attribs: impl structure::MatchAttributes, + attribs: impl super::MatchAttributes, text: Option<&str>, case_sensitive: bool, text_match_idx: Option, @@ -149,11 +101,11 @@ fn parse_match_attribs( }; if inflected { - inflect_matcher = Some(matcher); + inflect_matcher = Some(TextMatcher::new(matcher, info)?); } else { atoms.push( (TextAtom { - matcher: TextMatcher::new(matcher, info), + matcher: TextMatcher::new(matcher, info)?, }) .into(), ); @@ -172,13 +124,13 @@ fn parse_match_attribs( true, ) }; - pos_matcher = Some(PosMatcher::new(raw_matcher, info)); + pos_matcher = Some(PosMatcher::new(raw_matcher, info)?); } if pos_matcher.is_some() || inflect_matcher.is_some() { let matcher = WordDataMatcher { pos_matcher, - inflect_matcher: inflect_matcher.map(|x| TextMatcher::new(x, info)), + inflect_matcher, }; atoms.push( (WordDataAtom { @@ -234,7 +186,7 @@ fn parse_match_attribs( } fn get_exceptions( - token: &structure::Token, + token: &super::Token, case_sensitive: bool, only_shifted: bool, info: &mut BuildInfo, @@ -243,7 +195,7 @@ fn get_exceptions( let exceptions: Vec = parts .iter() .filter_map(|x| match x { - structure::TokenPart::Exception(x) => Some(x), + super::TokenPart::Exception(x) => Some(x), _ => None, }) .filter_map(|x| { @@ -287,14 +239,14 @@ fn get_exceptions( } fn parse_token( - token: &structure::Token, + token: &super::Token, case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { let mut parts = Vec::new(); let text = if let Some(parts) = &token.parts { parts.iter().find_map(|x| match x { - structure::TokenPart::Text(text) => Some(text.as_str()), + super::TokenPart::Text(text) => Some(text.as_str()), _ => None, }) } else { @@ -303,7 +255,7 @@ fn parse_token( let text_match_idx = if let Some(parts) = &token.parts { match parts.iter().find_map(|x| match x { - structure::TokenPart::Sub(sub) => Some(sub.no.parse::().map(|x| x + 1)), + super::TokenPart::Sub(sub) => Some(sub.no.parse::().map(|x| x + 1)), _ => None, }) { None => None, @@ -374,7 +326,7 @@ fn parse_token( Ok(parts) } -fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Result { +fn parse_match(m: super::Match, engine: &Engine, info: &mut BuildInfo) -> Result { if m.postag.is_some() || m.postag_regex.is_some() || m.postag_replace.is_some() @@ -418,7 +370,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re x => panic!("unknown postag_regex value {:?}", x), }; Some(PosReplacer { - matcher: PosMatcher::new(matcher, info), + matcher: PosMatcher::new(matcher, info)?, }) } else { None @@ -495,17 +447,17 @@ fn parse_synthesizer_text(text: &str, engine: &Engine) -> Result Result { let mut parts = Vec::new(); for part in data.parts { match part { - structure::SuggestionPart::Text(text) => { + super::SuggestionPart::Text(text) => { parts.extend(parse_synthesizer_text(text.as_str(), engine)?); } - structure::SuggestionPart::Match(m) => { + super::SuggestionPart::Match(m) => { parts.push(SynthesizerPart::Match(parse_match(m, engine, info)?.into())); } } @@ -523,7 +475,7 @@ fn get_last_id(parts: &[Part]) -> isize { } fn parse_parallel_tokens( - tokens: &[structure::Token], + tokens: &[super::Token], case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { @@ -544,7 +496,7 @@ fn parse_parallel_tokens( } fn parse_tokens( - tokens: &[structure::TokenCombination], + tokens: &[super::TokenCombination], case_sensitive: bool, info: &mut BuildInfo, ) -> Result, Error> { @@ -552,8 +504,8 @@ fn parse_tokens( for token_combination in tokens { out.extend(match token_combination { - structure::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?, - structure::TokenCombination::And(tokens) => { + super::TokenCombination::Token(token) => parse_token(token, case_sensitive, info)?, + super::TokenCombination::And(tokens) => { let atom = AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); vec![Part { @@ -564,7 +516,7 @@ fn parse_tokens( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }] } - structure::TokenCombination::Or(tokens) => { + super::TokenCombination::Or(tokens) => { let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); vec![Part { atom, @@ -574,7 +526,7 @@ fn parse_tokens( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }] } - structure::TokenCombination::Feature(_) => Vec::new(), + super::TokenCombination::Feature(_) => Vec::new(), }); } @@ -582,7 +534,7 @@ fn parse_tokens( } fn parse_pattern( - pattern: structure::Pattern, + pattern: super::Pattern, info: &mut BuildInfo, ) -> Result<(Composition, usize, usize), Error> { let mut start = None; @@ -596,17 +548,17 @@ fn parse_pattern( for part in &pattern.parts { match part { - structure::PatternPart::Token(token) => { + super::PatternPart::Token(token) => { composition_parts.extend(parse_token(token, case_sensitive, info)?) } - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { start = Some(get_last_id(&composition_parts)); composition_parts.extend(parse_tokens(&marker.tokens, case_sensitive, info)?); end = Some(get_last_id(&composition_parts)); } - structure::PatternPart::And(tokens) => { + super::PatternPart::And(tokens) => { let atom = AndAtom::and(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); @@ -618,7 +570,7 @@ fn parse_pattern( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }); } - structure::PatternPart::Or(tokens) => { + super::PatternPart::Or(tokens) => { let atom = OrAtom::or(parse_parallel_tokens(&tokens.tokens, case_sensitive, info)?); composition_parts.push(Part { @@ -629,7 +581,7 @@ fn parse_pattern( unify: tokens.tokens[0].unify.as_ref().map(|x| x == "yes"), }); } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -642,12 +594,12 @@ fn parse_pattern( } fn parse_features( - pattern: &structure::Pattern, - unifications: &Option>, + pattern: &super::Pattern, + unifications: &Option>, info: &mut BuildInfo, -) -> Vec> { +) -> Result>, Error> { let mut filters = Vec::new(); - let mut parse_feature = |id: &str| -> Vec { + let mut parse_feature = |id: &str| -> Result, Error> { let unification = unifications .as_ref() .unwrap() @@ -670,11 +622,11 @@ fn parse_features( for part in &pattern.parts { match part { - structure::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)), - structure::PatternPart::Marker(marker) => { + super::PatternPart::Feature(feature) => filters.push(parse_feature(&feature.id)?), + super::PatternPart::Marker(marker) => { for token_combination in &marker.tokens { - if let structure::TokenCombination::Feature(feature) = token_combination { - filters.push(parse_feature(&feature.id)); + if let super::TokenCombination::Feature(feature) = token_combination { + filters.push(parse_feature(&feature.id)?); } } } @@ -682,14 +634,11 @@ fn parse_features( } } - filters + Ok(filters) } impl Rule { - pub(crate) fn from_rule_structure( - data: structure::Rule, - info: &mut BuildInfo, - ) -> Result { + pub fn from_rule_structure(data: super::Rule, info: &mut BuildInfo) -> Result { if data.filter.is_some() { return Err(Error::Unimplemented( "rules with filter are not implemented.".into(), @@ -756,7 +705,7 @@ impl Rule { }; let unify_data = if let Some(pattern) = &data.pattern { - let unify_filters = parse_features(&pattern, &data.unifications, info); + let unify_filters = parse_features(&pattern, &data.unifications, info)?; let unify_mask: Vec<_> = maybe_composition .unwrap() .parts @@ -773,16 +722,16 @@ impl Rule { for part in data.message.parts { match part { - structure::MessagePart::Suggestion(suggestion) => { + super::MessagePart::Suggestion(suggestion) => { let suggester = parse_suggestion(suggestion.clone(), &engine, info)?; // simpler to just parse a second time than cloning the result message_parts.extend(parse_suggestion(suggestion, &engine, info)?.parts); suggesters.push(suggester); } - structure::MessagePart::Text(text) => { + super::MessagePart::Text(text) => { message_parts.extend(parse_synthesizer_text(text.as_str(), &engine)?); } - structure::MessagePart::Match(m) => { + super::MessagePart::Match(m) => { message_parts.push(SynthesizerPart::Match( parse_match(m, &engine, info)?.into(), )); @@ -817,10 +766,10 @@ impl Rule { for part in &example.parts { match part { - structure::ExamplePart::Text(text) => { + super::ExamplePart::Text(text) => { texts.push(text.as_str()); } - structure::ExamplePart::Marker(marker) => { + super::ExamplePart::Marker(marker) => { let (bytes_before, chars_before) = texts.iter().fold((0, 0), |acc, text| { (acc.0 + text.len(), acc.1 + text.chars().count()) @@ -911,6 +860,8 @@ fn parse_tag_form( is_sentence_end: bool, info: &mut BuildInfo, ) -> Result, Error> { + let tagger = info.tagger(); + lazy_static! { static ref REGEX: Regex = Regex::new(r"(.+?)\[(.+?)\]".into()); } @@ -935,8 +886,8 @@ fn parse_tag_form( None } else { Some(WordData::new( - info.tagger.id_word(parts[0].to_owned().into()), - info.tagger.id_tag(parts[1]).into_static(), + tagger.id_word(parts[0].to_owned().into()), + tagger.id_tag(parts[1]).into_static(), )) } }) @@ -944,7 +895,7 @@ fn parse_tag_form( tags.push( WordData::new( - info.tagger.id_word(text.to_owned().into()), + tagger.id_word(text.to_owned().into()), PosId::special(SpecialPos::None), ) .freeze(), @@ -958,19 +909,23 @@ fn parse_tag_form( } impl WordData<'static> { - fn from_structure(data: structure::WordData, info: &mut BuildInfo) -> Self { - WordData::new( - info.tagger + fn from_structure(data: super::WordData, info: &mut BuildInfo) -> Result { + Ok(WordData::new( + info.tagger() .id_word(data.lemma.unwrap_or_else(String::new).into()), - info.tagger + info.tagger() .id_tag(data.pos.as_deref().unwrap_or("").trim()) .into_static(), - ) + )) } } -fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildInfo) -> PosFilter { - match postag_regexp.as_deref() { +fn parse_pos_filter( + postag: &str, + postag_regexp: Option<&str>, + info: &mut BuildInfo, +) -> Result { + Ok(match postag_regexp.as_deref() { Some("yes") => PosFilter::new(PosMatcher::new( Matcher::new_regex( Regex::from_java_regex(&postag, true, true).unwrap(), @@ -978,23 +933,23 @@ fn parse_pos_filter(postag: &str, postag_regexp: Option<&str>, info: &mut BuildI true, ), info, - )), + )?), Some(_) | None => PosFilter::new(PosMatcher::new( Matcher::new_string(either::Left(postag.into()), false, false, true), info, - )), - } + )?), + }) } impl DisambiguationRule { - pub(crate) fn from_rule_structure( - data: structure::DisambiguationRule, + pub fn from_rule_structure( + data: super::DisambiguationRule, info: &mut BuildInfo, ) -> Result { // might need the pattern later so clone it here let (composition, start, end) = parse_pattern(data.pattern.clone(), info)?; - let unify_filters = parse_features(&data.pattern, &data.unifications, info); + let unify_filters = parse_features(&data.pattern, &data.unifications, info)?; let unify_mask: Vec<_> = composition.parts.iter().map(|part| part.unify).collect(); let antipatterns = if let Some(antipatterns) = data.antipatterns { @@ -1023,25 +978,25 @@ impl DisambiguationRule { let word_datas: Vec<_> = if let Some(wds) = data.disambig.word_datas { wds.into_iter() .map(|part| match part { - structure::DisambiguationPart::WordData(x) => { - either::Left(WordData::from_structure(x, info)) + super::DisambiguationPart::WordData(x) => { + WordData::from_structure(x, info).map(either::Left) + } + super::DisambiguationPart::Match(x) => { + parse_pos_filter(&x.postag.unwrap(), x.postag_regexp.as_deref(), info) + .map(either::Right) } - structure::DisambiguationPart::Match(x) => either::Right(parse_pos_filter( - &x.postag.unwrap(), - x.postag_regexp.as_deref(), - info, - )), }) - .collect() + .collect::>()? } else { Vec::new() }; + let tagger = info.tagger(); let disambiguations = match data.disambig.action.as_deref() { Some("remove") => { if let Some(postag) = data.disambig.postag.as_ref() { Ok(Disambiguation::Remove(vec![either::Right( - parse_pos_filter(postag, Some("yes"), info), + parse_pos_filter(postag, Some("yes"), info)?, )])) } else { Ok(Disambiguation::Remove(word_datas.into_iter().collect())) @@ -1079,45 +1034,59 @@ impl DisambiguationRule { for part in &data.pattern.parts { match part { - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { has_marker = true; for token in &marker.tokens { let token = match token { - structure::TokenCombination::Token(token) => token, - structure::TokenCombination::And(tokens) - | structure::TokenCombination::Or(tokens) => &tokens.tokens[0], - structure::TokenCombination::Feature(_) => continue, + super::TokenCombination::Token(token) => token, + super::TokenCombination::And(tokens) + | super::TokenCombination::Or(tokens) => &tokens.tokens[0], + super::TokenCombination::Feature(_) => continue, }; - marker_disambig.push(token.postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - token.postag_regexp.as_deref(), - info, - )) - })); + marker_disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + token.postag_regexp.as_deref(), + info, + ) + .map(either::Right) + }) + .transpose()?, + ); } } - structure::PatternPart::Token(token) => { - disambig.push(token.postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - token.postag_regexp.as_deref(), - info, - )) - })) - } - structure::PatternPart::And(tokens) - | structure::PatternPart::Or(tokens) => { - disambig.push(tokens.tokens[0].postag.as_ref().map(|x| { - either::Right(parse_pos_filter( - x, - tokens.tokens[0].postag_regexp.as_deref(), - info, - )) - })) + super::PatternPart::Token(token) => disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter(x, token.postag_regexp.as_deref(), info) + .map(either::Right) + }) + .transpose()?, + ), + super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => { + disambig.push( + tokens.tokens[0] + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + tokens.tokens[0].postag_regexp.as_deref(), + info, + ) + .map(either::Right) + }) + .transpose()?, + ) } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -1129,7 +1098,7 @@ impl DisambiguationRule { Ok(Disambiguation::Filter( disambiguations.into_iter().collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } Some("filter") => { @@ -1139,13 +1108,13 @@ impl DisambiguationRule { postag, Some("yes"), info, - )))], - info.tagger().lang_options().retain_last, + )?))], + tagger.lang_options().retain_last, )) } else { Ok(Disambiguation::Filter( word_datas.into_iter().map(Some).collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } } @@ -1159,36 +1128,61 @@ impl DisambiguationRule { for part in &data.pattern.parts { match part { - structure::PatternPart::Marker(marker) => { + super::PatternPart::Marker(marker) => { has_marker = true; for token in &marker.tokens { let token = match token { - structure::TokenCombination::Token(token) => token, - structure::TokenCombination::And(tokens) - | structure::TokenCombination::Or(tokens) => &tokens.tokens[0], - structure::TokenCombination::Feature(_) => continue, + super::TokenCombination::Token(token) => token, + super::TokenCombination::And(tokens) + | super::TokenCombination::Or(tokens) => &tokens.tokens[0], + super::TokenCombination::Feature(_) => continue, }; - marker_disambig.push(token.postag.as_ref().map(|x| { - parse_pos_filter(x, token.postag_regexp.as_deref(), info) - })); + marker_disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + token.postag_regexp.as_deref(), + info, + ) + }) + .transpose()?, + ); marker_mask.push(token.unify.is_some()) } } - structure::PatternPart::Token(token) => { - disambig.push(token.postag.as_ref().map(|x| { - parse_pos_filter(x, token.postag_regexp.as_deref(), info) - })); + super::PatternPart::Token(token) => { + disambig.push( + token + .postag + .as_ref() + .map(|x| { + parse_pos_filter(x, token.postag_regexp.as_deref(), info) + }) + .transpose()?, + ); mask.push(token.unify.is_some()); } - structure::PatternPart::And(tokens) - | structure::PatternPart::Or(tokens) => { - disambig.push(tokens.tokens[0].postag.as_ref().map(|x| { - parse_pos_filter(x, tokens.tokens[0].postag_regexp.as_deref(), info) - })); + super::PatternPart::And(tokens) | super::PatternPart::Or(tokens) => { + disambig.push( + tokens.tokens[0] + .postag + .as_ref() + .map(|x| { + parse_pos_filter( + x, + tokens.tokens[0].postag_regexp.as_deref(), + info, + ) + }) + .transpose()?, + ); mask.push(tokens.tokens[0].unify.is_some()); } - structure::PatternPart::Feature(_) => {} + super::PatternPart::Feature(_) => {} } } @@ -1204,15 +1198,15 @@ impl DisambiguationRule { if let Some(postag) = data.disambig.postag.as_ref() { Ok(Disambiguation::Filter( vec![Some(either::Left(WordData::new( - info.tagger.id_word("".into()), - info.tagger.id_tag(postag).into_static(), + tagger.id_word("".into()), + tagger.id_tag(postag).into_static(), )))], - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } else { Ok(Disambiguation::Filter( word_datas.into_iter().map(Some).collect(), - info.tagger().lang_options().retain_last, + tagger.lang_options().retain_last, )) } } @@ -1251,11 +1245,11 @@ impl DisambiguationRule { for part in &example.parts { match part { - structure::ExamplePart::Text(text) => { + super::ExamplePart::Text(text) => { texts.push(text.as_str()); char_length += text.chars().count(); } - structure::ExamplePart::Marker(marker) => { + super::ExamplePart::Marker(marker) => { if char_span.is_some() { return Err(Error::Unexpected( "example must have one or zero markers".into(), @@ -1328,3 +1322,160 @@ impl DisambiguationRule { }) } } + +macro_rules! flatten_group { + ($rulegroup:expr, $category:expr) => {{ + let group_antipatterns = if let Some(antipatterns) = $rulegroup.antipatterns { + antipatterns + } else { + Vec::new() + }; + + let group = super::Group { + id: $rulegroup.id, + default: $rulegroup.default, + name: $rulegroup.name, + n: 0, + }; + + $rulegroup + .rules + .into_iter() + .enumerate() + .map(|(i, mut rule)| { + if let Some(antipatterns) = &mut rule.antipatterns { + antipatterns.extend(group_antipatterns.clone()); + } else { + rule.antipatterns = Some(group_antipatterns.clone()); + } + + let mut group = group.clone(); + group.n = i; + (rule, Some(group), $category.clone()) + }) + .collect::>() + }}; +} + +type GrammarRuleReading = (super::Rule, Option, Option); +type DisambiguationRuleReading = ( + super::DisambiguationRule, + Option, + Option, +); + +pub fn read_rules>( + path: P, +) -> Vec> { + let file = File::open(path.as_ref()).unwrap(); + let file = BufReader::new(file); + + let sanitized = super::preprocess::sanitize(file, &["suggestion"]); + let rules = super::preprocess::extract_rules(sanitized.as_bytes()); + + let mut unifications = Vec::new(); + + let rules: Vec<_> = rules + .into_iter() + .map(|(xml, category)| { + let mut out = Vec::new(); + + let deseralized = super::RuleContainer::deserialize( + &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), + ); + + out.extend(match deseralized { + Ok(rule_container) => match rule_container { + super::RuleContainer::Rule(rule) => { + vec![Ok((rule, None, category))] + } + super::RuleContainer::RuleGroup(rule_group) => { + flatten_group!(rule_group, category) + .into_iter() + .map(Ok) + .collect() + } + super::RuleContainer::Unification(unification) => { + unifications.push(unification); + + vec![] + } + }, + Err(err) => vec![Err(err)], + }); + out + }) + .flatten() + .collect(); + + rules + .into_iter() + .map(|result| match result { + Ok(mut x) => { + x.0.unifications = Some(unifications.clone()); + + Ok(x) + } + Err(x) => Err(x), + }) + .collect() +} + +pub fn read_disambiguation_rules>( + path: P, +) -> Vec> { + let file = File::open(path.as_ref()).unwrap(); + let file = BufReader::new(file); + + let sanitized = super::preprocess::sanitize(file, &[]); + let rules = super::preprocess::extract_rules(sanitized.as_bytes()); + + let mut unifications = Vec::new(); + + let rules: Vec<_> = rules + .into_iter() + .map(|(xml, _)| { + let mut out = Vec::new(); + + let deseralized = super::DisambiguationRuleContainer::deserialize( + &mut serde_xml_rs::Deserializer::new(EventReader::new(xml.as_bytes())), + ); + + let category: Option = None; + + out.extend(match deseralized { + Ok(rule_container) => match rule_container { + super::DisambiguationRuleContainer::Rule(rule) => { + vec![Ok((rule, None, category))] + } + super::DisambiguationRuleContainer::RuleGroup(rule_group) => { + flatten_group!(rule_group, category) + .into_iter() + .map(Ok) + .collect() + } + super::DisambiguationRuleContainer::Unification(unification) => { + unifications.push(unification); + + vec![] + } + }, + Err(err) => vec![Err(err)], + }); + out + }) + .flatten() + .collect(); + + rules + .into_iter() + .map(|result| match result { + Ok(mut x) => { + x.0.unifications = Some(unifications.clone()); + + Ok(x) + } + Err(x) => Err(x), + }) + .collect() +} diff --git a/nlprule/src/rules.rs b/nlprule/src/components/rules/mod.rs similarity index 64% rename from nlprule/src/rules.rs rename to nlprule/src/components/rules/mod.rs index bd90f96..462a059 100644 --- a/nlprule/src/rules.rs +++ b/nlprule/src/components/rules/mod.rs @@ -1,47 +1,111 @@ -//! Sets of grammatical error correction rules. +use serde::{Deserialize, Serialize}; +use std::iter::FromIterator; + +use fs_err::File; use crate::properties::*; +use crate::rule::Rule; use crate::types::*; use crate::utils::parallelism::MaybeParallelRefIterator; -use crate::{rule::id::Selector, rule::MatchSentence, rule::Rule, Error}; -use fs_err::File; -use once_cell::sync::OnceCell; -use serde::{Deserialize, Serialize}; -use std::{ - io::{BufReader, Read, Write}, - iter::FromIterator, - path::Path, +use crate::{ + properties::Transform, + rule::{ + id::{Index, Selector}, + DisambiguationRule, MatchSentence, + }, + types::Sentence, }; +use once_cell::sync::OnceCell; + +use super::Component; -/// Language-dependent options for a rule set. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct RulesLangOptions { - /// Whether to allow errors while constructing the rules. - pub allow_errors: bool, - /// Grammar Rule selectors to use in this set. - #[serde(default)] - pub ids: Vec, - /// Grammar Rule selectors to ignore in this set. - #[serde(default)] - pub ignore_ids: Vec, +mod compile; + +#[derive(Serialize, Deserialize, Clone, Debug)] +pub struct Disambiguator { + rules: Vec, +} + +impl Transform for Disambiguator { + fn transform<'t>( + &'t self, + sentence: Sentence<'t>, + ) -> Result, crate::properties::Error> { + self.disambiguate_up_to_id(sentence, None) + } +} + +impl Component for Disambiguator { + fn name() -> &'static str { + "disambiguator" + } } -impl Default for RulesLangOptions { - fn default() -> Self { - RulesLangOptions { - allow_errors: true, - ids: Vec::new(), - ignore_ids: Vec::new(), +impl Disambiguator { + /// Gets all disambigation rules in the order they are applied. + pub fn rules(&self) -> &[DisambiguationRule] { + &self.rules + } + + pub(crate) fn disambiguate_up_to_id<'t>( + &'t self, + mut sentence: Sentence<'t>, + id: Option<&Index>, + ) -> Result, crate::properties::Error> { + let n = id.map_or(self.rules.len(), |id| { + self.rules.iter().position(|x| x.id == *id).unwrap() + }); + let mut i = 0; + + let guard = self.property_guard(&mut sentence)?; + + while i < n { + let match_sentence = MatchSentence::new(&sentence, guard.downgrade()); + + let result = self.rules[i..n] + .maybe_par_iter() + .enumerate() + .filter_map(|(j, rule)| { + let changes = rule.apply(&match_sentence); + + match changes { + Ok(changes) => { + if changes.is_empty() { + None + } else { + Some(Ok((j + i, changes))) + } + } + Err(err) => Some(Err(err)), + } + }) + .find_first(|_| true) + .transpose()?; + + if let Some((index, changes)) = result { + self.rules[index].change(&mut sentence, changes, guard)?; + i = index + 1; + } else { + i = n; + } } + + Ok(sentence) } } /// A set of grammatical error correction rules. #[derive(Serialize, Deserialize, Default, Clone)] pub struct Rules { - pub(crate) rules: Vec, + rules: Vec, #[serde(skip)] - pub(crate) properties: OnceCell, + properties: OnceCell, +} + +impl Component for Rules { + fn name() -> &'static str { + "rules" + } } impl Suggest for Rules { @@ -106,27 +170,6 @@ impl Suggest for Rules { } impl Rules { - /// Creates a new rule set from a path to a binary. - /// - /// # Errors - /// - If the file can not be opened. - /// - If the file content can not be deserialized to a rules set. - pub fn new>(p: P) -> Result { - let reader = BufReader::new(File::open(p.as_ref())?); - let rules: Rules = bincode::deserialize_from(reader)?; - Ok(rules) - } - - /// Creates a new rules set from a reader. - pub fn from_reader(reader: R) -> Result { - Ok(bincode::deserialize_from(reader)?) - } - - /// Serializes this rules set to a writer. - pub fn to_writer(&self, writer: W) -> Result<(), Error> { - Ok(bincode::serialize_into(writer, &self)?) - } - /// All rules ordered by priority. pub fn rules(&self) -> &[Rule] { &self.rules diff --git a/nlprule/src/components/tagger/compile.rs b/nlprule/src/components/tagger/compile.rs new file mode 100644 index 0000000..0e4dd94 --- /dev/null +++ b/nlprule/src/components/tagger/compile.rs @@ -0,0 +1,165 @@ +use fs_err as fs; +use fs_err::File; + +use crate::compile::{BuildComponent, BuildInfo, Error}; +use crate::components::tagger::TaggerLangOptions; + +use super::*; +use serde::Deserialize; +use std::{ + collections::HashSet, + io::{BufRead, BufReader}, + path::{Path, PathBuf}, +}; + +fn get_lines, S2: AsRef>( + paths: &[S1], + remove_paths: &[S2], +) -> std::io::Result> { + let mut output = Vec::new(); + let mut disallowed: Vec = Vec::new(); + + for path in remove_paths { + let file = File::open(path.as_ref())?; + let reader = std::io::BufReader::new(file); + + for line in reader.lines() { + let line = line?; + if line.starts_with('#') { + continue; + } + + disallowed.push(line.to_string()); + } + } + + for path in paths { + let file = File::open(path.as_ref())?; + let reader = std::io::BufReader::new(file); + + for line in reader.lines() { + let line = line?; + if line.starts_with('#') { + continue; + } + + if disallowed.contains(&line) { + continue; + } + + let parts: Vec<_> = line.split('\t').collect(); + + let word = parts[0].to_string(); + let inflection = parts[1].to_string(); + let tag = parts[2].to_string(); + + output.push((word, inflection, tag)) + } + } + + Ok(output) +} + +#[derive(Deserialize)] +pub struct Paths { + tag_dict: Vec, + tag_remove_dict: Vec, + common_words: PathBuf, + tagger_options: PathBuf, +} + +impl BuildComponent for Tagger { + type Paths = Paths; + + /// TODO: move and update + /// Creates a tagger from raw files. + /// + /// # Arguments + /// * `paths`: Paths to files where each line contains the word, lemma and tag, respectively, + /// separated by tabs, to be added to the tagger. + /// * `remove_paths`: Paths to files where each line contains the word, lemma and tag, respectively, + /// separated by tabs, to be removed from the tagger if present in the files from `paths`. + fn build(paths: Paths, _build_info: Option<&mut BuildInfo>) -> Result { + let options: TaggerLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.tagger_options)?))?; + let common_words: HashSet = fs::read_to_string(paths.common_words)? + .lines() + .map(ToOwned::to_owned) + .collect(); + + let mut tag_store = HashSet::new(); + let mut word_store = HashSet::new(); + + // add language specific special tags + tag_store.extend(options.extra_tags.iter().map(|x| x.as_str())); + + let lines = get_lines(&paths.tag_dict, &paths.tag_remove_dict)?; + + let punct = "!\"#$%&\\'()*+,-./:;<=>?@[\\]^_`{|}~"; + for i in 0..punct.len() { + word_store.insert(&punct[i..(i + 1)]); + } + + word_store.extend(common_words.iter().map(|x| x.as_str())); + + for (word, inflection, tag) in lines.iter() { + word_store.insert(word); + word_store.insert(inflection); + tag_store.insert(tag); + } + + // the empty string must not be part of any wordlist + assert!(!word_store.contains("")); + + // word store ids should be consistent across runs + let mut word_store: Vec<_> = word_store.into_iter().collect(); + word_store.sort_unstable(); + + // add special empty string to wordlist, must be the first element to have id 0 + word_store.insert(0, ""); + + // tag store ids should be consistent across runs + let mut tag_store: Vec<_> = tag_store.into_iter().collect(); + tag_store.sort_unstable(); + + // add special part of speech tags, they must have ids starting from zero + for (i, special_pos) in SpecialPos::iter().enumerate() { + tag_store.insert(i, special_pos); + } + + let word_store: BiMap<_, _> = word_store + .iter() + .enumerate() + .map(|(i, x)| (x.to_string(), WordIdInt::from_value_unchecked(i as u32))) + .collect(); + let tag_store: BiMap<_, _> = tag_store + .iter() + .enumerate() + .map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16))) + .collect(); + + let mut tags: Vec>> = vec![None; word_store.len()]; + + for (word, inflection, tag) in lines.iter() { + let word_id = word_store.get_by_left(word).unwrap(); + let lemma_id = word_store.get_by_left(inflection).unwrap(); + let pos_id = tag_store.get_by_left(tag).unwrap(); + + match &mut tags[word_id.value() as usize] { + Some(vec) => { + vec.push((*lemma_id, *pos_id)); + } + None => { + tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]); + } + } + } + + Ok(Tagger { + tags: WordIdMap(tags), + word_store, + tag_store, + lang_options: options, + }) + } +} diff --git a/nlprule/src/tokenizer/tag.rs b/nlprule/src/components/tagger/mod.rs similarity index 97% rename from nlprule/src/tokenizer/tag.rs rename to nlprule/src/components/tagger/mod.rs index fee120d..78d9139 100644 --- a/nlprule/src/tokenizer/tag.rs +++ b/nlprule/src/components/tagger/mod.rs @@ -3,7 +3,6 @@ use crate::{properties::*, types::*, utils::parallelism::MaybeParallelRefIterator}; use bimap::BiMap; use fst::{IntoStreamer, Map, Streamer}; -use lazy_static::lazy_static; use log::error; use serde::{Deserialize, Serialize}; use std::{ @@ -13,6 +12,8 @@ use std::{ iter::{once, FusedIterator}, }; +mod compile; + #[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] #[serde(transparent)] pub(crate) struct WordIdInt(u32); @@ -548,58 +549,10 @@ impl<'a> ExactSizeIterator for TagIter<'a> { #[derive(Default, Serialize, Deserialize, Clone)] #[serde(from = "TaggerFields", into = "TaggerFields")] pub struct Tagger { - pub(crate) tags: WordIdMap>, - pub(crate) tag_store: BiMap, - pub(crate) word_store: BiMap, - pub(crate) lang_options: TaggerLangOptions, -} - -impl Transform for Tagger { - fn properties(&self) -> PropertiesMut { - lazy_static! { - static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); - } - *PROPERTIES - } - - fn transform<'t>( - &'t self, - mut sentence: Sentence<'t>, - ) -> Result, crate::properties::Error> { - let props = self.property_guard(&mut sentence)?; - - for token in sentence.iter_mut() { - let mut tag_vec: Vec<_> = self - .get_tags_with_options( - token.as_str(), - if token.is_sentence_start() { - Some(true) - } else { - None - }, - None, - ) - .collect(); - - tag_vec.push( - WordData::new( - self.id_word(token.as_str().into()), - PosId::special(SpecialPos::None), - ) - .freeze(), - ); - - if token.is_sentence_end() { - tag_vec.push( - WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), - ); - } - - *props.tags_mut(token)? = Tags::new(self.id_word(token.as_str().into()), tag_vec); - } - - Ok(sentence) - } + tags: WordIdMap>, + tag_store: BiMap, + word_store: BiMap, + lang_options: TaggerLangOptions, } impl Tagger { @@ -787,4 +740,42 @@ impl Tagger { pub fn get_tags<'a>(&'a self, word: &'a str) -> TagIter<'a> { self.get_tags_with_options(word, None, None) } + + pub fn transform<'t>( + &'t self, + mut sentence: Sentence<'t>, + guard: PropertyGuardMut, + ) -> Result, crate::properties::Error> { + for token in sentence.iter_mut() { + let mut tag_vec: Vec<_> = self + .get_tags_with_options( + token.as_str(), + if token.is_sentence_start() { + Some(true) + } else { + None + }, + None, + ) + .collect(); + + tag_vec.push( + WordData::new( + self.id_word(token.as_str().into()), + PosId::special(SpecialPos::None), + ) + .freeze(), + ); + + if token.is_sentence_end() { + tag_vec.push( + WordData::new(WordId::empty(), PosId::special(SpecialPos::SentEnd)).freeze(), + ); + } + + *guard.tags_mut(token)? = Tags::new(self.id_word(token.as_str().into()), tag_vec); + } + + Ok(sentence) + } } diff --git a/nlprule/src/components/tokenizer/compile.rs b/nlprule/src/components/tokenizer/compile.rs new file mode 100644 index 0000000..03ed99e --- /dev/null +++ b/nlprule/src/components/tokenizer/compile.rs @@ -0,0 +1,45 @@ +use fs_err as fs; + +use std::{io::BufReader, path::PathBuf, str::FromStr}; + +use crate::compile::{BuildComponent, BuildInfo, Error}; + +use super::*; + +#[derive(Deserialize)] +pub struct Paths { + tokenizer_options: PathBuf, + srx: PathBuf, + lang_code: PathBuf, +} + +impl BuildComponent for Tokenizer { + type Paths = Paths; + + fn build(paths: Paths, build_info: Option<&mut BuildInfo>) -> Result { + let build_info = build_info.ok_or(Error::BuildInfoUnset)?; + + let options: TokenizerLangOptions = + serde_json::from_reader(BufReader::new(File::open(&paths.tokenizer_options)?))?; + let lang_code = fs::read_to_string(paths.lang_code)?; + + let sentencizer = + srx::SRX::from_str(&fs::read_to_string(&paths.srx)?)?.language_rules(lang_code); + + let mut whitelist = DefaultHashSet::new(); + + for (word, _) in build_info.tagger().word_store() { + if word.contains(|c| options.extra_split_chars.contains(&c)) { + whitelist.insert(word.to_owned()); + } + } + + Ok(Tokenizer { + tagger: build_info.tagger().clone(), + sentencizer, + lang_options: options, + whitelist, + properties: Default::default(), + }) + } +} diff --git a/nlprule/src/tokenizer.rs b/nlprule/src/components/tokenizer/mod.rs similarity index 56% rename from nlprule/src/tokenizer.rs rename to nlprule/src/components/tokenizer/mod.rs index c165cd4..7982ee2 100644 --- a/nlprule/src/tokenizer.rs +++ b/nlprule/src/components/tokenizer/mod.rs @@ -4,34 +4,18 @@ //! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by //! [DisambiguationRule][crate::rule::DisambiguationRule]s. -use crate::{ - properties::*, - rule::id::{Index, Selector}, - rule::MatchSentence, - types::*, - utils::{parallelism::MaybeParallelRefIterator, regex::Regex}, - Error, -}; +mod compile; + use fs_err::File; +use std::ops::Range; + +use crate::types::*; +use crate::{properties::*, utils::regex::Regex}; use lazy_static::lazy_static; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; -use std::{ - io::{BufReader, Read, Write}, - ops::Range, - path::Path, - sync::Arc, -}; - -pub mod chunk; -pub mod multiword; -pub mod tag; -use chunk::Chunker; -use multiword::MultiwordTagger; -use tag::Tagger; - -use crate::rule::DisambiguationRule; +use super::{tagger::Tagger, Component}; /// Split a text at the points where the given function is true. /// Keeps the separators. See https://stackoverflow.com/a/40296745. @@ -56,19 +40,8 @@ where } /// Options for a tokenizer. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize, Default)] pub(crate) struct TokenizerLangOptions { - /// Whether to allow errors while constructing the tokenizer. - pub allow_errors: bool, - /// Disambiguation Rule selectors to use in this tokenizer. - #[serde(default)] - pub ids: Vec, - /// Disambiguation Rule selectors to ignore in this tokenizer. - #[serde(default)] - pub ignore_ids: Vec, - /// Specific examples in the notation `{id}:{example_index}` which are known to fail. - #[serde(default)] - pub known_failures: Vec, /// Extra language-specific characters to split text on. #[serde(default)] pub extra_split_chars: Vec, @@ -77,20 +50,40 @@ pub(crate) struct TokenizerLangOptions { pub extra_join_regexes: Vec, } -impl Default for TokenizerLangOptions { - fn default() -> Self { - TokenizerLangOptions { - allow_errors: false, - ids: Vec::new(), - ignore_ids: Vec::new(), - known_failures: Vec::new(), - extra_split_chars: Vec::new(), - extra_join_regexes: Vec::new(), +/// The complete Tokenizer doing tagging, chunking and disambiguation. +#[derive(Serialize, Deserialize, Default, Clone)] +pub struct Tokenizer { + whitelist: DefaultHashSet, + sentencizer: srx::Rules, + tagger: Tagger, + lang_options: TokenizerLangOptions, + #[serde(skip)] + properties: OnceCell, +} + +impl Tokenize for Tokenizer { + fn properties(&self) -> PropertiesMut { + lazy_static! { + static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); } + *PROPERTIES + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + Box::new(SentenceIter { + text, + splits: self.sentencizer.split_ranges(text), + tokenizer: &self, + index: 0, + position: Position::default(), + }) } } -/// An iterator over [IncompleteSentence]s. Has the same properties as [SentenceIter]. +/// An iterator over sentences. Has some key properties: +/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero. +/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`. +/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence. pub struct SentenceIter<'t> { text: &'t str, splits: Vec>, @@ -132,171 +125,18 @@ impl<'t> Iterator for SentenceIter<'t> { } } -/// An iterator over [Sentence]s. Has some key properties: -/// - Preceding whitespace is always included so the first sentence always starts at byte and char index zero. -/// - There are no gaps between sentences i.e. `sentence[i - 1].span().end() == sentence[i].span().start()`. -/// - Behavior for trailing whitespace is not defined. Can be included in the last sentence or not be part of any sentence. -pub struct AnalyzedSentenceIter<'t> { - inner: SentenceIter<'t>, - tokenizer: &'t Tokenizer, -} - -impl<'t> Iterator for AnalyzedSentenceIter<'t> { - type Item = Result, crate::properties::Error>; - - fn next(&mut self) -> Option { - self.inner - .next() - .map(|sentence| self.tokenizer.disambiguate(sentence)) - } -} - -/// The complete Tokenizer doing tagging, chunking and disambiguation. -#[derive(Serialize, Deserialize, Default, Clone)] -pub struct Tokenizer { - pub(crate) rules: Vec, - pub(crate) whitelist: DefaultHashSet, - pub(crate) chunker: Option, - pub(crate) sentencizer: srx::Rules, - pub(crate) multiword_tagger: Option, - pub(crate) tagger: Arc, - pub(crate) lang_options: TokenizerLangOptions, - #[serde(skip)] - pub(crate) properties: OnceCell, -} - -// impl Transform for Tokenizer { -// fn properties(&self) -> PropertiesMut { -// *self.properties.get_or_init(|| { -// self.rules -// .iter() -// .map(|rule| rule.compute_properties()) -// .collect() -// }) -// } - -// fn transform<'t>( -// &'t self, -// _sentence: Sentence<'t>, -// ) -> Result, crate::properties::Error> { -// unimplemented!() -// } -// } - -impl Tokenize for Tokenizer { - fn properties(&self) -> PropertiesMut { - lazy_static! { - static ref PROPERTIES: PropertiesMut = Properties::default().write(&[Property::Tags]); - } - *PROPERTIES - } - - fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { - Box::new(SentenceIter { - text, - splits: self.sentencizer.split_ranges(text), - tokenizer: &self, - index: 0, - position: Position::default(), - }) +impl Component for Tokenizer { + fn name() -> &'static str { + "tokenizer" } } impl Tokenizer { - /// Creates a new tokenizer from a path to a binary. - /// - /// # Errors - /// - If the file can not be opened. - /// - If the file content can not be deserialized to a rules set. - pub fn new>(p: P) -> Result { - let reader = BufReader::new(File::open(p.as_ref())?); - Ok(bincode::deserialize_from(reader)?) - } - - /// Creates a new tokenizer from a reader. - pub fn from_reader(reader: R) -> Result { - Ok(bincode::deserialize_from(reader)?) - } - - /// Serializes this rules set to a writer. - pub fn to_writer(&self, writer: W) -> Result<(), Error> { - Ok(bincode::serialize_into(writer, &self)?) - } - - /// Gets all disambigation rules in the order they are applied. - pub fn rules(&self) -> &[DisambiguationRule] { - &self.rules - } - /// Gets the lexical tagger. - pub fn tagger(&self) -> &Arc { + pub fn tagger(&self) -> &Tagger { &self.tagger } - /// Gets the chunker if one exists. - pub fn chunker(&self) -> &Option { - &self.chunker - } - - pub(crate) fn lang_options(&self) -> &TokenizerLangOptions { - &self.lang_options - } - - pub(crate) fn disambiguate_up_to_id<'t>( - &'t self, - mut sentence: Sentence<'t>, - id: Option<&Index>, - ) -> Result, crate::properties::Error> { - let n = id.map_or(self.rules.len(), |id| { - self.rules.iter().position(|x| x.id == *id).unwrap() - }); - let mut i = 0; - - let guard = self.property_guard(&mut sentence)?; - - while i < n { - let match_sentence = MatchSentence::new(&sentence, guard.downgrade()); - - let result = self.rules[i..n] - .maybe_par_iter() - .enumerate() - .filter_map(|(j, rule)| { - let changes = rule.apply(&match_sentence); - - match changes { - Ok(changes) => { - if changes.is_empty() { - None - } else { - Some(Ok((j + i, changes))) - } - } - Err(err) => Some(Err(err)), - } - }) - .find_first(|_| true) - .transpose()?; - - if let Some((index, changes)) = result { - self.rules[index].change(&mut sentence, changes, guard)?; - i = index + 1; - } else { - i = n; - } - } - - Ok(sentence) - } - - /// Apply rule-based disambiguation to the tokens. - /// This does not change the number of tokens, but can change the content arbitrarily. - pub fn disambiguate<'t>( - &'t self, - sentence: Sentence<'t>, - ) -> Result, crate::properties::Error> { - self.disambiguate_up_to_id(sentence, None) - } - fn get_token_ranges<'t>( &self, text: &'t str, @@ -390,8 +230,9 @@ impl Tokenizer { .collect(); let mut sentence = Sentence::new(tokens, sentence, &self.tagger); + let guard = self.property_guard(&mut sentence).expect("TODO"); - sentence = self.tagger.transform(sentence).unwrap(); + sentence = self.tagger.transform(sentence, guard).expect("TOOD"); Some(sentence) } diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index 8617b31..b1297e3 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -71,19 +71,14 @@ use std::io; use thiserror::Error; -#[cfg(feature = "compile")] pub mod compile; +pub mod components; mod filter; pub mod properties; pub mod rule; -pub mod rules; -pub mod tokenizer; pub mod types; pub(crate) mod utils; -pub use rules::Rules; -pub use tokenizer::Tokenizer; - #[derive(Error, Debug)] #[allow(missing_docs)] pub enum Error { diff --git a/nlprule/src/rule/engine/composition.rs b/nlprule/src/rule/engine/composition.rs index 6543955..d0fdef5 100644 --- a/nlprule/src/rule/engine/composition.rs +++ b/nlprule/src/rule/engine/composition.rs @@ -1,6 +1,6 @@ use std::iter; -use crate::{properties::*, tokenizer::tag::Tagger, types::*, utils::regex::Regex}; +use crate::{components::tagger::Tagger, properties::*, types::*, utils::regex::Regex}; use enum_dispatch::enum_dispatch; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index b83ea14..e5db139 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -3,7 +3,6 @@ use crate::{ filter::{Filter, Filterable}, properties::*, - tokenizer::Tokenizer, types::*, utils, }; @@ -218,96 +217,96 @@ impl DisambiguationRule { Ok(()) } - /// Often there are examples associated with a rule. - /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> Result { - let mut passes = Vec::new(); - - for (i, test) in self.examples.iter().enumerate() { - let text = match test { - disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(), - disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(), - }; - - // by convention examples are always considered as one sentence even if the sentencizer would split - let sentence_before = tokenizer - .disambiguate_up_to_id( - tokenizer - .tokenize_sentence(text) - .expect("test text must not be empty"), - Some(&self.id), - ) - .unwrap(); - - // shift the sentence to the right before matching to make sure - // nothing assumes the sentene starts from absolute index zero - let shift_delta = Position { byte: 1, char: 1 }; - let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); - - let guard = self - .compute_properties() - .build(&mut sentence_before_complete)?; - - let changes = self - .apply(&MatchSentence::new( - &sentence_before_complete, - guard.downgrade(), - )) - .unwrap() - .lshift(shift_delta); - let mut sentence_after = sentence_before.clone(); - - if !changes.is_empty() { - self.change(&mut sentence_after, changes, guard).unwrap(); - } - - info!("Tokens: {:#?}", sentence_before); - - let pass = match test { - disambiguation::DisambiguationExample::Unchanged(_) => { - sentence_before == sentence_after - } - disambiguation::DisambiguationExample::Changed(change) => { - let _before = sentence_before - .iter() - .find(|x| *x.span().char() == change.char_span) - .unwrap(); - - let after = sentence_after - .iter() - .find(|x| *x.span().char() == change.char_span) - .unwrap(); - - let unordered_tags = - after.tags().unwrap().iter().collect::>(); - let unordered_tags_change = change.after.iter().collect::>(); - - unordered_tags == unordered_tags_change - } - }; - - if !pass { - let error_str = format!( - "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.", - self.id, test, sentence_before, sentence_after, - ); - - if tokenizer - .lang_options() - .known_failures - .contains(&format!("{}:{}", self.id, i)) - { - warn!("{}", error_str) - } else { - error!("{}", error_str) - } - } - - passes.push(pass); - } - - Ok(passes.iter().all(|x| *x)) - } + // /// Often there are examples associated with a rule. + // /// This method checks whether the correct action is taken in the examples. + // pub fn test(&self, tokenizer: &Tokenizer) -> Result { + // let mut passes = Vec::new(); + + // for (i, test) in self.examples.iter().enumerate() { + // let text = match test { + // disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(), + // disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(), + // }; + + // // by convention examples are always considered as one sentence even if the sentencizer would split + // let sentence_before = tokenizer + // .disambiguate_up_to_id( + // tokenizer + // .tokenize_sentence(text) + // .expect("test text must not be empty"), + // Some(&self.id), + // ) + // .unwrap(); + + // // shift the sentence to the right before matching to make sure + // // nothing assumes the sentene starts from absolute index zero + // let shift_delta = Position { byte: 1, char: 1 }; + // let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); + + // let guard = self + // .compute_properties() + // .build(&mut sentence_before_complete)?; + + // let changes = self + // .apply(&MatchSentence::new( + // &sentence_before_complete, + // guard.downgrade(), + // )) + // .unwrap() + // .lshift(shift_delta); + // let mut sentence_after = sentence_before.clone(); + + // if !changes.is_empty() { + // self.change(&mut sentence_after, changes, guard).unwrap(); + // } + + // info!("Tokens: {:#?}", sentence_before); + + // let pass = match test { + // disambiguation::DisambiguationExample::Unchanged(_) => { + // sentence_before == sentence_after + // } + // disambiguation::DisambiguationExample::Changed(change) => { + // let _before = sentence_before + // .iter() + // .find(|x| *x.span().char() == change.char_span) + // .unwrap(); + + // let after = sentence_after + // .iter() + // .find(|x| *x.span().char() == change.char_span) + // .unwrap(); + + // let unordered_tags = + // after.tags().unwrap().iter().collect::>(); + // let unordered_tags_change = change.after.iter().collect::>(); + + // unordered_tags == unordered_tags_change + // } + // }; + + // if !pass { + // let error_str = format!( + // "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.", + // self.id, test, sentence_before, sentence_after, + // ); + + // if tokenizer + // .lang_options() + // .known_failures + // .contains(&format!("{}:{}", self.id, i)) + // { + // warn!("{}", error_str) + // } else { + // error!("{}", error_str) + // } + // } + + // passes.push(pass); + // } + + // Ok(passes.iter().all(|x| *x)) + // } } /// An iterator over [Suggestion][crate::types::Suggestion]s. @@ -510,60 +509,60 @@ impl Rule { } } - /// Grammar rules always have at least one example associated with them. - /// This method checks whether the correct action is taken in the examples. - pub fn test(&self, tokenizer: &Tokenizer) -> Result { - let mut passes = Vec::new(); - - // make sure relative position is handled correctly - // shifting the entire sentence must be a no-op as far as the matcher is concerned - // if the suggestions are shifted back - let shift_delta = Position { byte: 1, char: 1 }; - - for test in self.examples.iter() { - // by convention examples are always considered as one sentence even if the sentencizer would split - let sentence = tokenizer - .disambiguate( - tokenizer - .tokenize_sentence(&test.text()) - .expect("test text must not be empty."), - ) - .unwrap() - .rshift(shift_delta); - - info!("Sentence: {:#?}", sentence); - let suggestions: Vec<_> = self - .apply(&MatchSentence::new( - &sentence, - self.compute_properties().build(&sentence)?, - )) - .map(|s| s.unwrap().lshift(shift_delta)) - .collect(); - - let pass = if suggestions.len() > 1 { - false - } else { - match test.suggestion() { - Some(correct_suggestion) => { - suggestions.len() == 1 && correct_suggestion == &suggestions[0] - } - None => suggestions.is_empty(), - } - }; - - if !pass { - warn!( - "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.", - self.id, - test.text(), - test.suggestion(), - suggestions - ); - } - - passes.push(pass); - } - - Ok(passes.iter().all(|x| *x)) - } + // /// Grammar rules always have at least one example associated with them. + // /// This method checks whether the correct action is taken in the examples. + // pub fn test(&self, tokenizer: &Tokenizer) -> Result { + // let mut passes = Vec::new(); + + // // make sure relative position is handled correctly + // // shifting the entire sentence must be a no-op as far as the matcher is concerned + // // if the suggestions are shifted back + // let shift_delta = Position { byte: 1, char: 1 }; + + // for test in self.examples.iter() { + // // by convention examples are always considered as one sentence even if the sentencizer would split + // let sentence = tokenizer + // .disambiguate( + // tokenizer + // .tokenize_sentence(&test.text()) + // .expect("test text must not be empty."), + // ) + // .unwrap() + // .rshift(shift_delta); + + // info!("Sentence: {:#?}", sentence); + // let suggestions: Vec<_> = self + // .apply(&MatchSentence::new( + // &sentence, + // self.compute_properties().build(&sentence)?, + // )) + // .map(|s| s.unwrap().lshift(shift_delta)) + // .collect(); + + // let pass = if suggestions.len() > 1 { + // false + // } else { + // match test.suggestion() { + // Some(correct_suggestion) => { + // suggestions.len() == 1 && correct_suggestion == &suggestions[0] + // } + // None => suggestions.is_empty(), + // } + // }; + + // if !pass { + // warn!( + // "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.", + // self.id, + // test.text(), + // test.suggestion(), + // suggestions + // ); + // } + + // passes.push(pass); + // } + + // Ok(passes.iter().all(|x| *x)) + // } } diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 0645d43..47fbafd 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -1,8 +1,7 @@ //! Fundamental types used by this crate. -use crate::tokenizer::tag::Tagger; -pub use crate::tokenizer::tag::{PosId, WordId}; -pub(crate) use crate::tokenizer::tag::{PosIdInt, SpecialPos, WordIdInt}; +use crate::components::tagger::Tagger; +pub(crate) use crate::components::tagger::{PosId, PosIdInt, SpecialPos, WordId, WordIdInt}; use derivative::Derivative; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; From 147b17c74e7a7f44311459e5a540d8e0cdbfb008 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Fri, 28 May 2021 10:58:30 +0200 Subject: [PATCH 07/15] fix disambiguator properties, unify tests --- nlprule/Cargo.toml | 4 - nlprule/configs/de/rules.json | 8 - nlprule/configs/de/tagger.json | 9 - nlprule/configs/de/tokenizer.json | 10 - nlprule/src/bin/compile.rs | 26 +- nlprule/src/bin/test.rs | 35 ++- nlprule/src/bin/test_disambiguation.rs | 44 --- nlprule/src/components/rules/compile/mod.rs | 8 +- nlprule/src/components/rules/mod.rs | 88 ++++++ nlprule/src/components/tokenizer/mod.rs | 87 +++--- nlprule/src/lib.rs | 5 + nlprule/src/properties.rs | 304 ++++++++++++++------ nlprule/src/rule/mod.rs | 285 +++++++++--------- nlprule/src/types.rs | 2 +- 14 files changed, 543 insertions(+), 372 deletions(-) delete mode 100644 nlprule/configs/de/rules.json delete mode 100644 nlprule/configs/de/tagger.json delete mode 100644 nlprule/configs/de/tokenizer.json delete mode 100644 nlprule/src/bin/test_disambiguation.rs diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index ff6628c..d0fab56 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -95,7 +95,3 @@ required-features = ["bin"] [[bin]] name = "run" required-features = ["bin"] - -[[bin]] -name = "test_disambiguation" -required-features = ["bin"] diff --git a/nlprule/configs/de/rules.json b/nlprule/configs/de/rules.json deleted file mode 100644 index 4191a24..0000000 --- a/nlprule/configs/de/rules.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "allow_errors": false, - "ignore_ids": [ - "CASING/DAS_BESTE_AM/1", - "TYPOS/PLANT_PLANET/0", - "COMPOUNDING/SUB_-S_BEDINGT_BASIERT/1" - ] -} \ No newline at end of file diff --git a/nlprule/configs/de/tagger.json b/nlprule/configs/de/tagger.json deleted file mode 100644 index 5372535..0000000 --- a/nlprule/configs/de/tagger.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "use_compound_split_heuristic": true, - "always_add_lower_tags": false, - "extra_tags": [ - "PKT", - "PRO:IND:DAT:SIN:NEU" - ], - "retain_last": false -} \ No newline at end of file diff --git a/nlprule/configs/de/tokenizer.json b/nlprule/configs/de/tokenizer.json deleted file mode 100644 index df21ad8..0000000 --- a/nlprule/configs/de/tokenizer.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "allow_errors": false, - "ignore_ids": [ - "DISAMBIGUATION/SUB_BEAMTE/1", - "DISAMBIGUATION/SUB_BEAMTE/2" - ], - "extra_join_regexes": [ - "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" - ] -} \ No newline at end of file diff --git a/nlprule/src/bin/compile.rs b/nlprule/src/bin/compile.rs index 9b883c9..f17cea4 100644 --- a/nlprule/src/bin/compile.rs +++ b/nlprule/src/bin/compile.rs @@ -2,7 +2,7 @@ use clap::Clap; use fs::{File, OpenOptions}; use fs_err as fs; -use log::info; +use log::{info, warn}; use nlprule::compile::{BuildComponent, BuildInfo, Error}; use nlprule::components::{ chunker::Chunker, @@ -83,16 +83,24 @@ fn main() -> Result<(), Error> { macro_rules! build { ($component:ty) => { info!("Creating component \"{}\".", <$component>::name()); - let instance = <$component>::build( + let instance_result = <$component>::build( serde_json::from_value(paths_value.clone())?, Some(&mut build_info), - )?; - instance.to_writer( - &OpenOptions::new() - .write(true) - .create(true) - .open(opts.out_dir.join(format!("{}.bin", <$component>::name())))?, - )?; + ); + + match instance_result { + Ok(instance) => { + instance.to_writer( + &OpenOptions::new() + .write(true) + .create(true) + .open(opts.out_dir.join(format!("{}.bin", <$component>::name())))?, + )?; + } + Err(error) => { + warn!("Error creating \"{0}\": {1}. This is expected if the component does not exist for this language.", <$component>::name(), error); + } + } }; } diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs index 0988861..28a6aa0 100644 --- a/nlprule/src/bin/test.rs +++ b/nlprule/src/bin/test.rs @@ -1,3 +1,14 @@ +use nlprule::{ + components::{ + chunker::Chunker, + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + Component, + }, + properties::{tokenize, CreatePipe, Pipeline, Tokenize}, +}; + // use clap::Clap; // use nlprule::{rules::Rules, tokenizer::Tokenizer}; @@ -42,4 +53,26 @@ // } // } -fn main() {} +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + + // let tokenizer = Pipeline::new(( + // Tokenizer::new("new_storage/en/tokenizer.bin")?, + // MultiwordTagger::new("new_storage/en/multiword_tagger.bin")?, + // Chunker::new("new_storage/en/chunker.bin")?, + // Disambiguator::new("new_storage/en/disambiguator.bin")?, + // Rules::new("new_storage/en/rules.bin")?, + // ))?; + + let tokenizer = Pipeline::new(( + Tokenizer::new("new_storage/de/tokenizer.bin")?, + // MultiwordTagger::new("new_storage/de/multiword_tagger.bin")?, + // Chunker::new("new_storage/en/chunker.bin")?, + Disambiguator::new("new_storage/de/disambiguator.bin")?, + Rules::new("new_storage/de/rules.bin")?, + ))?; + + tokenizer.test()?; + + Ok(()) +} diff --git a/nlprule/src/bin/test_disambiguation.rs b/nlprule/src/bin/test_disambiguation.rs deleted file mode 100644 index 1ab8ff6..0000000 --- a/nlprule/src/bin/test_disambiguation.rs +++ /dev/null @@ -1,44 +0,0 @@ -// use clap::Clap; -// use nlprule::tokenizer::Tokenizer; - -// #[derive(Clap)] -// #[clap( -// version = "1.0", -// author = "Benjamin Minixhofer " -// )] -// struct Opts { -// #[clap(long)] -// stop_at_error: bool, -// #[clap(long, short)] -// tokenizer: String, -// } - -// fn main() { -// env_logger::init(); -// let opts = Opts::parse(); - -// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); -// let rules = tokenizer.rules(); - -// println!("Last ID: {}", rules[rules.len() - 1].id()); -// println!("Runnable rules: {}", rules.len()); - -// let mut passes = 0; - -// for rule in rules { -// if let Ok(true) = rule.test(&tokenizer) { -// passes += 1; -// } else if opts.stop_at_error { -// break; -// } -// } - -// println!("Rules passing tests: {}", passes); -// if passes == rules.len() { -// std::process::exit(0); -// } else { -// std::process::exit(1); -// } -// } - -fn main() {} diff --git a/nlprule/src/components/rules/compile/mod.rs b/nlprule/src/components/rules/compile/mod.rs index c3a9e17..95adc50 100644 --- a/nlprule/src/components/rules/compile/mod.rs +++ b/nlprule/src/components/rules/compile/mod.rs @@ -22,9 +22,6 @@ pub(crate) struct DisambiguatorLangOptions { /// Disambiguation Rule selectors to ignore in this tokenizer. #[serde(default)] pub ignore_ids: Vec, - /// Specific examples in the notation `{id}:{example_index}` which are known to fail. - #[serde(default)] - pub known_failures: Vec, } #[derive(Deserialize)] @@ -105,7 +102,10 @@ impl BuildComponent for Disambiguator { } } - Ok(Disambiguator { rules }) + Ok(Disambiguator { + rules, + properties: Default::default(), + }) } } diff --git a/nlprule/src/components/rules/mod.rs b/nlprule/src/components/rules/mod.rs index 462a059..2fdcca3 100644 --- a/nlprule/src/components/rules/mod.rs +++ b/nlprule/src/components/rules/mod.rs @@ -1,3 +1,4 @@ +use log::info; use serde::{Deserialize, Serialize}; use std::iter::FromIterator; @@ -24,15 +25,59 @@ mod compile; #[derive(Serialize, Deserialize, Clone, Debug)] pub struct Disambiguator { rules: Vec, + #[serde(skip)] + properties: OnceCell, } impl Transform for Disambiguator { + fn properties(&self) -> PropertiesMut { + *self.properties.get_or_init(|| { + self.rules + .iter() + .map(|rule| rule.compute_properties()) + .collect() + }) + } + fn transform<'t>( &'t self, sentence: Sentence<'t>, ) -> Result, crate::properties::Error> { self.disambiguate_up_to_id(sentence, None) } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut current_rules: Vec<&DisambiguationRule> = Vec::new(); + let mut passes = 0; + + for rule in self.rules() { + let pipeline = tokenize::Pipeline::new(( + &tokenizer, + current_rules + .iter() + .map(|x| (*x).clone()) + .collect::(), + ))?; + + if rule.test(&pipeline).is_ok() { + passes += 1; + } + + current_rules.push(rule); + } + + info!( + "{0} out of {1} Disambiguation Rule tests passed.", + passes, + self.rules.len() + ); + + if passes == self.rules().len() { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } } impl Component for Disambiguator { @@ -167,6 +212,28 @@ impl Suggest for Rules { }) .collect()) } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut passes = 0; + + for rule in self.rules() { + if rule.test(&tokenizer).is_ok() { + passes += 1; + }; + } + + info!( + "{0} out of {1} Grammar Rule tests passed.", + passes, + self.rules.len() + ); + + if passes == self.rules().len() { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } } impl Rules { @@ -249,3 +316,24 @@ where } } } + +impl IntoIterator for Disambiguator { + type Item = DisambiguationRule; + type IntoIter = std::vec::IntoIter; + fn into_iter(self) -> Self::IntoIter { + self.rules.into_iter() + } +} + +impl FromIterator for Disambiguator +where + R: Into, +{ + fn from_iter>(iter: I) -> Self { + let rules: Vec = iter.into_iter().map(|x| x.into()).collect(); + Self { + rules, + properties: OnceCell::default(), + } + } +} diff --git a/nlprule/src/components/tokenizer/mod.rs b/nlprule/src/components/tokenizer/mod.rs index 7982ee2..d9a2700 100644 --- a/nlprule/src/components/tokenizer/mod.rs +++ b/nlprule/src/components/tokenizer/mod.rs @@ -78,6 +78,49 @@ impl Tokenize for Tokenizer { position: Position::default(), }) } + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { + if sentence.trim().is_empty() { + return None; + } + + let token_strs = self + .get_token_ranges(sentence) + .filter(|range| !sentence[range.clone()].trim().is_empty()); + + let n_token_strs = token_strs.clone().count(); + + let tokens: Vec<_> = token_strs + .enumerate() + .map(|(i, range)| { + let byte_start = range.start; + let char_start = sentence[..byte_start].chars().count(); + + let token_text = sentence[range].trim(); + + let is_sentence_start = i == 0; + let is_sentence_end = i == n_token_strs - 1; + + Token::new( + token_text, + Span::new( + byte_start..byte_start + token_text.len(), + char_start..char_start + token_text.chars().count(), + ), + is_sentence_start, + is_sentence_end, + sentence[..byte_start].ends_with(char::is_whitespace), + ) + }) + .collect(); + + let mut sentence = Sentence::new(tokens, sentence, &self.tagger); + let guard = self.property_guard(&mut sentence).expect("TODO"); + + sentence = self.tagger.transform(sentence, guard).expect("TODO"); + + Some(sentence) + } } /// An iterator over sentences. Has some key properties: @@ -192,48 +235,4 @@ impl Tokenizer { byte_start..byte_start + token.len() }) } - - /// Tokenize the given sentence. This applies chunking and tagging, but does not do disambiguation. - pub(crate) fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { - if sentence.trim().is_empty() { - return None; - } - - let token_strs = self - .get_token_ranges(sentence) - .filter(|range| !sentence[range.clone()].trim().is_empty()); - - let n_token_strs = token_strs.clone().count(); - - let tokens: Vec<_> = token_strs - .enumerate() - .map(|(i, range)| { - let byte_start = range.start; - let char_start = sentence[..byte_start].chars().count(); - - let token_text = sentence[range].trim(); - - let is_sentence_start = i == 0; - let is_sentence_end = i == n_token_strs - 1; - - Token::new( - token_text, - Span::new( - byte_start..byte_start + token_text.len(), - char_start..char_start + token_text.chars().count(), - ), - is_sentence_start, - is_sentence_end, - sentence[..byte_start].ends_with(char::is_whitespace), - ) - }) - .collect(); - - let mut sentence = Sentence::new(tokens, sentence, &self.tagger); - let guard = self.property_guard(&mut sentence).expect("TODO"); - - sentence = self.tagger.transform(sentence, guard).expect("TOOD"); - - Some(sentence) - } } diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index b1297e3..21042a3 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -89,8 +89,13 @@ pub enum Error { Serialization(#[from] bincode::Error), #[error(transparent)] IdError(#[from] rule::id::Error), + #[error(transparent)] + Property(#[from] properties::Error), + // TODO: combine with `Property` (probably) #[error("unset token property: {0}")] Unset(&'static str), + #[error("Test failed. See logs for details.")] + TestFailed, } /// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index fb8b323..0721837 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -3,121 +3,178 @@ use serde::{Deserialize, Serialize}; use crate::types::*; use thiserror::Error; -/// Correct a text by applying suggestions to it. -/// In the case of multiple possible replacements, always chooses the first one. -pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { - let mut offset: isize = 0; - let mut chars: Vec<_> = text.chars().collect(); - - for suggestion in suggestions { - let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); - chars.splice( - (suggestion.span().char().start as isize + offset) as usize - ..(suggestion.span().char().end as isize + offset) as usize, - replacement.iter().cloned(), - ); - offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; - } - - chars.into_iter().collect() -} +pub use suggest::Suggest; +pub use tokenize::Tokenize; +pub use transform::Transform; + +pub mod suggest { + use super::*; + + /// Correct a text by applying suggestions to it. + /// In the case of multiple possible replacements, always chooses the first one. + pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { + let mut offset: isize = 0; + let mut chars: Vec<_> = text.chars().collect(); + + for suggestion in suggestions { + let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); + chars.splice( + (suggestion.span().char().start as isize + offset) as usize + ..(suggestion.span().char().end as isize + offset) as usize, + replacement.iter().cloned(), + ); + offset = offset + replacement.len() as isize - suggestion.span().char().len() as isize; + } -pub trait Suggest { - fn properties(&self) -> Properties { - Properties::default() + chars.into_iter().collect() } - fn property_guard(&self, sentence: &Sentence) -> Result { - self.properties().build(sentence) - } + pub trait Suggest { + fn properties(&self) -> Properties { + Properties::default() + } - fn suggest(&self, sentence: &Sentence) -> Result, Error>; + fn property_guard(&self, sentence: &Sentence) -> Result { + self.properties().build(sentence) + } - fn correct(&self, sentence: &Sentence) -> Result { - let suggestions = self.suggest(sentence)?; - Ok(apply_suggestions(sentence.text(), &suggestions)) - } -} + fn suggest(&self, sentence: &Sentence) -> Result, Error>; -impl<'a, T> Suggest for &'a T -where - T: Suggest, -{ - fn properties(&self) -> Properties { - (*self).properties() - } + fn correct(&self, sentence: &Sentence) -> Result { + let suggestions = self.suggest(sentence)?; + Ok(apply_suggestions(sentence.text(), &suggestions)) + } - fn property_guard(&self, sentence: &Sentence) -> Result { - (*self).property_guard(sentence) + #[allow(unused_variables)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + Ok(()) + } } - fn suggest(&self, sentence: &Sentence) -> Result, Error> { - (*self).suggest(sentence) - } + impl<'a, T> Suggest for &'a T + where + T: Suggest, + { + fn properties(&self) -> Properties { + (*self).properties() + } - fn correct(&self, sentence: &Sentence) -> Result { - (*self).correct(sentence) + fn property_guard(&self, sentence: &Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn suggest(&self, sentence: &Sentence) -> Result, Error> { + (*self).suggest(sentence) + } + + fn correct(&self, sentence: &Sentence) -> Result { + (*self).correct(sentence) + } + + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + (*self).test(tokenizer) + } } } -pub trait Transform { - fn properties(&self) -> PropertiesMut { - PropertiesMut::default() - } +pub mod transform { + use super::*; - fn property_guard(&self, sentence: &mut Sentence) -> Result { - self.properties().build(sentence) - } + pub trait Transform { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } - fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error>; -} + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } -impl<'a, T> Transform for &'a T -where - T: Transform, -{ - fn properties(&self) -> PropertiesMut { - (*self).properties() - } + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error>; - fn property_guard(&self, sentence: &mut Sentence) -> Result { - (*self).property_guard(sentence) + #[allow(unused_variables)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + Ok(()) + } } - fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error> { - (*self).transform(sentence) - } -} + impl<'a, T> Transform for &'a T + where + T: Transform, + { + fn properties(&self) -> PropertiesMut { + (*self).properties() + } -pub trait Tokenize { - fn properties(&self) -> PropertiesMut { - PropertiesMut::default() - } + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn transform<'t>(&'t self, sentence: Sentence<'t>) -> Result, Error> { + (*self).transform(sentence) + } - fn property_guard(&self, sentence: &mut Sentence) -> Result { - self.properties().build(sentence) + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + (*self).test(tokenizer) + } } - fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't>; + #[derive(Serialize, Deserialize)] + pub struct Pipeline(pub(super) T); } -impl<'a, T> Tokenize for &'a T -where - T: Tokenize, -{ - fn properties(&self) -> PropertiesMut { - (*self).properties() - } +pub mod tokenize { + use super::*; - fn property_guard(&self, sentence: &mut Sentence) -> Result { - (*self).property_guard(sentence) + pub trait Tokenize { + fn properties(&self) -> PropertiesMut { + PropertiesMut::default() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + self.properties().build(sentence) + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't>; + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option>; + + fn test(&self) -> Result<(), crate::Error> { + Ok(()) + } } - fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { - (*self).tokenize(text) + impl<'a, T> Tokenize for &'a T + where + T: Tokenize, + { + fn properties(&self) -> PropertiesMut { + (*self).properties() + } + + fn property_guard(&self, sentence: &mut Sentence) -> Result { + (*self).property_guard(sentence) + } + + fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { + (*self).tokenize(text) + } + + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option> { + (*self).tokenize_sentence(sentence) + } + + fn test(&self) -> Result<(), crate::Error> { + (*self).test() + } } + + #[derive(Serialize, Deserialize)] + pub struct Pipeline(pub(super) T); } +#[derive(Serialize, Deserialize)] +pub struct Pipeline(T); + #[derive(Error, Debug)] #[allow(missing_docs)] pub enum Error { @@ -421,16 +478,30 @@ impl PropertyGuardMut { } } -pub struct Pipeline(T); +pub trait CreatePipe: Sized { + fn new(components: T) -> Result; +} -#[allow(clippy::new_ret_no_self)] -pub trait CreatePipe { - fn new(components: T) -> Result, Error>; +macro_rules! make_subpipe { + ($pipe:ty, $first:expr) => { + Ok::<_, crate::Error>($first) + }; + ($pipe:ty, $first:expr, $($name:expr),+) => { + <$pipe>::new(($first, $($name,)+)) + } } macro_rules! impl_pipeline { ( $first:ident, $last:ident, $($name:ident),*) => { - impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for Pipeline<($first, $($name,)* $last)> { + // Case 1: Tokenize -> Transform -> ... -> Transform + impl<$first: Tokenize, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for tokenize::Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + Ok(tokenize::Pipeline(components)) + } + } + + impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for tokenize::Pipeline<($first, $($name,)* $last)> { #[allow(non_snake_case)] fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { let (ref $first, $(ref $name,)* ref $last) = self.0; @@ -442,9 +513,37 @@ macro_rules! impl_pipeline { Box::new(sentences) } + + #[allow(non_snake_case, unused_mut)] + fn tokenize_sentence<'t>(&'t self, sentence: &'t str) -> Option { + let (ref $first, $(ref $name,)* ref $last) = self.0; + let mut sentence = $first.tokenize_sentence(sentence)?; + $(sentence = $name.transform(sentence).unwrap();)* + Some($last.transform(sentence).unwrap()) + } + + #[allow(non_snake_case)] + fn test(&self) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?; + subpipe.test()?; + + $last.test(subpipe)?; + + Ok(()) + } + } + + // Case 2: Transform -> ... -> Transform + impl<$first: Transform, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for transform::Pipeline<($first, $($name,)* $last)> { + #[allow(non_snake_case, unused_mut)] + fn new(components: ($first, $($name,)* $last)) -> Result { + Ok(transform::Pipeline(components)) + } } - impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for Pipeline<($first, $($name,)* $last)> { + impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for transform::Pipeline<($first, $($name,)* $last)> { #[allow(non_snake_case)] fn transform<'t>(&'t self, mut sentence: Sentence<'t>) -> Result, crate::properties::Error> { let (ref $first, $(ref $name,)* ref $last) = self.0; @@ -453,8 +552,21 @@ macro_rules! impl_pipeline { sentence = $last.transform(sentence)?; Ok(sentence) } + + #[allow(non_snake_case)] + fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + $first.test(&tokenizer)?; + let tokenizer_pipe = tokenize::Pipeline::new((&tokenizer, $first))?; + let subpipe = make_subpipe!(transform::Pipeline<_>, $($name,)* $last)?; + + subpipe.test(tokenizer_pipe)?; + Ok(()) + } } + // Case 3: Tokenize -> Transform -> ... -> Transform -> Suggest impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> CreatePipe<($first, $($name,)* $last)> for Pipeline<($first, $($name,)* $last)> { #[allow(non_snake_case, unused_mut)] fn new(components: ($first, $($name,)* $last)) -> Result { @@ -480,6 +592,18 @@ macro_rules! impl_pipeline { sentences } + + #[allow(non_snake_case)] + pub fn test(&self) -> Result<(), crate::Error> { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let subpipe = make_subpipe!(tokenize::Pipeline<_>, $first $(,$name)*)?; + subpipe.test()?; + + $last.test(subpipe)?; + + Ok(()) + } } }; } diff --git a/nlprule/src/rule/mod.rs b/nlprule/src/rule/mod.rs index e5db139..ce10292 100644 --- a/nlprule/src/rule/mod.rs +++ b/nlprule/src/rule/mod.rs @@ -8,7 +8,7 @@ use crate::{ }; use itertools::Itertools; use lazy_static::lazy_static; -use log::{error, info, warn}; +use log::{debug, error}; use serde::{Deserialize, Serialize}; use std::fmt; use std::{collections::HashSet, iter}; @@ -195,7 +195,7 @@ impl DisambiguationRule { changes: Changes, guard: PropertyGuardMut, ) -> Result<(), crate::properties::Error> { - log::info!("applying {}", self.id); + debug!("applying {}", self.id); for spans in changes.0 { let mut groups = Vec::new(); @@ -217,96 +217,85 @@ impl DisambiguationRule { Ok(()) } - // /// Often there are examples associated with a rule. - // /// This method checks whether the correct action is taken in the examples. - // pub fn test(&self, tokenizer: &Tokenizer) -> Result { - // let mut passes = Vec::new(); - - // for (i, test) in self.examples.iter().enumerate() { - // let text = match test { - // disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(), - // disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(), - // }; - - // // by convention examples are always considered as one sentence even if the sentencizer would split - // let sentence_before = tokenizer - // .disambiguate_up_to_id( - // tokenizer - // .tokenize_sentence(text) - // .expect("test text must not be empty"), - // Some(&self.id), - // ) - // .unwrap(); - - // // shift the sentence to the right before matching to make sure - // // nothing assumes the sentene starts from absolute index zero - // let shift_delta = Position { byte: 1, char: 1 }; - // let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); - - // let guard = self - // .compute_properties() - // .build(&mut sentence_before_complete)?; - - // let changes = self - // .apply(&MatchSentence::new( - // &sentence_before_complete, - // guard.downgrade(), - // )) - // .unwrap() - // .lshift(shift_delta); - // let mut sentence_after = sentence_before.clone(); - - // if !changes.is_empty() { - // self.change(&mut sentence_after, changes, guard).unwrap(); - // } - - // info!("Tokens: {:#?}", sentence_before); - - // let pass = match test { - // disambiguation::DisambiguationExample::Unchanged(_) => { - // sentence_before == sentence_after - // } - // disambiguation::DisambiguationExample::Changed(change) => { - // let _before = sentence_before - // .iter() - // .find(|x| *x.span().char() == change.char_span) - // .unwrap(); - - // let after = sentence_after - // .iter() - // .find(|x| *x.span().char() == change.char_span) - // .unwrap(); - - // let unordered_tags = - // after.tags().unwrap().iter().collect::>(); - // let unordered_tags_change = change.after.iter().collect::>(); - - // unordered_tags == unordered_tags_change - // } - // }; - - // if !pass { - // let error_str = format!( - // "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.", - // self.id, test, sentence_before, sentence_after, - // ); - - // if tokenizer - // .lang_options() - // .known_failures - // .contains(&format!("{}:{}", self.id, i)) - // { - // warn!("{}", error_str) - // } else { - // error!("{}", error_str) - // } - // } - - // passes.push(pass); - // } - - // Ok(passes.iter().all(|x| *x)) - // } + /// Often there are examples associated with a rule. + /// This method checks whether the correct action is taken in the examples. + pub(crate) fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut passes = Vec::new(); + + for test in self.examples.iter() { + let text = match test { + disambiguation::DisambiguationExample::Unchanged(x) => x.as_str(), + disambiguation::DisambiguationExample::Changed(x) => x.text.as_str(), + }; + + // by convention examples are always considered as one sentence even if the sentencizer would split + let sentence_before = tokenizer + .tokenize_sentence(text) + .expect("test text must not be empty"); + + // shift the sentence to the right before matching to make sure + // nothing assumes the sentene starts from absolute index zero + let shift_delta = Position { byte: 1, char: 1 }; + let mut sentence_before_complete = sentence_before.clone().rshift(shift_delta); + + let guard = self + .compute_properties() + .build(&mut sentence_before_complete)?; + + let changes = self + .apply(&MatchSentence::new( + &sentence_before_complete, + guard.downgrade(), + )) + .unwrap() + .lshift(shift_delta); + let mut sentence_after = sentence_before.clone(); + + if !changes.is_empty() { + self.change(&mut sentence_after, changes, guard).unwrap(); + } + + debug!("Tokens: {:#?}", sentence_before); + + let pass = match test { + disambiguation::DisambiguationExample::Unchanged(_) => { + sentence_before == sentence_after + } + disambiguation::DisambiguationExample::Changed(change) => { + let _before = sentence_before + .iter() + .find(|x| *x.span().char() == change.char_span) + .unwrap(); + + let after = sentence_after + .iter() + .find(|x| *x.span().char() == change.char_span) + .unwrap(); + + let unordered_tags = + after.tags().unwrap().iter().collect::>(); + let unordered_tags_change = change.after.iter().collect::>(); + + unordered_tags == unordered_tags_change + } + }; + + if !pass { + error!( + "Rule {}: Test \"{:#?}\" failed. Before: {:#?}. After: {:#?}.", + self.id, test, sentence_before, sentence_after + ) + } + + passes.push(pass); + } + + if passes.iter().all(|x| *x) { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } } /// An iterator over [Suggestion][crate::types::Suggestion]s. @@ -509,60 +498,60 @@ impl Rule { } } - // /// Grammar rules always have at least one example associated with them. - // /// This method checks whether the correct action is taken in the examples. - // pub fn test(&self, tokenizer: &Tokenizer) -> Result { - // let mut passes = Vec::new(); - - // // make sure relative position is handled correctly - // // shifting the entire sentence must be a no-op as far as the matcher is concerned - // // if the suggestions are shifted back - // let shift_delta = Position { byte: 1, char: 1 }; - - // for test in self.examples.iter() { - // // by convention examples are always considered as one sentence even if the sentencizer would split - // let sentence = tokenizer - // .disambiguate( - // tokenizer - // .tokenize_sentence(&test.text()) - // .expect("test text must not be empty."), - // ) - // .unwrap() - // .rshift(shift_delta); - - // info!("Sentence: {:#?}", sentence); - // let suggestions: Vec<_> = self - // .apply(&MatchSentence::new( - // &sentence, - // self.compute_properties().build(&sentence)?, - // )) - // .map(|s| s.unwrap().lshift(shift_delta)) - // .collect(); - - // let pass = if suggestions.len() > 1 { - // false - // } else { - // match test.suggestion() { - // Some(correct_suggestion) => { - // suggestions.len() == 1 && correct_suggestion == &suggestions[0] - // } - // None => suggestions.is_empty(), - // } - // }; - - // if !pass { - // warn!( - // "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.", - // self.id, - // test.text(), - // test.suggestion(), - // suggestions - // ); - // } - - // passes.push(pass); - // } - - // Ok(passes.iter().all(|x| *x)) - // } + /// Grammar rules always have at least one example associated with them. + /// This method checks whether the correct action is taken in the examples. + pub(crate) fn test(&self, tokenizer: TOK) -> Result<(), crate::Error> { + let mut passes = Vec::new(); + + // make sure relative position is handled correctly + // shifting the entire sentence must be a no-op as far as the matcher is concerned + // if the suggestions are shifted back + let shift_delta = Position { byte: 1, char: 1 }; + + for test in self.examples.iter() { + // by convention examples are always considered as one sentence even if the sentencizer would split + let sentence = tokenizer + .tokenize_sentence(&test.text()) + .expect("test text must not be empty.") + .rshift(shift_delta); + + debug!("Sentence: {:#?}", sentence); + let suggestions: Vec<_> = self + .apply(&MatchSentence::new( + &sentence, + self.compute_properties().build(&sentence)?, + )) + .map(|s| s.unwrap().lshift(shift_delta)) + .collect(); + + let pass = if suggestions.len() > 1 { + false + } else { + match test.suggestion() { + Some(correct_suggestion) => { + suggestions.len() == 1 && correct_suggestion == &suggestions[0] + } + None => suggestions.is_empty(), + } + }; + + if !pass { + error!( + "Rule {}: test \"{}\" failed. Expected: {:#?}. Found: {:#?}.", + self.id, + test.text(), + test.suggestion(), + suggestions + ); + } + + passes.push(pass); + } + + if passes.iter().all(|x| *x) { + Ok(()) + } else { + Err(crate::Error::TestFailed) + } + } } diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index 47fbafd..bb08102 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -1,7 +1,7 @@ //! Fundamental types used by this crate. use crate::components::tagger::Tagger; -pub(crate) use crate::components::tagger::{PosId, PosIdInt, SpecialPos, WordId, WordIdInt}; +pub(crate) use crate::components::tagger::{PosId, SpecialPos, WordId, WordIdInt}; use derivative::Derivative; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; From 2bc7842cb6ef6023e7cb78350135988091e0aa3e Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Fri, 28 May 2021 17:08:11 +0200 Subject: [PATCH 08/15] fix pipeline properties, add lang module --- nlprule/Cargo.toml | 3 + nlprule/benches/load.rs | 12 +-- nlprule/configs/es/rules.json | 12 --- nlprule/configs/es/tagger.json | 65 ------------- nlprule/configs/es/tokenizer.json | 15 --- nlprule/src/bin/test.rs | 93 +++++-------------- nlprule/src/components/chunker/mod.rs | 1 + .../src/components/multiword_tagger/mod.rs | 1 + nlprule/src/components/rules/mod.rs | 1 + nlprule/src/components/tagger/mod.rs | 1 + nlprule/src/components/tokenizer/mod.rs | 1 + nlprule/src/lib.rs | 4 + nlprule/src/properties.rs | 73 ++++++++++----- scripts/build_and_test.sh | 23 +++-- 14 files changed, 100 insertions(+), 205 deletions(-) delete mode 100644 nlprule/configs/es/rules.json delete mode 100644 nlprule/configs/es/tagger.json delete mode 100644 nlprule/configs/es/tokenizer.json diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index d0fab56..89f3d41 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -60,10 +60,13 @@ fs-err = "2.5" [[bench]] name = "load" harness = false +required-features = ["binaries"] [features] default = ["regex-onig"] +binaries = [] + regex-onig = ["onig"] # to switch to the fancy-regex engine, disable default features and add this feature regex-fancy = ["fancy-regex"] diff --git a/nlprule/benches/load.rs b/nlprule/benches/load.rs index 0e0c010..9dab5a8 100644 --- a/nlprule/benches/load.rs +++ b/nlprule/benches/load.rs @@ -1,17 +1,13 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use nlprule::{Rules, Tokenizer}; +use criterion::{criterion_group, criterion_main, Criterion}; +use nlprule::lang::en; use std::time::Duration; fn parse_tokenizer(c: &mut Criterion) { - c.bench_function("load tokenizer", |b| { - b.iter(|| Tokenizer::new(black_box("../storage/en_tokenizer.bin")).unwrap()) - }); + c.bench_function("load tokenizer", |b| b.iter(en::analyzer)); } fn parse_rules(c: &mut Criterion) { - c.bench_function("load rules", |b| { - b.iter(|| Rules::new(black_box("../storage/en_rules.bin")).unwrap()) - }); + c.bench_function("load rules", |b| b.iter(en::rules)); } fn no_warmup_criterion() -> Criterion { diff --git a/nlprule/configs/es/rules.json b/nlprule/configs/es/rules.json deleted file mode 100644 index 938013a..0000000 --- a/nlprule/configs/es/rules.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "allow_errors": false, - "ignore_ids": [ - "TYPOGRAPHY/ESPACIO_DESPUES_DE_PUNTO/0", - "TYPOGRAPHY/ESPACIO_DESPUES_DE_PUNTO/1", - "DIACRITICS/DIACRITICS_OTHERS/26", - "DIACRITICS/PRONOM_TILDE/0", - "MISSPELLING/R_RR/0", - "MISSPELLING/NO_SEPARADO/4", - "MISSPELLING/NO_SEPARADO/56" - ] -} \ No newline at end of file diff --git a/nlprule/configs/es/tagger.json b/nlprule/configs/es/tagger.json deleted file mode 100644 index 51a55c3..0000000 --- a/nlprule/configs/es/tagger.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "use_compound_split_heuristic": false, - "always_add_lower_tags": true, - "extra_tags": [ - "_PUNCT", - "_PUNCT_CONT", - "_QM_OPEN", - "_QM_CLOSE", - "_ellipsis", - "_abrev_biblia", - "LOC_PREP", - "LOC_ADV", - "LOC_CS", - "LOC_CONJ", - "AQ0MN0", - "RG_before", - "DN0MP0", - "DN0FP0", - "DN0CP0", - "DN0CS0", - "AO0CN0", - "_GN_FS", - "Y", - "_enumeration", - "LOC_ADC", - "_GV_", - "_GN_MS", - "_GN_MP", - "_GN_FP", - "_allow_tanto", - "_allow_como", - "_allow_repeat", - "_possible_NP", - "_complement_cada", - "url", - "_DD_after_noun", - "complement", - "complement_a", - "complement_cada", - "ignore_concordance", - "_reflexive", - "Z", - "NPCNO000", - "NPMSSP00", - "NPFSSP00", - "LOC_ADJ", - "NPCS", - "NPMSP00", - "NPFSO00", - "NPCS000", - "NPCP000", - "NCFSO00", - "NPMN000", - "NPMS000", - "NPMP000", - "NCCSO00", - "NCCPO00", - "NCMSO00", - "NPCN000", - "NCCN00", - "LOC_CC", - "LOC_I" - ], - "retain_last": true -} \ No newline at end of file diff --git a/nlprule/configs/es/tokenizer.json b/nlprule/configs/es/tokenizer.json deleted file mode 100644 index 402aa9b..0000000 --- a/nlprule/configs/es/tokenizer.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "allow_errors": true, - "ignore_ids": [], - "extra_split_chars": [ - "-", - "─", - "‒", - "–", - "ㅡ" - ], - "extra_join_regexes": [ - "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})", - "(\\d+[\\.,])+\\d+\\w*" - ] -} \ No newline at end of file diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs index 28a6aa0..f41ce06 100644 --- a/nlprule/src/bin/test.rs +++ b/nlprule/src/bin/test.rs @@ -1,78 +1,27 @@ -use nlprule::{ - components::{ - chunker::Chunker, - multiword_tagger::MultiwordTagger, - rules::{Disambiguator, Rules}, - tokenizer::Tokenizer, - Component, - }, - properties::{tokenize, CreatePipe, Pipeline, Tokenize}, -}; - -// use clap::Clap; -// use nlprule::{rules::Rules, tokenizer::Tokenizer}; - -// #[derive(Clap)] -// #[clap( -// version = "1.0", -// author = "Benjamin Minixhofer " -// )] -// struct Opts { -// #[clap(long, short)] -// tokenizer: String, -// #[clap(long, short)] -// rules: String, -// #[clap(long, short)] -// ids: Vec, -// } - -// fn main() { -// env_logger::init(); -// let opts = Opts::parse(); - -// let tokenizer = Tokenizer::new(opts.tokenizer).unwrap(); -// let rules_container = Rules::new(opts.rules).unwrap(); -// let rules = rules_container.rules(); - -// println!("Runnable rules: {}", rules.len()); - -// let mut passes = 0; -// for rule in rules { -// if opts.ids.is_empty() || opts.ids.contains(&rule.id().to_string()) { -// if let Ok(true) = rule.test(&tokenizer) { -// passes += 1; -// } -// } -// } - -// println!("Rules passing tests: {}", passes); -// if passes == rules.len() { -// std::process::exit(0); -// } else { -// std::process::exit(1); -// } -// } +use nlprule::lang::{de, en, es}; + +use clap::Clap; + +#[derive(Clap)] +#[clap( + version = "1.0", + author = "Benjamin Minixhofer " +)] +struct Opts { + #[clap(long, short)] + lang: String, +} fn main() -> Result<(), nlprule::Error> { env_logger::init(); - - // let tokenizer = Pipeline::new(( - // Tokenizer::new("new_storage/en/tokenizer.bin")?, - // MultiwordTagger::new("new_storage/en/multiword_tagger.bin")?, - // Chunker::new("new_storage/en/chunker.bin")?, - // Disambiguator::new("new_storage/en/disambiguator.bin")?, - // Rules::new("new_storage/en/rules.bin")?, - // ))?; - - let tokenizer = Pipeline::new(( - Tokenizer::new("new_storage/de/tokenizer.bin")?, - // MultiwordTagger::new("new_storage/de/multiword_tagger.bin")?, - // Chunker::new("new_storage/en/chunker.bin")?, - Disambiguator::new("new_storage/de/disambiguator.bin")?, - Rules::new("new_storage/de/rules.bin")?, - ))?; - - tokenizer.test()?; + let opts = Opts::parse(); + + match opts.lang.as_ref() { + "de" => de::correcter().test()?, + "en" => en::correcter().test()?, + "es" => es::correcter().test()?, + x => panic!("language code '{}' does not exist!", x), + } Ok(()) } diff --git a/nlprule/src/components/chunker/mod.rs b/nlprule/src/components/chunker/mod.rs index e9825b5..8b6aad2 100644 --- a/nlprule/src/components/chunker/mod.rs +++ b/nlprule/src/components/chunker/mod.rs @@ -1,5 +1,6 @@ //! A Chunker ported from [OpenNLP](https://opennlp.apache.org/). +#[cfg(feature = "compile")] mod compile; use half::bf16; diff --git a/nlprule/src/components/multiword_tagger/mod.rs b/nlprule/src/components/multiword_tagger/mod.rs index b27e468..03518fe 100644 --- a/nlprule/src/components/multiword_tagger/mod.rs +++ b/nlprule/src/components/multiword_tagger/mod.rs @@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize}; use super::Component; +#[cfg(feature = "compile")] mod compile; #[derive(Serialize, Deserialize)] diff --git a/nlprule/src/components/rules/mod.rs b/nlprule/src/components/rules/mod.rs index 2fdcca3..1cd9233 100644 --- a/nlprule/src/components/rules/mod.rs +++ b/nlprule/src/components/rules/mod.rs @@ -20,6 +20,7 @@ use once_cell::sync::OnceCell; use super::Component; +#[cfg(feature = "compile")] mod compile; #[derive(Serialize, Deserialize, Clone, Debug)] diff --git a/nlprule/src/components/tagger/mod.rs b/nlprule/src/components/tagger/mod.rs index 78d9139..0a5d1fb 100644 --- a/nlprule/src/components/tagger/mod.rs +++ b/nlprule/src/components/tagger/mod.rs @@ -12,6 +12,7 @@ use std::{ iter::{once, FusedIterator}, }; +#[cfg(feature = "compile")] mod compile; #[derive(Debug, Copy, Clone, Serialize, Deserialize, Hash, Eq, PartialEq, Ord, PartialOrd)] diff --git a/nlprule/src/components/tokenizer/mod.rs b/nlprule/src/components/tokenizer/mod.rs index d9a2700..564fa62 100644 --- a/nlprule/src/components/tokenizer/mod.rs +++ b/nlprule/src/components/tokenizer/mod.rs @@ -4,6 +4,7 @@ //! Tokens are *disambiguated* (i. e. information from the initial assignment is changed) in a rule-based way by //! [DisambiguationRule][crate::rule::DisambiguationRule]s. +#[cfg(feature = "compile")] mod compile; use fs_err::File; diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index 21042a3..18a1dd5 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -71,9 +71,13 @@ use std::io; use thiserror::Error; +#[cfg(feature = "compile")] pub mod compile; pub mod components; mod filter; +#[cfg(feature = "binaries")] +#[macro_use] +pub mod lang; pub mod properties; pub mod rule; pub mod types; diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index 0721837..19af572 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -119,7 +119,7 @@ pub mod transform { } #[derive(Serialize, Deserialize)] - pub struct Pipeline(pub(super) T); + pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); } pub mod tokenize { @@ -169,11 +169,11 @@ pub mod tokenize { } #[derive(Serialize, Deserialize)] - pub struct Pipeline(pub(super) T); + pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); } #[derive(Serialize, Deserialize)] -pub struct Pipeline(T); +pub struct Pipeline(T, PropertiesMut); #[derive(Error, Debug)] #[allow(missing_docs)] @@ -223,11 +223,7 @@ impl Bitset { self } - pub fn is_empty(&self) -> bool { - self.0 == 0 - } - - pub fn iter<'a>(&'a self) -> impl Iterator + 'a { + pub fn into_iter<'a>(self) -> impl Iterator + 'a { Property::properties().iter().filter_map(move |property| { if self.contains(property) { Some(*property) @@ -329,6 +325,12 @@ impl Properties { } impl PropertiesMut { + pub(crate) fn reads_without_write<'a>(&'a self) -> impl Iterator + 'a { + self.read_mask + .intersection(self.write_mask.inverse()) + .into_iter() + } + pub fn union(mut self, properties: PropertiesMut) -> Self { self.read_mask = self.read_mask.union(properties.read_mask); self.write_mask = self.write_mask.union(properties.read_mask); @@ -336,16 +338,13 @@ impl PropertiesMut { self } - pub fn chain(mut self, next: PropertiesMut) -> Result { + pub fn chain(mut self, next: PropertiesMut) -> Self { let next_reads = next.read_mask.intersection(next.write_mask.inverse()); - let invalid_reads = next_reads.intersection(self.write_mask.inverse()); - - if !invalid_reads.is_empty() { - return Err(Error::InvalidPipeline(invalid_reads.iter().collect())); - } + let new_reads = next_reads.intersection(self.write_mask.inverse()); + self.read_mask = self.read_mask.union(new_reads); self.write_mask = self.write_mask.union(next.write_mask); - Ok(self) + self } pub fn build(&self, sentence: &mut Sentence) -> Result { @@ -497,11 +496,25 @@ macro_rules! impl_pipeline { impl<$first: Tokenize, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for tokenize::Pipeline<($first, $($name,)* $last)> { #[allow(non_snake_case, unused_mut)] fn new(components: ($first, $($name,)* $last)) -> Result { - Ok(tokenize::Pipeline(components)) + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties()); + + if !properties.reads_without_write().next().is_none() { + return Err(Error::InvalidPipeline(properties.reads_without_write().collect())); + } + + Ok(tokenize::Pipeline(components, properties)) } } impl<$first: Tokenize, $($name: Transform,)* $last: Transform> Tokenize for tokenize::Pipeline<($first, $($name,)* $last)> { + fn properties(&self) -> PropertiesMut { + self.1 + } + #[allow(non_snake_case)] fn tokenize<'t>(&'t self, text: &'t str) -> Box> + 't> { let (ref $first, $(ref $name,)* ref $last) = self.0; @@ -539,11 +552,21 @@ macro_rules! impl_pipeline { impl<$first: Transform, $($name: Transform,)* $last: Transform> CreatePipe<($first, $($name,)* $last)> for transform::Pipeline<($first, $($name,)* $last)> { #[allow(non_snake_case, unused_mut)] fn new(components: ($first, $($name,)* $last)) -> Result { - Ok(transform::Pipeline(components)) + let (ref $first, $(ref $name,)* ref $last) = components; + + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties()); + + Ok(transform::Pipeline(components, properties)) } } impl<$first: Transform, $($name: Transform,)* $last: Transform> Transform for transform::Pipeline<($first, $($name,)* $last)> { + fn properties(&self) -> PropertiesMut { + self.1 + } + #[allow(non_snake_case)] fn transform<'t>(&'t self, mut sentence: Sentence<'t>) -> Result, crate::properties::Error> { let (ref $first, $(ref $name,)* ref $last) = self.0; @@ -572,15 +595,23 @@ macro_rules! impl_pipeline { fn new(components: ($first, $($name,)* $last)) -> Result { let (ref $first, $(ref $name,)* ref $last) = components; - let mut properties = PropertiesMut::default().chain($first.properties())?; - $(properties = properties.chain($name.properties())?;)* - properties.chain($last.properties().write(&[]))?; + let mut properties = $first.properties(); + $(properties = properties.chain($name.properties());)* + properties.chain($last.properties().write(&[])); + + if !properties.reads_without_write().next().is_none() { + return Err(Error::InvalidPipeline(properties.reads_without_write().collect())); + } - Ok(Pipeline(components)) + Ok(Pipeline(components, properties)) } } impl<$first: Tokenize, $($name: Transform,)* $last: Suggest> Pipeline<($first, $($name,)* $last)> { + pub fn properties(&self) -> PropertiesMut { + self.1 + } + #[allow(non_snake_case, unused_mut)] pub fn suggest<'t>(&'t self, text: &'t str) -> impl Iterator> + 't { let (ref $first, $(ref $name,)* ref $last) = self.0; diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh index bb18ed8..ee68740 100755 --- a/scripts/build_and_test.sh +++ b/scripts/build_and_test.sh @@ -6,24 +6,23 @@ fi # this script assumes the build directories are in data/ # only for convenience -mkdir -p storage +mkdir -p nlprule/src/storage -# x-- => only compile -# -xx => test_disambiguation and test -# xxx or flags not set => everything -flags=${2:-"xxx"} +# x- => only compile +# -x => only test +# xx or flags not set => everything +flags=${2:-"xx"} if [ "${flags:0:1}" == "x" ] then - RUST_LOG=INFO cargo run --all-features --bin compile -- --build-dir data/$1 --tokenizer-out storage/$1_tokenizer.bin --rules-out storage/$1_rules.bin + cd nlprule + RUST_LOG=INFO cargo run --features "compile bin" --bin compile -- --build-dir ../data/$1 --out-dir storage/$1 + cd .. fi if [ "${flags:1:1}" == "x" ] then - RUST_LOG=WARN cargo run --all-features --bin test_disambiguation -- --tokenizer storage/$1_tokenizer.bin -fi - -if [ "${flags:2:1}" == "x" ] -then - RUST_LOG=WARN cargo run --all-features --bin test -- --tokenizer storage/$1_tokenizer.bin --rules storage/$1_rules.bin + cd nlprule + RUST_LOG=INFO cargo run --all-features --bin test -- --lang $1 + cd .. fi \ No newline at end of file From 5927d698eb51fdd0565aa43d8fb6783383f96229 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sat, 29 May 2021 11:56:15 +0200 Subject: [PATCH 09/15] add lang module, add setup.sh --- data/de/disambiguator_options.json | 7 ++++ data/de/rules_options.json | 8 ++++ data/de/tagger_options.json | 9 +++++ data/de/tokenizer_options.json | 5 +++ data/en/disambiguator_options.json | 6 +++ data/en/rules_options.json | 7 ++++ data/en/tagger_options.json | 11 +++++ data/en/tokenizer_options.json | 5 +++ data/es/disambiguator_options.json | 4 ++ data/es/rules_options.json | 12 ++++++ data/es/tagger_options.json | 65 ++++++++++++++++++++++++++++++ data/es/tokenizer_options.json | 13 ++++++ nlprule/src/lang.rs | 22 ++++++++++ nlprule/src/lang/de.rs | 31 ++++++++++++++ nlprule/src/lang/en.rs | 42 +++++++++++++++++++ nlprule/src/lang/es.rs | 36 +++++++++++++++++ scripts/setup.sh | 11 +++++ 17 files changed, 294 insertions(+) create mode 100644 data/de/disambiguator_options.json create mode 100644 data/de/rules_options.json create mode 100644 data/de/tagger_options.json create mode 100644 data/de/tokenizer_options.json create mode 100644 data/en/disambiguator_options.json create mode 100644 data/en/rules_options.json create mode 100644 data/en/tagger_options.json create mode 100644 data/en/tokenizer_options.json create mode 100644 data/es/disambiguator_options.json create mode 100644 data/es/rules_options.json create mode 100644 data/es/tagger_options.json create mode 100644 data/es/tokenizer_options.json create mode 100644 nlprule/src/lang.rs create mode 100644 nlprule/src/lang/de.rs create mode 100644 nlprule/src/lang/en.rs create mode 100644 nlprule/src/lang/es.rs create mode 100755 scripts/setup.sh diff --git a/data/de/disambiguator_options.json b/data/de/disambiguator_options.json new file mode 100644 index 0000000..92bc9fc --- /dev/null +++ b/data/de/disambiguator_options.json @@ -0,0 +1,7 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "DISAMBIGUATION/SUB_BEAMTE/1", + "DISAMBIGUATION/SUB_BEAMTE/2" + ] +} \ No newline at end of file diff --git a/data/de/rules_options.json b/data/de/rules_options.json new file mode 100644 index 0000000..4191a24 --- /dev/null +++ b/data/de/rules_options.json @@ -0,0 +1,8 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "CASING/DAS_BESTE_AM/1", + "TYPOS/PLANT_PLANET/0", + "COMPOUNDING/SUB_-S_BEDINGT_BASIERT/1" + ] +} \ No newline at end of file diff --git a/data/de/tagger_options.json b/data/de/tagger_options.json new file mode 100644 index 0000000..5372535 --- /dev/null +++ b/data/de/tagger_options.json @@ -0,0 +1,9 @@ +{ + "use_compound_split_heuristic": true, + "always_add_lower_tags": false, + "extra_tags": [ + "PKT", + "PRO:IND:DAT:SIN:NEU" + ], + "retain_last": false +} \ No newline at end of file diff --git a/data/de/tokenizer_options.json b/data/de/tokenizer_options.json new file mode 100644 index 0000000..0e8eb37 --- /dev/null +++ b/data/de/tokenizer_options.json @@ -0,0 +1,5 @@ +{ + "extra_join_regexes": [ + "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" + ] +} \ No newline at end of file diff --git a/data/en/disambiguator_options.json b/data/en/disambiguator_options.json new file mode 100644 index 0000000..e76a2cc --- /dev/null +++ b/data/en/disambiguator_options.json @@ -0,0 +1,6 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "DISAMBIGUATION/BEST_JJS/0" + ] +} \ No newline at end of file diff --git a/data/en/rules_options.json b/data/en/rules_options.json new file mode 100644 index 0000000..a1a27a0 --- /dev/null +++ b/data/en/rules_options.json @@ -0,0 +1,7 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "GRAMMAR/PRP_MD_NN/2", + "TYPOS/VERB_APOSTROPHE_S/3" + ] +} \ No newline at end of file diff --git a/data/en/tagger_options.json b/data/en/tagger_options.json new file mode 100644 index 0000000..8d2a8a6 --- /dev/null +++ b/data/en/tagger_options.json @@ -0,0 +1,11 @@ +{ + "use_compound_split_heuristic": false, + "always_add_lower_tags": true, + "extra_tags": [ + "PCT", + "ORD", + "SYM", + "RB_SENT" + ], + "retain_last": true +} \ No newline at end of file diff --git a/data/en/tokenizer_options.json b/data/en/tokenizer_options.json new file mode 100644 index 0000000..0e8eb37 --- /dev/null +++ b/data/en/tokenizer_options.json @@ -0,0 +1,5 @@ +{ + "extra_join_regexes": [ + "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})" + ] +} \ No newline at end of file diff --git a/data/es/disambiguator_options.json b/data/es/disambiguator_options.json new file mode 100644 index 0000000..8305874 --- /dev/null +++ b/data/es/disambiguator_options.json @@ -0,0 +1,4 @@ +{ + "allow_errors": false, + "ignore_ids": [] +} \ No newline at end of file diff --git a/data/es/rules_options.json b/data/es/rules_options.json new file mode 100644 index 0000000..938013a --- /dev/null +++ b/data/es/rules_options.json @@ -0,0 +1,12 @@ +{ + "allow_errors": false, + "ignore_ids": [ + "TYPOGRAPHY/ESPACIO_DESPUES_DE_PUNTO/0", + "TYPOGRAPHY/ESPACIO_DESPUES_DE_PUNTO/1", + "DIACRITICS/DIACRITICS_OTHERS/26", + "DIACRITICS/PRONOM_TILDE/0", + "MISSPELLING/R_RR/0", + "MISSPELLING/NO_SEPARADO/4", + "MISSPELLING/NO_SEPARADO/56" + ] +} \ No newline at end of file diff --git a/data/es/tagger_options.json b/data/es/tagger_options.json new file mode 100644 index 0000000..51a55c3 --- /dev/null +++ b/data/es/tagger_options.json @@ -0,0 +1,65 @@ +{ + "use_compound_split_heuristic": false, + "always_add_lower_tags": true, + "extra_tags": [ + "_PUNCT", + "_PUNCT_CONT", + "_QM_OPEN", + "_QM_CLOSE", + "_ellipsis", + "_abrev_biblia", + "LOC_PREP", + "LOC_ADV", + "LOC_CS", + "LOC_CONJ", + "AQ0MN0", + "RG_before", + "DN0MP0", + "DN0FP0", + "DN0CP0", + "DN0CS0", + "AO0CN0", + "_GN_FS", + "Y", + "_enumeration", + "LOC_ADC", + "_GV_", + "_GN_MS", + "_GN_MP", + "_GN_FP", + "_allow_tanto", + "_allow_como", + "_allow_repeat", + "_possible_NP", + "_complement_cada", + "url", + "_DD_after_noun", + "complement", + "complement_a", + "complement_cada", + "ignore_concordance", + "_reflexive", + "Z", + "NPCNO000", + "NPMSSP00", + "NPFSSP00", + "LOC_ADJ", + "NPCS", + "NPMSP00", + "NPFSO00", + "NPCS000", + "NPCP000", + "NCFSO00", + "NPMN000", + "NPMS000", + "NPMP000", + "NCCSO00", + "NCCPO00", + "NCMSO00", + "NPCN000", + "NCCN00", + "LOC_CC", + "LOC_I" + ], + "retain_last": true +} \ No newline at end of file diff --git a/data/es/tokenizer_options.json b/data/es/tokenizer_options.json new file mode 100644 index 0000000..10cac1f --- /dev/null +++ b/data/es/tokenizer_options.json @@ -0,0 +1,13 @@ +{ + "extra_split_chars": [ + "-", + "─", + "‒", + "–", + "ㅡ" + ], + "extra_join_regexes": [ + "(https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,})", + "(\\d+[\\.,])+\\d+\\w*" + ] +} \ No newline at end of file diff --git a/nlprule/src/lang.rs b/nlprule/src/lang.rs new file mode 100644 index 0000000..a3ea738 --- /dev/null +++ b/nlprule/src/lang.rs @@ -0,0 +1,22 @@ +macro_rules! binary { + ($component: ty, $lang_code:literal, $binary_name:literal) => {{ + use crate::components::Component; + + let mut bytes: &'static [u8] = include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/storage/", + $lang_code, + "/", + $binary_name, + ".bin" + )); + + <$component>::from_reader(&mut bytes) + }}; +} + +const ERROR_MSG: &str = "binaries are pre-tested."; + +pub mod de; +pub mod en; +pub mod es; diff --git a/nlprule/src/lang/de.rs b/nlprule/src/lang/de.rs new file mode 100644 index 0000000..6e3d337 --- /dev/null +++ b/nlprule/src/lang/de.rs @@ -0,0 +1,31 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "de", "tokenizer").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "de", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "de", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), disambiguator())).expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/nlprule/src/lang/en.rs b/nlprule/src/lang/en.rs new file mode 100644 index 0000000..5dfd743 --- /dev/null +++ b/nlprule/src/lang/en.rs @@ -0,0 +1,42 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + chunker::Chunker, + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Chunker, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "en", "tokenizer").expect(ERROR_MSG) +} + +pub fn multiword_tagger() -> MultiwordTagger { + binary!(MultiwordTagger, "en", "tokenizer").expect(ERROR_MSG) +} + +pub fn chunker() -> Chunker { + binary!(Chunker, "en", "chunker").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "en", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "en", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), multiword_tagger(), chunker(), disambiguator())) + .expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/nlprule/src/lang/es.rs b/nlprule/src/lang/es.rs new file mode 100644 index 0000000..5f4194a --- /dev/null +++ b/nlprule/src/lang/es.rs @@ -0,0 +1,36 @@ +use super::ERROR_MSG; +use crate::{ + components::{ + multiword_tagger::MultiwordTagger, + rules::{Disambiguator, Rules}, + tokenizer::Tokenizer, + }, + properties::{tokenize, CreatePipe, Pipeline}, +}; + +pub type Analyzer = tokenize::Pipeline<(Tokenizer, MultiwordTagger, Disambiguator)>; +pub type Correcter = Pipeline<(Analyzer, Rules)>; + +pub fn tokenizer() -> Tokenizer { + binary!(Tokenizer, "es", "tokenizer").expect(ERROR_MSG) +} + +pub fn multiword_tagger() -> MultiwordTagger { + binary!(MultiwordTagger, "es", "multiword_tagger").expect(ERROR_MSG) +} + +pub fn disambiguator() -> Disambiguator { + binary!(Disambiguator, "es", "disambiguator").expect(ERROR_MSG) +} + +pub fn rules() -> Rules { + binary!(Rules, "es", "rules").expect(ERROR_MSG) +} + +pub fn analyzer() -> Analyzer { + tokenize::Pipeline::new((tokenizer(), multiword_tagger(), disambiguator())).expect(ERROR_MSG) +} + +pub fn correcter() -> Correcter { + Pipeline::new((analyzer(), rules())).expect(ERROR_MSG) +} diff --git a/scripts/setup.sh b/scripts/setup.sh new file mode 100755 index 0000000..3d26ebb --- /dev/null +++ b/scripts/setup.sh @@ -0,0 +1,11 @@ +mkdir -p data + +cd data + +for lang in "en" "de" "es" +do + if [ ! -f $lang.zip ]; then + wget https://f000.backblazeb2.com/file/nlprule/$lang.zip + unzip -o $lang.zip + fi +done \ No newline at end of file From 967cd95e5dfb1385dbd7ac18b4ad467aff0e983e Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 11:46:18 +0200 Subject: [PATCH 10/15] update ci, feature flag for each language --- .github/workflows/ci.yml | 315 ++++++++---------- nlprule/Cargo.toml | 22 +- nlprule/src/bin/test.rs | 27 -- nlprule/src/components/mod.rs | 2 +- nlprule/src/components/rules/compile/mod.rs | 2 +- .../rules/compile/structure/parse.rs | 12 +- nlprule/src/components/rules/mod.rs | 2 - nlprule/src/components/tagger/mod.rs | 8 +- nlprule/src/components/tokenizer/compile.rs | 2 +- nlprule/src/components/tokenizer/mod.rs | 1 - nlprule/src/lang.rs | 5 + nlprule/src/lib.rs | 62 +--- nlprule/src/properties.rs | 22 +- nlprule/src/rule/id.rs | 5 +- nlprule/src/types.rs | 18 +- nlprule/tests/tests.rs | 51 ++- scripts/build_and_test.sh | 14 +- scripts/setup.sh | 11 - 18 files changed, 243 insertions(+), 338 deletions(-) delete mode 100644 nlprule/src/bin/test.rs delete mode 100755 scripts/setup.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7800425..462359e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - lang: ["en", "de", "es"] # TODO: load this from build/languages.txt + lang: ["en", "de", "es"] steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 @@ -23,12 +23,7 @@ jobs: target: wasm32-unknown-unknown - uses: Swatinem/rust-cache@v1 - run: | - mkdir data - mkdir storage - - cd data - wget https://f000.backblazeb2.com/file/nlprule/${{ matrix.lang }}.zip - unzip ${{ matrix.lang }}.zip + bash scripts/setup.sh ${{ matrix.lang }} - name: Build source uses: actions-rs/cargo@v1 with: @@ -45,54 +40,20 @@ jobs: with: token: ${{ secrets.GITHUB_TOKEN }} args: --all-features - - name: Build binaries - uses: actions-rs/cargo@v1 - env: - RUST_LOG: INFO - with: - command: run - args: --all-features --bin compile -- --build-dir data/${{ matrix.lang }} --tokenizer-out storage/${{ matrix.lang }}_tokenizer.bin --rules-out storage/${{ matrix.lang }}_rules.bin + - name: Build and test + run: | + bash scripts/build_and_test.sh ${{ matrix.lang }} xx - name: Run nlprule tests uses: actions-rs/cargo@v1 if: matrix.lang == 'en' with: command: test args: --verbose --all-features --release - - name: Run disambiguation tests - uses: actions-rs/cargo@v1 - env: - RUST_LOG: WARN - with: - command: run - args: --all-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run disambiguation tests (with regex-fancy backend) - uses: actions-rs/cargo@v1 - if: matrix.lang == 'en' - env: - RUST_LOG: WARN - with: - command: run - args: --manifest-path nlprule/Cargo.toml --features "bin regex-onig" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run disambiguation tests (with regex-onig backend) - uses: actions-rs/cargo@v1 - if: matrix.lang == 'en' - env: - RUST_LOG: WARN - with: - command: run - args: --manifest-path nlprule/Cargo.toml --features "bin regex-fancy" --no-default-features --bin test_disambiguation -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin - - name: Run grammar rule tests - uses: actions-rs/cargo@v1 - env: - RUST_LOG: WARN - with: - command: run - args: --all-features --bin test -- --tokenizer storage/${{ matrix.lang }}_tokenizer.bin --rules storage/${{ matrix.lang }}_rules.bin - name: Upload binaries as artifact uses: actions/upload-artifact@v2 with: name: binaries - path: storage/* + path: nlprule/storage/* matrix_prep: runs-on: ubuntu-latest @@ -107,145 +68,145 @@ jobs: # inputFile: '.github/workflows/matrix_includes.json' # Default input file path filter: '[?runOnEvent==`${{ github.event_name }}` || runOnEvent==`always`]' - python: - needs: [matrix_prep, rust] - strategy: - matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}} - runs-on: ${{ matrix.os }} - container: ${{ matrix.container }} - env: - working-directory: python - steps: - - uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously) - with: - profile: minimal - toolchain: stable - - uses: Swatinem/rust-cache@v1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - uses: actions/download-artifact@v2 - with: - name: binaries - path: storage - - name: Install GSED (if needed) # needed by set_version.sh - if: matrix.os == 'macos-latest' - run: | - brew install gnu-sed - - name: Update version (if release) - if: github.event_name == 'release' - run: | - bash scripts/set_version.sh ${{ github.event.release.tag_name }} - - name: Build and Test - run: | - # pybin is the directory with python binaries - PYBIN=${{ matrix.pybin }} + # python: + # needs: [matrix_prep, rust] + # strategy: + # matrix: ${{fromJson(needs.matrix_prep.outputs.matrix)}} + # runs-on: ${{ matrix.os }} + # container: ${{ matrix.container }} + # env: + # working-directory: python + # steps: + # - uses: actions/checkout@v2 + # - uses: actions-rs/toolchain@v1 # maturin needs Rust (obviously) + # with: + # profile: minimal + # toolchain: stable + # - uses: Swatinem/rust-cache@v1 + # - name: Set up Python ${{ matrix.python-version }} + # uses: actions/setup-python@v1 + # with: + # python-version: ${{ matrix.python-version }} + # - uses: actions/download-artifact@v2 + # with: + # name: binaries + # path: storage + # - name: Install GSED (if needed) # needed by set_version.sh + # if: matrix.os == 'macos-latest' + # run: | + # brew install gnu-sed + # - name: Update version (if release) + # if: github.event_name == 'release' + # run: | + # bash scripts/set_version.sh ${{ github.event.release.tag_name }} + # - name: Build and Test + # run: | + # # pybin is the directory with python binaries + # PYBIN=${{ matrix.pybin }} - if [ -z "${PYBIN}" ]; then - PIP_CMD="python -m pip" - PYTHON_CMD="python" - PYTEST_CMD="python -m pytest" - export MATURIN_CMD="maturin" - else - PIP_CMD="${PYBIN}/pip" - PYTHON_CMD="${PYBIN}/python" - PYTEST_CMD="${PYBIN}/pytest" - export MATURIN_CMD="${PYBIN}/maturin" - fi + # if [ -z "${PYBIN}" ]; then + # PIP_CMD="python -m pip" + # PYTHON_CMD="python" + # PYTEST_CMD="python -m pytest" + # export MATURIN_CMD="maturin" + # else + # PIP_CMD="${PYBIN}/pip" + # PYTHON_CMD="${PYBIN}/python" + # PYTEST_CMD="${PYBIN}/pytest" + # export MATURIN_CMD="${PYBIN}/maturin" + # fi - # if pybin is set, the venv will not be used - # still create it here for convenience since we need it on windows - ${PYTHON_CMD} -m venv venv - . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS - ${PIP_CMD} install --upgrade pip - ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2 + # # if pybin is set, the venv will not be used + # # still create it here for convenience since we need it on windows + # ${PYTHON_CMD} -m venv venv + # . venv/bin/activate || . venv/Scripts/activate # 'Scripts' on windows, 'bin' on Linux / macOS + # ${PIP_CMD} install --upgrade pip + # ${PIP_CMD} install maturin==0.9.4 pytest==6.1.2 - # remove potentially cached wheels - rm target/wheels/* || true - bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014 + # # remove potentially cached wheels + # rm target/wheels/* || true + # bash scripts/maturin.sh build --interpreter ${PYTHON_CMD} --release --manylinux 2014 - # install the wheel in two different ways: - # 1. via pip: needed on manylinux - # 2. via maturin develop: needed on windows in venv - ${PIP_CMD} install $(ls target/wheels/* | head -n1) - bash scripts/maturin.sh develop --release + # # install the wheel in two different ways: + # # 1. via pip: needed on manylinux + # # 2. via maturin develop: needed on windows in venv + # ${PIP_CMD} install $(ls target/wheels/* | head -n1) + # bash scripts/maturin.sh develop --release - ${PYTEST_CMD} python/test.py -s - shell: bash - - name: Upload wheel as artifact - uses: actions/upload-artifact@v2 - with: - name: python-wheel - path: target/wheels/* + # ${PYTEST_CMD} python/test.py -s + # shell: bash + # - name: Upload wheel as artifact + # uses: actions/upload-artifact@v2 + # with: + # name: python-wheel + # path: target/wheels/* - publish: - runs-on: ubuntu-latest - needs: [rust, python] - if: github.event_name == 'release' + # publish: + # runs-on: ubuntu-latest + # needs: [rust, python] + # if: github.event_name == 'release' - steps: - - uses: actions/checkout@v2 - with: - ref: ${{ github.head_ref }} - - name: Set up Python 3.8 - uses: actions/setup-python@v1 - with: - python-version: 3.8 - - uses: actions/download-artifact@v2 - with: - name: python-wheel - path: python-wheel - - uses: actions/download-artifact@v2 - with: - name: binaries - path: storage - - run: | - gzip storage/en_tokenizer.bin - gzip storage/en_rules.bin - gzip storage/de_tokenizer.bin - gzip storage/de_rules.bin - gzip storage/es_tokenizer.bin - gzip storage/es_rules.bin - - name: Update version - run: | - bash scripts/set_version.sh ${{ github.event.release.tag_name }} - - name: Publish on crates.io - run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out - cargo login $CARGO_KEY + # steps: + # - uses: actions/checkout@v2 + # with: + # ref: ${{ github.head_ref }} + # - name: Set up Python 3.8 + # uses: actions/setup-python@v1 + # with: + # python-version: 3.8 + # - uses: actions/download-artifact@v2 + # with: + # name: python-wheel + # path: python-wheel + # - uses: actions/download-artifact@v2 + # with: + # name: binaries + # path: storage + # - run: | + # gzip storage/en_tokenizer.bin + # gzip storage/en_rules.bin + # gzip storage/de_tokenizer.bin + # gzip storage/de_rules.bin + # gzip storage/es_tokenizer.bin + # gzip storage/es_rules.bin + # - name: Update version + # run: | + # bash scripts/set_version.sh ${{ github.event.release.tag_name }} + # - name: Publish on crates.io + # run: | # --allow-dirty is only needed b/c of the README.md, we can be sure it is clean otherwise anyway because it is freshly checked out + # cargo login $CARGO_KEY - cd nlprule - cp ../README.md README.md - cargo publish --allow-dirty - rm README.md - cd .. + # cd nlprule + # cp ../README.md README.md + # cargo publish --allow-dirty + # rm README.md + # cd .. - # allow crates.io index to update s. t. nlprule-build can depend on nlprule - sleep 1m + # # allow crates.io index to update s. t. nlprule-build can depend on nlprule + # sleep 1m - cd build - cargo publish --allow-dirty - cd .. - env: - CARGO_KEY: ${{ secrets.CARGO_KEY }} - - name: Publish on PyPI - run: | - pip install twine==3.3 - twine upload python-wheel/* - env: - TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} - TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} - - name: Upload release binaries - uses: alexellis/upload-assets@0.2.2 - env: - GITHUB_TOKEN: ${{ github.token }} - with: - asset_paths: '["./storage/*"]' - - run: | - rm -r python-wheel - rm -r storage - - uses: stefanzweifel/git-auto-commit-action@v4 - with: - commit_message: v${{ github.event.release.tag_name }} - branch: main + # cd build + # cargo publish --allow-dirty + # cd .. + # env: + # CARGO_KEY: ${{ secrets.CARGO_KEY }} + # - name: Publish on PyPI + # run: | + # pip install twine==3.3 + # twine upload python-wheel/* + # env: + # TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }} + # TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }} + # - name: Upload release binaries + # uses: alexellis/upload-assets@0.2.2 + # env: + # GITHUB_TOKEN: ${{ github.token }} + # with: + # asset_paths: '["./storage/*"]' + # - run: | + # rm -r python-wheel + # rm -r storage + # - uses: stefanzweifel/git-auto-commit-action@v4 + # with: + # commit_message: v${{ github.event.release.tag_name }} + # branch: main diff --git a/nlprule/Cargo.toml b/nlprule/Cargo.toml index 89f3d41..36bce79 100644 --- a/nlprule/Cargo.toml +++ b/nlprule/Cargo.toml @@ -53,10 +53,6 @@ quickcheck = "1.0" quickcheck_macros = "1.0" criterion = "0.3" -[build-dependencies] -serde_json = "1" -fs-err = "2.5" - [[bench]] name = "load" harness = false @@ -65,12 +61,14 @@ required-features = ["binaries"] [features] default = ["regex-onig"] -binaries = [] +binaries-de = [] +binaries-en = [] +binaries-es = [] +binaries-all = ["binaries-de", "binaries-en", "binaries-es"] regex-onig = ["onig"] # to switch to the fancy-regex engine, disable default features and add this feature regex-fancy = ["fancy-regex"] - # this enables both regex backends at the same time and makes sure they are equivalent # used only for compilation and tests regex-all-test = ["regex-onig", "regex-fancy"] @@ -92,8 +90,16 @@ name = "compile" required-features = ["compile", "bin"] [[bin]] -name = "test" -required-features = ["bin"] +name = "test_en" +required-features = ["bin", "binaries-en"] + +[[bin]] +name = "test_es" +required-features = ["bin", "binaries-es"] + +[[bin]] +name = "test_de" +required-features = ["bin", "binaries-de"] [[bin]] name = "run" diff --git a/nlprule/src/bin/test.rs b/nlprule/src/bin/test.rs deleted file mode 100644 index f41ce06..0000000 --- a/nlprule/src/bin/test.rs +++ /dev/null @@ -1,27 +0,0 @@ -use nlprule::lang::{de, en, es}; - -use clap::Clap; - -#[derive(Clap)] -#[clap( - version = "1.0", - author = "Benjamin Minixhofer " -)] -struct Opts { - #[clap(long, short)] - lang: String, -} - -fn main() -> Result<(), nlprule::Error> { - env_logger::init(); - let opts = Opts::parse(); - - match opts.lang.as_ref() { - "de" => de::correcter().test()?, - "en" => en::correcter().test()?, - "es" => es::correcter().test()?, - x => panic!("language code '{}' does not exist!", x), - } - - Ok(()) -} diff --git a/nlprule/src/components/mod.rs b/nlprule/src/components/mod.rs index fb157e1..b1cda1d 100644 --- a/nlprule/src/components/mod.rs +++ b/nlprule/src/components/mod.rs @@ -17,7 +17,7 @@ pub trait Component: Serialize + DeserializeOwned { fn new>(p: P) -> Result { let reader = BufReader::new(File::open(p.as_ref())?); - Ok(Self::from_reader(reader)?) + Self::from_reader(reader) } fn from_reader(reader: R) -> Result { diff --git a/nlprule/src/components/rules/compile/mod.rs b/nlprule/src/components/rules/compile/mod.rs index 95adc50..e2d6cf2 100644 --- a/nlprule/src/components/rules/compile/mod.rs +++ b/nlprule/src/components/rules/compile/mod.rs @@ -1,7 +1,7 @@ mod structure; +use fs_err::File; use std::{io::BufReader, path::PathBuf}; - use log::warn; use crate::{ diff --git a/nlprule/src/components/rules/compile/structure/parse.rs b/nlprule/src/components/rules/compile/structure/parse.rs index d05c5e0..f6449a6 100644 --- a/nlprule/src/components/rules/compile/structure/parse.rs +++ b/nlprule/src/components/rules/compile/structure/parse.rs @@ -199,11 +199,7 @@ fn get_exceptions( _ => None, }) .filter_map(|x| { - let exception_text = if let Some(exception_text) = &x.text { - Some(exception_text.as_str()) - } else { - None - }; + let exception_text = x.text.as_ref().map(|x| x.as_str()); let mut atom = match parse_match_attribs(x, exception_text, case_sensitive, None, info) { Ok(atom) => atom, @@ -348,11 +344,7 @@ fn parse_match(m: super::Match, engine: &Engine, info: &mut BuildInfo) -> Result m.no.parse::() .expect("no must be parsable as usize."); - let case_conversion = if let Some(conversion) = &m.case_conversion { - Some(conversion.as_str()) - } else { - None - }; + let case_conversion = m.case_conversion.as_deref(); let pos_replacer = if let Some(postag) = m.postag { if postag.contains("+DT") || postag.contains("+INDT") { diff --git a/nlprule/src/components/rules/mod.rs b/nlprule/src/components/rules/mod.rs index 1cd9233..78be9ea 100644 --- a/nlprule/src/components/rules/mod.rs +++ b/nlprule/src/components/rules/mod.rs @@ -2,8 +2,6 @@ use log::info; use serde::{Deserialize, Serialize}; use std::iter::FromIterator; -use fs_err::File; - use crate::properties::*; use crate::rule::Rule; use crate::types::*; diff --git a/nlprule/src/components/tagger/mod.rs b/nlprule/src/components/tagger/mod.rs index 0a5d1fb..4a88c0c 100644 --- a/nlprule/src/components/tagger/mod.rs +++ b/nlprule/src/components/tagger/mod.rs @@ -413,11 +413,9 @@ impl WordIdMap { .iter() .enumerate() .filter_map(|(index, maybe_value)| { - if let Some(value) = maybe_value { - Some((WordIdInt(index as u32), value)) - } else { - None - } + maybe_value + .as_ref() + .map(|value| (WordIdInt(index as u32), value)) }) } } diff --git a/nlprule/src/components/tokenizer/compile.rs b/nlprule/src/components/tokenizer/compile.rs index 03ed99e..63bb36c 100644 --- a/nlprule/src/components/tokenizer/compile.rs +++ b/nlprule/src/components/tokenizer/compile.rs @@ -1,5 +1,5 @@ use fs_err as fs; - +use fs_err::File; use std::{io::BufReader, path::PathBuf, str::FromStr}; use crate::compile::{BuildComponent, BuildInfo, Error}; diff --git a/nlprule/src/components/tokenizer/mod.rs b/nlprule/src/components/tokenizer/mod.rs index 564fa62..3ce20ff 100644 --- a/nlprule/src/components/tokenizer/mod.rs +++ b/nlprule/src/components/tokenizer/mod.rs @@ -7,7 +7,6 @@ #[cfg(feature = "compile")] mod compile; -use fs_err::File; use std::ops::Range; use crate::types::*; diff --git a/nlprule/src/lang.rs b/nlprule/src/lang.rs index a3ea738..3f30acc 100644 --- a/nlprule/src/lang.rs +++ b/nlprule/src/lang.rs @@ -1,3 +1,4 @@ +#[allow(unused)] macro_rules! binary { ($component: ty, $lang_code:literal, $binary_name:literal) => {{ use crate::components::Component; @@ -15,8 +16,12 @@ macro_rules! binary { }}; } +#[allow(unused)] const ERROR_MSG: &str = "binaries are pre-tested."; +#[cfg(feature = "binaries-de")] pub mod de; +#[cfg(feature = "binaries-en")] pub mod en; +#[cfg(feature = "binaries-es")] pub mod es; diff --git a/nlprule/src/lib.rs b/nlprule/src/lib.rs index 18a1dd5..4e4edf4 100644 --- a/nlprule/src/lib.rs +++ b/nlprule/src/lib.rs @@ -10,13 +10,12 @@ //! Correct a text: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules}; +//! use nlprule::lang::en; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let rules = Rules::new("path/to/en_rules.bin")?; +//! let correcter = en::correcter(); //! //! assert_eq!( -//! rules.correct("She was not been here since Monday.", &tokenizer), +//! correcter.correct("She was not been here since Monday.").collect::>().join(""), //! String::from("She was not here since Monday.") //! ); //! # Ok::<(), nlprule::Error>(()) @@ -25,46 +24,41 @@ //! Get suggestions and correct a text: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules, types::Suggestion, rules::apply_suggestions}; +//! use nlprule::lang::en; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let rules = Rules::new("path/to/en_rules.bin")?; +//! let correcter = en::correcter(); //! //! let text = "She was not been here since Monday."; //! -//! let suggestions = rules.suggest(text, &tokenizer); +//! let suggestions = correcter.suggest(text).next().expect("`text` contains one sentence."); //! assert_eq!(*suggestions[0].span().char(), 4usize..16); //! assert_eq!(suggestions[0].replacements(), vec!["was not", "has not been"]); //! assert_eq!(suggestions[0].source(), "GRAMMAR/WAS_BEEN/1"); //! assert_eq!(suggestions[0].message(), "Did you mean was not or has not been?"); //! -//! let corrected = apply_suggestions(text, &suggestions); -//! -//! assert_eq!(corrected, "She was not here since Monday."); //! # Ok::<(), nlprule::Error>(()) //! ``` //! //! Tokenize & analyze a text: //! //! ```no_run -//! use nlprule::Tokenizer; +//! use nlprule::lang::en; +//! use nlprule::properties::Tokenize; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; +//! let analyzer = en::analyzer(); //! //! let text = "A brief example is shown."; //! //! // returns an iterator over sentences -//! let sentence = tokenizer.pipe(text).next().expect("`text` contains one sentence."); +//! let sentence = analyzer.tokenize(text).next().expect("`text` contains one sentence."); //! //! println!("{:#?}", sentence); -//! assert_eq!(sentence.tokens()[1].word().text().as_str(), "brief"); -//! assert_eq!(sentence.tokens()[1].word().tags()[0].pos().as_str(), "JJ"); -//! assert_eq!(sentence.tokens()[1].chunks(), vec!["I-NP-singular"]); +//! assert_eq!(sentence.tokens()[1].as_str(), "brief"); +//! assert_eq!(sentence.tokens()[1].tags()?.iter().next().unwrap().pos().as_str(), "JJ"); +//! assert_eq!(sentence.tokens()[1].chunks()?, &["I-NP-singular"]); //! // some other information like char / byte span, lemmas etc. is also set! //! # Ok::<(), nlprule::Error>(()) //! ``` -//! --- -//! Binaries are distributed with [Github releases](https://github.com/bminixhofer/nlprule/releases). // #![warn(missing_docs)] use std::io; @@ -75,7 +69,6 @@ use thiserror::Error; pub mod compile; pub mod components; mod filter; -#[cfg(feature = "binaries")] #[macro_use] pub mod lang; pub mod properties; @@ -95,35 +88,6 @@ pub enum Error { IdError(#[from] rule::id::Error), #[error(transparent)] Property(#[from] properties::Error), - // TODO: combine with `Property` (probably) - #[error("unset token property: {0}")] - Unset(&'static str), #[error("Test failed. See logs for details.")] TestFailed, } - -/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. -pub fn tokenizer_filename(lang_code: &str) -> String { - format!("{}_tokenizer.bin", lang_code) -} - -/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format. -pub fn rules_filename(lang_code: &str) -> String { - format!("{}_rules.bin", lang_code) -} - -/// Gets the canonical filename for the tokenizer binary for a language code in ISO 639-1 (two-letter) format. -#[macro_export] -macro_rules! tokenizer_filename { - ($lang_code:literal) => { - concat!($lang_code, "_tokenizer.bin") - }; -} - -/// Gets the canonical filename for the rules binary for a language code in ISO 639-1 (two-letter) format. -#[macro_export] -macro_rules! rules_filename { - ($lang_code:literal) => { - concat!($lang_code, "_rules.bin") - }; -} diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index 19af572..bb1f9b3 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -12,9 +12,9 @@ pub mod suggest { /// Correct a text by applying suggestions to it. /// In the case of multiple possible replacements, always chooses the first one. - pub fn apply_suggestions(text: &str, suggestions: &[Suggestion]) -> String { - let mut offset: isize = 0; - let mut chars: Vec<_> = text.chars().collect(); + pub fn apply_suggestions(sentence: &Sentence, suggestions: &[Suggestion]) -> String { + let mut offset: isize = -(sentence.span().char().start as isize); + let mut chars: Vec<_> = sentence.text().chars().collect(); for suggestion in suggestions { let replacement: Vec<_> = suggestion.replacements()[0].chars().collect(); @@ -42,7 +42,7 @@ pub mod suggest { fn correct(&self, sentence: &Sentence) -> Result { let suggestions = self.suggest(sentence)?; - Ok(apply_suggestions(sentence.text(), &suggestions)) + Ok(apply_suggestions(&sentence, &suggestions)) } #[allow(unused_variables)] @@ -325,7 +325,7 @@ impl Properties { } impl PropertiesMut { - pub(crate) fn reads_without_write<'a>(&'a self) -> impl Iterator + 'a { + pub(crate) fn reads_without_write(&self) -> impl Iterator { self.read_mask .intersection(self.write_mask.inverse()) .into_iter() @@ -624,6 +624,18 @@ macro_rules! impl_pipeline { sentences } + #[allow(non_snake_case, unused_mut)] + pub fn correct<'t>(&'t self, text: &'t str) -> impl Iterator + 't { + let (ref $first, $(ref $name,)* ref $last) = self.0; + + let sentences = $first.tokenize(text).map(move |mut sentence| { + $(sentence = $name.transform(sentence).unwrap();)* + $last.correct(&sentence).unwrap() + }); + + sentences + } + #[allow(non_snake_case)] pub fn test(&self) -> Result<(), crate::Error> { let (ref $first, $(ref $name,)* ref $last) = self.0; diff --git a/nlprule/src/rule/id.rs b/nlprule/src/rule/id.rs index 7c0c21b..850843f 100644 --- a/nlprule/src/rule/id.rs +++ b/nlprule/src/rule/id.rs @@ -17,11 +17,10 @@ //! Select individal rules: //! //! ```no_run -//! use nlprule::{Tokenizer, Rules, rule::id::Category}; +//! use nlprule::{lang::en, rule::id::Category}; //! use std::convert::TryInto; //! -//! let tokenizer = Tokenizer::new("path/to/en_tokenizer.bin")?; -//! let mut rules = Rules::new("path/to/en_rules.bin")?; +//! let mut rules = en::rules(); //! //! // disable rules named "confusion_due_do" in category "confused_words" //! rules diff --git a/nlprule/src/types.rs b/nlprule/src/types.rs index bb08102..7f66716 100644 --- a/nlprule/src/types.rs +++ b/nlprule/src/types.rs @@ -1,7 +1,7 @@ //! Fundamental types used by this crate. -use crate::components::tagger::Tagger; pub(crate) use crate::components::tagger::{PosId, SpecialPos, WordId, WordIdInt}; +use crate::{components::tagger::Tagger, properties::Property}; use derivative::Derivative; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; @@ -321,19 +321,27 @@ impl<'t> Token<'t> { impl<'t> Token<'t> { /// The tags of this token. Contain information about the part-of-speech tags and lemmas. pub fn tags(&self) -> Result<&Tags<'t>, crate::Error> { - self.tags.as_ref().ok_or(crate::Error::Unset("tags")) + self.tags + .as_ref() + .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into()) } pub fn tags_mut(&mut self) -> Result<&mut Tags<'t>, crate::Error> { - self.tags.as_mut().ok_or(crate::Error::Unset("tags")) + self.tags + .as_mut() + .ok_or_else(|| crate::properties::Error::Unset(Property::Tags).into()) } pub fn chunks(&self) -> Result<&[String], crate::Error> { - self.chunks.as_deref().ok_or(crate::Error::Unset("chunks")) + self.chunks + .as_deref() + .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into()) } pub fn chunks_mut(&mut self) -> Result<&mut Vec, crate::Error> { - self.chunks.as_mut().ok_or(crate::Error::Unset("chunks")) + self.chunks + .as_mut() + .ok_or_else(|| crate::properties::Error::Unset(Property::Chunks).into()) } } diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 2ee1caa..73d7b23 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -1,43 +1,36 @@ use std::convert::TryInto; use lazy_static::lazy_static; -use nlprule::{properties::*, rule::id::Category, types::Position, Rules, Tokenizer}; +use nlprule::{lang::en, properties::*}; use quickcheck_macros::quickcheck; -const TOKENIZER_PATH: &str = "../storage/en_tokenizer.bin"; -const RULES_PATH: &str = "../storage/en_rules.bin"; +#[test] +fn can_analyze_empty_text() { + let analyzer = en::analyzer(); -lazy_static! { - static ref TOKENIZER: Tokenizer = Tokenizer::new(TOKENIZER_PATH).unwrap(); - static ref RULES: Rules = Rules::new(RULES_PATH).unwrap(); + let sentences: Vec<_> = analyzer.tokenize("").collect(); + assert!(sentences.is_empty()); } #[test] -fn can_tokenize_empty_text() { - let tokenizer = - Pipeline::new((&*TOKENIZER, TOKENIZER.chunker().as_ref().unwrap(), &*RULES)).unwrap(); - - let sentences: Vec<_> = tokenizer.suggest("His homework is due tomorrow.").collect(); - // assert!(sentences.is_empty()); +fn handles_whitespace_correctly() { + let analyzer = en::analyzer(); + + // preceding whitespace has to be included, trailing whitespace behavior is unspecified + let text = " hello.\ttest.\t\t"; + + let mut sentences = analyzer.tokenize(text); + assert_eq!( + &text[sentences.next().unwrap().span().byte().clone()], + " hello.\t" + ); + assert_eq!( + &text[sentences.next().unwrap().span().byte().clone()], + "test.\t" + ); + assert!(sentences.next().is_none()); } -// #[test] -// fn handles_whitespace_correctly() { -// // preceding whitespace has to be included, trailing whitespace behavior is unspecified -// let text = " hello.\ttest.\t\t"; - -// let mut sentences = TOKENIZER.pipe(text); -// assert_eq!( -// &text[sentences.next().unwrap().unwrap().span().byte().clone()], -// " hello.\t" -// ); -// assert_eq!( -// &text[sentences.next().unwrap().unwrap().span().byte().clone()], -// "test.\t" -// ); -// assert!(sentences.next().is_none()); -// } - // #[quickcheck] // fn can_tokenize_anything(text: String) -> bool { // let _: Vec<_> = TOKENIZER.pipe(&text).collect(); diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh index ee68740..b9c16b9 100755 --- a/scripts/build_and_test.sh +++ b/scripts/build_and_test.sh @@ -4,10 +4,18 @@ then exit fi -# this script assumes the build directories are in data/ -# only for convenience mkdir -p nlprule/src/storage +cd data + +# download + extract the build directory from backblaze if we don't have it yet +if [ ! -f $1.zip ]; then + wget https://f000.backblazeb2.com/file/nlprule/$$1.zip + unzip -o $1.zip +fi + +cd .. + # x- => only compile # -x => only test # xx or flags not set => everything @@ -23,6 +31,6 @@ fi if [ "${flags:1:1}" == "x" ] then cd nlprule - RUST_LOG=INFO cargo run --all-features --bin test -- --lang $1 + RUST_LOG=INFO cargo run --features "bin binaries-$1" --bin test_$1 cd .. fi \ No newline at end of file diff --git a/scripts/setup.sh b/scripts/setup.sh deleted file mode 100755 index 3d26ebb..0000000 --- a/scripts/setup.sh +++ /dev/null @@ -1,11 +0,0 @@ -mkdir -p data - -cd data - -for lang in "en" "de" "es" -do - if [ ! -f $lang.zip ]; then - wget https://f000.backblazeb2.com/file/nlprule/$lang.zip - unzip -o $lang.zip - fi -done \ No newline at end of file From 6ddffdba15520a0fd935ed5f5f783d1cc96e0135 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 11:49:11 +0200 Subject: [PATCH 11/15] update ci --- .github/workflows/ci.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 462359e..825ba24 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,8 +22,9 @@ jobs: toolchain: stable target: wasm32-unknown-unknown - uses: Swatinem/rust-cache@v1 - - run: | - bash scripts/setup.sh ${{ matrix.lang }} + - name: Build and test language + run: | + bash scripts/build_and_test.sh ${{ matrix.lang }} xx - name: Build source uses: actions-rs/cargo@v1 with: @@ -40,9 +41,6 @@ jobs: with: token: ${{ secrets.GITHUB_TOKEN }} args: --all-features - - name: Build and test - run: | - bash scripts/build_and_test.sh ${{ matrix.lang }} xx - name: Run nlprule tests uses: actions-rs/cargo@v1 if: matrix.lang == 'en' From 9846e6eab1198fd43502220cb902dce7ce541f7a Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 11:50:34 +0200 Subject: [PATCH 12/15] add language specific test binaries --- nlprule/src/bin/test_de.rs | 6 ++++++ nlprule/src/bin/test_en.rs | 6 ++++++ nlprule/src/bin/test_es.rs | 6 ++++++ 3 files changed, 18 insertions(+) create mode 100644 nlprule/src/bin/test_de.rs create mode 100644 nlprule/src/bin/test_en.rs create mode 100644 nlprule/src/bin/test_es.rs diff --git a/nlprule/src/bin/test_de.rs b/nlprule/src/bin/test_de.rs new file mode 100644 index 0000000..af75f27 --- /dev/null +++ b/nlprule/src/bin/test_de.rs @@ -0,0 +1,6 @@ +use nlprule::lang::de; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + de::correcter().test() +} diff --git a/nlprule/src/bin/test_en.rs b/nlprule/src/bin/test_en.rs new file mode 100644 index 0000000..f7268fc --- /dev/null +++ b/nlprule/src/bin/test_en.rs @@ -0,0 +1,6 @@ +use nlprule::lang::en; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + en::correcter().test() +} diff --git a/nlprule/src/bin/test_es.rs b/nlprule/src/bin/test_es.rs new file mode 100644 index 0000000..c6c3ace --- /dev/null +++ b/nlprule/src/bin/test_es.rs @@ -0,0 +1,6 @@ +use nlprule::lang::es; + +fn main() -> Result<(), nlprule::Error> { + env_logger::init(); + es::correcter().test() +} From 4dde97e9eab668b6f3d9fbbbe87c7160acc16672 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 12:03:50 +0200 Subject: [PATCH 13/15] fix build_and_test script --- scripts/build_and_test.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh index b9c16b9..d030daf 100755 --- a/scripts/build_and_test.sh +++ b/scripts/build_and_test.sh @@ -4,13 +4,15 @@ then exit fi +set -e + mkdir -p nlprule/src/storage cd data # download + extract the build directory from backblaze if we don't have it yet if [ ! -f $1.zip ]; then - wget https://f000.backblazeb2.com/file/nlprule/$$1.zip + wget https://f000.backblazeb2.com/file/nlprule/$1.zip unzip -o $1.zip fi From 5bf39bc264308628ec06b7fdf1b6e8803d800fc8 Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 12:21:38 +0200 Subject: [PATCH 14/15] update ci --- .github/workflows/ci.yml | 5 ----- scripts/build_and_test.sh | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 825ba24..2179017 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,11 +25,6 @@ jobs: - name: Build and test language run: | bash scripts/build_and_test.sh ${{ matrix.lang }} xx - - name: Build source - uses: actions-rs/cargo@v1 - with: - command: build - args: --all-features - name: Build source (WebAssembly) uses: actions-rs/cargo@v1 with: diff --git a/scripts/build_and_test.sh b/scripts/build_and_test.sh index d030daf..5b28e3b 100755 --- a/scripts/build_and_test.sh +++ b/scripts/build_and_test.sh @@ -33,6 +33,6 @@ fi if [ "${flags:1:1}" == "x" ] then cd nlprule - RUST_LOG=INFO cargo run --features "bin binaries-$1" --bin test_$1 + RUST_LOG=INFO cargo run --no-default-features --features "bin binaries-$1 regex-all-test" --bin test_$1 cd .. fi \ No newline at end of file From aa9418486a0e31a376a73a73cbe5ee7af1002c4e Mon Sep 17 00:00:00 2001 From: Benjamin Minixhofer Date: Sun, 30 May 2021 14:50:50 +0200 Subject: [PATCH 15/15] remove nlprule-build, update tests --- .github/workflows/ci.yml | 4 +- Cargo.toml | 1 - build/Cargo.toml | 26 -- build/README.md | 30 +- build/languages.txt | 3 - build/src/lib.rs | 756 ---------------------------------- nlprule/src/components/mod.rs | 2 +- nlprule/src/lang.rs | 14 +- nlprule/src/properties.rs | 36 +- nlprule/tests/tests.rs | 254 ++++++------ 10 files changed, 178 insertions(+), 948 deletions(-) delete mode 100644 build/Cargo.toml delete mode 100644 build/languages.txt delete mode 100644 build/src/lib.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2179017..e3ac562 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,13 +35,13 @@ jobs: if: matrix.lang == 'en' with: token: ${{ secrets.GITHUB_TOKEN }} - args: --all-features + args: --features "binaries-en compile bin regex-all-test" - name: Run nlprule tests uses: actions-rs/cargo@v1 if: matrix.lang == 'en' with: command: test - args: --verbose --all-features --release + args: --verbose --features "binaries-en" --release - name: Upload binaries as artifact uses: actions/upload-artifact@v2 with: diff --git a/Cargo.toml b/Cargo.toml index a8efc09..d636835 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,6 @@ [workspace] members = [ "nlprule", - "build", "python" ] diff --git a/build/Cargo.toml b/build/Cargo.toml deleted file mode 100644 index 726af20..0000000 --- a/build/Cargo.toml +++ /dev/null @@ -1,26 +0,0 @@ -[package] -name = "nlprule-build" -version = "0.6.3" -authors = ["Benjamin Minixhofer ", "Bernhard Schuster "] -edition = "2018" -license = "MIT OR Apache-2.0" -description = "Build tools for a fast, low-resource Natural Language Processing and Error Correction library." -repository = "https://github.com/bminixhofer/nlprule" -keywords = ["text", "spelling", "language-processing", "nlp", "grammar"] -categories = ["science", "text-processing"] - -[dependencies] -flate2 = "1" -thiserror = "1" -zip = "0.5.9" -directories = "3" -reqwest = { version = "0.11", default_features = false, features = ["blocking", "rustls-tls"] } -nlprule = { path = "../nlprule", features = ["compile"], version = "0.6.3" } # BUILD_BINDINGS_COMMENT -# nlprule = { package = "nlprule-core", path = "../nlprule", features = ["compile"] } # BUILD_BINDINGS_UNCOMMENT -fs-err = "2.5" - -[dev-dependencies] -tempdir = "0.3" -smush = "0.1.5" -env_logger = "0.8" -nlprule_030 = { package = "nlprule", version = "0.3.0" } diff --git a/build/README.md b/build/README.md index c826acc..db2c7ec 100644 --- a/build/README.md +++ b/build/README.md @@ -1,34 +1,6 @@ # nlprule-build -This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). It also provides: -1. Utility functions to download the binaries from their distribution source. -2. Scripts to create the nlprule build directories. - -## Development - -If you are using a development version of nlprule, the builder can build the binaries itself (instead of just fetching them): - -```rust -let nlprule_builder = nlprule_build::BinaryBuilder::new( - &["en"], - std::env::var("OUT_DIR").expect("OUT_DIR is set when build.rs is running"), -) -// this specifies that the binaries should be built if they are not found -.fallback_to_build_dir(true) -.build() -.validate(); -``` - -In that case, you should set - -```toml -[profile.dev] -build-override = { opt-level = 2 } -``` - -in your `Cargo.toml`. Building can be slow otherwise. - -The following has information how to acquire the nlpruile build directories and how to build and test the nlprule binaries. As a user you will typically not need to do this. +Utilities for creating build resources. ### Building and testing the nlprule binaries diff --git a/build/languages.txt b/build/languages.txt deleted file mode 100644 index f1723af..0000000 --- a/build/languages.txt +++ /dev/null @@ -1,3 +0,0 @@ -de -en -es \ No newline at end of file diff --git a/build/src/lib.rs b/build/src/lib.rs deleted file mode 100644 index 5dd85f8..0000000 --- a/build/src/lib.rs +++ /dev/null @@ -1,756 +0,0 @@ -//! This crate provides a builder to make it easier to use the correct binaries for [nlprule](https://github.com/bminixhofer/nlprule). -//! See `README.md` for details. - -use flate2::bufread::GzDecoder; -use fs::File; -use fs_err as fs; -use nlprule::{compile, rules_filename, tokenizer_filename}; -use std::fs::Permissions; -use std::{ - io::{self, BufReader, BufWriter, Cursor, Read}, - path::{Path, PathBuf}, - result, -}; -use zip::result::ZipError; - -pub type OtherError = Box; - -#[derive(Debug, thiserror::Error)] -pub enum Error { - #[error(transparent)] - RequestError(#[from] reqwest::Error), - #[error("Binaries were not found on the remote")] - BinariesNotFound, - #[error("Failed to validate {1:?} binary for lang {0}")] - ValidationFailed(String, Binary, #[source] nlprule::Error), - #[error(transparent)] - IoError(#[from] io::Error), - #[error(transparent)] - ZipError(#[from] ZipError), - #[error("error postprocessing binaries: {0}")] - PostprocessingError(#[source] OtherError), - #[error("error transforming binaries: {0}")] - TransformError(#[source] OtherError), - #[error("Collation failed")] - CollationFailed(#[source] nlprule::compile::Error), -} - -pub type Result = result::Result; - -/// Definition of the data transformation for the network retrieved, binencoded rules and tokenizer binaries. -pub type TransformDataFn = Box) -> result::Result<(), OtherError>>; - -/// Definition of the path transformation for the network retrieved, binencoded rules and tokenizer binaries. -pub type TransformPathFn = Box result::Result>; - -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] -pub enum Binary { - Tokenizer, - Rules, -} - -impl Binary { - fn filename(&self, lang_code: &str) -> String { - match &self { - Binary::Tokenizer => tokenizer_filename(lang_code), - Binary::Rules => rules_filename(lang_code), - } - } -} - -/// Tries downloading the binaries from their distribution source. -/// -/// This implicitly unpacks the originally gzip'd sources and returns -/// an in-memory buffer. -fn obtain_binary_from_github_release( - version: &str, - lang_code: &str, - binary: Binary, -) -> Result> { - let filename = binary.filename(lang_code); - - let bytes = reqwest::blocking::get(&format!( - "https://github.com/bminixhofer/nlprule/releases/download/{}/{}.gz", - version, filename - ))? - .error_for_status() - .map_err(|e| { - if let Some(404) = e.status().map(|x| x.as_u16()) { - Error::BinariesNotFound - } else { - e.into() - } - })? - .bytes()?; - - let mut gz = GzDecoder::new(&bytes[..]); - let mut buffer = Vec::new(); - gz.read_to_end(&mut buffer)?; - - Ok(buffer) -} - -fn construct_cache_path( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, -) -> Result> { - let filename = binary.filename(lang_code); - - cache_dir - .map(move |dir| { - let path = dir.join(version).join(lang_code).join(&filename); - Ok(if let Some(transform_path_fn) = transform_path_fn { - transform_path_fn(path).map_err(Error::TransformError)? - } else { - path - }) - }) - .transpose() -} - -/// Returns the bytes for a binary which are either obtained -/// from the on-disk cache or from the distribution source. -/// If the on-disk cache is disabled or is not present, -/// it will attempt to download it via [`obtain_binary_from_github_release`]. -/// Also updates the cache. -/// -/// If `transform_data_fn` is set, the bytes returned from this function are the output -/// of `transform_data_fn` applied to the binencoded binaries. -fn obtain_binary_cache_or_github( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, - transform_data_fn: Option<&TransformDataFn>, -) -> Result> { - let cache_path = - construct_cache_path(version, lang_code, binary, cache_dir, transform_path_fn)?; - - // if the file can be read, the data is already cached and the transform was applied before - if let Some(ref cache_path) = cache_path { - if let Ok(bytes) = fs::read(cache_path) { - return Ok(bytes); - } - } - - // the binencoded data from github - let bytes_binenc = obtain_binary_from_github_release(version, lang_code, binary)?; - - // apply the transform if any to an intermediate buffer - let bytes_transformed = if let Some(transform_data_fn) = transform_data_fn { - let mut intermediate = Vec::::new(); - transform_data_fn(bytes_binenc.as_slice(), &mut intermediate) - .map_err(Error::TransformError)?; - intermediate - } else { - bytes_binenc - }; - - // update the cache entry - if let Some(ref cache_path) = cache_path { - fs::create_dir_all(cache_path.parent().expect("path must have parent"))?; - let mut cache_file = fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(cache_path)?; - io::copy(&mut bytes_transformed.as_slice(), &mut cache_file)?; - } - - Ok(bytes_transformed) -} - -fn assure_binary_availability( - version: &str, - lang_code: &str, - binary: Binary, - cache_dir: Option<&PathBuf>, - transform_path_fn: Option<&TransformPathFn>, - transform_data_fn: Option<&TransformDataFn>, - out: PathBuf, -) -> Result<()> { - let source = obtain_binary_cache_or_github( - version, - lang_code, - binary, - cache_dir, - transform_path_fn, - transform_data_fn, - )?; - - let mut out_file = fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(out)?; - io::copy(&mut source.as_slice(), &mut out_file)?; - Ok(()) -} - -pub fn get_build_dir>(lang_code: &str, out_dir: P) -> Result<()> { - let bytes = reqwest::blocking::get(&format!( - "https://f000.backblazeb2.com/file/nlprule/{}.zip", - lang_code - ))? - .error_for_status()? - .bytes()?; - - // extract the zip file and write to directory, a bit annoying that this is so verbose - // adapted from https://github.com/zip-rs/zip/blob/master/examples/extract.rs - let mut archive = zip::ZipArchive::new(Cursor::new(bytes))?; - - for i in 0..archive.len() { - let mut file = archive.by_index(i)?; - let outpath = match file.enclosed_name() { - Some(path) => out_dir - .as_ref() - // the first component of the path is the zip file name e. g. "en" so we skip it - .join(path.iter().skip(1).collect::()), - None => continue, - }; - - if (&*file.name()).ends_with('/') { - fs::create_dir_all(&outpath)?; - } else { - if let Some(p) = outpath.parent() { - if !p.exists() { - fs::create_dir_all(&p)?; - } - } - let mut outfile = fs::File::create(&outpath)?; - io::copy(&mut file, &mut outfile)?; - } - - // Get and Set permissions - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - - if let Some(mode) = file.unix_mode() { - fs::set_permissions(&outpath, Permissions::from_mode(mode))?; - } - } - } - - Ok(()) -} - -/// Gets the language codes for the currently supported languages in ISO 639-1 (two-letter) format e. g. "en". -pub fn supported_language_codes() -> Vec<&'static str> { - include_str!(concat!(env!("CARGO_MANIFEST_DIR"), "/", "languages.txt")) - .lines() - .collect() -} - -/// Places all nlprule binaries for the given languages in some directory. -pub struct BinaryBuilder { - language_codes: Vec, - out_dir: PathBuf, - version: String, - cache_dir: Option, - fallback_to_build_dir: bool, - build_dir: Option, - outputs: Vec, - transform_path_fn: Option, - transform_data_fn: Option, -} - -impl BinaryBuilder { - /// ```plain - /// github release resource --[fn transform]--> $cache_dir --[fn postprocess]--> $OUT_DIR/ - /// ``` - /// - /// Acquires the rule and tokenizer binaries for one language by: - /// - Trying to download them from their distribution source (or load them local cache). - /// - If they are not found (i. e. a dev version of nlprule is used) and `fallback_to_build_dir` is true - /// downloads the latest build directory and builds the binaries from it. - /// This can still fail if the dev version is sufficiently outdated for the latest build dir. - /// In that case, the user is encouraged to update to a release or a newer git sha. - fn build_language(&mut self, lang_code: &str) -> Result<()> { - // adjust the destination path - let path_transform = |out: PathBuf| -> Result { - Ok( - if let Some(ref transform_path_fn) = self.transform_path_fn { - transform_path_fn(out).map_err(Error::TransformError)? - } else { - out - }, - ) - }; - - let tokenizer_out = path_transform(self.out_dir.join(tokenizer_filename(lang_code)))?; - let rules_out = path_transform(self.out_dir.join(rules_filename(lang_code)))?; - - let mut did_not_find_binaries = false; - - for (binary, out) in &[ - (Binary::Tokenizer, &tokenizer_out), - (Binary::Rules, &rules_out), - ] { - let out = out.to_owned().to_owned(); - match assure_binary_availability( - &self.version, - lang_code, - *binary, - self.cache_dir.as_ref(), - self.transform_path_fn.as_ref(), - self.transform_data_fn.as_ref(), - out, - ) { - Err(Error::BinariesNotFound) => { - did_not_find_binaries = true; - break; - } - res => res?, - } - } - - if did_not_find_binaries && self.fallback_to_build_dir { - // it is possible that the build dirs are cached too long i. e. not downloaded again although a new version is available - // this could lead to problems but is not easy to fix so it will stay this way unless problems are reported - let build_dir = self - .build_dir.as_ref() - .unwrap_or_else( - || self.cache_dir.as_ref().expect("need somewhere to store build dirs: either `cache_dir` or `build_dir_path` must be set if `fallback_to_build_dir` is true."), - ) - .join(lang_code); - - if !build_dir.exists() { - get_build_dir(lang_code, &build_dir).expect("error loading build directory"); - } - - let mut rules_sink = BufWriter::new( - fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(&rules_out)?, - ); - let mut tokenizer_sink = BufWriter::new( - fs::OpenOptions::new() - .truncate(true) - .create(true) - .write(true) - .open(&tokenizer_out)?, - ); - if let Some(ref transform_data_fn) = self.transform_data_fn { - let mut transfer_buffer_rules = Vec::new(); - let mut transfer_buffer_tokenizer = Vec::new(); - - compile::compile( - build_dir, - &mut transfer_buffer_rules, - &mut transfer_buffer_tokenizer, - ) - .map_err(Error::CollationFailed)?; - - assert_ne!(transfer_buffer_rules.len(), 0); - assert_ne!(transfer_buffer_tokenizer.len(), 0); - - let mut transformed_buffer_rules = Vec::new(); - let mut transformed_buffer_tokenizer = Vec::new(); - - transform_data_fn( - transfer_buffer_rules.as_slice(), - &mut transformed_buffer_rules, - ) - .map_err(Error::TransformError)?; - transform_data_fn( - transfer_buffer_tokenizer.as_slice(), - &mut transformed_buffer_tokenizer, - ) - .map_err(Error::TransformError)?; - } else { - compile::compile(build_dir, &mut rules_sink, &mut tokenizer_sink) - .map_err(Error::CollationFailed)?; - }; - } else if did_not_find_binaries { - panic!( - "Did not find binaries for version {}. \ - If this is a development version, try setting `fallback_to_build_dir` to build the binaries yourself. \ - If this is a release, this should NOT happen.", - self.version - ); - } - - self.outputs.push(tokenizer_out); - self.outputs.push(rules_out); - Ok(()) - } - - /// Creates a new binary builder. `language_codes` must be in ISO 639-1 (two-letter) format. - /// If `language_codes` is `&[]`, uses all supported languages. - /// If this is used in a `build.rs`, `out_dir` should probably be the OUT_DIR environment variable. - pub fn new>(language_codes: &[&str], out_dir: P) -> Self { - let language_codes: Vec<_> = if language_codes.is_empty() { - supported_language_codes() - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - language_codes - .iter() - .map(ToOwned::to_owned) - .map(ToOwned::to_owned) - .collect::>() - }; - - let project_dir = directories::ProjectDirs::from("", "", "nlprule"); - // this should be CARGO_ARTIFACT_DIR once it is merged: https://github.com/rust-lang/rfcs/pull/3035 - let cache_dir = project_dir.as_ref().map(|x| x.cache_dir().to_owned()); - let build_dir = cache_dir.as_ref().map(|x| x.join("build_dirs")); - - let version = env!("CARGO_PKG_VERSION").to_owned(); - - BinaryBuilder { - language_codes, - out_dir: out_dir.as_ref().to_owned(), - version, - cache_dir, - fallback_to_build_dir: false, - build_dir, - outputs: Vec::new(), - transform_data_fn: None, - transform_path_fn: None, - } - } - - /// Sets the version for which to fetch binaries. - /// The version of `nlprule-build` (kept in sync with `nlprule` version) by default. - /// Typically does not need to be modified. - pub fn version>(mut self, version: S) -> Self { - self.version = version.into(); - self - } - - /// Sets the out directory. - pub fn out_dir(mut self, out_dir: PathBuf) -> Self { - self.out_dir = out_dir; - self - } - - /// Sets the cache directory. The user cache directory at e. g. `~/.cache/nlprule` by default. - pub fn cache_dir(mut self, cache_dir: Option) -> Self { - self.cache_dir = cache_dir; - self - } - - /// Sets whether to fallback to building from the build directory if no distributed binaries are found - /// (i. e. a development version of nlprule is used). - pub fn fallback_to_build_dir(mut self, fallback_to_build_dir: bool) -> Self { - self.fallback_to_build_dir = fallback_to_build_dir; - self - } - - /// Sets the path the build directories should be stored at. - /// Only relevant if `fallback_to_build_dir` is true. - /// `cache_dir.join("build_dirs")` by default. - pub fn build_dir(mut self, build_dir: Option) -> Self { - self.build_dir = build_dir; - self - } - - /// Builds by {downloading, copying, building} the binaries to the out directory. - pub fn build(mut self) -> Result { - self.language_codes - .clone() - .into_iter() - .try_for_each(|lang_code| self.build_language(&lang_code))?; - Ok(self) - } - - /// Validates the binaries by checking if they can be loaded by nlprule. - pub fn validate(&self) -> Result<()> { - for lang_code in &self.language_codes { - let tokenizer_out = self.out_dir.join(tokenizer_filename(lang_code)); - let rules_out = self.out_dir.join(rules_filename(lang_code)); - - nlprule::Rules::new(rules_out) - .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Rules, e))?; - nlprule::Tokenizer::new(tokenizer_out) - .map_err(|e| Error::ValidationFailed(lang_code.to_owned(), Binary::Tokenizer, e))?; - } - - Ok(()) - } - - /// Gets the paths to all files this builder created. - pub fn outputs(&self) -> &[PathBuf] { - &self.outputs - } - - /// Applies the given transformation function to the binary immediately after obtaining it. - /// This happens before placing the file in the cache (if any) so by using a compression - /// function the size of the cache directory can be reduced. - /// Modifies the path of the cached binaries by the given `path_fn`. - /// If no cache directory is set or the binaries are built from the build dir, the `path_fn` does nothing. - /// - /// The resulting files will then reside in the given cache dir if any. - /// - /// Attention: Any compression applied here, must be undone in the - /// `fn postprocess` provided closure to retain the original binenc file - /// to be consumed by the application code. - pub fn transform(mut self, proc_fn: D, path_fn: P) -> Self - where - // these signatures have to match the `TransformDataFn` and `TransformPathFn` types - D: Fn(&[u8], &mut Vec) -> result::Result<(), OtherError> + 'static, - P: Fn(PathBuf) -> result::Result + 'static, - { - self.transform_data_fn = Some(Box::new(proc_fn)); - self.transform_path_fn = Some(Box::new(path_fn)); - self - } - - /// Applies the given postprocessing function to the binaries e. g. for compression. - /// Modifies the output path by the given path function. - /// - /// # Example - /// - /// ```rust - /// # use nlprule_build::BinaryBuilder; - /// # use std::io::Write; - /// # let tempdir = tempdir::TempDir::new("builder_test")?; - /// # let tempdir = tempdir.path(); - /// # - /// # let mut builder = BinaryBuilder::new(&["en"], tempdir).version("0.3.0"); - /// builder - /// .build()? - /// .postprocess( - /// |reader, mut writer| { - /// let mut encoder = flate2::read::GzEncoder::new(reader, flate2::Compression::default()); - /// std::io::copy(&mut encoder, &mut writer)?; - /// Ok(()) - /// }, - /// |p| { - /// let mut path = p.as_os_str().to_os_string(); - /// path.push(".gz"); - /// path - /// }, - /// )?; - /// # Ok::<(), nlprule_build::Error>(()) - /// ``` - pub fn postprocess(mut self, proc_fn: C, path_fn: F) -> Result - where - C: Fn(BufReader, BufWriter) -> result::Result<(), OtherError>, - F: Fn(PathBuf) -> P, - P: AsRef, - { - for (i, path) in self.outputs.clone().into_iter().enumerate() { - let reader = BufReader::new(fs::File::open(&path)?); - - let new_path = path_fn(path.clone()); - let new_path = new_path.as_ref(); - - let writer = BufWriter::new(File::create(new_path)?); - - proc_fn(reader, writer).map_err(Error::PostprocessingError)?; - - if new_path != path { - self.outputs[i] = new_path.to_path_buf(); - fs::remove_file(path)?; - } - } - - Ok(self) - } -} - -#[cfg(test)] -mod tests { - use io::Write; - - use super::*; - - #[test] - fn getting_binary_works() -> Result<()> { - // this is nice to keep roughly in sync with the latest released version but it is not necessary - let tempdir = tempdir::TempDir::new("build_dir")?; - let tempdir = tempdir.path().join("foo.bin"); - assure_binary_availability("0.3.0", "en", Binary::Rules, None, None, None, tempdir)?; - - Ok(()) - } - - #[test] - fn getting_build_dir_works() -> Result<()> { - let _ = env_logger::builder().is_test(true).try_init(); - - let tempdir = tempdir::TempDir::new("build_dir_test")?; - let tempdir = tempdir.path(); - - get_build_dir("en", &tempdir)?; - - assert_eq!(fs::read_to_string(tempdir.join("lang_code.txt"))?, "en"); - - Ok(()) - } - - // TODO: causes problems in CI, maybe remove `fallback_to_build_dir` altogether? - // #[test] - // fn binary_builder_works() -> Result<()> { - // let tempdir = tempdir::TempDir::new("builder_test")?; - // let tempdir = tempdir.path(); - - // BinaryBuilder::new(&["en"], tempdir) - // .cache_dir(Some(tempdir.to_path_buf())) - // .fallback_to_build_dir(true) - // .build()? - // .validate()?; - - // Ok(()) - // } - - #[test] - fn binary_builder_works_with_released_version() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()?; - - Ok(()) - } - - #[test] - fn binary_builder_works_with_smush() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()? - .postprocess( - |mut buffer, mut writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - Ok(writer.write_all(&smush::encode( - &tmp, - smush::Codec::Gzip, - smush::Quality::Default, - )?)?) - }, - |p| { - let mut path = p.as_os_str().to_os_string(); - path.push(".gz"); - path - }, - )?; - - let tokenizer_path = tempdir - .join(Path::new(&tokenizer_filename("en"))) - .with_extension("bin.gz"); - assert!(tokenizer_path.exists()); - let decoded = smush::decode(&fs::read(tokenizer_path)?, smush::Codec::Gzip).unwrap(); - - let _ = nlprule_030::Tokenizer::new_from(&mut decoded.as_slice()).unwrap(); - - Ok(()) - } - - #[test] - fn binary_builder_works_with_flate2() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - let builder = BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .build()? - .postprocess( - |mut buffer, writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - Ok( - flate2::write::GzEncoder::new(writer, flate2::Compression::default()) - .write_all(&tmp)?, - ) - }, - |p| { - let mut path = p.as_os_str().to_os_string(); - path.push(".gz"); - path - }, - )?; - - assert_eq!( - builder.outputs(), - &[ - tempdir.join("en_tokenizer.bin.gz"), - tempdir.join("en_rules.bin.gz") - ] - ); - - let rules_path = tempdir - .join(Path::new(&rules_filename("en"))) - .with_extension("bin.gz"); - assert!(rules_path.exists()); - - let encoded = fs::read(rules_path)?; - let mut decoder = flate2::read::GzDecoder::new(&encoded[..]); - - let mut decoded = Vec::new(); - decoder.read_to_end(&mut decoded).unwrap(); - - let _ = nlprule_030::Rules::new_from(&mut decoded.as_slice()).unwrap(); - - Ok(()) - } - - #[test] - fn build_with_zstd_transform() -> Result<()> { - let tempdir = tempdir::TempDir::new("builder_test")?; - let tempdir = tempdir.path(); - - let builder = BinaryBuilder::new(&["en"], tempdir) - .version("0.3.0") - .transform( - |buffer, writer| { - let data = smush::encode(buffer, smush::Codec::Zstd, smush::Quality::Maximum)?; - writer.write_all(&data)?; - Ok(()) - }, - |p: PathBuf| { - let mut s = p.to_string_lossy().to_string(); - s.push_str(".zstd"); - Ok(PathBuf::from(s)) - }, - ) - .build()? - .postprocess( - |mut buffer, mut writer| { - let mut tmp = Vec::new(); - buffer.read_to_end(&mut tmp)?; - let data = smush::decode(tmp.as_slice(), smush::Codec::Zstd)?; - writer.write_all(data.as_slice())?; - Ok(()) - }, - |p| { - let path = p.to_string_lossy(); - assert!(path.ends_with(".zstd")); - let end = path.len().saturating_sub(".zstd".len()); - assert_ne!(end, 0); - path[..end].to_owned() - }, - )?; - - assert_eq!( - builder.outputs(), - &[ - tempdir.join("en_tokenizer.bin"), - tempdir.join("en_rules.bin") - ] - ); - - let rules_path = tempdir - .join(Path::new(&rules_filename("en"))) - .with_extension("bin"); - assert!(rules_path.is_file()); - - let _ = nlprule_030::Rules::new(rules_path).unwrap(); - Ok(()) - } -} diff --git a/nlprule/src/components/mod.rs b/nlprule/src/components/mod.rs index b1cda1d..8cdb152 100644 --- a/nlprule/src/components/mod.rs +++ b/nlprule/src/components/mod.rs @@ -12,7 +12,7 @@ pub mod rules; pub mod tagger; pub mod tokenizer; -pub trait Component: Serialize + DeserializeOwned { +pub trait Component: Serialize + DeserializeOwned + Clone { fn name() -> &'static str; fn new>(p: P) -> Result { diff --git a/nlprule/src/lang.rs b/nlprule/src/lang.rs index 3f30acc..f627fef 100644 --- a/nlprule/src/lang.rs +++ b/nlprule/src/lang.rs @@ -1,6 +1,16 @@ +use std::path::{Path, PathBuf}; + +const MANIFEST_DIR: &str = env!("CARGO_MANIFEST_DIR"); + +pub fn binary_path(lang_code: &str, name: &str) -> PathBuf { + Path::new(MANIFEST_DIR) + .join(lang_code) + .join(format!("{}.bin", name)) +} + #[allow(unused)] macro_rules! binary { - ($component: ty, $lang_code:literal, $binary_name:literal) => {{ + ($component: ty, $lang_code:literal, $name:literal) => {{ use crate::components::Component; let mut bytes: &'static [u8] = include_bytes!(concat!( @@ -8,7 +18,7 @@ macro_rules! binary { "/storage/", $lang_code, "/", - $binary_name, + $name, ".bin" )); diff --git a/nlprule/src/properties.rs b/nlprule/src/properties.rs index bb1f9b3..de151e0 100644 --- a/nlprule/src/properties.rs +++ b/nlprule/src/properties.rs @@ -118,7 +118,7 @@ pub mod transform { } } - #[derive(Serialize, Deserialize)] + #[derive(Serialize, Deserialize, Clone)] pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); } @@ -168,13 +168,43 @@ pub mod tokenize { } } - #[derive(Serialize, Deserialize)] + #[derive(Serialize, Deserialize, Clone)] pub struct Pipeline(pub(super) T, pub(super) PropertiesMut); } -#[derive(Serialize, Deserialize)] +#[derive(Serialize, Deserialize, Clone)] pub struct Pipeline(T, PropertiesMut); +impl transform::Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl tokenize::Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + +impl Pipeline { + pub fn components(&self) -> &T { + &self.0 + } + + pub fn components_mut(&mut self) -> &mut T { + &mut self.0 + } +} + #[derive(Error, Debug)] #[allow(missing_docs)] pub enum Error { diff --git a/nlprule/tests/tests.rs b/nlprule/tests/tests.rs index 73d7b23..6408c96 100644 --- a/nlprule/tests/tests.rs +++ b/nlprule/tests/tests.rs @@ -1,25 +1,26 @@ use std::convert::TryInto; use lazy_static::lazy_static; -use nlprule::{lang::en, properties::*}; +use nlprule::{lang::en, properties::*, rule::id::Category, types::Position}; use quickcheck_macros::quickcheck; +lazy_static! { + static ref ANALYZER: en::Analyzer = en::analyzer(); + static ref CORRECTER: en::Correcter = en::correcter(); +} + #[test] fn can_analyze_empty_text() { - let analyzer = en::analyzer(); - - let sentences: Vec<_> = analyzer.tokenize("").collect(); + let sentences: Vec<_> = ANALYZER.tokenize("").collect(); assert!(sentences.is_empty()); } #[test] fn handles_whitespace_correctly() { - let analyzer = en::analyzer(); - // preceding whitespace has to be included, trailing whitespace behavior is unspecified let text = " hello.\ttest.\t\t"; - let mut sentences = analyzer.tokenize(text); + let mut sentences = ANALYZER.tokenize(text); assert_eq!( &text[sentences.next().unwrap().span().byte().clone()], " hello.\t" @@ -31,121 +32,124 @@ fn handles_whitespace_correctly() { assert!(sentences.next().is_none()); } -// #[quickcheck] -// fn can_tokenize_anything(text: String) -> bool { -// let _: Vec<_> = TOKENIZER.pipe(&text).collect(); -// true -// } - -// #[test] -// fn suggest_indices_are_relative_to_input_text() { -// let suggestions = RULES -// .suggest( -// "I can due his homework for 10€. I can due his homework.", -// &*TOKENIZER, -// ) -// .unwrap(); - -// assert_eq!(*suggestions[0].span().char(), 6..9); -// assert_eq!(*suggestions[0].span().byte(), 6..9); - -// assert_eq!(*suggestions[1].span().char(), 38..41); -// assert_eq!( -// *suggestions[1].span().byte(), -// 38 + '€'.len_utf8() - 1..41 + '€'.len_utf8() - 1 -// ); -// } - -// #[test] -// fn sentence_spans_correct() { -// let text = "A short test. A test with emoji 😊."; - -// let sentences: Vec<_> = TOKENIZER.pipe(text).collect::>().unwrap(); -// assert_eq!(sentences.len(), 2); - -// assert_eq!(*sentences[0].span().char(), 0..14); -// assert_eq!(*sentences[0].span().byte(), 0..14); - -// assert_eq!(*sentences[1].span().char(), 14..34); -// assert_eq!(*sentences[1].span().byte(), 14..37); -// } - -// #[test] -// fn token_spans_correct() { -// let text = "A short test. A test with emoji 😊."; - -// let tokens: Vec<_> = TOKENIZER -// .pipe(text) -// .map(|x| x.into_iter()) -// .flatten() -// .collect(); -// assert_eq!(*tokens[0].span().byte(), 0..1); -// assert_eq!(*tokens[0].span().char(), 0..1); - -// assert_eq!(*tokens[2].span().char(), 8..12); -// assert_eq!(*tokens[2].span().byte(), 8..12); - -// assert_eq!(*tokens[tokens.len() - 2].span().char(), 32..33); -// assert_eq!(*tokens[tokens.len() - 2].span().byte(), 32..36); - -// assert_eq!(*tokens[tokens.len() - 1].span().char(), 33..34); -// assert_eq!(*tokens[tokens.len() - 1].span().byte(), 36..37); -// } - -// #[quickcheck] -// fn no_gaps_between_sentences(text: String) { -// let mut prev_pos = Position::default(); -// let mut contains_sentence = false; - -// for sentence in TOKENIZER.pipe(&text) { -// let sentence = sentence.unwrap(); - -// assert_eq!(sentence.span().start(), prev_pos); -// prev_pos += sentence.span().len(); - -// contains_sentence = true; -// } - -// assert_eq!(contains_sentence, !text.trim().is_empty()); -// } - -// #[test] -// fn rules_can_be_disabled_enabled() { -// let mut rules = Rules::new(RULES_PATH).unwrap(); - -// // enabled by default -// assert!(!rules -// .suggest("I can due his homework", &*TOKENIZER) -// .unwrap() -// .is_empty()); - -// rules -// .select_mut( -// &Category::new("confused_words") -// .join("confusion_due_do") -// .into(), -// ) -// .for_each(|x| x.disable()); - -// // disabled now -// assert!(rules -// .suggest("I can due his homework", &*TOKENIZER) -// .unwrap() -// .is_empty()); - -// // disabled by default -// assert!(rules -// .suggest("I can not go", &*TOKENIZER) -// .unwrap() -// .is_empty()); - -// rules -// .select_mut(&"typos/can_not".try_into().unwrap()) -// .for_each(|x| x.enable()); - -// // enabled now -// assert!(!rules -// .suggest("I can not go", &*TOKENIZER) -// .unwrap() -// .is_empty()); -// } +#[quickcheck] +fn can_analyze_anything(text: String) -> bool { + let _: Vec<_> = ANALYZER.tokenize(&text).collect(); + true +} + +#[test] +fn suggest_indices_are_relative_to_input_text() { + let suggestions: Vec<_> = CORRECTER + .suggest("I can due his homework for 10€. I can due his homework.") + .flatten() + .collect(); + + assert_eq!(*suggestions[0].span().char(), 6..9); + assert_eq!(*suggestions[0].span().byte(), 6..9); + + assert_eq!(*suggestions[1].span().char(), 38..41); + assert_eq!( + *suggestions[1].span().byte(), + 38 + '€'.len_utf8() - 1..41 + '€'.len_utf8() - 1 + ); +} + +#[test] +fn sentence_spans_correct() { + let text = "A short test. A test with emoji 😊."; + + let sentences: Vec<_> = ANALYZER.tokenize(text).collect(); + assert_eq!(sentences.len(), 2); + + assert_eq!(*sentences[0].span().char(), 0..14); + assert_eq!(*sentences[0].span().byte(), 0..14); + + assert_eq!(*sentences[1].span().char(), 14..34); + assert_eq!(*sentences[1].span().byte(), 14..37); +} + +#[test] +fn token_spans_correct() { + let text = "A short test. A test with emoji 😊."; + + let tokens: Vec<_> = ANALYZER + .tokenize(text) + .map(|x| x.into_iter()) + .flatten() + .collect(); + assert_eq!(*tokens[0].span().byte(), 0..1); + assert_eq!(*tokens[0].span().char(), 0..1); + + assert_eq!(*tokens[2].span().char(), 8..12); + assert_eq!(*tokens[2].span().byte(), 8..12); + + assert_eq!(*tokens[tokens.len() - 2].span().char(), 32..33); + assert_eq!(*tokens[tokens.len() - 2].span().byte(), 32..36); + + assert_eq!(*tokens[tokens.len() - 1].span().char(), 33..34); + assert_eq!(*tokens[tokens.len() - 1].span().byte(), 36..37); +} + +#[quickcheck] +fn no_gaps_between_sentences(text: String) { + let mut prev_pos = Position::default(); + let mut contains_sentence = false; + + for sentence in ANALYZER.tokenize(&text) { + assert_eq!(sentence.span().start(), prev_pos); + prev_pos += sentence.span().len(); + + contains_sentence = true; + } + + assert_eq!(contains_sentence, !text.trim().is_empty()); +} + +#[test] +fn rules_can_be_disabled_enabled() { + let mut correcter = CORRECTER.clone(); + + // enabled by default + assert!(correcter + .suggest("I can due his homework") + .flatten() + .next() + .is_some()); + + correcter + .components_mut() + .1 + .select_mut( + &Category::new("confused_words") + .join("confusion_due_do") + .into(), + ) + .for_each(|x| x.disable()); + + // disabled now + assert!(correcter + .suggest("I can due his homework") + .flatten() + .next() + .is_none()); + + // disabled by default + assert!(correcter.suggest("I can not go").flatten().next().is_none()); + + correcter + .components_mut() + .1 + .select_mut(&"typos/can_not".try_into().unwrap()) + .for_each(|x| x.enable()); + + // enabled now + assert!(correcter.suggest("I can not go").flatten().next().is_some()); +} + +#[test] +fn pipelines_work_with_references() -> Result<(), crate::Error> { + let _pipeline = Pipeline::new((&*ANALYZER, &CORRECTER.components().1))?; + + Ok(()) +}