Skip to content

Commit

Permalink
Replace the fine grained Chinese analyzer with a new one (#1370)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Use modified jieba query segmentation for fine grained Chinese analyzer.

Issue :#1308

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
yingfeng authored Jun 21, 2024
1 parent c4c149a commit 5b1e692
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/common/analyzer/chinese_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ void ChineseAnalyzer::Parse(const String &input) {
if (cut_grain_ == CutGrain::kCoarse)
jieba_->Cut(input, cut_words_, true);
else
jieba_->CutHMM(input, cut_words_);
jieba_->CutForSearch(input, cut_words_);
local_offset_ = -1;
cursor_ = -1;
cut_size_ = cut_words_.size();
Expand Down
27 changes: 23 additions & 4 deletions third_party/cppjieba/include/cppjieba/QuerySegment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,26 +39,45 @@ class QuerySegment: public SegmentBase {
wrs.reserve(sentence.size()/2);
while (pre_filter.HasNext()) {
range = pre_filter.Next();
Cut(range.begin, range.end, wrs, hmm);
Cut(sentence, range.begin, range.end, wrs, hmm);
}
words.clear();
words.reserve(wrs.size());
GetWordsFromWordRanges(sentence, wrs, words);
}
void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange>& res, bool hmm) const {
//use mix Cut first

bool IsAlphaDigit(const string& str) const {
bool ret = true;
for(unsigned i = 0; i < str.size(); i++) {
ret &= (std::isalpha(str[i])||std::isdigit(str[i]));
}
return ret;
}

void Cut(const string &sentence, RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector<WordRange> &res, bool hmm) const {
// use mix Cut first
vector<WordRange> mixRes;
mixSeg_.Cut(begin, end, mixRes, hmm);

vector<WordRange> fullRes;
for (vector<WordRange>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) {
if (mixResItr->Length() > 2) {
RuneStrArray::const_iterator left = mixResItr->left;
RuneStrArray::const_iterator right = mixResItr->right;
uint32_t len = right->offset - left->offset + right->len;
string str = sentence.substr(left->offset, len);
if(IsAlphaDigit(str)) {
res.push_back(*mixResItr);
continue;
}
for (size_t i = 0; i + 1 < mixResItr->Length(); i++) {
WordRange wr(mixResItr->left + i, mixResItr->left + i + 1);
if (trie_->Find(wr.left, wr.right + 1) != NULL) {
res.push_back(wr);
}
}
} else {
res.push_back(*mixResItr);
}
if (mixResItr->Length() > 3) {
for (size_t i = 0; i + 2 < mixResItr->Length(); i++) {
Expand All @@ -68,9 +87,9 @@ class QuerySegment: public SegmentBase {
}
}
}
res.push_back(*mixResItr);
}
}

private:
bool IsAllAscii(const Unicode& s) const {
for(size_t i = 0; i < s.size(); i++) {
Expand Down

0 comments on commit 5b1e692

Please sign in to comment.