Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into fix472
Browse files Browse the repository at this point in the history
  • Loading branch information
JinHai-CN committed Jan 22, 2025
2 parents e65ab96 + bb2bc8d commit 793aae7
Show file tree
Hide file tree
Showing 45 changed files with 1,423 additions and 538 deletions.
20 changes: 16 additions & 4 deletions src/common/analyzer/analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,23 @@ public:
}

protected:
typedef void (*HookType)(void *data, const char *text, const u32 len, const u32 offset, const u32 end_offset, const bool is_special_char);
typedef void (*HookType)(void *data,
const char *text,
const u32 len,
const u32 offset,
const u32 end_offset,
const bool is_special_char,
const u16 payload);

virtual int AnalyzeImpl(const Term &input, void *data, HookType func) { return -1; }

static void AppendTermList(void *data, const char *text, const u32 len, const u32 offset, const u32 end_offset, const bool is_special_char) {
static void AppendTermList(void *data,
const char *text,
const u32 len,
const u32 offset,
const u32 end_offset,
const bool is_special_char,
const u16 payload) {
void **parameters = (void **)data;
TermList *output = (TermList *)parameters[0];
Analyzer *analyzer = (Analyzer *)parameters[1];
Expand All @@ -62,9 +74,9 @@ protected:
return;
if (is_special_char && analyzer->convert_to_placeholder_) {
if (output->empty() == true || output->back().text_.compare(PLACE_HOLDER) != 0)
output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, end_offset);
output->Add(PLACE_HOLDER.c_str(), PLACE_HOLDER.length(), offset, end_offset, payload);
} else {
output->Add(text, len, offset, end_offset);
output->Add(text, len, offset, end_offset, payload);
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/common/analyzer/analyzer_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import ngram_analyzer;
import rag_analyzer;
import whitespace_analyzer;
import ik_analyzer;
import rank_features_analyzer;
import logger;

namespace infinity {
Expand Down Expand Up @@ -330,6 +331,9 @@ Tuple<UniquePtr<Analyzer>, Status> AnalyzerPool::GetAnalyzer(const std::string_v
}
return {MakeUnique<WhitespaceAnalyzer>(name.substr(suffix_pos + 1)), Status::OK()};
}
case Str2Int(RANKFEATURES.data()): {
return {MakeUnique<RankFeaturesAnalyzer>(), Status::OK()};
}
default: {
if (std::filesystem::is_regular_file(name)) {
// Suppose it is a customized Python script analyzer
Expand Down
1 change: 1 addition & 0 deletions src/common/analyzer/analyzer_pool.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ public:
static constexpr std::string_view IK = "ik";
static constexpr std::string_view KEYWORD = "keyword";
static constexpr std::string_view WHITESPACE = "whitespace";
static constexpr std::string_view RANKFEATURES = "rankfeatures";

private:
CacheType cache_{};
Expand Down
20 changes: 10 additions & 10 deletions src/common/analyzer/common_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,12 @@ int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType

if (is_index_) {
if (IsSpecialChar()) {
func(data, token_, len_, offset_, end_offset_, true);
func(data, token_, len_, offset_, end_offset_, true, 0);
temp_offset = offset_;
continue;
}
if (is_raw_) {
func(data, token_, len_, offset_, end_offset_, false);
func(data, token_, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
continue;
}
Expand All @@ -77,37 +77,37 @@ int CommonLanguageAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType
bool lowercase_is_different = memcmp(token_, lowercase_term, len_) != 0;

if (stemming_term_str_size && stem_only_) {
func(data, stem_term.c_str(), stemming_term_str_size, offset_, end_offset_, false);
func(data, stem_term.c_str(), stemming_term_str_size, offset_, end_offset_, false, 0);
temp_offset = offset_;
} else if (stemming_term_str_size || (case_sensitive_ && contain_lower_ && lowercase_is_different)) {
/// have more than one output
if (case_sensitive_) {
func(data, token_, len_, offset_, end_offset_, false);
func(data, token_, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
} else {
func(data, lowercase_term, len_, offset_, end_offset_, false);
func(data, lowercase_term, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
}
if (stemming_term_str_size) {
func(data, stem_term.c_str(), stemming_term_str_size, offset_, end_offset_, false);
func(data, stem_term.c_str(), stemming_term_str_size, offset_, end_offset_, false, 0);
temp_offset = offset_;
}
if (case_sensitive_ && contain_lower_ && lowercase_is_different) {
func(data, lowercase_term, len_, offset_, end_offset_, false);
func(data, lowercase_term, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
}
} else {
/// have only one output
if (case_sensitive_) {
func(data, token_, len_, offset_, end_offset_, false);
func(data, token_, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
} else {
func(data, lowercase_term, len_, offset_, end_offset_, false);
func(data, lowercase_term, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
}
}
} else {
func(data, token_, len_, offset_, end_offset_, false);
func(data, token_, len_, offset_, end_offset_, false, 0);
temp_offset = offset_;
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/analyzer/ik/ik_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ int IKAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
while ((lexeme = context_->GetNextLexeme()) != nullptr) {
std::wstring text = lexeme->GetLexemeText();
String token = CharacterUtil::UTF16ToUTF8(text);
func(data, token.c_str(), token.size(), offset++, 0, false);
func(data, token.c_str(), token.size(), offset++, 0, false, 0);
delete lexeme;
};
return 0;
Expand Down
2 changes: 1 addition & 1 deletion src/common/analyzer/ngram_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ int NGramAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
while (cur < len && NextInString(input.text_.c_str(), len, &cur, &token_start, &token_length)) {
if (token_length == 0)
continue;
func(data, input.text_.c_str() + token_start, token_length, offset, offset + token_length, false);
func(data, input.text_.c_str() + token_start, token_length, offset, offset + token_length, false, 0);
offset++;
}

Expand Down
2 changes: 1 addition & 1 deletion src/common/analyzer/rag_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1438,7 +1438,7 @@ int RAGAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
Split(output, blank_pattern_, tokens);
unsigned offset = 0;
for (auto &t : tokens) {
func(data, t.c_str(), t.size(), offset++, 0, false);
func(data, t.c_str(), t.size(), offset++, 0, false, 0);
}
return 0;
}
Expand Down
52 changes: 52 additions & 0 deletions src/common/analyzer/rank_features_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright(C) 2025 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#include <string>
module rank_features_analyzer;
import stl;
import term;
import analyzer;
import third_party;

namespace infinity {

u16 FloatToU16(float value) {
if (value < 0.0f)
value = 0.0f;
if (value > 65535.0f)
value = 65535.0f;
return static_cast<u16>(value);
}

int RankFeaturesAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) {
nlohmann::json line_json = nlohmann::json::parse(input.text_);
u32 offset = 0;
for (auto iter = line_json.begin(); iter != line_json.end(); ++iter) {
String key = iter.key();
String value = iter.value();
float target = 0;
try {
target = std::stof(value);
} catch (const std::exception &e) {
}
u16 weight = FloatToU16(target);
func(data, key.data(), key.size(), offset++, 0, false, weight);
}

return 0;
}

} // namespace infinity
35 changes: 35 additions & 0 deletions src/common/analyzer/rank_features_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright(C) 2025 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

export module rank_features_analyzer;
import stl;
import term;
import analyzer;

namespace infinity {

export class RankFeaturesAnalyzer : public Analyzer {
String delimiters_{};

public:
RankFeaturesAnalyzer() = default;
~RankFeaturesAnalyzer() override = default;

protected:
int AnalyzeImpl(const Term &input, void *data, HookType func) override;
};

} // namespace infinity
9 changes: 6 additions & 3 deletions src/common/analyzer/term.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,12 @@ public:

export class TermList : public Deque<Term> {
public:
void Add(const char *text, const u32 len, const u32 offset, const u32 end_offset) {
void Add(const char *text, const u32 len, const u32 offset, const u32 end_offset, const u16 payload = 0) {
push_back(global_temporary_);
back().text_.assign(text, len);
back().word_offset_ = offset;
back().end_offset_ = end_offset;
back().payload_ = payload;
}

void Add(cppjieba::Word &cut_word) {
Expand All @@ -54,18 +55,20 @@ public:
back().word_offset_ = cut_word.offset;
}

void Add(const String &token, const u32 offset, const u32 end_offset) {
void Add(const String &token, const u32 offset, const u32 end_offset, const u16 payload = 0) {
push_back(global_temporary_);
back().text_ = token;
back().word_offset_ = offset;
back().end_offset_ = end_offset;
back().payload_ = payload;
}

void Add(String &token, const u32 offset, const u32 end_offset) {
void Add(String &token, const u32 offset, const u32 end_offset, const u16 payload = 0) {
push_back(global_temporary_);
std::swap(back().text_, token);
back().word_offset_ = offset;
back().end_offset_ = end_offset;
back().payload_ = payload;
}

private:
Expand Down
6 changes: 3 additions & 3 deletions src/common/analyzer/whitespace_analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ int WhitespaceAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func
std::string t;
u32 offset = 0;
while (is >> t) {
func(data, t.data(), t.size(), offset++, 0, false);
func(data, t.data(), t.size(), offset++, 0, false, 0);
}
return 0;
} else {
Expand All @@ -49,11 +49,11 @@ int WhitespaceAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func
while (search_start < input_text.size()) {
const auto found = input_text.find_first_of(delimiters, search_start);
if (found == std::string_view::npos) {
func(data, input_text.data() + search_start, input_text.size() - search_start, offset++, 0, false);
func(data, input_text.data() + search_start, input_text.size() - search_start, offset++, 0, false, 0);
break;
}
if (found > search_start) {
func(data, input_text.data() + search_start, found - search_start, offset++, 0, false);
func(data, input_text.data() + search_start, found - search_start, offset++, 0, false, 0);
}
search_start = found + 1;
}
Expand Down
Loading

0 comments on commit 793aae7

Please sign in to comment.