Skip to content

Commit

Permalink
Support traditional Chinese for full text search (#1378)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?

Using opencc for converting from tradtional Chinese to simplified
Chinese
(Just introducing older version of OpenCC for much less dependencies)

Issue:#1376

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
  • Loading branch information
yingfeng authored Jun 24, 2024
1 parent 13b0774 commit 9fded40
Show file tree
Hide file tree
Showing 33 changed files with 3,068 additions and 1 deletion.
6 changes: 6 additions & 0 deletions benchmark/local_infinity/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

# ########################################
Expand All @@ -38,6 +39,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

# query benchmark
Expand All @@ -58,6 +60,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

# ########################################
Expand All @@ -80,6 +83,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

# ########################################
Expand All @@ -100,6 +104,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

add_executable(bmp_benchmark
Expand All @@ -119,6 +124,7 @@ target_link_libraries(
lz4.a
atomic.a
jma
opencc
)

if(ENABLE_JEMALLOC)
Expand Down
1 change: 1 addition & 0 deletions benchmark/remote_infinity/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ target_link_libraries(
atomic.a
thrift.a
jma
opencc
)

if(ENABLE_JEMALLOC)
Expand Down
5 changes: 5 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,7 @@ target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/base64/include")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/oatpp/src")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/eigen-3.4.0")
target_include_directories(infinity_core PUBLIC "${CMAKE_SOURCE_DIR}/third_party/opencc")

if (NOT SUPPORT_FMA EQUAL 0)
message(FATAL_ERROR "This project requires the processor support fused multiply-add (FMA) instructions.")
Expand Down Expand Up @@ -288,6 +289,7 @@ target_link_libraries(infinity
event.a
oatpp.a
jma
opencc
)

if (ENABLE_JEMALLOC)
Expand Down Expand Up @@ -336,6 +338,7 @@ target_link_libraries(embedded_infinity_ext PRIVATE
event.a
oatpp.a
jma
opencc
)

#if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
Expand Down Expand Up @@ -460,6 +463,7 @@ target_link_libraries(unit_test
atomic.a
thrift.a
jma
opencc
)

target_link_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/lib")
Expand All @@ -484,6 +488,7 @@ target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/thr
target_include_directories(unit_test PUBLIC "${CMAKE_BINARY_DIR}/third_party/thrift/")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/pgm/include")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/googletest/googletest/include")
target_include_directories(unit_test PUBLIC "${CMAKE_SOURCE_DIR}/third_party/opencc")

# target_compile_options(unit_test PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
if (SUPPORT_AVX2 EQUAL 0 OR SUPPORT_AVX512 EQUAL 0)
Expand Down
34 changes: 34 additions & 0 deletions src/common/analyzer/analyzer_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import config;
import infinity_context;
import analyzer;
import chinese_analyzer;
import traditional_chinese_analyzer;
import japanese_analyzer;
import standard_analyzer;
import ngram_analyzer;
Expand Down Expand Up @@ -72,6 +73,39 @@ Tuple<UniquePtr<Analyzer>, Status> AnalyzerPool::GetAnalyzer(const std::string_v
analyzer->SetCutGrain(cut_grain);
return {std::move(analyzer), Status::OK()};
}
case Str2Int(TRADITIONALCHINESE.data()): {
// chinese-{coarse|fine}
Analyzer *prototype = cache_[TRADITIONALCHINESE].get();
if (prototype == nullptr) {
String path;
Config *config = InfinityContext::instance().config();
if (config == nullptr) {
// InfinityContext has not been initialized.
path = "/var/infinity/resource";
} else {
path = config->ResourcePath();
}
UniquePtr<TraditionalChineseAnalyzer> analyzer = MakeUnique<TraditionalChineseAnalyzer>(std::move(path));
Status load_status = analyzer->Load();
if (!load_status.ok()) {
return {nullptr, load_status};
}
prototype = analyzer.get();
cache_[TRADITIONALCHINESE] = std::move(analyzer);
}
CutGrain cut_grain = CutGrain::kCoarse;
const char *str = name.data();
while (*str != '\0' && *str != '-') {
str++;
}
if (strcmp(str, "-fine") == 0) {
cut_grain = CutGrain::kFine;
}
UniquePtr<TraditionalChineseAnalyzer> analyzer =
MakeUnique<TraditionalChineseAnalyzer>(*reinterpret_cast<TraditionalChineseAnalyzer *>(prototype));
analyzer->SetCutGrain(cut_grain);
return {std::move(analyzer), Status::OK()};
}
case Str2Int(JAPANESE.data()): {
Analyzer *prototype = cache_[JAPANESE].get();
if (prototype == nullptr) {
Expand Down
1 change: 1 addition & 0 deletions src/common/analyzer/analyzer_pool.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ public:

public:
static constexpr std::string_view CHINESE = "chinese";
static constexpr std::string_view TRADITIONALCHINESE = "tradition";
static constexpr std::string_view JAPANESE = "japanese";
static constexpr std::string_view STANDARD = "standard";
static constexpr std::string_view NGRAM = "ngram";
Expand Down
2 changes: 1 addition & 1 deletion src/common/analyzer/chinese_analyzer.cppm
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ private:

bool DoNext();

private:
protected:
cppjieba::Jieba *jieba_{nullptr};
String dict_path_;
bool own_jieba_{};
Expand Down
73 changes: 73 additions & 0 deletions src/common/analyzer/traditional_chinese_analyzer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

#include <cstring>
#include <filesystem>

#include <openccxx.h>

import stl;
import logger;
import third_party;
import chinese_analyzer;

module traditional_chinese_analyzer;

namespace fs = std::filesystem;

namespace infinity {
static const String OPENCC_PATH = "opencc";

namespace fs = std::filesystem;

TraditionalChineseAnalyzer::TraditionalChineseAnalyzer(const String &path) : ChineseAnalyzer(path) {}

TraditionalChineseAnalyzer::TraditionalChineseAnalyzer(const TraditionalChineseAnalyzer &other) : ChineseAnalyzer(other) { own_opencc_ = false; }

TraditionalChineseAnalyzer::~TraditionalChineseAnalyzer() {
if (own_opencc_) {
delete opencc_;
}
}

Status TraditionalChineseAnalyzer::Load() {
try {
ChineseAnalyzer::Load();
} catch (const std::exception &e) {
return Status::InvalidAnalyzerFile("Failed to load jieba analyzer");
}
fs::path root(dict_path_);
fs::path opencc_path(root / OPENCC_PATH);

if (!fs::exists(opencc_path)) {
return Status::InvalidAnalyzerFile(opencc_path);
}
try {
opencc_ = new ::OpenCC(opencc_path.string());
} catch (const std::exception &e) {
return Status::InvalidAnalyzerFile("Failed to load OpenCC");
}
own_opencc_ = true;
return Status::OK();
}

void TraditionalChineseAnalyzer::Parse(const String &input) {
String out;
opencc_->convert(input, out);
ChineseAnalyzer::Parse(out);
}

} // namespace infinity
44 changes: 44 additions & 0 deletions src/common/analyzer/traditional_chinese_analyzer.cppm
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright(C) 2023 InfiniFlow, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

module;

export module traditional_chinese_analyzer;

import stl;
import status;
import chinese_analyzer;

class OpenCC;

namespace infinity {

export class TraditionalChineseAnalyzer : public ChineseAnalyzer {
public:
TraditionalChineseAnalyzer(const String &path);

TraditionalChineseAnalyzer(const TraditionalChineseAnalyzer &other);

~TraditionalChineseAnalyzer();

Status Load();

protected:
void Parse(const String &input) override;

private:
OpenCC *opencc_{nullptr};
bool own_opencc_{false};
};
} // namespace infinity
5 changes: 5 additions & 0 deletions third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,8 @@ add_subdirectory(fastpfor)
################################################################################
add_subdirectory(ijma)

################################################################################
### opencc
################################################################################
add_subdirectory(opencc)

12 changes: 12 additions & 0 deletions third_party/opencc/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
project(OpenCC CXX C)

add_definitions(-march=native)

# library target
FILE(GLOB_RECURSE OPENCC_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/*.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/*.c"
)
add_library(opencc STATIC
${OPENCC_SRC})
set_target_properties(opencc PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
Loading

0 comments on commit 9fded40

Please sign in to comment.