Skip to content

Commit b79a770

Browse files
authored
Added chinese support (infiniflow#1108)
Added chinese support ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring
1 parent 3823c87 commit b79a770

17 files changed

+348
-271
lines changed

python/benchmark/legacy_benchmark/test_remote_benchmark_basic.py

+52-77
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
import functools
15-
import multiprocessing
1615
import threading
1716
import time
1817
import traceback
@@ -37,37 +36,22 @@ def wrapped_func(*args, **kwargs):
3736
return wrapped_func
3837

3938

40-
@trace_unhandled_exceptions
41-
def go():
42-
print(1)
43-
raise Exception()
44-
print(2)
39+
def worker_thread(thread_id, num_iterations, some_function, ip='127.0.0.1', port=9090):
40+
infinity_obj = infinity.connect(NetworkAddress(ip, port))
41+
try:
42+
for j in range(num_iterations):
43+
some_function(infinity_obj, port, thread_id, j)
44+
except Exception as e:
45+
print(f"Exception: {e}")
46+
finally:
47+
infinity_obj.disconnect()
4548

4649

47-
def test():
48-
p = multiprocessing.Pool(1)
49-
50-
p.apply_async(go)
51-
p.close()
52-
p.join()
53-
54-
55-
def worker_thread(process_id, thread_id, num_iterations, some_function, ip='0.0.0.0', port=9090):
56-
for j in range(num_iterations):
57-
infinity_obj = infinity.connect(NetworkAddress(ip, port))
58-
try:
59-
some_function(infinity_obj, port, process_id, thread_id, j)
60-
except Exception as e:
61-
print(f"Exception: {e}")
62-
finally:
63-
infinity_obj.disconnect()
64-
65-
66-
def worker_internal_connection(process_id, num_threads, num_iterations, some_function, ip=None, port=None):
50+
def worker_internal_connection(num_threads, num_iterations, some_function, ip=None, port=None):
6751
threads = []
6852
for j in range(num_threads):
6953
thread = threading.Thread(target=worker_thread, args=(
70-
process_id, j, num_iterations, some_function, ip, port))
54+
j, num_iterations, some_function, ip, port))
7155
threads.append(thread)
7256
thread.start()
7357

@@ -76,38 +60,29 @@ def worker_internal_connection(process_id, num_threads, num_iterations, some_fun
7660
thread.join()
7761

7862

79-
def measure_time_internal(num_processes, num_threads, num_times, some_function, ip=None, port=None):
80-
# Calculate how many iterations each process should do
81-
num_iterations = num_times // num_processes // num_threads
63+
def measure_time_internal(num_threads, num_times, some_function, ip=None, port=None):
64+
# Calculate how many iterations each thread should do
65+
num_iterations = num_times // num_threads
8266

8367
start_time = time.perf_counter()
84-
processes = []
85-
for i in range(num_processes):
86-
process = multiprocessing.Process(target=worker_internal_connection,
87-
args=(i, num_threads, num_iterations, some_function, ip, port))
88-
processes.append(process)
89-
process.start()
90-
91-
# Wait for all threads to finish
92-
for process in processes:
93-
process.join()
94-
68+
worker_internal_connection(
69+
num_threads, num_iterations, some_function, ip, port)
9570
end_time = time.perf_counter()
9671

9772
elapsed_time = end_time - start_time
9873
return elapsed_time
9974

10075

101-
def execute(some_functions: list, protocols: list, num_processes, num_threads, num_times) -> pd.DataFrame:
76+
def execute(some_functions: list, protocols: list, num_threads, num_times) -> pd.DataFrame:
10277
results = pd.DataFrame(
103-
columns=['rpc name', 'function', 'qps', 'elapsed_time', 'average_latency', 'num_processes', 'num_threads',
78+
columns=['rpc name', 'function', 'qps', 'elapsed_time', 'average_latency', 'num_threads',
10479
'num_times'])
10580
print(f"\n")
10681

10782
for (protocol, ip, port) in protocols:
10883
for some_function in some_functions:
10984
elapsed_time = measure_time_internal(
110-
num_processes, num_threads, num_times, some_function, ip, port)
85+
num_threads, num_times, some_function, ip, port)
11186
qps = num_times / elapsed_time # queries per second
11287
avg_latency = (elapsed_time / num_times) * 1000 # in ms
11388

@@ -116,7 +91,6 @@ def execute(some_functions: list, protocols: list, num_processes, num_threads, n
11691
qps,
11792
elapsed_time,
11893
avg_latency,
119-
num_processes,
12094
num_threads,
12195
num_times]
12296

@@ -129,117 +103,118 @@ class TestBenchmark:
129103

130104
def test_measure_time(self):
131105
@trace_unhandled_exceptions
132-
def create_database(infinity_obj, port, process_id, thread_id, num_iteration):
106+
def create_database(infinity_obj, port, thread_id, num_iteration):
133107
res = infinity_obj.create_database(
134-
f"my_database_{port}_{process_id}_{thread_id}_{num_iteration}")
108+
f"my_database_{port}_{thread_id}_{num_iteration}")
135109
if res.error_code != ErrorCode.OK:
136110
raise Exception(f"create_database failed: {res.error_msg}")
137111

138112
@trace_unhandled_exceptions
139-
def get_database(infinity_obj, port, process_id, thread_id, num_iteration):
113+
def get_database(infinity_obj, port, thread_id, num_iteration):
140114
db_obj = infinity_obj.get_database(f"default_db")
141115
if db_obj is None:
142116
raise Exception(f"get_database failed")
143117

144118
@trace_unhandled_exceptions
145-
def list_databases(infinity_obj, port, process_id, thread_id, num_iteration):
119+
def list_databases(infinity_obj, port, thread_id, num_iteration):
146120
res = infinity_obj.list_databases()
147121
if res.error_code != ErrorCode.OK:
148122
raise Exception(f"list_databases failed: {res.error_msg}")
149123

150124
@trace_unhandled_exceptions
151-
def drop_database(infinity_obj, port, process_id, thread_id, num_iteration):
125+
def drop_database(infinity_obj, port, thread_id, num_iteration):
152126
res = infinity_obj.drop_database(
153-
f"my_database_{port}_{process_id}_{thread_id}_{num_iteration}")
127+
f"my_database_{port}_{thread_id}_{num_iteration}")
154128
if res.error_code != ErrorCode.OK:
155129
raise Exception(f"drop_database failed: {res.error_msg}")
156130

157131
@trace_unhandled_exceptions
158-
def create_table(infinity_obj, port, process_id, thread_id, num_iteration):
132+
def create_table(infinity_obj, port, thread_id, num_iteration):
159133
res = infinity_obj.get_database(f"default_db").create_table(
160-
f"table_{port}_{process_id}_{thread_id}_{num_iteration}",
134+
f"table_{port}_{thread_id}_{num_iteration}",
161135
{"c1": {"type": "int", "constraints": ["primary key"]}, "c2": {"type": "float"}})
162136
if res.error_code != ErrorCode.OK:
163137
raise Exception(f"create_table failed: {res.error_msg}")
164138

165139
@trace_unhandled_exceptions
166-
def insert_table(infinity_obj, port, process_id, thread_id, num_iteration):
140+
def insert_table(infinity_obj, port, thread_id, num_iteration):
167141
res = (infinity_obj
168142
.get_database(f"default_db")
169-
.get_table(f"table_{port}_{process_id}_{thread_id}_{num_iteration}")
143+
.get_table(f"table_{port}_{thread_id}_{num_iteration}")
170144
.insert([{"c1": 1, "c2": 1.1}, {"c1": 2, "c2": 2.2}]))
171145
if res.error_code != ErrorCode.OK:
172146
raise Exception(f"insert_table failed: {res.error_msg}")
173147

174148
@trace_unhandled_exceptions
175-
def list_tables(infinity_obj, port, process_id, thread_id, num_iteration):
149+
def list_tables(infinity_obj, port, thread_id, num_iteration):
176150
(infinity_obj
177151
.get_database(f"default_db")
178152
.list_tables())
179153

180154
@trace_unhandled_exceptions
181-
def select_table(infinity_obj, port, process_id, thread_id, num_iteration):
155+
def select_table(infinity_obj, port, thread_id, num_iteration):
182156
res = (infinity_obj
183157
.get_database(f"default_db")
184-
.get_table(f"table_{port}_{process_id}_{thread_id}_{num_iteration}")
158+
.get_table(f"table_{port}_{thread_id}_{num_iteration}")
185159
.query_builder()
186160
.output(["*"])
187161
.filter("c1 > 1").to_df())
188162
if res is None:
189163
raise Exception(f"select_table failed: {res}")
190164

191165
@trace_unhandled_exceptions
192-
def drop_table(infinity_obj, port, process_id, thread_id, num_iteration):
166+
def drop_table(infinity_obj, port, thread_id, num_iteration):
193167
res = (infinity_obj
194168
.get_database(f"default_db")
195-
.drop_table(f"table_{port}_{process_id}_{thread_id}_{num_iteration}"))
169+
.drop_table(f"table_{port}_{thread_id}_{num_iteration}"))
196170
if res.error_code != ErrorCode.OK:
197171
raise Exception(f"drop_table failed: {res.error_msg}")
198172

199173
@trace_unhandled_exceptions
200-
def create_index(infinity_obj, port, process_id, thread_id, num_iteration):
174+
def create_index(infinity_obj, port, thread_id, num_iteration):
201175
res = (infinity_obj
202176
.get_database(f"default_db")
203-
.get_table(f"table_{port}_{process_id}_{thread_id}_{num_iteration}")
177+
.get_table(f"table_{port}_{thread_id}_{num_iteration}")
204178
.create_index("my_index", ["c1"], "IVF_FLAT", None))
205179
if res.error_code != ErrorCode.OK:
206180
raise Exception(f"create_index failed: {res.error_msg}")
207181

208182
@trace_unhandled_exceptions
209-
def drop_index(infinity_obj, port, process_id, thread_id, num_iteration):
183+
def drop_index(infinity_obj, port, thread_id, num_iteration):
210184
res = (infinity_obj
211185
.get_database(f"default_db")
212-
.get_table(f"table_{port}_{process_id}_{thread_id}_{num_iteration}")
186+
.get_table(f"table_{port}_{thread_id}_{num_iteration}")
213187
.drop_index("my_index"))
214188
if res.error_code != ErrorCode.OK:
215189
raise Exception(f"drop_index failed: {res.error_msg}")
216190

217191
############################################
218192
# Using the tune
219193

220-
ip: str = '0.0.0.0'
221-
thrift = ("Thrift", ip, 9090)
194+
ip: str = '127.0.0.1'
195+
thrift = ("Thrift", ip, 23817)
222196
thread_pool_thrift = ("Thread Pool Thrift", ip, 23817)
223-
async_thrift = ("AsyncThrift", ip, 9070)
224-
num_processes = 16
225-
num_threads = 16
226-
num_times = 16 * 16 * 10
197+
async_thrift = ("AsyncThrift", ip, 23817)
198+
num_threads = 1
199+
num_times = 10
227200
protocols = [thread_pool_thrift]
228201

229-
database_functions = [create_database,
230-
get_database, list_databases, drop_database]
202+
database_functions = [create_database]
231203

232204
db_df = execute(database_functions, protocols,
233-
num_processes, num_threads, num_times)
205+
num_threads, num_times)
234206

235-
table_functions = [create_table, insert_table,
236-
select_table, list_tables, drop_table]
207+
table_functions = []
237208
tbl_df = execute(table_functions, protocols,
238-
num_processes, num_threads, num_times)
209+
num_threads, num_times)
239210

240211
# index_functions = []
241-
# idx_df = execute(index_functions, protocols, num_processes, num_threads, num_times)
212+
# idx_df = execute(index_functions, protocols, num_threads, num_times)
242213

243214
df = pd.concat([db_df, tbl_df])
244215
print(df)
245216
df.to_excel(f"{datetime.now()}_benchmark.xlsx")
217+
218+
219+
if __name__ == "__main__":
220+
TestBenchmark().test_measure_time()

python/hello_infinity.py

+60-2
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def main():
4343
print(res)
4444

4545

46-
def test():
46+
def test_english():
4747
try:
4848
infinity_obj = infinity.connect(REMOTE_HOST)
4949
db = infinity_obj.get_database("default_db")
@@ -105,6 +105,64 @@ def test():
105105
# print(qb_result)
106106

107107

108+
def test_chinese():
109+
"""
110+
Checkout https://github.com/infiniflow/resource.git under /var/infinity. The jieba dict is
111+
/var/infinity/resource/jieba/dict/jieba.dict.utf8
112+
"""
113+
try:
114+
infinity_obj = infinity.connect(REMOTE_HOST)
115+
infinity_obj.create_database("default_db", ConflictType.Ignore)
116+
db = infinity_obj.get_database("default_db")
117+
# Drop my_table if it already exists
118+
db.drop_table("my_table", ConflictType.Ignore)
119+
# Create a table named "my_table"
120+
table = db.create_table(
121+
"my_table", {"num": "integer", "body": "varchar", "vec": "vector, 4, float"})
122+
table.insert(
123+
[{"num": 1, "body": "据Wccftech报道,苹果正在开发一种定制芯片,旨在为人工智能(AI)服务器提供动力。暂时还不清楚这款芯片的具体规格,以及具体的实现目标。传闻苹果已选择台积电(TSMC)的3nm制程节点来制造这款芯片,预计2025年下半年量产。如果按照量产时间和台积电的半导体工艺进度,那么对应的很可能是N3E工艺。", "vec": [1.0, 1.2, 0.8, 0.9]}])
124+
table.insert(
125+
[{"num": 2, "body": "两个月前有报道称,苹果已正式放弃了努力超过十年、投下海量资金的“泰坦计划(Project Titan)”电动车项目。苹果随后解散了大约2000人的开发团队,各人会被分配到其他地方,其中一个很重要的去处就是人工智能部门。有传言称,苹果已经将注意力转向生成式AI,希望能够为业务找到新的增长动力。", "vec": [4.0, 4.2, 4.3, 4.5]}])
126+
127+
# `create_index()` is required before match() or fusion()
128+
res = table.create_index("my_index",
129+
[index.IndexInfo("body",
130+
index.IndexType.FullText,
131+
[index.InitParameter("ANALYZER", "chinese")]),
132+
], ConflictType.Error)
133+
# assert res.success
134+
135+
res = table.output(["num", "body"]).knn(
136+
"vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 2).to_pl()
137+
138+
pds_df = pds.DataFrame(res)
139+
json_data = pds_df.to_json()
140+
print("------json-------")
141+
print(json_data)
142+
143+
table_obj = db.get_table("my_table")
144+
qb_result = table_obj.output(["num", "body"]).knn(
145+
"vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3).to_pl()
146+
print("------tabular -------")
147+
print("------knn-------")
148+
print(qb_result)
149+
150+
qb_result1 = table_obj.match(
151+
"body", "芯片", "topn=1").output(["num", "body"]).to_pl()
152+
print("------match-------")
153+
print(qb_result1)
154+
155+
qb_result2 = table_obj.output(["num", "body"]).knn(
156+
"vec", [3.0, 2.8, 2.7, 3.1], "float", "ip", 3).match(
157+
"body", "芯片", "topn=1").fusion('rrf').to_pl()
158+
print("------knn+match-------")
159+
print(qb_result2)
160+
161+
except Exception as e:
162+
print(str(e))
163+
164+
108165
if __name__ == '__main__':
109166
main()
110-
test()
167+
test_english()
168+
test_chinese()

src/common/analyzer/analyzer_pool.cpp

+8-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@ UniquePtr<Analyzer> AnalyzerPool::Get(const std::string_view &name) {
4343
case Str2Int(CHINESE.data()): {
4444
Analyzer *prototype = cache_[CHINESE].get();
4545
if (prototype == nullptr) {
46-
String path = InfinityContext::instance().config()->resource_dict_path();
46+
String path;
47+
Config *config = InfinityContext::instance().config();
48+
if (config == nullptr) {
49+
// InfinityContext has not been initialized.
50+
path = "/var/infinity/resource";
51+
} else {
52+
path = config->resource_dict_path();
53+
}
4754
UniquePtr<ChineseAnalyzer> analyzer = MakeUnique<ChineseAnalyzer>(std::move(path));
4855
if (!analyzer->Load()) {
4956
return nullptr;

src/common/analyzer/chinese_analyzer.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -65,23 +65,23 @@ bool ChineseAnalyzer::Load() {
6565
fs::path stopwords_path(root / STOP_WORD_PATH);
6666

6767
if (!fs::exists(dict_path)) {
68-
LOG_INFO(fmt::format("Invalid jieba config {} dict for jieba_analyzer does not exist", dict_path.string()));
68+
// LOG_INFO(fmt::format("Invalid jieba config {} dict for jieba_analyzer does not exist", dict_path.string()));
6969
return false;
7070
}
7171
if (!fs::exists(hmm_path)) {
72-
LOG_INFO(fmt::format("Invalid jieba config {} hmm for jieba_analyzer does not exist", hmm_path.string()));
72+
// LOG_INFO(fmt::format("Invalid jieba config {} hmm for jieba_analyzer does not exist", hmm_path.string()));
7373
return false;
7474
}
7575
if (!fs::exists(userdict_path)) {
76-
LOG_INFO(fmt::format("Invalid jieba config {} user_dict for jieba_analyzer does not exist", userdict_path.string()));
76+
// LOG_INFO(fmt::format("Invalid jieba config {} user_dict for jieba_analyzer does not exist", userdict_path.string()));
7777
return false;
7878
}
7979
if (!fs::exists(idf_path)) {
80-
LOG_INFO(fmt::format("Invalid jieba config {} idf for jieba_analyzer does not exist", idf_path.string()));
80+
// LOG_INFO(fmt::format("Invalid jieba config {} idf for jieba_analyzer does not exist", idf_path.string()));
8181
return false;
8282
}
8383
if (!fs::exists(stopwords_path)) {
84-
LOG_INFO(fmt::format("Invalid jieba config {} stopword for jieba_analyzer does not exist", stopwords_path.string()));
84+
// LOG_INFO(fmt::format("Invalid jieba config {} stopword for jieba_analyzer does not exist", stopwords_path.string()));
8585
return false;
8686
}
8787

0 commit comments

Comments
 (0)