Skip to content

Commit f1512a4

Browse files
committed
Make databases use english analyzer by default
1 parent 1af7d41 commit f1512a4

File tree

3 files changed

+12
-7
lines changed

3 files changed

+12
-7
lines changed

src/scholarag/document_stores/elastic.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212

1313
logger = logging.getLogger(__name__)
1414

15-
SETTINGS: dict[str, Any] = {"number_of_shards": 2, "number_of_replicas": 1}
15+
SETTINGS: dict[str, Any] = {
16+
"number_of_shards": 2,
17+
"number_of_replicas": 1,
18+
"analysis": {"analyzer": {"default": {"type": "english"}}},
19+
}
1620

1721
MAPPINGS_PARAGRAPHS: dict[str, Any] = {
1822
"dynamic": "strict",

src/scholarag/document_stores/open.py

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
SETTINGS: dict[str, Any] = {
1717
"number_of_shards": 2,
1818
"number_of_replicas": 1,
19+
"analysis": {"analyzer": {"default": {"type": "english"}}},
1920
}
2021

2122
MAPPINGS_PARAGRAPHS: dict[str, Any] = {

tests/test_document_stores.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ def test_search(get_testing_ds_client, query):
305305
res_hits[0]["_source"]["text"]
306306
== "This document is a bad test, I don't want to retrieve it"
307307
)
308-
assert res_hits[0]["_score"] == pytest.approx(0.7782316)
308+
assert res_hits[0]["_score"] == pytest.approx(0.81427324)
309309
# test query + aggs.
310310
results = ds_client.search(index_doc, query, aggs=aggs)
311311
assert results["aggregations"]["unique_ids"]["buckets"][0]["doc_count"] == 1
@@ -376,13 +376,13 @@ def test_bm25_search(get_testing_ds_client, filter_db):
376376
if filter_db is None:
377377
assert len(res) == 2
378378
assert res[0]["text"] == "test of an amazing function"
379-
assert res[0]["score"] == pytest.approx(0.5403367)
379+
assert res[0]["score"] == pytest.approx(0.5504225)
380380
else:
381381
assert len(res) == 1
382382
assert (
383383
res[0]["text"] == "This document is a bad test, I don't want to retrieve it"
384384
)
385-
assert res[0]["score"] == pytest.approx(0.37292093)
385+
assert res[0]["score"] == pytest.approx(0.39019167)
386386

387387
# Errors
388388
with pytest.raises(RuntimeError):
@@ -647,7 +647,7 @@ async def test_asearch(get_testing_async_ds_client, query):
647647
res_hits[0]["_source"]["text"]
648648
== "This document is a bad test, I don't want to retrieve it"
649649
)
650-
assert res_hits[0]["_score"] == pytest.approx(0.7782316)
650+
assert res_hits[0]["_score"] == pytest.approx(0.81427324)
651651
# test query + aggs.
652652
results = await ds_client.search(index_doc, query, aggs=aggs)
653653
assert results["aggregations"]["unique_ids"]["buckets"][0]["doc_count"] == 1
@@ -722,13 +722,13 @@ async def test_abm25_search(get_testing_async_ds_client, filter_db):
722722
if filter_db is None:
723723
assert len(res) == 2
724724
assert res[0]["text"] == "test of an amazing function"
725-
assert res[0]["score"] == pytest.approx(0.5403367)
725+
assert res[0]["score"] == pytest.approx(0.5504225)
726726
else:
727727
assert len(res) == 1
728728
assert (
729729
res[0]["text"] == "This document is a bad test, I don't want to retrieve it"
730730
)
731-
assert res[0]["score"] == pytest.approx(0.37292093)
731+
assert res[0]["score"] == pytest.approx(0.39019167)
732732

733733

734734
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)