Merge branch 'main' of https://github.com/embeddings-benchmark/mteb

embeddings-benchmark · Feb 4, 2025 · 3036c05 · 3036c05
2 parents f43b661 + de8f384
commit 3036c05
Show file tree

Hide file tree

Showing 52 changed files with 1,707 additions and 24 deletions.
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -1400,3 +1400,39 @@ def load_results(
     year={2024}
 }""",
 )
+
+BEIR_NL = Benchmark(
+    name="BEIR-NL",
+    tasks=get_tasks(
+        tasks=[
+            "ArguAna-NL",
+            "CQADupstack-NL",
+            "FEVER-NL",
+            "NQ-NL",
+            "Touche2020-NL",
+            "FiQA2018-NL",
+            "Quora-NL",
+            "HotpotQA-NL",
+            "SCIDOCS-NL",
+            "ClimateFEVER-NL",
+            "mMARCO-NL",
+            "SciFact-NL",
+            "DBPedia-NL",
+            "NFCorpus-NL",
+            "TRECCOVID-NL",
+        ],
+    ),
+    description="BEIR-NL is a Dutch adaptation of the publicly available BEIR benchmark, created through automated "
+    "translation.",
+    reference="https://arxiv.org/abs/2412.08329",
+    contacts=["nikolay-banar"],
+    citation="""@misc{banar2024beirnlzeroshotinformationretrieval,
+    title={BEIR-NL: Zero-shot Information Retrieval Benchmark for the Dutch Language}, 
+     author={Nikolay Banar and Ehsan Lotfi and Walter Daelemans},
+     year={2024},
+     eprint={2412.08329},
+     archivePrefix={arXiv},
+     primaryClass={cs.CL},
+     url={https://arxiv.org/abs/2412.08329}, 
+}""",
+)
diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py
@@ -109,12 +109,15 @@
         # splits not specified to assuming everything
         # in MTEB
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translated from NQ (not trained on)
         "NQHardNegatives": ["test"],
         "NQ-PL": ["test"],
         "HotPotQA": ["test"],  # translated, not trained on
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],
+        "FEVER-NL": ["test"],  # translated from FEVER (not trained on)
         "FEVERHardNegatives": ["test"],
         # not in MTEB
         # trained on stack exchange (title-body)
@@ -160,11 +163,14 @@
         # splits not specified to assuming everything
         # in MTEB
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translated from NQ (not trained on)
         "NQHardNegatives": ["test"],
         "HotPotQA": ["test"],
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],
+        "FEVER-NL": ["test"],  # translated from FEVER (not trained on)
         "FEVERHardNegatives": ["test"],
         # not in MTEB
         # trained on stack exchange (title-body)
@@ -210,11 +216,14 @@
         # splits not specified to assuming everything
         # in MTEB
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translated from NQ (not trained on)
         "NQHardNegatives": ["test"],
         "HotPotQA": ["test"],
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],
+        "FEVER-NL": ["test"],  # translated from FEVER (not trained on)
         "FEVERHardNegatives": ["test"],
         # not in MTEB
         # trained on stack exchange (title-body)
@@ -260,11 +269,14 @@
         # splits not specified to assuming everything
         # in MTEB
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translated from NQ (not trained on)
         "NQHardNegatives": ["test"],
         "HotPotQA": ["test"],
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],
+        "FEVER-NL": ["test"],  # translated from FEVER (not trained on)
         "FEVERHardNegatives": ["test"],
         # trained on stack exchange, unsure if sources match
         # not in MTEB
@@ -310,11 +322,14 @@
         # splits not specified to assuming everything
         # in MTEB
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translated from NQ (not trained on)
         "NQHardNegatives": ["test"],
         "HotPotQA": ["test"],
         "HotPotQAHardNegatives": ["test"],
         "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+        "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
         "FEVER": ["test"],
+        "FEVER-NL": ["test"],  # translated from FEVER (not trained on)
         "FEVERHardNegatives": ["test"],
         # not in MTEB
         # trained on stack exchange (title-body)

diff --git a/mteb/models/bge_models.py b/mteb/models/bge_models.py
@@ -23,12 +23,15 @@
     "MSMARCOHardNegatives": ["train"],
     "NanoMSMARCORetrieval": ["train"],
     "MSMARCO-PL": ["train"],  # translation not trained on
+    "mMARCO-NL": ["train"],  # translation not trained on
     "NQ": ["train"],
+    "NQ-NL": ["train"],  # translation not trained on
     "NQHardNegatives": ["train"],
     "NanoNQRetrieval": ["train"],
     "NQ-PL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQA-NL": ["train"],  # translation not trained on
     "HotpotQAHardNegatives": ["train"],
     "T2Retrieval": ["train"],
     "DuReader": ["train"],
@@ -58,6 +61,7 @@
 bge_training_data = {
     # source: https://data.baai.ac.cn/details/BAAI-MTP
     "NQ": ["test"],
+    "NQ-NL": ["test"],  # translation not trained on
     "NQHardNegatives": ["test"],
     "AmazonReviewsClassification": [
         "validation",
@@ -309,7 +313,6 @@
     "zho_Hans",  # zh
 ]
 
-
 bge_small_en_v1_5 = ModelMeta(
     loader=partial(  # type: ignore
         sentence_transformers_loader,
@@ -641,7 +644,6 @@
     training_datasets=bge_m3_training_data,
 )
 
-
 bge_multilingual_gemma2 = ModelMeta(
     loader=partial(  # type: ignore
         sentence_transformers_loader,
@@ -686,11 +688,17 @@
     # TriviaQA
     # QuoraDuplicateQuestions
     "HotpotQA": ["train"],
+    "HotpotQA-NL": ["train"],  # translation not trained on
     "FEVER": ["train"],
+    "FEVER-NL": ["train"],  # translation not trained on
     "MSMARCO": ["train"],
+    "mMARCO-NL": ["train"],  # translation not trained on
     "NQ": ["train"],
+    "NQ-NL": ["train"],  # translation not trained on
     "ArguAna": ["train"],
+    "ArguAna-NL": ["train"],  # translation not trained on
     "FiQA2018": ["train"],
+    "FiQA2018-NL": ["train"],  # translation not trained on
     # |Reranking|
     "SciDocsReranking": ["train"],
     "StackOverflowDupQuestions": ["train"],

diff --git a/mteb/models/colbert_models.py b/mteb/models/colbert_models.py
@@ -167,10 +167,10 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
     superseded_by=None,
     training_datasets={
         "MSMARCO": ["train"],  # dev?
+        "mMARCO-NL": ["train"],  # translation not trained on
     },
 )
 
-
 jina_colbert_v2 = ModelMeta(
     loader=partial(
         ColBERTWrapper,
@@ -222,6 +222,7 @@ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
     superseded_by=None,
     training_datasets={
         "MSMARCO": ["train"],
+        "mMARCO-NL": ["train"],  # translation not trained on
         "DuRetrieval": [],
         "MIRACL": ["train"],
     },

diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py
@@ -11,17 +11,17 @@
 
 MISTRAL_LANGUAGES = ["eng_Latn", "fra_Latn", "deu_Latn", "ita_Latn", "spa_Latn"]
 
-
 E5_INSTRUCTION = "Instruct: {instruction}\nQuery: "
 
-
 E5_MISTRAL_TRAINING_DATA = {
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
+    "FEVER-NL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQA-NL": ["train"],  # translation not trained on
 }
 
 e5_instruct = ModelMeta(
@@ -116,8 +116,10 @@
         # copied from e5
         # source: https://arxiv.org/pdf/2212.03533
         "NQ": ["test"],
+        "NQ-NL": ["test"],  # translation not trained on
         "NQHardNegatives": ["test"],
         "MSMARCO": ["train"],  # dev?
+        "mMARCO-NL": ["train"],  # translation not trained on
         # source: https://www.zeta-alpha.com/post/fine-tuning-an-llm-for-state-of-the-art-retrieval-zeta-alpha-s-top-10-submission-to-the-the-mteb-be
         # "Arguana",
         # "FEVER",

diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py
@@ -120,19 +120,23 @@
     "MSMARCOHardNegatives": ["train"],
     "NanoMSMARCORetrieval": ["train"],
     "MSMARCO-PL": ["train"],  # translation not trained on
+    "mMARCO-NL": ["train"],  # translation not trained on
     "NQ": ["train"],
     "NQHardNegatives": ["train"],
     "NanoNQRetrieval": ["train"],
     "NQ-PL": ["train"],  # translation not trained on
+    "NQ-NL": ["train"],  # translation not trained on
 }
 
 ME5_TRAINING_DATA = {
     **E5_TRAINING_DATA,
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
+    "FEVER-NL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQA-NL": ["train"],  # translation not trained on
 }
 
 e5_mult_small = ModelMeta(

diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py
@@ -6,7 +6,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 gme_qwen2_vl_2b_instruct = ModelMeta(
     loader=None,
     name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
@@ -29,6 +28,7 @@
         # source: https://arxiv.org/pdf/2412.16855
         "MSMARCO": ["train"],
         "MSMARCO.v2": ["train"],
+        "mMARCO-NL": ["train"],  # translation not trained on
     },
     public_training_code=None,
     public_training_data=None,
@@ -56,6 +56,7 @@
         # source: https://arxiv.org/pdf/2412.16855
         "MSMARCO": ["train"],
         "MSMARCO.v2": ["train"],
+        "mMARCO-NL": ["train"],  # translation not trained on
     },
     public_training_code=None,
     public_training_data=None,

diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py
@@ -10,15 +10,16 @@
 
 logger = logging.getLogger(__name__)
 
-
 GRIT_LM_TRAINING_DATA = {
     **E5_TRAINING_DATA,  # source https://arxiv.org/pdf/2402.09906
     # also uses medi2 which contains fever and hotpotqa:
     "FEVER": ["train"],
     "FEVERHardNegatives": ["train"],
+    "FEVER-NL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
     "HotpotQAHardNegatives": ["train"],
     "HotpotQA-PL": ["train"],  # translation not trained on
+    "HotpotQA-NL": ["train"],  # translation not trained on
 }
 
 

diff --git a/mteb/models/gte_models.py b/mteb/models/gte_models.py
@@ -51,7 +51,6 @@ def instruction_template(
     max_tokens=131072,
 )
 
-
 gte_Qwen1_5_7B_instruct = ModelMeta(
     loader=partial(  # type: ignore
         instruct_wrapper,
@@ -82,7 +81,6 @@ def instruction_template(
     training_datasets=None,
 )
 
-
 gte_Qwen2_1_5B_instruct = ModelMeta(
     loader=partial(  # type: ignore
         instruct_wrapper,
@@ -264,10 +262,14 @@ def instruction_template(
     "DuReader": ["train"],
     "MMarcoReranking": ["train"],
     "CMedQAv2-reranking": ["train"],
+    "NQ-NL": ["train"],  # translation not trained on
     "NQ": ["train"],
     "MSMARCO": ["train"],
+    "mMARCO-NL": ["train"],  # translation not trained on
     "HotpotQA": ["train"],
+    "HotpotQA-NL": ["train"],
     "FEVER": ["train"],
+    "FEVER-NL": ["train"],
     "MIRACLReranking": ["train"],
     "MrTidyRetrieval": ["train"],
     "MultiLongDocRetrieval": ["train"],

diff --git a/mteb/models/ibm_granite_models.py b/mteb/models/ibm_granite_models.py
@@ -53,15 +53,18 @@
     # Machine Translations of SPECTER citation triplets
     # Natural Questions (NQ)
     "NQ": ["test"],
+    "NQ-NL": ["test"],  # translation not trained on
     "NQHardNegatives": ["test"],
     # SQuAD2.0
     # HotpotQA
     "HotPotQA": ["test"],
     "HotPotQAHardNegatives": ["test"],
     "HotPotQA-PL": ["test"],  # translated from hotpotQA (not trained on)
+    "HotpotQA-NL": ["test"],  # translated from hotpotQA (not trained on)
     # Fever
     "FEVER": ["test"],
     "FEVERHardNegatives": ["test"],
+    "FEVER-NL": ["test"],  # translated from hotpotQA (not trained on)
     # PubMed
     # Multilingual Miracl Triples
     "MIRACLRetrieval": ["train"],
@@ -72,6 +75,7 @@
     # Sadeeem Question Asnwering
     # DBPedia Title-Body Pairs
     "DBPedia": ["train"],
+    "DBPedia-NL": ["train"],  # translated from hotpotQA (not trained on)
     # Synthetic: English Query-Wikipedia Passage
     # Synthetic: English Fact Verification
     # Synthetic: Multilingual Query-Wikipedia Passage

diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py
@@ -234,16 +234,17 @@ def encode(
         "MSMARCO": ["train"],
         "MSMARCOHardNegatives": ["train"],
         "NanoMSMARCORetrieval": ["train"],
+        "mMARCO-NL": ["train"],  # translation not trained on
         "NQ": ["train"],
         "NQHardNegatives": ["train"],
         "NanoNQRetrieval": ["train"],
         "NQ-PL": ["train"],  # translation not trained on
+        "NQ-NL": ["train"],  # translation not trained on
         # oasst1, oasst2
     },
     adapted_from="XLM-RoBERTa",
 )
 
-
 jina_embeddings_v2_base_en = ModelMeta(
     loader=partial(
         SentenceTransformerWrapper,