Merge branches 'update_mteb_meta_cli' and 'main' of https://github.co…

…m/embeddings-benchmark/mteb
embeddings-benchmark · Jun 15, 2024 · 7570f9c · 7570f9c
2 parents f19d227 + a9f0eca
commit 7570f9c
Show file tree

Hide file tree

Showing 26 changed files with 1,828 additions and 89 deletions.
diff --git a/docs/mmteb/points.md b/docs/mmteb/points.md
@@ -92,3 +92,4 @@ Please also add your first name and last name are as you want them to appear in
 | ManuelFay         | Manuel        | Faysse     | manuel.faysse@centralesupelec.fr |              ~Manuel_Faysse1        | CentraleSupélec & Illuin Technology                  |
 | hgissbkh          | Hippolyte     | Gisserot-Boukhlef    | hippolyte.gisserot-boukhlef@centralesupelec.fr        |   ~Hippolyte_Gisserot-Boukhlef1                   | CentraleSupélec & Artefact Research Center   |
 | sted97          | Simone     | Tedeschi    | tedeschi@diag.uniroma1.it        |   ~Simone_Tedeschi1                   | Sapienza University of Rome   |
+| gentaiscool          | Genta Indra     | Winata    | genta.winata@capitalone.com        |   ~Genta_Indra_Winata1                   | Capital One   |
diff --git a/docs/mmteb/points/914.jsonl b/docs/mmteb/points/914.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "gentaiscool", "New dataset": 26}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points/917.jsonl b/docs/mmteb/points/917.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "akshita-sukhlecha", "New dataset": 34}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points/922.jsonl b/docs/mmteb/points/922.jsonl
@@ -0,0 +1,2 @@
+{"GitHub": "gentaiscool", "New dataset": 38}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points/923.jsonl b/docs/mmteb/points/923.jsonl
@@ -0,0 +1,5 @@
+{"GitHub": "MariyaTikhonova", "Dataset annotations": 1}
+{"GitHub": "anpalmak2003", "Dataset annotations": 1}
+{"GitHub": "ab1992ao", "Dataset annotations": 1}
+{"GitHub": "Alenush", "Dataset annotations": 1}
+{"GitHub": "KennethEnevoldsen", "Review PR": 2}
diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md
diff --git a/docs/tasks.md b/docs/tasks.md
diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py
@@ -13,6 +13,8 @@
 from .multilingual.IWSLT2017BitextMinig import *
 from .multilingual.NorwegianCourtsBitextMining import *
 from .multilingual.NTREXBitextMining import *
+from .multilingual.NusaTranslationBitextMining import *
+from .multilingual.NusaXBitextMining import *
 from .multilingual.RomaTalesBitextMining import *
 from .multilingual.TatoebaBitextMining import *
 from .srn.SRNCorpusBitextMining import *

diff --git a/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py b/mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import AbsTaskBitextMining, CrosslingualTask
+
+_LANGUAGES = {
+    "ind-abs": ["ind-Latn", "abs-Latn"],
+    "ind-btk": ["ind-Latn", "bbc-Latn"],
+    "ind-bew": ["ind-Latn", "bew-Latn"],
+    "ind-bhp": ["ind-Latn", "bhp-Latn"],
+    "ind-jav": ["ind-Latn", "jav-Latn"],
+    "ind-mad": ["ind-Latn", "mad-Latn"],
+    "ind-mak": ["ind-Latn", "mak-Latn"],
+    "ind-min": ["ind-Latn", "min-Latn"],
+    "ind-mui": ["ind-Latn", "mui-Latn"],
+    "ind-rej": ["ind-Latn", "rej-Latn"],
+    "ind-sun": ["ind-Latn", "sun-Latn"],
+}
+
+
+class NusaTranslationBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="NusaTranslationBitextMining",
+        dataset={
+            "path": "gentaiscool/bitext_nusatranslation_miners",
+            "revision": "ba52e9d114a4a145d79b4293afab31304a999a4c",
+        },
+        description="NusaTranslation is a parallel dataset for machine translation on 11 Indonesia languages and English.",
+        reference="https://huggingface.co/datasets/indonlp/nusatranslation_mt",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["train"],
+        eval_langs=_LANGUAGES,
+        main_score="f1",
+        date=("2021-08-01", "2022-07-01"),
+        form=["written"],
+        domains=["Social"],
+        task_subtypes=[],
+        license="CC BY-SA 4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
+        bibtex_citation="""
+        @inproceedings{cahyawijaya2023nusawrites,
+            title={NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages},
+            author={Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and others},
+            booktitle={Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)},
+            pages={921--945},
+            year={2023}
+        }
+
+        """,
+        n_samples={"train": 50200},
+        avg_character_length={"train": 147.01},
+    )
diff --git a/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py b/mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks import AbsTaskBitextMining, CrosslingualTask
+
+_LANGUAGES = {
+    "eng-ace": ["eng-Latn", "ace-Latn"],
+    "eng-ban": ["eng-Latn", "ban-Latn"],
+    "eng-bbc": ["eng-Latn", "bbc-Latn"],
+    "eng-bjn": ["eng-Latn", "bjn-Latn"],
+    "eng-bug": ["eng-Latn", "bug-Latn"],
+    "eng-ind": ["eng-Latn", "ind-Latn"],
+    "eng-jav": ["eng-Latn", "jav-Latn"],
+    "eng-mad": ["eng-Latn", "mad-Latn"],
+    "eng-min": ["eng-Latn", "min-Latn"],
+    "eng-nij": ["eng-Latn", "nij-Latn"],
+    "eng-sun": ["eng-Latn", "sun-Latn"],
+}
+
+
+class NusaXBitextMining(AbsTaskBitextMining, CrosslingualTask):
+    metadata = TaskMetadata(
+        name="NusaXBitextMining",
+        dataset={
+            "path": "gentaiscool/bitext_nusax_miners",
+            "revision": "fba4f2cfe2592641056f7a274c9aa8453b27a4a8",
+        },
+        description="NusaX is a parallel dataset for machine translation and sentiment analysis on 11 Indonesia languages and English.",
+        reference="https://huggingface.co/datasets/indonlp/NusaX-senti/",
+        type="BitextMining",
+        category="s2s",
+        eval_splits=["train"],
+        eval_langs=_LANGUAGES,
+        main_score="f1",
+        date=("2021-08-01", "2022-07-01"),
+        form=["written"],
+        domains=["Reviews"],
+        task_subtypes=[],
+        license="CC BY-SA 4.0",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
+        bibtex_citation="""
+        @inproceedings{winata2023nusax,
+        title={NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages},
+        author={Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, Radityo Eko and Fung, Pascale and others},
+        booktitle={Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics},
+        pages={815--834},
+        year={2023}
+        }
+        @misc{winata2024miners,
+            title={MINERS: Multilingual Language Models as Semantic Retrievers}, 
+            author={Genta Indra Winata and Ruochen Zhang and David Ifeoluwa Adelani},
+            year={2024},
+            eprint={2406.07424},
+            archivePrefix={arXiv},
+            primaryClass={cs.CL}
+        }
+        """,
+        n_samples={"train": 5500},
+        avg_character_length={"train": 157.15},
+    )
diff --git a/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py b/mteb/tasks/Classification/multilingual/MTOPDomainClassification.py
@@ -28,15 +28,15 @@ class MTOPDomainClassification(MultilingualTask, AbsTaskClassification):
         eval_splits=["validation", "test"],
         eval_langs=_LANGUAGES,
         main_score="accuracy",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        socioeconomic_status=None,
-        annotations_creators=None,
-        dialect=None,
-        text_creation=None,
+        date=("2020-01-01", "2020-12-31"),
+        form=["spoken"],
+        domains=["Spoken"],
+        task_subtypes=[],
+        license="Not specified",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
         bibtex_citation="""@inproceedings{li-etal-2021-mtop,
     title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark",
     author = "Li, Haoran  and

diff --git a/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py b/mteb/tasks/Classification/multilingual/MTOPIntentClassification.py
@@ -28,15 +28,15 @@ class MTOPIntentClassification(MultilingualTask, AbsTaskClassification):
         eval_splits=["validation", "test"],
         eval_langs=_LANGUAGES,
         main_score="accuracy",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        socioeconomic_status=None,
-        annotations_creators=None,
-        dialect=None,
-        text_creation=None,
+        date=("2020-01-01", "2020-12-31"),
+        form=["spoken"],
+        domains=["Spoken"],
+        task_subtypes=[],
+        license="Not specified",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
         bibtex_citation="""@inproceedings{li-etal-2021-mtop,
     title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark",
     author = "Li, Haoran  and

diff --git a/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py b/mteb/tasks/Classification/multilingual/MassiveIntentClassification.py
@@ -74,15 +74,15 @@ class MassiveIntentClassification(MultilingualTask, AbsTaskClassification):
         eval_splits=["validation", "test"],
         eval_langs=_LANGUAGES,
         main_score="accuracy",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        socioeconomic_status=None,
-        annotations_creators=None,
-        dialect=None,
-        text_creation=None,
+        date=("2022-01-01", "2022-04-22"),
+        form=["spoken"],
+        domains=["Spoken"],
+        task_subtypes=[],
+        license="Apache 2.0",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
         bibtex_citation="""@misc{fitzgerald2022massive,
       title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, 
       author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan},

diff --git a/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py b/mteb/tasks/Classification/multilingual/MassiveScenarioClassification.py
@@ -74,15 +74,15 @@ class MassiveScenarioClassification(MultilingualTask, AbsTaskClassification):
         eval_splits=["validation", "test"],
         eval_langs=_LANGUAGES,
         main_score="accuracy",
-        date=None,
-        form=None,
-        domains=None,
-        task_subtypes=None,
-        license=None,
-        socioeconomic_status=None,
-        annotations_creators=None,
-        dialect=None,
-        text_creation=None,
+        date=("2022-01-01", "2022-04-22"),
+        form=["spoken"],
+        domains=["Spoken"],
+        task_subtypes=[],
+        license="Apache 2.0",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
         bibtex_citation="""@misc{fitzgerald2022massive,
       title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages}, 
       author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan},

diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py
@@ -17,6 +17,7 @@
 from .kor.KlueSTS import *
 from .kor.KorSTS import *
 from .multilingual.IndicCrosslingualSTS import *
+from .multilingual.SemRel24STS import *
 from .multilingual.STS17CrosslingualSTS import *
 from .multilingual.STS22CrosslingualSTS import *
 from .multilingual.STSBenchmarkMultilingualSTS import *

diff --git a/mteb/tasks/STS/multilingual/SemRel24STS.py b/mteb/tasks/STS/multilingual/SemRel24STS.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+
+from mteb.abstasks import AbsTaskSTS, MultilingualTask, TaskMetadata
+
+_LANGUAGES = {
+    "afr": ["afr-Latn"],
+    "amh": ["amh-Ethi"],
+    "arb": ["arb-Arab"],
+    "arq": ["arq-Arab"],
+    "ary": ["ary-Arab"],
+    "eng": ["eng-Latn"],
+    "hau": ["hau-Latn"],
+    "hin": ["hin-Deva"],
+    "ind": ["ind-Latn"],
+    "kin": ["kin-Latn"],
+    "mar": ["mar-Deva"],
+    "tel": ["tel-Telu"],
+}
+
+_SPLITS = ["test"]
+
+
+class SemRel24STS(AbsTaskSTS, MultilingualTask):
+    metadata = TaskMetadata(
+        name="SemRel24STS",
+        dataset={
+            "path": "SemRel/SemRel2024",
+            "revision": "ef5c383d1b87eb8feccde3dfb7f95e42b1b050dd",
+        },
+        description=(
+            "SemRel2024 is a collection of Semantic Textual Relatedness (STR) datasets for 14 languages, "
+            "including African and Asian languages. The datasets are composed of sentence pairs, each assigned a "
+            "relatedness score between 0 (completely) unrelated and 1 (maximally related) with a large range of "
+            "expected relatedness values."
+        ),
+        reference="https://huggingface.co/datasets/SemRel/SemRel2024",
+        type="STS",
+        category="s2s",
+        eval_splits=_SPLITS,
+        eval_langs=_LANGUAGES,
+        main_score="cosine_spearman",
+        date=("2023-01-01", "2023-12-31"),
+        form=["spoken", "written"],
+        domains=[],
+        task_subtypes=[],
+        license="Not specified",
+        socioeconomic_status="mixed",
+        annotations_creators="human-annotated",
+        dialect=[],
+        text_creation="created",
+        bibtex_citation="""@misc{ousidhoum2024semrel2024,
+        title={SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages}, 
+        author={Nedjma Ousidhoum and Shamsuddeen Hassan Muhammad and Mohamed Abdalla and Idris Abdulmumin and Ibrahim Said Ahmad and
+        Sanchit Ahuja and Alham Fikri Aji and Vladimir Araujo and Abinew Ali Ayele and Pavan Baswani and Meriem Beloucif and
+        Chris Biemann and Sofia Bourhim and Christine De Kock and Genet Shanko Dekebo and
+        Oumaima Hourrane and Gopichand Kanumolu and Lokesh Madasu and Samuel Rutunda and Manish Shrivastava and
+        Thamar Solorio and Nirmal Surange and Hailegnaw Getaneh Tilaye and Krishnapriya Vishnubhotla and Genta Winata and
+        Seid Muhie Yimam and Saif M. Mohammad},
+              year={2024},
+              eprint={2402.08638},
+              archivePrefix={arXiv},
+              primaryClass={cs.CL}
+        }
+        """,
+        n_samples={"dev": 2089, "test": 7498},
+        avg_character_length={"dev": 163.1, "test": 145.9},
+    )
+
+    @property
+    def metadata_dict(self) -> dict[str, str]:
+        metadata_dict = super().metadata_dict
+        metadata_dict["min_score"] = 0
+        metadata_dict["max_score"] = 1
+        return metadata_dict
+
+    def dataset_transform(self) -> None:
+        for lang, subset in self.dataset.items():
+            self.dataset[lang] = subset.rename_column("label", "score")
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "mteb"
-version = "1.12.29"
+version = "1.12.32"
 description = "Massive Text Embedding Benchmark"
 readme = "README.md"
 authors = [
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"GitHub": "gentaiscool", "New dataset": 26}
		{"GitHub": "KennethEnevoldsen", "Review PR": 2}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"GitHub": "akshita-sukhlecha", "New dataset": 34}
		{"GitHub": "KennethEnevoldsen", "Review PR": 2}