Skip to content

Commit

Permalink
Merge branches 'update_mteb_meta_cli' and 'main' of https://github.co…
Browse files Browse the repository at this point in the history
  • Loading branch information
KennethEnevoldsen committed Jun 15, 2024
2 parents f19d227 + a9f0eca commit 7570f9c
Show file tree
Hide file tree
Showing 26 changed files with 1,828 additions and 89 deletions.
1 change: 1 addition & 0 deletions docs/mmteb/points.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,4 @@ Please also add your first name and last name are as you want them to appear in
| ManuelFay | Manuel | Faysse | manuel.faysse@centralesupelec.fr | ~Manuel_Faysse1 | CentraleSupélec & Illuin Technology |
| hgissbkh | Hippolyte | Gisserot-Boukhlef | hippolyte.gisserot-boukhlef@centralesupelec.fr | ~Hippolyte_Gisserot-Boukhlef1 | CentraleSupélec & Artefact Research Center |
| sted97 | Simone | Tedeschi | tedeschi@diag.uniroma1.it | ~Simone_Tedeschi1 | Sapienza University of Rome |
| gentaiscool | Genta Indra | Winata | genta.winata@capitalone.com | ~Genta_Indra_Winata1 | Capital One |
2 changes: 2 additions & 0 deletions docs/mmteb/points/914.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 26}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/917.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "akshita-sukhlecha", "New dataset": 34}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
2 changes: 2 additions & 0 deletions docs/mmteb/points/922.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 38}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
5 changes: 5 additions & 0 deletions docs/mmteb/points/923.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"GitHub": "MariyaTikhonova", "Dataset annotations": 1}
{"GitHub": "anpalmak2003", "Dataset annotations": 1}
{"GitHub": "ab1992ao", "Dataset annotations": 1}
{"GitHub": "Alenush", "Dataset annotations": 1}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
50 changes: 26 additions & 24 deletions docs/mmteb/points_table.md

Large diffs are not rendered by default.

63 changes: 36 additions & 27 deletions docs/tasks.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions mteb/tasks/BitextMining/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from .multilingual.IWSLT2017BitextMinig import *
from .multilingual.NorwegianCourtsBitextMining import *
from .multilingual.NTREXBitextMining import *
from .multilingual.NusaTranslationBitextMining import *
from .multilingual.NusaXBitextMining import *
from .multilingual.RomaTalesBitextMining import *
from .multilingual.TatoebaBitextMining import *
from .srn.SRNCorpusBitextMining import *
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import AbsTaskBitextMining, CrosslingualTask

_LANGUAGES = {
"ind-abs": ["ind-Latn", "abs-Latn"],
"ind-btk": ["ind-Latn", "bbc-Latn"],
"ind-bew": ["ind-Latn", "bew-Latn"],
"ind-bhp": ["ind-Latn", "bhp-Latn"],
"ind-jav": ["ind-Latn", "jav-Latn"],
"ind-mad": ["ind-Latn", "mad-Latn"],
"ind-mak": ["ind-Latn", "mak-Latn"],
"ind-min": ["ind-Latn", "min-Latn"],
"ind-mui": ["ind-Latn", "mui-Latn"],
"ind-rej": ["ind-Latn", "rej-Latn"],
"ind-sun": ["ind-Latn", "sun-Latn"],
}


class NusaTranslationBitextMining(AbsTaskBitextMining, CrosslingualTask):
metadata = TaskMetadata(
name="NusaTranslationBitextMining",
dataset={
"path": "gentaiscool/bitext_nusatranslation_miners",
"revision": "ba52e9d114a4a145d79b4293afab31304a999a4c",
},
description="NusaTranslation is a parallel dataset for machine translation on 11 Indonesia languages and English.",
reference="https://huggingface.co/datasets/indonlp/nusatranslation_mt",
type="BitextMining",
category="s2s",
eval_splits=["train"],
eval_langs=_LANGUAGES,
main_score="f1",
date=("2021-08-01", "2022-07-01"),
form=["written"],
domains=["Social"],
task_subtypes=[],
license="CC BY-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""
@inproceedings{cahyawijaya2023nusawrites,
title={NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages},
author={Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and others},
booktitle={Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)},
pages={921--945},
year={2023}
}
""",
n_samples={"train": 50200},
avg_character_length={"train": 147.01},
)
64 changes: 64 additions & 0 deletions mteb/tasks/BitextMining/multilingual/NusaXBitextMining.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import AbsTaskBitextMining, CrosslingualTask

_LANGUAGES = {
"eng-ace": ["eng-Latn", "ace-Latn"],
"eng-ban": ["eng-Latn", "ban-Latn"],
"eng-bbc": ["eng-Latn", "bbc-Latn"],
"eng-bjn": ["eng-Latn", "bjn-Latn"],
"eng-bug": ["eng-Latn", "bug-Latn"],
"eng-ind": ["eng-Latn", "ind-Latn"],
"eng-jav": ["eng-Latn", "jav-Latn"],
"eng-mad": ["eng-Latn", "mad-Latn"],
"eng-min": ["eng-Latn", "min-Latn"],
"eng-nij": ["eng-Latn", "nij-Latn"],
"eng-sun": ["eng-Latn", "sun-Latn"],
}


class NusaXBitextMining(AbsTaskBitextMining, CrosslingualTask):
metadata = TaskMetadata(
name="NusaXBitextMining",
dataset={
"path": "gentaiscool/bitext_nusax_miners",
"revision": "fba4f2cfe2592641056f7a274c9aa8453b27a4a8",
},
description="NusaX is a parallel dataset for machine translation and sentiment analysis on 11 Indonesia languages and English.",
reference="https://huggingface.co/datasets/indonlp/NusaX-senti/",
type="BitextMining",
category="s2s",
eval_splits=["train"],
eval_langs=_LANGUAGES,
main_score="f1",
date=("2021-08-01", "2022-07-01"),
form=["written"],
domains=["Reviews"],
task_subtypes=[],
license="CC BY-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""
@inproceedings{winata2023nusax,
title={NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages},
author={Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, Radityo Eko and Fung, Pascale and others},
booktitle={Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics},
pages={815--834},
year={2023}
}
@misc{winata2024miners,
title={MINERS: Multilingual Language Models as Semantic Retrievers},
author={Genta Indra Winata and Ruochen Zhang and David Ifeoluwa Adelani},
year={2024},
eprint={2406.07424},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""",
n_samples={"train": 5500},
avg_character_length={"train": 157.15},
)
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ class MTOPDomainClassification(MultilingualTask, AbsTaskClassification):
eval_splits=["validation", "test"],
eval_langs=_LANGUAGES,
main_score="accuracy",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
socioeconomic_status=None,
annotations_creators=None,
dialect=None,
text_creation=None,
date=("2020-01-01", "2020-12-31"),
form=["spoken"],
domains=["Spoken"],
task_subtypes=[],
license="Not specified",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""@inproceedings{li-etal-2021-mtop,
title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark",
author = "Li, Haoran and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ class MTOPIntentClassification(MultilingualTask, AbsTaskClassification):
eval_splits=["validation", "test"],
eval_langs=_LANGUAGES,
main_score="accuracy",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
socioeconomic_status=None,
annotations_creators=None,
dialect=None,
text_creation=None,
date=("2020-01-01", "2020-12-31"),
form=["spoken"],
domains=["Spoken"],
task_subtypes=[],
license="Not specified",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""@inproceedings{li-etal-2021-mtop,
title = "{MTOP}: A Comprehensive Multilingual Task-Oriented Semantic Parsing Benchmark",
author = "Li, Haoran and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ class MassiveIntentClassification(MultilingualTask, AbsTaskClassification):
eval_splits=["validation", "test"],
eval_langs=_LANGUAGES,
main_score="accuracy",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
socioeconomic_status=None,
annotations_creators=None,
dialect=None,
text_creation=None,
date=("2022-01-01", "2022-04-22"),
form=["spoken"],
domains=["Spoken"],
task_subtypes=[],
license="Apache 2.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""@misc{fitzgerald2022massive,
title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages},
author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ class MassiveScenarioClassification(MultilingualTask, AbsTaskClassification):
eval_splits=["validation", "test"],
eval_langs=_LANGUAGES,
main_score="accuracy",
date=None,
form=None,
domains=None,
task_subtypes=None,
license=None,
socioeconomic_status=None,
annotations_creators=None,
dialect=None,
text_creation=None,
date=("2022-01-01", "2022-04-22"),
form=["spoken"],
domains=["Spoken"],
task_subtypes=[],
license="Apache 2.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""@misc{fitzgerald2022massive,
title={MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages},
author={Jack FitzGerald and Christopher Hench and Charith Peris and Scott Mackie and Kay Rottmann and Ana Sanchez and Aaron Nash and Liam Urbach and Vishesh Kakarala and Richa Singh and Swetha Ranganath and Laurie Crist and Misha Britan and Wouter Leeuwis and Gokhan Tur and Prem Natarajan},
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/STS/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .kor.KlueSTS import *
from .kor.KorSTS import *
from .multilingual.IndicCrosslingualSTS import *
from .multilingual.SemRel24STS import *
from .multilingual.STS17CrosslingualSTS import *
from .multilingual.STS22CrosslingualSTS import *
from .multilingual.STSBenchmarkMultilingualSTS import *
Expand Down
78 changes: 78 additions & 0 deletions mteb/tasks/STS/multilingual/SemRel24STS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskSTS, MultilingualTask, TaskMetadata

_LANGUAGES = {
"afr": ["afr-Latn"],
"amh": ["amh-Ethi"],
"arb": ["arb-Arab"],
"arq": ["arq-Arab"],
"ary": ["ary-Arab"],
"eng": ["eng-Latn"],
"hau": ["hau-Latn"],
"hin": ["hin-Deva"],
"ind": ["ind-Latn"],
"kin": ["kin-Latn"],
"mar": ["mar-Deva"],
"tel": ["tel-Telu"],
}

_SPLITS = ["test"]


class SemRel24STS(AbsTaskSTS, MultilingualTask):
metadata = TaskMetadata(
name="SemRel24STS",
dataset={
"path": "SemRel/SemRel2024",
"revision": "ef5c383d1b87eb8feccde3dfb7f95e42b1b050dd",
},
description=(
"SemRel2024 is a collection of Semantic Textual Relatedness (STR) datasets for 14 languages, "
"including African and Asian languages. The datasets are composed of sentence pairs, each assigned a "
"relatedness score between 0 (completely) unrelated and 1 (maximally related) with a large range of "
"expected relatedness values."
),
reference="https://huggingface.co/datasets/SemRel/SemRel2024",
type="STS",
category="s2s",
eval_splits=_SPLITS,
eval_langs=_LANGUAGES,
main_score="cosine_spearman",
date=("2023-01-01", "2023-12-31"),
form=["spoken", "written"],
domains=[],
task_subtypes=[],
license="Not specified",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
bibtex_citation="""@misc{ousidhoum2024semrel2024,
title={SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages},
author={Nedjma Ousidhoum and Shamsuddeen Hassan Muhammad and Mohamed Abdalla and Idris Abdulmumin and Ibrahim Said Ahmad and
Sanchit Ahuja and Alham Fikri Aji and Vladimir Araujo and Abinew Ali Ayele and Pavan Baswani and Meriem Beloucif and
Chris Biemann and Sofia Bourhim and Christine De Kock and Genet Shanko Dekebo and
Oumaima Hourrane and Gopichand Kanumolu and Lokesh Madasu and Samuel Rutunda and Manish Shrivastava and
Thamar Solorio and Nirmal Surange and Hailegnaw Getaneh Tilaye and Krishnapriya Vishnubhotla and Genta Winata and
Seid Muhie Yimam and Saif M. Mohammad},
year={2024},
eprint={2402.08638},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""",
n_samples={"dev": 2089, "test": 7498},
avg_character_length={"dev": 163.1, "test": 145.9},
)

@property
def metadata_dict(self) -> dict[str, str]:
metadata_dict = super().metadata_dict
metadata_dict["min_score"] = 0
metadata_dict["max_score"] = 1
return metadata_dict

def dataset_transform(self) -> None:
for lang, subset in self.dataset.items():
self.dataset[lang] = subset.rename_column("label", "score")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "mteb"
version = "1.12.29"
version = "1.12.32"
description = "Massive Text Embedding Benchmark"
readme = "README.md"
authors = [
Expand Down
Loading

0 comments on commit 7570f9c

Please sign in to comment.