-
Notifications
You must be signed in to change notification settings - Fork 307
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branches 'update_mteb_meta_cli' and 'main' of https://github.co…
- Loading branch information
Showing
26 changed files
with
1,828 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "gentaiscool", "New dataset": 26} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "akshita-sukhlecha", "New dataset": 34} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "gentaiscool", "New dataset": 38} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{"GitHub": "MariyaTikhonova", "Dataset annotations": 1} | ||
{"GitHub": "anpalmak2003", "Dataset annotations": 1} | ||
{"GitHub": "ab1992ao", "Dataset annotations": 1} | ||
{"GitHub": "Alenush", "Dataset annotations": 1} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
mteb/tasks/BitextMining/multilingual/NusaTranslationBitextMining.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks import AbsTaskBitextMining, CrosslingualTask | ||
|
||
_LANGUAGES = { | ||
"ind-abs": ["ind-Latn", "abs-Latn"], | ||
"ind-btk": ["ind-Latn", "bbc-Latn"], | ||
"ind-bew": ["ind-Latn", "bew-Latn"], | ||
"ind-bhp": ["ind-Latn", "bhp-Latn"], | ||
"ind-jav": ["ind-Latn", "jav-Latn"], | ||
"ind-mad": ["ind-Latn", "mad-Latn"], | ||
"ind-mak": ["ind-Latn", "mak-Latn"], | ||
"ind-min": ["ind-Latn", "min-Latn"], | ||
"ind-mui": ["ind-Latn", "mui-Latn"], | ||
"ind-rej": ["ind-Latn", "rej-Latn"], | ||
"ind-sun": ["ind-Latn", "sun-Latn"], | ||
} | ||
|
||
|
||
class NusaTranslationBitextMining(AbsTaskBitextMining, CrosslingualTask): | ||
metadata = TaskMetadata( | ||
name="NusaTranslationBitextMining", | ||
dataset={ | ||
"path": "gentaiscool/bitext_nusatranslation_miners", | ||
"revision": "ba52e9d114a4a145d79b4293afab31304a999a4c", | ||
}, | ||
description="NusaTranslation is a parallel dataset for machine translation on 11 Indonesia languages and English.", | ||
reference="https://huggingface.co/datasets/indonlp/nusatranslation_mt", | ||
type="BitextMining", | ||
category="s2s", | ||
eval_splits=["train"], | ||
eval_langs=_LANGUAGES, | ||
main_score="f1", | ||
date=("2021-08-01", "2022-07-01"), | ||
form=["written"], | ||
domains=["Social"], | ||
task_subtypes=[], | ||
license="CC BY-SA 4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="human-annotated", | ||
dialect=[], | ||
text_creation="created", | ||
bibtex_citation=""" | ||
@inproceedings{cahyawijaya2023nusawrites, | ||
title={NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages}, | ||
author={Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and others}, | ||
booktitle={Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)}, | ||
pages={921--945}, | ||
year={2023} | ||
} | ||
""", | ||
n_samples={"train": 50200}, | ||
avg_character_length={"train": 147.01}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks import AbsTaskBitextMining, CrosslingualTask | ||
|
||
_LANGUAGES = { | ||
"eng-ace": ["eng-Latn", "ace-Latn"], | ||
"eng-ban": ["eng-Latn", "ban-Latn"], | ||
"eng-bbc": ["eng-Latn", "bbc-Latn"], | ||
"eng-bjn": ["eng-Latn", "bjn-Latn"], | ||
"eng-bug": ["eng-Latn", "bug-Latn"], | ||
"eng-ind": ["eng-Latn", "ind-Latn"], | ||
"eng-jav": ["eng-Latn", "jav-Latn"], | ||
"eng-mad": ["eng-Latn", "mad-Latn"], | ||
"eng-min": ["eng-Latn", "min-Latn"], | ||
"eng-nij": ["eng-Latn", "nij-Latn"], | ||
"eng-sun": ["eng-Latn", "sun-Latn"], | ||
} | ||
|
||
|
||
class NusaXBitextMining(AbsTaskBitextMining, CrosslingualTask): | ||
metadata = TaskMetadata( | ||
name="NusaXBitextMining", | ||
dataset={ | ||
"path": "gentaiscool/bitext_nusax_miners", | ||
"revision": "fba4f2cfe2592641056f7a274c9aa8453b27a4a8", | ||
}, | ||
description="NusaX is a parallel dataset for machine translation and sentiment analysis on 11 Indonesia languages and English.", | ||
reference="https://huggingface.co/datasets/indonlp/NusaX-senti/", | ||
type="BitextMining", | ||
category="s2s", | ||
eval_splits=["train"], | ||
eval_langs=_LANGUAGES, | ||
main_score="f1", | ||
date=("2021-08-01", "2022-07-01"), | ||
form=["written"], | ||
domains=["Reviews"], | ||
task_subtypes=[], | ||
license="CC BY-SA 4.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="human-annotated", | ||
dialect=[], | ||
text_creation="created", | ||
bibtex_citation=""" | ||
@inproceedings{winata2023nusax, | ||
title={NusaX: Multilingual Parallel Sentiment Dataset for 10 Indonesian Local Languages}, | ||
author={Winata, Genta Indra and Aji, Alham Fikri and Cahyawijaya, Samuel and Mahendra, Rahmad and Koto, Fajri and Romadhony, Ade and Kurniawan, Kemal and Moeljadi, David and Prasojo, Radityo Eko and Fung, Pascale and others}, | ||
booktitle={Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics}, | ||
pages={815--834}, | ||
year={2023} | ||
} | ||
@misc{winata2024miners, | ||
title={MINERS: Multilingual Language Models as Semantic Retrievers}, | ||
author={Genta Indra Winata and Ruochen Zhang and David Ifeoluwa Adelani}, | ||
year={2024}, | ||
eprint={2406.07424}, | ||
archivePrefix={arXiv}, | ||
primaryClass={cs.CL} | ||
} | ||
""", | ||
n_samples={"train": 5500}, | ||
avg_character_length={"train": 157.15}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks import AbsTaskSTS, MultilingualTask, TaskMetadata | ||
|
||
_LANGUAGES = { | ||
"afr": ["afr-Latn"], | ||
"amh": ["amh-Ethi"], | ||
"arb": ["arb-Arab"], | ||
"arq": ["arq-Arab"], | ||
"ary": ["ary-Arab"], | ||
"eng": ["eng-Latn"], | ||
"hau": ["hau-Latn"], | ||
"hin": ["hin-Deva"], | ||
"ind": ["ind-Latn"], | ||
"kin": ["kin-Latn"], | ||
"mar": ["mar-Deva"], | ||
"tel": ["tel-Telu"], | ||
} | ||
|
||
_SPLITS = ["test"] | ||
|
||
|
||
class SemRel24STS(AbsTaskSTS, MultilingualTask): | ||
metadata = TaskMetadata( | ||
name="SemRel24STS", | ||
dataset={ | ||
"path": "SemRel/SemRel2024", | ||
"revision": "ef5c383d1b87eb8feccde3dfb7f95e42b1b050dd", | ||
}, | ||
description=( | ||
"SemRel2024 is a collection of Semantic Textual Relatedness (STR) datasets for 14 languages, " | ||
"including African and Asian languages. The datasets are composed of sentence pairs, each assigned a " | ||
"relatedness score between 0 (completely) unrelated and 1 (maximally related) with a large range of " | ||
"expected relatedness values." | ||
), | ||
reference="https://huggingface.co/datasets/SemRel/SemRel2024", | ||
type="STS", | ||
category="s2s", | ||
eval_splits=_SPLITS, | ||
eval_langs=_LANGUAGES, | ||
main_score="cosine_spearman", | ||
date=("2023-01-01", "2023-12-31"), | ||
form=["spoken", "written"], | ||
domains=[], | ||
task_subtypes=[], | ||
license="Not specified", | ||
socioeconomic_status="mixed", | ||
annotations_creators="human-annotated", | ||
dialect=[], | ||
text_creation="created", | ||
bibtex_citation="""@misc{ousidhoum2024semrel2024, | ||
title={SemRel2024: A Collection of Semantic Textual Relatedness Datasets for 14 Languages}, | ||
author={Nedjma Ousidhoum and Shamsuddeen Hassan Muhammad and Mohamed Abdalla and Idris Abdulmumin and Ibrahim Said Ahmad and | ||
Sanchit Ahuja and Alham Fikri Aji and Vladimir Araujo and Abinew Ali Ayele and Pavan Baswani and Meriem Beloucif and | ||
Chris Biemann and Sofia Bourhim and Christine De Kock and Genet Shanko Dekebo and | ||
Oumaima Hourrane and Gopichand Kanumolu and Lokesh Madasu and Samuel Rutunda and Manish Shrivastava and | ||
Thamar Solorio and Nirmal Surange and Hailegnaw Getaneh Tilaye and Krishnapriya Vishnubhotla and Genta Winata and | ||
Seid Muhie Yimam and Saif M. Mohammad}, | ||
year={2024}, | ||
eprint={2402.08638}, | ||
archivePrefix={arXiv}, | ||
primaryClass={cs.CL} | ||
} | ||
""", | ||
n_samples={"dev": 2089, "test": 7498}, | ||
avg_character_length={"dev": 163.1, "test": 145.9}, | ||
) | ||
|
||
@property | ||
def metadata_dict(self) -> dict[str, str]: | ||
metadata_dict = super().metadata_dict | ||
metadata_dict["min_score"] = 0 | ||
metadata_dict["max_score"] = 1 | ||
return metadata_dict | ||
|
||
def dataset_transform(self) -> None: | ||
for lang, subset in self.dataset.items(): | ||
self.dataset[lang] = subset.rename_column("label", "score") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.