-
Notifications
You must be signed in to change notification settings - Fork 307
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Add NusaParagraph Emotion Classification (#928)
* add NusaTranslationEmotionClassification * update name * add new task * add new task * Update mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com> * add sizes * add point * add results * update desc --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
- Loading branch information
1 parent
0f51819
commit 5e4ad44
Showing
6 changed files
with
1,326 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "gentaiscool", "New dataset": 2} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
mteb/tasks/Classification/multilingual/NusaParagraphEmotionClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks import AbsTaskClassification, MultilingualTask | ||
|
||
_LANGUAGES = { | ||
"btk": ["bbc-Latn"], | ||
"bew": ["bew-Latn"], | ||
"bug": ["bug-Latn"], | ||
"jav": ["jav-Latn"], | ||
"mad": ["mad-Latn"], | ||
"mak": ["mak-Latn"], | ||
"min": ["min-Latn"], | ||
"mui": ["mui-Latn"], | ||
"rej": ["rej-Latn"], | ||
"sun": ["sun-Latn"], | ||
} | ||
|
||
|
||
class NusaParagraphEmotionClassification(MultilingualTask, AbsTaskClassification): | ||
metadata = TaskMetadata( | ||
name="NusaParagraphEmotionClassification", | ||
dataset={ | ||
"path": "gentaiscool/nusaparagraph_emot", | ||
"revision": "c61e8c3ee47d2dce296e9601195916b54c21d575", | ||
}, | ||
description="NusaParagraphEmotionClassification is a multi-class emotion classification on 10 Indonesian languages from the NusaParagraph dataset.", | ||
reference="https://github.com/IndoNLP/nusa-writes", | ||
category="s2s", | ||
type="Classification", | ||
eval_splits=["test"], | ||
eval_langs=_LANGUAGES, | ||
main_score="f1", | ||
date=("2021-08-01", "2022-07-01"), | ||
form=["written"], | ||
domains=["Non-fiction", "Fiction"], | ||
task_subtypes=["Emotion classification"], | ||
license="Apache 2.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="human-annotated", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=""" | ||
@inproceedings{cahyawijaya-etal-2023-nusawrites, | ||
title = "NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages", | ||
author = "Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale", | ||
editor = "Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa", | ||
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", | ||
month = nov, | ||
year = "2023", | ||
address = "Nusa Dua, Bali", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2023.ijcnlp-main.60", | ||
pages = "921--945", | ||
} | ||
""", | ||
n_samples={"train": 15516, "validation": 2948, "test": 6250}, | ||
avg_character_length={"train": 740.24, "validation": 740.66, "test": 740.71}, | ||
) |
Oops, something went wrong.