-
Notifications
You must be signed in to change notification settings - Fork 312
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Add NusaParagraph Topic Classification (#927)
* add data * update lang code * update metadata * update metadata * update metadata * update files * Update mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com> * update avg char * add points * add results and fix lint --------- Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
- Loading branch information
1 parent
2a31c8c
commit e13f037
Showing
5 changed files
with
1,325 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"GitHub": "gentaiscool", "New dataset": 18} | ||
{"GitHub": "KennethEnevoldsen", "Review PR": 2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
60 changes: 60 additions & 0 deletions
60
mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks import AbsTaskClassification, MultilingualTask | ||
|
||
_LANGUAGES = { | ||
"btk": ["bbc-Latn"], | ||
"bew": ["bew-Latn"], | ||
"bug": ["bug-Latn"], | ||
"jav": ["jav-Latn"], | ||
"mad": ["mad-Latn"], | ||
"mak": ["mak-Latn"], | ||
"min": ["min-Latn"], | ||
"mui": ["mui-Latn"], | ||
"rej": ["rej-Latn"], | ||
"sun": ["sun-Latn"], | ||
} | ||
|
||
|
||
class NusaParagraphTopicClassification(MultilingualTask, AbsTaskClassification): | ||
metadata = TaskMetadata( | ||
name="NusaParagraphTopicClassification", | ||
dataset={ | ||
"path": "gentaiscool/nusaparagraph_topic", | ||
"revision": "abb43f8d5b9510b8724b48283aca26c4733eac5d", | ||
}, | ||
description="NusaParagraphTopicClassification is a multi-class topic classification on 10 Indonesian languages.", | ||
reference="https://github.com/IndoNLP/nusa-writes", | ||
category="s2s", | ||
type="Classification", | ||
eval_splits=["test"], | ||
eval_langs=_LANGUAGES, | ||
main_score="f1", | ||
date=("2021-08-01", "2022-07-01"), | ||
form=["written"], | ||
domains=["Non-fiction", "Fiction"], | ||
task_subtypes=["Topic classification"], | ||
license="Apache 2.0", | ||
socioeconomic_status="mixed", | ||
annotations_creators="human-annotated", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation=""" | ||
@inproceedings{cahyawijaya-etal-2023-nusawrites, | ||
title = "NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages", | ||
author = "Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale", | ||
editor = "Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa", | ||
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)", | ||
month = nov, | ||
year = "2023", | ||
address = "Nusa Dua, Bali", | ||
publisher = "Association for Computational Linguistics", | ||
url = "https://aclanthology.org/2023.ijcnlp-main.60", | ||
pages = "921--945", | ||
} | ||
""", | ||
n_samples={"train": 15516, "validation": 2948, "test": 6250}, | ||
avg_character_length={"train": 740.24, "validation": 740.66, "test": 740.71}, | ||
) |
Oops, something went wrong.