Skip to content

Commit

Permalink
fix: Add NusaParagraph Topic Classification (#927)
Browse files Browse the repository at this point in the history
* add data

* update lang code

* update metadata

* update metadata

* update metadata

* update files

* Update mteb/tasks/Classification/multilingual/NusaParagraphTopicClassification.py

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>

* update avg char

* add points

* add results and fix lint

---------

Co-authored-by: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
  • Loading branch information
gentaiscool and KennethEnevoldsen authored Jun 15, 2024
1 parent 2a31c8c commit e13f037
Show file tree
Hide file tree
Showing 5 changed files with 1,325 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/927.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "gentaiscool", "New dataset": 18}
{"GitHub": "KennethEnevoldsen", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
from .multilingual.MultilingualSentimentClassification import *
from .multilingual.NaijaSenti import *
from .multilingual.NordicLangClassification import *
from .multilingual.NusaParagraphTopicClassification import *
from .multilingual.NusaXSenti import *
from .multilingual.ScalaClassification import *
from .multilingual.SIB200Classification import *
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks import AbsTaskClassification, MultilingualTask

_LANGUAGES = {
"btk": ["bbc-Latn"],
"bew": ["bew-Latn"],
"bug": ["bug-Latn"],
"jav": ["jav-Latn"],
"mad": ["mad-Latn"],
"mak": ["mak-Latn"],
"min": ["min-Latn"],
"mui": ["mui-Latn"],
"rej": ["rej-Latn"],
"sun": ["sun-Latn"],
}


class NusaParagraphTopicClassification(MultilingualTask, AbsTaskClassification):
metadata = TaskMetadata(
name="NusaParagraphTopicClassification",
dataset={
"path": "gentaiscool/nusaparagraph_topic",
"revision": "abb43f8d5b9510b8724b48283aca26c4733eac5d",
},
description="NusaParagraphTopicClassification is a multi-class topic classification on 10 Indonesian languages.",
reference="https://github.com/IndoNLP/nusa-writes",
category="s2s",
type="Classification",
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="f1",
date=("2021-08-01", "2022-07-01"),
form=["written"],
domains=["Non-fiction", "Fiction"],
task_subtypes=["Topic classification"],
license="Apache 2.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="found",
bibtex_citation="""
@inproceedings{cahyawijaya-etal-2023-nusawrites,
title = "NusaWrites: Constructing High-Quality Corpora for Underrepresented and Extremely Low-Resource Languages",
author = "Cahyawijaya, Samuel and Lovenia, Holy and Koto, Fajri and Adhista, Dea and Dave, Emmanuel and Oktavianti, Sarah and Akbar, Salsabil and Lee, Jhonson and Shadieq, Nuur and Cenggoro, Tjeng Wawan and Linuwih, Hanung and Wilie, Bryan and Muridan, Galih and Winata, Genta and Moeljadi, David and Aji, Alham Fikri and Purwarianti, Ayu and Fung, Pascale",
editor = "Park, Jong C. and Arase, Yuki and Hu, Baotian and Lu, Wei and Wijaya, Derry and Purwarianti, Ayu and Krisnadhi, Adila Alfa",
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = nov,
year = "2023",
address = "Nusa Dua, Bali",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.ijcnlp-main.60",
pages = "921--945",
}
""",
n_samples={"train": 15516, "validation": 2948, "test": 6250},
avg_character_length={"train": 740.24, "validation": 740.66, "test": 740.71},
)
Loading

0 comments on commit e13f037

Please sign in to comment.