Skip to content

Commit

Permalink
UkrFormalityClassification added (#574)
Browse files Browse the repository at this point in the history
* UkrFormalityClassification added

* cleanup UkrFormalityClassification

* points added

* Update 574.jsonl

* UkrFormalityClassification transform fix

* Update 574.jsonl
  • Loading branch information
dokato authored Apr 26, 2024
1 parent ca2f340 commit bc833f5
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 0 deletions.
3 changes: 3 additions & 0 deletions docs/mmteb/points/574.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "dokato", "New dataset": 6}
{"GitHub": "imenelydiaker", "Review PR": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
from .tur.TurkishMovieSentimentClassification import *
from .tur.TurkishProductSentimentClassification import *
from .uig.UyghurSentimentClassification import *
from .ukr.UkrFormalityClassification import *
from .urd.UrduRomanSentimentClassification import *
from .vie.VieStudentFeedbackClassification import *
from .zho.CMTEBClassification import *
Expand Down
55 changes: 55 additions & 0 deletions mteb/tasks/Classification/ukr/UkrFormalityClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class UkrFormalityClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="UkrFormalityClassification",
description="""
This dataset contains Ukrainian Formality Classification dataset obtained by
trainslating English GYAFC data.
English data source: https://aclanthology.org/N18-1012/
Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal.
""",
dataset={
"path": "ukr-detect/ukr-formality-dataset-translated-gyafc",
"revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8",
},
reference="https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc",
type="Classification",
category="s2s",
eval_splits=["train", "test"],
eval_langs=["ukr-Cyrl"],
main_score="accuracy",
date=("2018-04-11", "2018-06-20"),
form=["written"],
domains=["News"],
task_subtypes=["Topic classification"],
license="openrail++",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
text_creation="machine-translated",
bibtex_citation="""@inproceedings{rao-tetreault-2018-dear,
title = "Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer",
author = "Rao, Sudha and
Tetreault, Joel",
booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)",
month = jun,
year = "2018",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/N18-1012",
}""",
n_samples={"train": 2048, "test": 2048},
avg_character_length={"train": 52.10, "test": 53.07},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("labels", "label")
self.dataset = self.dataset.class_encode_column("label")
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train", "test"]
)
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"dataset_revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8",
"mteb_dataset_name": "UkrFormalityClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.531884765625,
"accuracy_stderr": 0.042682624460272475,
"ap": 0.4519191811919496,
"ap_stderr": 0.023030098181090915,
"evaluation_time": 6.41,
"f1": 0.5232694350569594,
"f1_stderr": 0.04149209518186783,
"main_score": 0.531884765625
},
"train": {
"accuracy": 0.53310546875,
"accuracy_stderr": 0.03355227727082908,
"ap": 0.5188855107501925,
"ap_stderr": 0.018712669478063233,
"evaluation_time": 11.28,
"f1": 0.5264433232444724,
"f1_stderr": 0.03228714969033335,
"main_score": 0.53310546875
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"dataset_revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8",
"mteb_dataset_name": "UkrFormalityClassification",
"mteb_version": "1.6.37",
"test": {
"accuracy": 0.528076171875,
"accuracy_stderr": 0.030777409502741826,
"ap": 0.4423356000929221,
"ap_stderr": 0.010210226392447973,
"evaluation_time": 5.95,
"f1": 0.5084647911438317,
"f1_stderr": 0.022028997054790573,
"main_score": 0.528076171875
},
"train": {
"accuracy": 0.515625,
"accuracy_stderr": 0.017610646998202893,
"ap": 0.5087213333014945,
"ap_stderr": 0.009644894062476045,
"evaluation_time": 10.27,
"f1": 0.5019099055689374,
"f1_stderr": 0.014889799683856533,
"main_score": 0.515625
}
}

0 comments on commit bc833f5

Please sign in to comment.