diff --git a/docs/mmteb/points/574.jsonl b/docs/mmteb/points/574.jsonl new file mode 100644 index 0000000000..f05a65589d --- /dev/null +++ b/docs/mmteb/points/574.jsonl @@ -0,0 +1,3 @@ +{"GitHub": "dokato", "New dataset": 6} +{"GitHub": "imenelydiaker", "Review PR": 2} +{"GitHub": "isaac-chung", "Review PR": 2} diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 9cb78698dd..351263cf3e 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -81,6 +81,7 @@ from .tur.TurkishMovieSentimentClassification import * from .tur.TurkishProductSentimentClassification import * from .uig.UyghurSentimentClassification import * +from .ukr.UkrFormalityClassification import * from .urd.UrduRomanSentimentClassification import * from .vie.VieStudentFeedbackClassification import * from .zho.CMTEBClassification import * diff --git a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py new file mode 100644 index 0000000000..a230905be5 --- /dev/null +++ b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class UkrFormalityClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="UkrFormalityClassification", + description=""" + This dataset contains Ukrainian Formality Classification dataset obtained by + trainslating English GYAFC data. + English data source: https://aclanthology.org/N18-1012/ + Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M + Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal. + """, + dataset={ + "path": "ukr-detect/ukr-formality-dataset-translated-gyafc", + "revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8", + }, + reference="https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc", + type="Classification", + category="s2s", + eval_splits=["train", "test"], + eval_langs=["ukr-Cyrl"], + main_score="accuracy", + date=("2018-04-11", "2018-06-20"), + form=["written"], + domains=["News"], + task_subtypes=["Topic classification"], + license="openrail++", + socioeconomic_status="mixed", + annotations_creators="derived", + dialect=[], + text_creation="machine-translated", + bibtex_citation="""@inproceedings{rao-tetreault-2018-dear, + title = "Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer", + author = "Rao, Sudha and + Tetreault, Joel", + booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", + month = jun, + year = "2018", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N18-1012", + }""", + n_samples={"train": 2048, "test": 2048}, + avg_character_length={"train": 52.10, "test": 53.07}, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("labels", "label") + self.dataset = self.dataset.class_encode_column("label") + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train", "test"] + ) diff --git a/mteb/tasks/Classification/ukr/__init__.py b/mteb/tasks/Classification/ukr/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/results/intfloat__multilingual-e5-small/UkrFormalityClassification.json b/results/intfloat__multilingual-e5-small/UkrFormalityClassification.json new file mode 100644 index 0000000000..0ed3243139 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/UkrFormalityClassification.json @@ -0,0 +1,25 @@ +{ + "dataset_revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8", + "mteb_dataset_name": "UkrFormalityClassification", + "mteb_version": "1.6.37", + "test": { + "accuracy": 0.531884765625, + "accuracy_stderr": 0.042682624460272475, + "ap": 0.4519191811919496, + "ap_stderr": 0.023030098181090915, + "evaluation_time": 6.41, + "f1": 0.5232694350569594, + "f1_stderr": 0.04149209518186783, + "main_score": 0.531884765625 + }, + "train": { + "accuracy": 0.53310546875, + "accuracy_stderr": 0.03355227727082908, + "ap": 0.5188855107501925, + "ap_stderr": 0.018712669478063233, + "evaluation_time": 11.28, + "f1": 0.5264433232444724, + "f1_stderr": 0.03228714969033335, + "main_score": 0.53310546875 + } +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/UkrFormalityClassification.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/UkrFormalityClassification.json new file mode 100644 index 0000000000..e1b137e0d0 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/UkrFormalityClassification.json @@ -0,0 +1,25 @@ +{ + "dataset_revision": "671d1e6bbf45a74ef21af351fd4ef7b32b7856f8", + "mteb_dataset_name": "UkrFormalityClassification", + "mteb_version": "1.6.37", + "test": { + "accuracy": 0.528076171875, + "accuracy_stderr": 0.030777409502741826, + "ap": 0.4423356000929221, + "ap_stderr": 0.010210226392447973, + "evaluation_time": 5.95, + "f1": 0.5084647911438317, + "f1_stderr": 0.022028997054790573, + "main_score": 0.528076171875 + }, + "train": { + "accuracy": 0.515625, + "accuracy_stderr": 0.017610646998202893, + "ap": 0.5087213333014945, + "ap_stderr": 0.009644894062476045, + "evaluation_time": 10.27, + "f1": 0.5019099055689374, + "f1_stderr": 0.014889799683856533, + "main_score": 0.515625 + } +} \ No newline at end of file