From 4ed19ce4e48d01ef3f8b4d27f20c8b8a6fed45d9 Mon Sep 17 00:00:00 2001 From: Akash Kundu <112017800+Akash190104@users.noreply.github.com> Date: Wed, 24 Apr 2024 12:41:52 +0530 Subject: [PATCH] fix: Added Hindi sentiment analysis dataset (#491) * Added Hindi sentiment analysis dataset * Made changes based on comments and added points on points table * linted correctly * use stratified_subsampling --------- Co-authored-by: Isaac Chung --- docs/mmteb/points/491.jsonl | 2 + mteb/tasks/Classification/__init__.py | 1 + .../hin/SentimentAnalysisHindi.py | 44 +++++++++++++++++++ .../SentimentAnalysisHindi.json | 13 ++++++ .../SentimentAnalysisHindi.json | 13 ++++++ 5 files changed, 73 insertions(+) create mode 100644 docs/mmteb/points/491.jsonl create mode 100644 mteb/tasks/Classification/hin/SentimentAnalysisHindi.py create mode 100644 results/intfloat__multilingual-e5-small/SentimentAnalysisHindi.json create mode 100644 results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SentimentAnalysisHindi.json diff --git a/docs/mmteb/points/491.jsonl b/docs/mmteb/points/491.jsonl new file mode 100644 index 0000000000..fb2107aca5 --- /dev/null +++ b/docs/mmteb/points/491.jsonl @@ -0,0 +1,2 @@ +{"GitHub": "Akash190104", "New dataset": 2} +{"GitHub": "isaac-chung", "Review PR": 2} diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 55c827de01..ee09c81bce 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -31,6 +31,7 @@ from .fra.MovieReviewSentimentClassification import * from .guj.GujaratiNewsClassification import * from .hin.HindiDiscourseClassification import * +from .hin.SentimentAnalysisHindi import * from .hrv.CroatianSentimentClassification import * from .ind.IndonesianIdClickbaitClassification import * from .ita.ItaHateClassification import * diff --git a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py new file mode 100644 index 0000000000..6ee5376e09 --- /dev/null +++ b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SentimentAnalysisHindi(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentAnalysisHindi", + description="Hindi Sentiment Analysis Dataset", + reference="https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi", + dataset={ + "path": "OdiaGenAI/sentiment_analysis_hindi", + "revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983", + }, + type="Classification", + category="s2s", + eval_splits=["train"], + eval_langs=["hin-Deva"], + main_score="f1", + date=("2023-09-15", "2023-10-16"), + form=["written"], + dialect=[], + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="CC BY-NC-SA 4.0", + socioeconomic_status="mixed", + annotations_creators="derived", + text_creation="found", + bibtex_citation="""@misc{OdiaGenAI, + author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli}, + title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language}, + year = {2023}, + publisher = {Hugging Face}, + journal = {Hugging Face repository}, + howpublished = {{https://huggingface.co/OdiaGenAI}}, } """, + n_samples={"train": 2497}, + avg_character_length={"train": 81.29}, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["train"] + ) diff --git a/results/intfloat__multilingual-e5-small/SentimentAnalysisHindi.json b/results/intfloat__multilingual-e5-small/SentimentAnalysisHindi.json new file mode 100644 index 0000000000..bab6fe4bf6 --- /dev/null +++ b/results/intfloat__multilingual-e5-small/SentimentAnalysisHindi.json @@ -0,0 +1,13 @@ +{ + "dataset_revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983", + "mteb_dataset_name": "SentimentAnalysisHindi", + "mteb_version": "1.6.12", + "train": { + "accuracy": 0.632666015625, + "accuracy_stderr": 0.046950749545060705, + "evaluation_time": 6.02, + "f1": 0.6169989829875298, + "f1_stderr": 0.04379471171758545, + "main_score": 0.6169989829875298 + } +} \ No newline at end of file diff --git a/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SentimentAnalysisHindi.json b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SentimentAnalysisHindi.json new file mode 100644 index 0000000000..121b445746 --- /dev/null +++ b/results/sentence-transformers__paraphrase-multilingual-MiniLM-L12-v2/SentimentAnalysisHindi.json @@ -0,0 +1,13 @@ +{ + "dataset_revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983", + "mteb_dataset_name": "SentimentAnalysisHindi", + "mteb_version": "1.6.12", + "train": { + "accuracy": 0.59599609375, + "accuracy_stderr": 0.04385635527573272, + "evaluation_time": 5.99, + "f1": 0.5861909658086787, + "f1_stderr": 0.050871526301261356, + "main_score": 0.5861909658086787 + } +} \ No newline at end of file