Skip to content

Commit

Permalink
fix: Added Hindi sentiment analysis dataset (#491)
Browse files Browse the repository at this point in the history
* Added Hindi sentiment analysis dataset
* Made changes based on comments and added points on points table
* linted correctly
* use stratified_subsampling
---------
Co-authored-by: Isaac Chung <chungisaac1217@gmail.com>
  • Loading branch information
Akash190104 authored Apr 24, 2024
1 parent fdedf69 commit 4ed19ce
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/mmteb/points/491.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"GitHub": "Akash190104", "New dataset": 2}
{"GitHub": "isaac-chung", "Review PR": 2}
1 change: 1 addition & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from .fra.MovieReviewSentimentClassification import *
from .guj.GujaratiNewsClassification import *
from .hin.HindiDiscourseClassification import *
from .hin.SentimentAnalysisHindi import *
from .hrv.CroatianSentimentClassification import *
from .ind.IndonesianIdClickbaitClassification import *
from .ita.ItaHateClassification import *
Expand Down
44 changes: 44 additions & 0 deletions mteb/tasks/Classification/hin/SentimentAnalysisHindi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class SentimentAnalysisHindi(AbsTaskClassification):
metadata = TaskMetadata(
name="SentimentAnalysisHindi",
description="Hindi Sentiment Analysis Dataset",
reference="https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi",
dataset={
"path": "OdiaGenAI/sentiment_analysis_hindi",
"revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983",
},
type="Classification",
category="s2s",
eval_splits=["train"],
eval_langs=["hin-Deva"],
main_score="f1",
date=("2023-09-15", "2023-10-16"),
form=["written"],
dialect=[],
domains=["Reviews"],
task_subtypes=["Sentiment/Hate speech"],
license="CC BY-NC-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="derived",
text_creation="found",
bibtex_citation="""@misc{OdiaGenAI,
author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli},
title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language},
year = {2023},
publisher = {Hugging Face},
journal = {Hugging Face repository},
howpublished = {{https://huggingface.co/OdiaGenAI}}, } """,
n_samples={"train": 2497},
avg_character_length={"train": 81.29},
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983",
"mteb_dataset_name": "SentimentAnalysisHindi",
"mteb_version": "1.6.12",
"train": {
"accuracy": 0.632666015625,
"accuracy_stderr": 0.046950749545060705,
"evaluation_time": 6.02,
"f1": 0.6169989829875298,
"f1_stderr": 0.04379471171758545,
"main_score": 0.6169989829875298
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"dataset_revision": "1beac1b941da76a9c51e3e5b39d230fde9a80983",
"mteb_dataset_name": "SentimentAnalysisHindi",
"mteb_version": "1.6.12",
"train": {
"accuracy": 0.59599609375,
"accuracy_stderr": 0.04385635527573272,
"evaluation_time": 5.99,
"f1": 0.5861909658086787,
"f1_stderr": 0.050871526301261356,
"main_score": 0.5861909658086787
}
}

0 comments on commit 4ed19ce

Please sign in to comment.