Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pair classification inconsistency #945

Merged
merged 6 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/mmteb/points/945.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"GitHub": "dokato", "Bug fixes": 2}
{"GitHub": "dokato", "New dataset": 10}
{"GitHub": "x-tabdeveloping", "Review PR": 2}
8 changes: 4 additions & 4 deletions mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class AbsTaskPairClassification(AbsTask):
is computed to measure how well the methods can be used for pairwise pair classification.

self.load_data() must generate a huggingface dataset with a split matching self.metadata_dict["eval_splits"], and assign it to self.dataset. It must contain the following columns:
sent1: list[str]
sent2: list[str]
sentence1: list[str]
sentence2: list[str]
labels: list[int]
"""

Expand All @@ -41,8 +41,8 @@ def _evaluate_subset(
"sentence_transformers.evaluation.PairClassificationEvaluator"
).setLevel(logging.WARN)
evaluator = PairClassificationEvaluator(
data_split["sent1"],
data_split["sent2"],
data_split["sentence1"],
data_split["sentence2"],
data_split["labels"],
task_name=self.metadata.name,
**kwargs,
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/ara/ArEntail.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def dataset_transform(self):
for split in self.metadata.eval_splits:
_dataset[split] = [
{
"sent1": self.dataset[split]["premise"],
"sent2": self.dataset[split]["hypothesis"],
"sentence1": self.dataset[split]["premise"],
"sentence2": self.dataset[split]["hypothesis"],
"labels": self.dataset[split]["label"],
}
]
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/ces/CTKFactsNLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ def dataset_transform(self):
for split in self.metadata.eval_splits:
_dataset[split] = [
{
"sent1": hf_dataset[split]["evidence"],
"sent2": hf_dataset[split]["claim"],
"sentence1": hf_dataset[split]["evidence"],
"sentence2": hf_dataset[split]["claim"],
"labels": hf_dataset[split]["label"],
}
]
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/deu/FalseFriendsDeEnPC.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ def dataset_transform(self):

_dataset[split] = [
{
"sent1": hf_dataset["sent1"],
"sent2": hf_dataset["sent2"],
"sentence1": hf_dataset["sent1"],
"sentence2": hf_dataset["sent2"],
"labels": hf_dataset["labels"],
}
]
Expand Down
10 changes: 5 additions & 5 deletions mteb/tasks/PairClassification/eng/LegalBenchPC.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ def load_data(self, **kwargs: Any) -> None:

_dataset = _dataset.rename_columns(
{
dataset_col_map["sent1"]: "sent1",
dataset_col_map["sent2"]: "sent2",
dataset_col_map["sent1"]: "sentence1",
dataset_col_map["sent2"]: "sentence2",
dataset_col_map["labels"]: "labels",
}
)
_dataset = _dataset.select_columns(["labels", "sent1", "sent2"])
_dataset = _dataset.select_columns(["labels", "sentence1", "sentence2"])
mapping = dataset_col_map["mapping"]
_dataset = _dataset.map(
lambda example: {
Expand Down Expand Up @@ -174,8 +174,8 @@ def dataset_transform(self):
hf_dataset = self.dataset[split]
_dataset[split] = [
{
"sent1": hf_dataset["sent1"],
"sent2": hf_dataset["sent2"],
"sentence1": hf_dataset["sentence1"],
"sentence2": hf_dataset["sentence2"],
"labels": hf_dataset["labels"],
}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,7 @@ class SprintDuplicateQuestionsPC(AbsTaskPairClassification):
n_samples={"validation": 101000, "test": 101000},
avg_character_length={"validation": 65.2, "test": 67.9},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")
38 changes: 21 additions & 17 deletions mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,27 @@ class TwitterSemEval2015PC(AbsTaskPairClassification):
dialect=None,
text_creation=None,
bibtex_citation="""@inproceedings{xu-etal-2015-semeval,
title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})",
author = "Xu, Wei and
Callison-Burch, Chris and
Dolan, Bill",
editor = "Nakov, Preslav and
Zesch, Torsten and
Cer, Daniel and
Jurgens, David",
booktitle = "Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)",
month = jun,
year = "2015",
address = "Denver, Colorado",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/S15-2001",
doi = "10.18653/v1/S15-2001",
pages = "1--11",
}""",
title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})",
author = "Xu, Wei and
Callison-Burch, Chris and
Dolan, Bill",
editor = "Nakov, Preslav and
Zesch, Torsten and
Cer, Daniel and
Jurgens, David",
booktitle = "Proceedings of the 9th International Workshop on Semantic Evaluation ({S}em{E}val 2015)",
month = jun,
year = "2015",
address = "Denver, Colorado",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/S15-2001",
doi = "10.18653/v1/S15-2001",
pages = "1--11",
}""",
n_samples={"test": 16777},
avg_character_length={"test": 38.3},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")
40 changes: 22 additions & 18 deletions mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,24 +29,28 @@ class TwitterURLCorpusPC(AbsTaskPairClassification):
dialect=None,
text_creation=None,
bibtex_citation="""@inproceedings{lan-etal-2017-continuously,
title = "A Continuously Growing Dataset of Sentential Paraphrases",
author = "Lan, Wuwei and
Qiu, Siyu and
He, Hua and
Xu, Wei",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1126",
doi = "10.18653/v1/D17-1126",
pages = "1224--1234",
abstract = "A major challenge in paraphrase research is the lack of parallel corpora. In this paper, we present a new method to collect large-scale sentential paraphrases from Twitter by linking tweets through shared URLs. The main advantage of our method is its simplicity, as it gets rid of the classifier or human in the loop needed to select data before annotation and subsequent application of paraphrase identification algorithms in the previous work. We present the largest human-labeled paraphrase corpus to date of 51,524 sentence pairs and the first cross-domain benchmarking for automatic paraphrase identification. In addition, we show that more than 30,000 new sentential paraphrases can be easily and continuously captured every month at {\textasciitilde}70{\%} precision, and demonstrate their utility for downstream NLP tasks through phrasal paraphrase extraction. We make our code and data freely available.",
}""",
title = "A Continuously Growing Dataset of Sentential Paraphrases",
author = "Lan, Wuwei and
Qiu, Siyu and
He, Hua and
Xu, Wei",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1126",
doi = "10.18653/v1/D17-1126",
pages = "1224--1234",
abstract = "A major challenge in paraphrase research is the lack of parallel corpora. In this paper, we present a new method to collect large-scale sentential paraphrases from Twitter by linking tweets through shared URLs. The main advantage of our method is its simplicity, as it gets rid of the classifier or human in the loop needed to select data before annotation and subsequent application of paraphrase identification algorithms in the previous work. We present the largest human-labeled paraphrase corpus to date of 51,524 sentence pairs and the first cross-domain benchmarking for automatic paraphrase identification. In addition, we show that more than 30,000 new sentential paraphrases can be easily and continuously captured every month at {\textasciitilde}70{\%} precision, and demonstrate their utility for downstream NLP tasks through phrasal paraphrase extraction. We make our code and data freely available.",
}""",
n_samples={"test": 51534},
avg_character_length={"test": 79.5},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/fas/FarsTail.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def dataset_transform(self):
for split in self.metadata.eval_splits:
_dataset[split] = [
{
"sent1": self.dataset[split]["premise"],
"sent2": self.dataset[split]["hypothesis"],
"sentence1": self.dataset[split]["premise"],
"sentence2": self.dataset[split]["hypothesis"],
"labels": self.dataset[split]["label"],
}
]
Expand Down
18 changes: 11 additions & 7 deletions mteb/tasks/PairClassification/hye/ArmenianParaphrasePC.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,18 @@ class ArmenianParaphrasePC(AbsTaskPairClassification):
text_creation="found",
bibtex_citation="""
@misc{malajyan2020arpa,
title={ARPA: Armenian Paraphrase Detection Corpus and Models},
author={Arthur Malajyan and Karen Avetisyan and Tsolak Ghukasyan},
year={2020},
eprint={2009.12615},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
title={ARPA: Armenian Paraphrase Detection Corpus and Models},
author={Arthur Malajyan and Karen Avetisyan and Tsolak Ghukasyan},
year={2020},
eprint={2009.12615},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
""",
n_samples={"train": 4023, "test": 1470},
avg_character_length={"train": 243.81, "test": 241.37},
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("sent1", "sentence1")
self.dataset = self.dataset.rename_column("sent2", "sentence2")
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/ind/IndoNLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ def dataset_transform(self):
)
_dataset[split] = [
{
"sent1": hf_dataset["premise"],
"sent2": hf_dataset["hypothesis"],
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/kor/KlueNLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def dataset_transform(self):
)
_dataset[split] = [
{
"sent1": hf_dataset["premise"],
"sent2": hf_dataset["hypothesis"],
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from __future__ import annotations

from mteb.abstasks import AbsTaskPairClassification, MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

_LANGUAGES = {
"as": ["asm-Beng"],
"bn": ["ben-Beng"],
"gu": ["guj-Gujr"],
"hi": ["hin-Deva"],
"kn": ["kan-Knda"],
"ml": ["mal-Mlym"],
"mr": ["mar-Deva"],
"or": ["ory-Orya"],
"pa": ["pan-Guru"],
"ta": ["tam-Taml"],
"te": ["tel-Telu"],
}


class IndicXnliPairClassification(AbsTaskPairClassification, MultilingualTask):
metadata = TaskMetadata(
name="IndicXnliPairClassification",
dataset={
"path": "Divyanshu/indicxnli",
"revision": "7092c27872e919f31d0496fb8b9c47bd2cba3f6c",
"split": "test",
"trust_remote_code": True,
},
description="""INDICXNLI is similar to existing XNLI dataset in shape/form, but
focusses on Indic language family.
The train (392,702), validation (2,490), and evaluation sets (5,010) of English
XNLI were translated from English into each of the eleven Indic languages. IndicTrans
is a large Transformer-based sequence to sequence model. It is trained on Samanantar
dataset (Ramesh et al., 2021), which is the largest parallel multi- lingual corpus
over eleven Indic languages.
""",
reference="https://gem-benchmark.com/data_cards/opusparcus",
category="s2s",
type="PairClassification",
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="ap",
date=("2022-04-22", "2022-10-06"),
form=["written"],
domains=["Non-fiction", "Fiction", "Government"],
task_subtypes=None,
license="cc-by-4.0",
socioeconomic_status="low",
annotations_creators="derived",
dialect=[],
text_creation="machine-translated",
bibtex_citation="""
@misc{aggarwal_gupta_kunch_22,
doi = {10.48550/ARXIV.2204.08776},
url = {https://arxiv.org/abs/2204.08776},
author = {Aggarwal, Divyanshu and Gupta, Vivek and Kunchukuttan, Anoop},
title = {IndicXNLI: Evaluating Multilingual Inference for Indian Languages},
publisher = {arXiv},
year = {2022},
copyright = {Creative Commons Attribution 4.0 International}
}
""",
n_samples={"test": 5010},
avg_character_length={"test": 77.24}, # average of premise and hypothesis
)

def dataset_transform(self) -> None:
# Convert to standard format
_dataset = {}
for lang in self.hf_subsets:
_dataset[lang] = {}
hf_dataset = self.dataset[lang]
# 0=entailment, 2=contradiction. Filter out neutral to match the task.
# Then map entailment as positive (1) and contradiction as negative (0).
hf_dataset = self.dataset[lang].filter(lambda x: x["label"] in [0, 2])
hf_dataset = hf_dataset.map(
lambda example: {"label": 0 if example["label"] == 2 else 1}
)
_dataset[lang]["test"] = [
{
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
self.dataset = _dataset
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/multilingual/OpusparcusPC.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ def dataset_transform(self, lang):
del sent1[i]
del sent2[i]
new_dict["labels"] = [labels]
new_dict["sent1"] = [sent1]
new_dict["sent2"] = [sent2]
new_dict["sentence1"] = [sent1]
new_dict["sentence2"] = [sent2]
self.dataset[lang][split] = datasets.Dataset.from_dict(new_dict)
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/multilingual/PawsX.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ def dataset_transform(self):

_dataset[lang][split] = [
{
"sent1": hf_dataset["sentence1"],
"sent2": hf_dataset["sentence2"],
"sentence1": hf_dataset["sentence1"],
"sentence2": hf_dataset["sentence2"],
"labels": hf_dataset["label"],
}
]
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/multilingual/RTE3.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def dataset_transform(self):
)
_dataset[lang][split] = [
{
"sent1": hf_dataset["premise"],
"sent2": hf_dataset["hypothesis"],
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
Expand Down
8 changes: 4 additions & 4 deletions mteb/tasks/PairClassification/multilingual/XNLI.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ def dataset_transform(self):

_dataset[lang][split] = [
{
"sent1": hf_dataset["premise"],
"sent2": hf_dataset["hypothesis"],
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
Expand Down Expand Up @@ -166,8 +166,8 @@ def dataset_transform(self):
)
_dataset[lang][split] = [
{
"sent1": hf_dataset["premise"],
"sent2": hf_dataset["hypothesis"],
"sentence1": hf_dataset["premise"],
"sentence2": hf_dataset["hypothesis"],
"labels": hf_dataset["label"],
}
]
Expand Down
4 changes: 2 additions & 2 deletions mteb/tasks/PairClassification/multilingual/XStance.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ def load_data(self, **kwargs):

def convert_example(example):
return {
"sent1": example["question"],
"sent2": example["comment"],
"sentence1": example["question"],
"sentence2": example["comment"],
"labels": 1 if example["label"] == "FAVOR" else 0,
}

Expand Down
Loading
Loading