Skip to content

Commit

Permalink
fix: RAR-b initial PR (#929)
Browse files Browse the repository at this point in the history
* RARb initial PR

* RAR-b initial PR

* RAR-b initial PR

* RAR-b initial PR

* fill metadata

* add metadata n samples&char length

* taskdata subtask reasoning as retrieval

* fix formatting errors

* metadata description made descriptive
  • Loading branch information
gowitheflow-1998 authored Jun 16, 2024
1 parent ea62028 commit b75a9c9
Show file tree
Hide file tree
Showing 53 changed files with 6,206 additions and 0 deletions.
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"Textual Entailment",
"Counterfactual Detection",
"Emotion classification",
"Reasoning as Retrieval",
]

TASK_DOMAIN = Literal[
Expand Down
17 changes: 17 additions & 0 deletions mteb/tasks/Retrieval/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from .ell.GreekCivicsQA import *
from .eng.AILACasedocsRetrieval import *
from .eng.AILAStatutesRetrieval import *
from .eng.AlphaNLIRetrieval import *
from .eng.ARCChallengeRetrieval import *
from .eng.ArguAnaRetrieval import *
from .eng.ClimateFEVERRetrieval import *
from .eng.CQADupstackAndroidRetrieval import *
Expand All @@ -34,6 +36,7 @@
from .eng.FEVERRetrieval import *
from .eng.FiQA2018Retrieval import *
from .eng.HagridRetrieval import *
from .eng.HellaSwagRetrieval import *
from .eng.HotpotQARetrieval import *
from .eng.LegalBenchConsumerContractsQARetrieval import *
from .eng.LegalBenchCorporateLobbyingRetrieval import *
Expand All @@ -51,12 +54,26 @@
from .eng.NarrativeQARetrieval import *
from .eng.NFCorpusRetrieval import *
from .eng.NQRetrieval import *
from .eng.PiqaRetrieval import *
from .eng.QuailRetrieval import *
from .eng.QuoraRetrieval import *
from .eng.RARbCodeRetrieval import *
from .eng.RARbMathRetrieval import *
from .eng.SCIDOCSRetrieval import *
from .eng.SciFactRetrieval import *
from .eng.SiqaRetrieval import *
from .eng.SpartQARetrieval import *
from .eng.TempReasonL1Retrieval import *
from .eng.TempReasonL2ContextRetrieval import *
from .eng.TempReasonL2FactRetrieval import *
from .eng.TempReasonL2PureRetrieval import *
from .eng.TempReasonL3ContextRetrieval import *
from .eng.TempReasonL3FactRetrieval import *
from .eng.TempReasonL3PureRetrieval import *
from .eng.TopiOCQARetrieval import *
from .eng.Touche2020Retrieval import *
from .eng.TRECCOVIDRetrieval import *
from .eng.WinoGrandeRetrieval import *
from .est.estqa import *
from .fra.AlloprofRetrieval import *
from .fra.BSARDRetrieval import *
Expand Down
46 changes: 46 additions & 0 deletions mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class ARCChallenge(AbsTaskRetrieval):
metadata = TaskMetadata(
name="ARCChallenge",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on ARC-Challenge.",
reference="https://allenai.org/data/arc",
dataset={
"path": "RAR-b/ARC-Challenge",
"revision": "c481e0da3dcbbad8bce7721dea9085b74320a0a3",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2018-01-01", "2018-12-31"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Reasoning as Retrieval"],
license="CC BY-SA 4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@article{clark2018think,
title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
journal={arXiv preprint arXiv:1803.05457},
year={2018}
}
""",
n_samples={"test": 1172},
avg_character_length={"test": 161.7},
)
47 changes: 47 additions & 0 deletions mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class AlphaNLI(AbsTaskRetrieval):
metadata = TaskMetadata(
name="AlphaNLI",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on AlphaNLI.",
reference="https://leaderboard.allenai.org/anli/submissions/get-started",
dataset={
"path": "RAR-b/alphanli",
"revision": "303f40ef3d50918d3dc43577d33f2f7344ad72c1",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2019-01-01", "2019-12-31"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Reasoning as Retrieval"],
license="CC BY-NC 4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@article{bhagavatula2019abductive,
title={Abductive commonsense reasoning},
author={Bhagavatula, Chandra and Bras, Ronan Le and Malaviya, Chaitanya and Sakaguchi, Keisuke and Holtzman, Ari and Rashkin, Hannah and Downey, Doug and Yih, Scott Wen-tau and Choi, Yejin},
journal={arXiv preprint arXiv:1908.05739},
year={2019}
}
""",
n_samples={"test": 1532},
avg_character_length={"test": 147.8},
)
46 changes: 46 additions & 0 deletions mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class HellaSwag(AbsTaskRetrieval):
metadata = TaskMetadata(
name="HellaSwag",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on HellaSwag.",
reference="https://rowanzellers.com/hellaswag/",
dataset={
"path": "RAR-b/hellaswag",
"revision": "a5c990205e017d10761197ccab3000936689c3ae",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2019-01-01", "2019-12-31"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Reasoning as Retrieval"],
license="MIT",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@article{zellers2019hellaswag,
title={Hellaswag: Can a machine really finish your sentence?},
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
journal={arXiv preprint arXiv:1905.07830},
year={2019}
}
""",
n_samples={"test": 10042},
avg_character_length={"test": 366.1},
)
49 changes: 49 additions & 0 deletions mteb/tasks/Retrieval/eng/PiqaRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class PIQA(AbsTaskRetrieval):
metadata = TaskMetadata(
name="PIQA",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on PIQA.",
reference="https://arxiv.org/abs/1911.11641",
dataset={
"path": "RAR-b/piqa",
"revision": "bb30be7e9184e6b6b1d99bbfe1bb90a3a81842e6",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2020-01-01", "2020-12-31"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Reasoning as Retrieval"],
license="AFL-3.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@inproceedings{bisk2020piqa,
title={Piqa: Reasoning about physical commonsense in natural language},
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
booktitle={Proceedings of the AAAI conference on artificial intelligence},
volume={34},
number={05},
pages={7432--7439},
year={2020}
}
""",
n_samples={"test": 1838},
avg_character_length={"test": 134.3},
)
49 changes: 49 additions & 0 deletions mteb/tasks/Retrieval/eng/QuailRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class Quail(AbsTaskRetrieval):
metadata = TaskMetadata(
name="Quail",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on Quail.",
reference="https://text-machine.cs.uml.edu/lab2/projects/quail/",
dataset={
"path": "RAR-b/quail",
"revision": "1851bc536f8bdab29e03e29191c4586b1d8d7c5a",
},
type="Retrieval",
category="s2s",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2020-01-01", "2020-12-31"),
form=["written"],
domains=["Encyclopaedic"],
task_subtypes=["Reasoning as Retrieval"],
license="CC BY-NC-SA 4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@inproceedings{rogers2020getting,
title={Getting closer to AI complete question answering: A set of prerequisite real tasks},
author={Rogers, Anna and Kovaleva, Olga and Downey, Matthew and Rumshisky, Anna},
booktitle={Proceedings of the AAAI conference on artificial intelligence},
volume={34},
number={05},
pages={8722--8731},
year={2020}
}
""",
n_samples={"test": 2720},
avg_character_length={"test": 1983.3},
)
57 changes: 57 additions & 0 deletions mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from mteb.abstasks.TaskMetadata import TaskMetadata

from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval


class RARbCode(AbsTaskRetrieval):
metadata = TaskMetadata(
name="RARbCode",
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on RAR-b code-pooled dataset.",
reference="https://arxiv.org/abs/2404.06347",
dataset={
"path": "RAR-b/humanevalpack-mbpp-pooled",
"revision": "25f7d11a7ac12dcbb8d3836eb2de682b98c825e4",
},
type="Retrieval",
category="s2p",
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="ndcg_at_10",
date=("2019-01-01", "2023-12-31"),
form=["written"],
domains=["Programming"],
task_subtypes=["Reasoning as Retrieval"],
license="CC BY-NC-SA 4.0",
socioeconomic_status="medium",
annotations_creators="derived",
dialect=[],
text_creation="found",
bibtex_citation="""@article{xiao2024rar,
title={RAR-b: Reasoning as Retrieval Benchmark},
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
journal={arXiv preprint arXiv:2404.06347},
year={2024}
}
@article{muennighoff2023octopack,
title={Octopack: Instruction tuning code large language models},
author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne},
journal={arXiv preprint arXiv:2308.07124},
year={2023}
}
@article{austin2021program,
title={Program Synthesis with Large Language Models},
author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
journal={arXiv preprint arXiv:2108.07732},
year={2021}
@article{husain2019codesearchnet,
title={Codesearchnet challenge: Evaluating the state of semantic code search},
author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
journal={arXiv preprint arXiv:1909.09436},
year={2019}
}
""",
n_samples={"test": 1484},
avg_character_length={"test": 621.2},
)
Loading

0 comments on commit b75a9c9

Please sign in to comment.