fix: RAR-b initial PR (#929)

* RARb initial PR * RAR-b initial PR * RAR-b initial PR * RAR-b initial PR * fill metadata * add metadata n samples&char length * taskdata subtask reasoning as retrieval * fix formatting errors * metadata description made descriptive
embeddings-benchmark · Jun 16, 2024 · b75a9c9 · b75a9c9
1 parent ea62028
commit b75a9c9
Show file tree

Hide file tree

Showing 53 changed files with 6,206 additions and 0 deletions.
diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py
@@ -36,6 +36,7 @@
     "Textual Entailment",
     "Counterfactual Detection",
     "Emotion classification",
+    "Reasoning as Retrieval",
 ]
 
 TASK_DOMAIN = Literal[

diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py
@@ -14,6 +14,8 @@
 from .ell.GreekCivicsQA import *
 from .eng.AILACasedocsRetrieval import *
 from .eng.AILAStatutesRetrieval import *
+from .eng.AlphaNLIRetrieval import *
+from .eng.ARCChallengeRetrieval import *
 from .eng.ArguAnaRetrieval import *
 from .eng.ClimateFEVERRetrieval import *
 from .eng.CQADupstackAndroidRetrieval import *
@@ -34,6 +36,7 @@
 from .eng.FEVERRetrieval import *
 from .eng.FiQA2018Retrieval import *
 from .eng.HagridRetrieval import *
+from .eng.HellaSwagRetrieval import *
 from .eng.HotpotQARetrieval import *
 from .eng.LegalBenchConsumerContractsQARetrieval import *
 from .eng.LegalBenchCorporateLobbyingRetrieval import *
@@ -51,12 +54,26 @@
 from .eng.NarrativeQARetrieval import *
 from .eng.NFCorpusRetrieval import *
 from .eng.NQRetrieval import *
+from .eng.PiqaRetrieval import *
+from .eng.QuailRetrieval import *
 from .eng.QuoraRetrieval import *
+from .eng.RARbCodeRetrieval import *
+from .eng.RARbMathRetrieval import *
 from .eng.SCIDOCSRetrieval import *
 from .eng.SciFactRetrieval import *
+from .eng.SiqaRetrieval import *
+from .eng.SpartQARetrieval import *
+from .eng.TempReasonL1Retrieval import *
+from .eng.TempReasonL2ContextRetrieval import *
+from .eng.TempReasonL2FactRetrieval import *
+from .eng.TempReasonL2PureRetrieval import *
+from .eng.TempReasonL3ContextRetrieval import *
+from .eng.TempReasonL3FactRetrieval import *
+from .eng.TempReasonL3PureRetrieval import *
 from .eng.TopiOCQARetrieval import *
 from .eng.Touche2020Retrieval import *
 from .eng.TRECCOVIDRetrieval import *
+from .eng.WinoGrandeRetrieval import *
 from .est.estqa import *
 from .fra.AlloprofRetrieval import *
 from .fra.BSARDRetrieval import *

diff --git a/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py b/mteb/tasks/Retrieval/eng/ARCChallengeRetrieval.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class ARCChallenge(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="ARCChallenge",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on ARC-Challenge.",
+        reference="https://allenai.org/data/arc",
+        dataset={
+            "path": "RAR-b/ARC-Challenge",
+            "revision": "c481e0da3dcbbad8bce7721dea9085b74320a0a3",
+        },
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2018-01-01", "2018-12-31"),
+        form=["written"],
+        domains=["Encyclopaedic"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="CC BY-SA 4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+@article{clark2018think,
+  title={Think you have solved question answering? try arc, the ai2 reasoning challenge},
+  author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind},
+  journal={arXiv preprint arXiv:1803.05457},
+  year={2018}
+}
+""",
+        n_samples={"test": 1172},
+        avg_character_length={"test": 161.7},
+    )
diff --git a/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py b/mteb/tasks/Retrieval/eng/AlphaNLIRetrieval.py
@@ -0,0 +1,47 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class AlphaNLI(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="AlphaNLI",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on AlphaNLI.",
+        reference="https://leaderboard.allenai.org/anli/submissions/get-started",
+        dataset={
+            "path": "RAR-b/alphanli",
+            "revision": "303f40ef3d50918d3dc43577d33f2f7344ad72c1",
+        },
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2019-01-01", "2019-12-31"),
+        form=["written"],
+        domains=["Encyclopaedic"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="CC BY-NC 4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+
+@article{bhagavatula2019abductive,
+  title={Abductive commonsense reasoning},
+  author={Bhagavatula, Chandra and Bras, Ronan Le and Malaviya, Chaitanya and Sakaguchi, Keisuke and Holtzman, Ari and Rashkin, Hannah and Downey, Doug and Yih, Scott Wen-tau and Choi, Yejin},
+  journal={arXiv preprint arXiv:1908.05739},
+  year={2019}
+}
+""",
+        n_samples={"test": 1532},
+        avg_character_length={"test": 147.8},
+    )
diff --git a/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py b/mteb/tasks/Retrieval/eng/HellaSwagRetrieval.py
@@ -0,0 +1,46 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class HellaSwag(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="HellaSwag",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on HellaSwag.",
+        reference="https://rowanzellers.com/hellaswag/",
+        dataset={
+            "path": "RAR-b/hellaswag",
+            "revision": "a5c990205e017d10761197ccab3000936689c3ae",
+        },
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2019-01-01", "2019-12-31"),
+        form=["written"],
+        domains=["Encyclopaedic"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="MIT",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+@article{zellers2019hellaswag,
+  title={Hellaswag: Can a machine really finish your sentence?},
+  author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+  journal={arXiv preprint arXiv:1905.07830},
+  year={2019}
+}
+""",
+        n_samples={"test": 10042},
+        avg_character_length={"test": 366.1},
+    )
diff --git a/mteb/tasks/Retrieval/eng/PiqaRetrieval.py b/mteb/tasks/Retrieval/eng/PiqaRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class PIQA(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="PIQA",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on PIQA.",
+        reference="https://arxiv.org/abs/1911.11641",
+        dataset={
+            "path": "RAR-b/piqa",
+            "revision": "bb30be7e9184e6b6b1d99bbfe1bb90a3a81842e6",
+        },
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2020-01-01", "2020-12-31"),
+        form=["written"],
+        domains=["Encyclopaedic"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="AFL-3.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+@inproceedings{bisk2020piqa,
+  title={Piqa: Reasoning about physical commonsense in natural language},
+  author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others},
+  booktitle={Proceedings of the AAAI conference on artificial intelligence},
+  volume={34},
+  number={05},
+  pages={7432--7439},
+  year={2020}
+}
+""",
+        n_samples={"test": 1838},
+        avg_character_length={"test": 134.3},
+    )
diff --git a/mteb/tasks/Retrieval/eng/QuailRetrieval.py b/mteb/tasks/Retrieval/eng/QuailRetrieval.py
@@ -0,0 +1,49 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class Quail(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="Quail",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on Quail.",
+        reference="https://text-machine.cs.uml.edu/lab2/projects/quail/",
+        dataset={
+            "path": "RAR-b/quail",
+            "revision": "1851bc536f8bdab29e03e29191c4586b1d8d7c5a",
+        },
+        type="Retrieval",
+        category="s2s",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2020-01-01", "2020-12-31"),
+        form=["written"],
+        domains=["Encyclopaedic"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="CC BY-NC-SA 4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+@inproceedings{rogers2020getting,
+  title={Getting closer to AI complete question answering: A set of prerequisite real tasks},
+  author={Rogers, Anna and Kovaleva, Olga and Downey, Matthew and Rumshisky, Anna},
+  booktitle={Proceedings of the AAAI conference on artificial intelligence},
+  volume={34},
+  number={05},
+  pages={8722--8731},
+  year={2020}
+}
+""",
+        n_samples={"test": 2720},
+        avg_character_length={"test": 1983.3},
+    )
diff --git a/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py b/mteb/tasks/Retrieval/eng/RARbCodeRetrieval.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from mteb.abstasks.TaskMetadata import TaskMetadata
+
+from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
+
+
+class RARbCode(AbsTaskRetrieval):
+    metadata = TaskMetadata(
+        name="RARbCode",
+        description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on RAR-b code-pooled dataset.",
+        reference="https://arxiv.org/abs/2404.06347",
+        dataset={
+            "path": "RAR-b/humanevalpack-mbpp-pooled",
+            "revision": "25f7d11a7ac12dcbb8d3836eb2de682b98c825e4",
+        },
+        type="Retrieval",
+        category="s2p",
+        eval_splits=["test"],
+        eval_langs=["eng-Latn"],
+        main_score="ndcg_at_10",
+        date=("2019-01-01", "2023-12-31"),
+        form=["written"],
+        domains=["Programming"],
+        task_subtypes=["Reasoning as Retrieval"],
+        license="CC BY-NC-SA 4.0",
+        socioeconomic_status="medium",
+        annotations_creators="derived",
+        dialect=[],
+        text_creation="found",
+        bibtex_citation="""@article{xiao2024rar,
+  title={RAR-b: Reasoning as Retrieval Benchmark},
+  author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al},
+  journal={arXiv preprint arXiv:2404.06347},
+  year={2024}
+}
+@article{muennighoff2023octopack,
+  title={Octopack: Instruction tuning code large language models},
+  author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne},
+  journal={arXiv preprint arXiv:2308.07124},
+  year={2023}
+}
+@article{austin2021program,
+  title={Program Synthesis with Large Language Models},
+  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
+  journal={arXiv preprint arXiv:2108.07732},
+  year={2021}
+@article{husain2019codesearchnet,
+  title={Codesearchnet challenge: Evaluating the state of semantic code search},
+  author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc},
+  journal={arXiv preprint arXiv:1909.09436},
+  year={2019}
+}
+""",
+        n_samples={"test": 1484},
+        avg_character_length={"test": 621.2},
+    )