-
Notifications
You must be signed in to change notification settings - Fork 312
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* RARb initial PR * RAR-b initial PR * RAR-b initial PR * RAR-b initial PR * fill metadata * add metadata n samples&char length * taskdata subtask reasoning as retrieval * fix formatting errors * metadata description made descriptive
- Loading branch information
1 parent
ea62028
commit b75a9c9
Showing
53 changed files
with
6,206 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class ARCChallenge(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="ARCChallenge", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on ARC-Challenge.", | ||
reference="https://allenai.org/data/arc", | ||
dataset={ | ||
"path": "RAR-b/ARC-Challenge", | ||
"revision": "c481e0da3dcbbad8bce7721dea9085b74320a0a3", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2018-01-01", "2018-12-31"), | ||
form=["written"], | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="CC BY-SA 4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@article{clark2018think, | ||
title={Think you have solved question answering? try arc, the ai2 reasoning challenge}, | ||
author={Clark, Peter and Cowhey, Isaac and Etzioni, Oren and Khot, Tushar and Sabharwal, Ashish and Schoenick, Carissa and Tafjord, Oyvind}, | ||
journal={arXiv preprint arXiv:1803.05457}, | ||
year={2018} | ||
} | ||
""", | ||
n_samples={"test": 1172}, | ||
avg_character_length={"test": 161.7}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class AlphaNLI(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="AlphaNLI", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on AlphaNLI.", | ||
reference="https://leaderboard.allenai.org/anli/submissions/get-started", | ||
dataset={ | ||
"path": "RAR-b/alphanli", | ||
"revision": "303f40ef3d50918d3dc43577d33f2f7344ad72c1", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2019-01-01", "2019-12-31"), | ||
form=["written"], | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="CC BY-NC 4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@article{bhagavatula2019abductive, | ||
title={Abductive commonsense reasoning}, | ||
author={Bhagavatula, Chandra and Bras, Ronan Le and Malaviya, Chaitanya and Sakaguchi, Keisuke and Holtzman, Ari and Rashkin, Hannah and Downey, Doug and Yih, Scott Wen-tau and Choi, Yejin}, | ||
journal={arXiv preprint arXiv:1908.05739}, | ||
year={2019} | ||
} | ||
""", | ||
n_samples={"test": 1532}, | ||
avg_character_length={"test": 147.8}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class HellaSwag(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="HellaSwag", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on HellaSwag.", | ||
reference="https://rowanzellers.com/hellaswag/", | ||
dataset={ | ||
"path": "RAR-b/hellaswag", | ||
"revision": "a5c990205e017d10761197ccab3000936689c3ae", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2019-01-01", "2019-12-31"), | ||
form=["written"], | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="MIT", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@article{zellers2019hellaswag, | ||
title={Hellaswag: Can a machine really finish your sentence?}, | ||
author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, | ||
journal={arXiv preprint arXiv:1905.07830}, | ||
year={2019} | ||
} | ||
""", | ||
n_samples={"test": 10042}, | ||
avg_character_length={"test": 366.1}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class PIQA(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="PIQA", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on PIQA.", | ||
reference="https://arxiv.org/abs/1911.11641", | ||
dataset={ | ||
"path": "RAR-b/piqa", | ||
"revision": "bb30be7e9184e6b6b1d99bbfe1bb90a3a81842e6", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2020-01-01", "2020-12-31"), | ||
form=["written"], | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="AFL-3.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@inproceedings{bisk2020piqa, | ||
title={Piqa: Reasoning about physical commonsense in natural language}, | ||
author={Bisk, Yonatan and Zellers, Rowan and Gao, Jianfeng and Choi, Yejin and others}, | ||
booktitle={Proceedings of the AAAI conference on artificial intelligence}, | ||
volume={34}, | ||
number={05}, | ||
pages={7432--7439}, | ||
year={2020} | ||
} | ||
""", | ||
n_samples={"test": 1838}, | ||
avg_character_length={"test": 134.3}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class Quail(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="Quail", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on Quail.", | ||
reference="https://text-machine.cs.uml.edu/lab2/projects/quail/", | ||
dataset={ | ||
"path": "RAR-b/quail", | ||
"revision": "1851bc536f8bdab29e03e29191c4586b1d8d7c5a", | ||
}, | ||
type="Retrieval", | ||
category="s2s", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2020-01-01", "2020-12-31"), | ||
form=["written"], | ||
domains=["Encyclopaedic"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="CC BY-NC-SA 4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@inproceedings{rogers2020getting, | ||
title={Getting closer to AI complete question answering: A set of prerequisite real tasks}, | ||
author={Rogers, Anna and Kovaleva, Olga and Downey, Matthew and Rumshisky, Anna}, | ||
booktitle={Proceedings of the AAAI conference on artificial intelligence}, | ||
volume={34}, | ||
number={05}, | ||
pages={8722--8731}, | ||
year={2020} | ||
} | ||
""", | ||
n_samples={"test": 2720}, | ||
avg_character_length={"test": 1983.3}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from __future__ import annotations | ||
|
||
from mteb.abstasks.TaskMetadata import TaskMetadata | ||
|
||
from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval | ||
|
||
|
||
class RARbCode(AbsTaskRetrieval): | ||
metadata = TaskMetadata( | ||
name="RARbCode", | ||
description="Measuring the ability to retrieve the groundtruth answers to reasoning task queries on RAR-b code-pooled dataset.", | ||
reference="https://arxiv.org/abs/2404.06347", | ||
dataset={ | ||
"path": "RAR-b/humanevalpack-mbpp-pooled", | ||
"revision": "25f7d11a7ac12dcbb8d3836eb2de682b98c825e4", | ||
}, | ||
type="Retrieval", | ||
category="s2p", | ||
eval_splits=["test"], | ||
eval_langs=["eng-Latn"], | ||
main_score="ndcg_at_10", | ||
date=("2019-01-01", "2023-12-31"), | ||
form=["written"], | ||
domains=["Programming"], | ||
task_subtypes=["Reasoning as Retrieval"], | ||
license="CC BY-NC-SA 4.0", | ||
socioeconomic_status="medium", | ||
annotations_creators="derived", | ||
dialect=[], | ||
text_creation="found", | ||
bibtex_citation="""@article{xiao2024rar, | ||
title={RAR-b: Reasoning as Retrieval Benchmark}, | ||
author={Xiao, Chenghao and Hudson, G Thomas and Moubayed, Noura Al}, | ||
journal={arXiv preprint arXiv:2404.06347}, | ||
year={2024} | ||
} | ||
@article{muennighoff2023octopack, | ||
title={Octopack: Instruction tuning code large language models}, | ||
author={Muennighoff, Niklas and Liu, Qian and Zebaze, Armel and Zheng, Qinkai and Hui, Binyuan and Zhuo, Terry Yue and Singh, Swayam and Tang, Xiangru and Von Werra, Leandro and Longpre, Shayne}, | ||
journal={arXiv preprint arXiv:2308.07124}, | ||
year={2023} | ||
} | ||
@article{austin2021program, | ||
title={Program Synthesis with Large Language Models}, | ||
author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others}, | ||
journal={arXiv preprint arXiv:2108.07732}, | ||
year={2021} | ||
@article{husain2019codesearchnet, | ||
title={Codesearchnet challenge: Evaluating the state of semantic code search}, | ||
author={Husain, Hamel and Wu, Ho-Hsiang and Gazit, Tiferet and Allamanis, Miltiadis and Brockschmidt, Marc}, | ||
journal={arXiv preprint arXiv:1909.09436}, | ||
year={2019} | ||
} | ||
""", | ||
n_samples={"test": 1484}, | ||
avg_character_length={"test": 621.2}, | ||
) |
Oops, something went wrong.