diff --git a/src/open_r1/evaluate.py b/src/open_r1/evaluate.py index 0447b266..b32389ac 100644 --- a/src/open_r1/evaluate.py +++ b/src/open_r1/evaluate.py @@ -106,6 +106,20 @@ def gpqa_prompt_fn(line, task_name: str = None): metric=[expr_gold_metric], version=1, ) +aime25 = LightevalTaskConfig( + name="aime25", + suite=["custom"], + prompt_function=aime_prompt_fn, + hf_repo="open-r1/aime_2025_1", + hf_subset="default", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select=None, + generation_size=32768, + metric=[expr_gold_metric], + version=1, +) math_500 = LightevalTaskConfig( name="math_500", suite=["custom"], @@ -141,6 +155,7 @@ def gpqa_prompt_fn(line, task_name: str = None): # Add tasks to the table TASKS_TABLE = [] TASKS_TABLE.append(aime24) +TASKS_TABLE.append(aime25) TASKS_TABLE.append(math_500) TASKS_TABLE.append(gpqa_diamond) diff --git a/src/open_r1/utils/evaluation.py b/src/open_r1/utils/evaluation.py index 86de906d..48997183 100644 --- a/src/open_r1/utils/evaluation.py +++ b/src/open_r1/utils/evaluation.py @@ -48,6 +48,7 @@ def register_lighteval_task( register_lighteval_task(LIGHTEVAL_TASKS, "custom", "math_500", "math_500", 0) register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime24", "aime24", 0) +register_lighteval_task(LIGHTEVAL_TASKS, "custom", "aime25", "aime25", 0) register_lighteval_task(LIGHTEVAL_TASKS, "custom", "gpqa", "gpqa:diamond", 0)