Skip to content

Commit

Permalink
Fix mlvu aggregation utils logic (#555)
Browse files Browse the repository at this point in the history
* Update utils.py

* Update utils.py

* Update utils.py
  • Loading branch information
shuyansy authored Feb 27, 2025
1 parent 56cf355 commit 13d139a
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions lmms_eval/tasks/mlvu/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

TASK_TYPES = ["TR", "AR", "VS", "NQA", "ER", "PQA", "SSC", "AO", "AC"]
TASK_TYPES = ["TR", "AR", "NQA", "ER", "PQA", "AO", "AC"]


hf_home = os.getenv("HF_HOME", "./~/.cache/huggingface")
Expand Down Expand Up @@ -105,20 +105,26 @@ def mlvu_aggregate_results(results):
category2score[task_type]["answered"] += 1
category2score[task_type]["correct"] += result["pred_answer"] == result["answer"]

task_category_scores = {}

# Calculate and log accuracy for each task category
for task_cate in TASK_TYPES:
total_correct = 0
total_answered = 0
for k, v in category2score.items():
if task_cate in k:
total_correct += v["correct"]
total_answered += v["answered"]
eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0
task_category_scores[task_cate] = accuracy
eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {accuracy:.1f}%")

# Calculate and log average accuracy across all task categories
if TASK_TYPES:
average_accuracy = sum(task_category_scores.values()) / len(TASK_TYPES)
else:
average_accuracy = 0

total_correct = 0
total_answered = 0
for k, v in category2score.items():
total_correct += v["correct"]
total_answered += v["answered"]
eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
eval_logger.info(f"Average Performance Across All Task Categories: {average_accuracy:.1f}%")

return 100 * total_correct / total_answered if total_answered > 0 else 0
return average_accuracy

0 comments on commit 13d139a

Please sign in to comment.