Skip to content

Commit

Permalink
Update utils.py
Browse files Browse the repository at this point in the history
  • Loading branch information
shuyansy authored Feb 25, 2025
1 parent eb2dadc commit 025f016
Showing 1 changed file with 42 additions and 0 deletions.
42 changes: 42 additions & 0 deletions lmms_eval/tasks/mlvu/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,45 @@ def mlvu_aggregate_results(results):
eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")

return 100 * total_correct / total_answered if total_answered > 0 else 0



def mlvu_aggregate_results(results):
"""
Args:
results: a list of values returned by process_results
Returns:
A score
"""
category2score = {}
for task_type in TASK_TYPES:
category2score[task_type] = {"correct": 0, "answered": 0}

for result in results:
task_type = result["task_type"]
category2score[task_type]["answered"] += 1
category2score[task_type]["correct"] += result["pred_answer"] == result["answer"]

task_category_scores = {}

# Calculate and log accuracy for each task category
for task_cate in TASK_TYPES:
total_correct = 0
total_answered = 0
for k, v in category2score.items():
if task_cate in k:
total_correct += v["correct"]
total_answered += v["answered"]
accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0
task_category_scores[task_cate] = accuracy
eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {accuracy:.1f}%")

# Calculate and log average accuracy across all task categories
if TASK_TYPES:
average_accuracy = sum(task_category_scores.values()) / len(TASK_TYPES)
else:
average_accuracy = 0

eval_logger.info(f"Average Performance Across All Task Categories: {average_accuracy:.1f}%")

return average_accuracy

0 comments on commit 025f016

Please sign in to comment.