From 025f01680f73d7ce4a70b27a8f02849d1e94d6cd Mon Sep 17 00:00:00 2001 From: Yan Shu <570533048@qq.com> Date: Tue, 25 Feb 2025 13:30:48 +0100 Subject: [PATCH 1/3] Update utils.py --- lmms_eval/tasks/mlvu/utils.py | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py index 8ddea3dd7..a464051a3 100644 --- a/lmms_eval/tasks/mlvu/utils.py +++ b/lmms_eval/tasks/mlvu/utils.py @@ -122,3 +122,45 @@ def mlvu_aggregate_results(results): eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") return 100 * total_correct / total_answered if total_answered > 0 else 0 + + + +def mlvu_aggregate_results(results): + """ + Args: + results: a list of values returned by process_results + Returns: + A score + """ + category2score = {} + for task_type in TASK_TYPES: + category2score[task_type] = {"correct": 0, "answered": 0} + + for result in results: + task_type = result["task_type"] + category2score[task_type]["answered"] += 1 + category2score[task_type]["correct"] += result["pred_answer"] == result["answer"] + + task_category_scores = {} + + # Calculate and log accuracy for each task category + for task_cate in TASK_TYPES: + total_correct = 0 + total_answered = 0 + for k, v in category2score.items(): + if task_cate in k: + total_correct += v["correct"] + total_answered += v["answered"] + accuracy = 100 * total_correct / total_answered if total_answered > 0 else 0 + task_category_scores[task_cate] = accuracy + eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {accuracy:.1f}%") + + # Calculate and log average accuracy across all task categories + if TASK_TYPES: + average_accuracy = sum(task_category_scores.values()) / len(TASK_TYPES) + else: + average_accuracy = 0 + + eval_logger.info(f"Average Performance Across All Task Categories: {average_accuracy:.1f}%") + + return average_accuracy From ccb911bc38671ce2833c17578bfd496b677afa70 Mon Sep 17 00:00:00 2001 From: Yan Shu <570533048@qq.com> Date: Tue, 25 Feb 2025 13:33:06 +0100 Subject: [PATCH 2/3] Update utils.py --- lmms_eval/tasks/mlvu/utils.py | 36 ----------------------------------- 1 file changed, 36 deletions(-) diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py index a464051a3..1f7a0d9ae 100644 --- a/lmms_eval/tasks/mlvu/utils.py +++ b/lmms_eval/tasks/mlvu/utils.py @@ -89,42 +89,6 @@ def mlvu_process_results(doc, results): return {f"mlvu_perception_score": data_dict} -def mlvu_aggregate_results(results): - """ - Args: - results: a list of values returned by process_results - Returns: - A score - """ - category2score = {} - for task_type in TASK_TYPES: - category2score[task_type] = {"correct": 0, "answered": 0} - - for result in results: - task_type = result["task_type"] - category2score[task_type]["answered"] += 1 - category2score[task_type]["correct"] += result["pred_answer"] == result["answer"] - - for task_cate in TASK_TYPES: - total_correct = 0 - total_answered = 0 - for k, v in category2score.items(): - if task_cate in k: - total_correct += v["correct"] - total_answered += v["answered"] - eval_logger.info(f"Evaluation on Task Categories: {task_cate}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") - - total_correct = 0 - total_answered = 0 - for k, v in category2score.items(): - total_correct += v["correct"] - total_answered += v["answered"] - eval_logger.info(f"Overall Performance: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%") - - return 100 * total_correct / total_answered if total_answered > 0 else 0 - - - def mlvu_aggregate_results(results): """ Args: From 765ef79f04d8a3991beecb4dbe3aa711557bacf1 Mon Sep 17 00:00:00 2001 From: Yan Shu <570533048@qq.com> Date: Thu, 27 Feb 2025 10:51:05 +0100 Subject: [PATCH 3/3] Update utils.py --- lmms_eval/tasks/mlvu/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmms_eval/tasks/mlvu/utils.py b/lmms_eval/tasks/mlvu/utils.py index 1f7a0d9ae..77b3e9cfb 100644 --- a/lmms_eval/tasks/mlvu/utils.py +++ b/lmms_eval/tasks/mlvu/utils.py @@ -14,7 +14,7 @@ from lmms_eval.tasks._task_utils.file_utils import generate_submission_file -TASK_TYPES = ["TR", "AR", "VS", "NQA", "ER", "PQA", "SSC", "AO", "AC"] +TASK_TYPES = ["TR", "AR", "NQA", "ER", "PQA", "AO", "AC"] hf_home = os.getenv("HF_HOME", "./~/.cache/huggingface")