diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml new file mode 100644 index 000000000..81094620b --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml @@ -0,0 +1,22 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +doc_to_target: "answer" +dataset_name: "cn" +output_type: generate_until +doc_to_visual: !function cn_utils.mmbench_doc_to_visual +doc_to_text: !function cn_utils.mmbench_doc_to_text +generation_kwargs: + max_new_tokens: 256 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function cn_utils.mmbench_process_results +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml new file mode 100644 index 000000000..ab2b882c8 --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml @@ -0,0 +1,25 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +doc_to_target: "answer" +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +doc_to_visual: !function en_utils.mmbench_doc_to_visual +doc_to_text: !function en_utils.mmbench_doc_to_text +doc_to_target: "answer" +process_results: !function en_utils.mmbench_process_results +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +output_type: generate_until +dataset_name: "en" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py index c049613f2..7009e012e 100644 --- a/lmms_eval/tasks/mmbench/cc_utils.py +++ b/lmms_eval/tasks/mmbench/cc_utils.py @@ -7,6 +7,7 @@ eval_logger = logging.getLogger("lmms-eval") from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f: raw_data = f.readlines() @@ -66,9 +67,9 @@ def mmbench_cn_cc_process_results(doc, results): return data -def mmbench_cn_cc_aggregate_results(results): +def mmbench_cn_cc_aggregate_results(results, args): df = pd.DataFrame(results) - os.makedirs("./submissions", exist_ok=True) - with pd.ExcelWriter("./submissions/mmbench_cn_cc_results.xlsx") as writer: + file = generate_submission_file("mmbench_cn_cc_results.xlsx", args) + with pd.ExcelWriter(file) as writer: df.to_excel(writer, index=False) - eval_logger.info(f"Saved results to mmbench_cn_cc_results.xlsx") + eval_logger.info(f"Saved results to {file}") diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py index 1010eb1b1..812b9aa38 100644 --- a/lmms_eval/tasks/mmbench/cn_utils.py +++ b/lmms_eval/tasks/mmbench/cn_utils.py @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_cn_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 1962dc528..26e260006 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -36,7 +36,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "answer": doc.get("answer", None), "options": options_prompt, "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], "options_dict": options_dict, "index": doc["index"], "hint": doc["hint"], @@ -44,7 +44,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "split": doc["split"], } - query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}" + query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}" if model_specific_prompt_kwargs: query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}" @@ -64,7 +64,7 @@ def mmbench_process_results(doc, results): "source": doc["source"], "split": doc["split"], "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], } } option_candidate = ["A", "B", "C", "D", "E"] @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_en_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") diff --git a/lmms_eval/tasks/mmbench/mmbench_cc.yaml b/lmms_eval/tasks/mmbench/mmbench_cc.yaml index 0ec13674f..238aa10c9 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cc.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cc.yaml @@ -1,9 +1,8 @@ -dataset_path: lmms-lab/MMBench_CN +dataset_path: lmms-lab/MMBench +dataset_name: cc dataset_kwargs: token: True -group: mmbench_cn task: "mmbench_cn_cc" -dataset_name: "chinese_culture" test_split: test output_type: generate_until doc_to_visual: !function cc_utils.mmbench_doc_to_visual diff --git a/lmms_eval/tasks/mmbench/mmbench_cn.yaml b/lmms_eval/tasks/mmbench/mmbench_cn.yaml index 82fddeb0c..6232531c4 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn.yaml @@ -5,4 +5,6 @@ task: - mmbench_cn_cc metadata: version: 0.0 - sys_prompt: "有如下几个选项:" \ No newline at end of file + gpt_eval_model_name: "gpt-3.5-turbo" + quick_extract: true + sys_prompt: "有如下几个选项:" diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml index 3b2b4fbb1..3d7b9d98b 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -group: mmbench_cn task: "mmbench_cn_dev" -dataset_name: "default" test_split: "dev" -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission + higher_is_better: true aggregation: !function cn_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml index b17bb761d..b86f092cb 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -task: "mmbench_cn_test" -dataset_name: "default" +task: mmbench_cn_test test_split: test -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission aggregation: !function cn_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_en.yaml b/lmms_eval/tasks/mmbench/mmbench_en.yaml index c518f924e..9fa757cc3 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en.yaml @@ -5,11 +5,3 @@ task: metadata: version: 0.0 sys_prompt: "There are several options:" - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\nAnswer with the option's letter from the given choices directly." -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml index 89d6ff76d..b4f4a2e9f 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml @@ -1,23 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_dev" test_split: dev -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 \ No newline at end of file + higher_is_better: true diff --git a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml index 92f73ef1f..5acf404af 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml @@ -1,22 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_test" test_split: test -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 \ No newline at end of file