Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tasks] Fix MMBench #13

Merged
merged 9 commits into from
Mar 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
dataset_path: lmms-lab/MMBench
dataset_kwargs:
token: True
doc_to_target: "answer"
dataset_name: "cn"
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
25 changes: 25 additions & 0 deletions lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
dataset_path: lmms-lab/MMBench
dataset_kwargs:
token: True
doc_to_target: "answer"
model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
process_results: !function en_utils.mmbench_process_results
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
output_type: generate_until
dataset_name: "en"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
9 changes: 5 additions & 4 deletions lmms_eval/tasks/mmbench/cc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

eval_logger = logging.getLogger("lmms-eval")
from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f:
raw_data = f.readlines()
Expand Down Expand Up @@ -66,9 +67,9 @@ def mmbench_cn_cc_process_results(doc, results):
return data


def mmbench_cn_cc_aggregate_results(results):
def mmbench_cn_cc_aggregate_results(results, args):
df = pd.DataFrame(results)
os.makedirs("./submissions", exist_ok=True)
with pd.ExcelWriter("./submissions/mmbench_cn_cc_results.xlsx") as writer:
file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
with pd.ExcelWriter(file) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to mmbench_cn_cc_results.xlsx")
eval_logger.info(f"Saved results to {file}")
3 changes: 1 addition & 2 deletions lmms_eval/tasks/mmbench/cn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args):

def mmbench_aggregate_test_results(results, args):
df = pd.DataFrame(results)
Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True)
excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_cn_test_results.xlsx"
excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")
9 changes: 4 additions & 5 deletions lmms_eval/tasks/mmbench/en_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
"answer": doc.get("answer", None),
"options": options_prompt,
"category": doc["category"],
"L2-category": doc["l2-category"],
"L2-category": doc["L2-category"],
"options_dict": options_dict,
"index": doc["index"],
"hint": doc["hint"],
"source": doc["source"],
"split": doc["split"],
}

query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}"
query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"

if model_specific_prompt_kwargs:
query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
Expand All @@ -64,7 +64,7 @@ def mmbench_process_results(doc, results):
"source": doc["source"],
"split": doc["split"],
"category": doc["category"],
"L2-category": doc["l2-category"],
"L2-category": doc["L2-category"],
}
}
option_candidate = ["A", "B", "C", "D", "E"]
Expand All @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args):

def mmbench_aggregate_test_results(results, args):
df = pd.DataFrame(results)
Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True)
excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_en_test_results.xlsx"
excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args)
with pd.ExcelWriter(excel_write_path) as writer:
df.to_excel(writer, index=False)
eval_logger.info(f"Saved results to {excel_write_path}")
5 changes: 2 additions & 3 deletions lmms_eval/tasks/mmbench/mmbench_cc.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
dataset_path: lmms-lab/MMBench_CN
dataset_path: lmms-lab/MMBench
dataset_name: cc
dataset_kwargs:
token: True
group: mmbench_cn
task: "mmbench_cn_cc"
dataset_name: "chinese_culture"
test_split: test
output_type: generate_until
doc_to_visual: !function cc_utils.mmbench_doc_to_visual
Expand Down
4 changes: 3 additions & 1 deletion lmms_eval/tasks/mmbench/mmbench_cn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ task:
- mmbench_cn_cc
metadata:
version: 0.0
sys_prompt: "有如下几个选项:"
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true
sys_prompt: "有如下几个选项:"
30 changes: 2 additions & 28 deletions lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
dataset_path: lmms-lab/MMBench_CN
dataset_kwargs:
token: True
group: mmbench_cn
task: "mmbench_cn_dev"
dataset_name: "default"
test_split: "dev"
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
metric_list:
- metric: submission
higher_is_better: true
aggregation: !function cn_utils.mmbench_aggregate_dev_results
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
include: _default_template_mmbench_cn_yaml
30 changes: 2 additions & 28 deletions lmms_eval/tasks/mmbench/mmbench_cn_test.yaml
Original file line number Diff line number Diff line change
@@ -1,33 +1,7 @@
dataset_path: lmms-lab/MMBench_CN
dataset_kwargs:
token: True
task: "mmbench_cn_test"
dataset_name: "default"
task: mmbench_cn_test
test_split: test
output_type: generate_until
doc_to_visual: !function cn_utils.mmbench_doc_to_visual
doc_to_text: !function cn_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function cn_utils.mmbench_process_results
metric_list:
- metric: submission
aggregation: !function cn_utils.mmbench_aggregate_test_results
higher_is_better: true
metadata:
version: 0.0
gpt_eval_model_name: "gpt-3.5-turbo"
quick_extract: true

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
include: _default_template_mmbench_cn_yaml
8 changes: 0 additions & 8 deletions lmms_eval/tasks/mmbench/mmbench_en.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,3 @@ task:
metadata:
version: 0.0
sys_prompt: "There are several options:"

model_specific_prompt_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly."
model_specific_generation_kwargs:
llava:
image_aspect_ratio: original
20 changes: 2 additions & 18 deletions lmms_eval/tasks/mmbench/mmbench_en_dev.yaml
Original file line number Diff line number Diff line change
@@ -1,23 +1,7 @@
dataset_path: lmms-lab/MMBench_EN
dataset_kwargs:
token: True
task: "mmbench_en_dev"
test_split: dev
output_type: generate_until
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
until:
- "ASSISTANT:"
max_new_tokens: 1024
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function en_utils.mmbench_process_results
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_dev_results
metadata:
version: 0.0
higher_is_better: true
17 changes: 1 addition & 16 deletions lmms_eval/tasks/mmbench/mmbench_en_test.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,7 @@
dataset_path: lmms-lab/MMBench_EN
dataset_kwargs:
token: True
task: "mmbench_en_test"
test_split: test
output_type: generate_until
doc_to_visual: !function en_utils.mmbench_doc_to_visual
doc_to_text: !function en_utils.mmbench_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 256
temperature: 0
top_p: 0
num_beams: 1
do_sample: false
process_results: !function en_utils.mmbench_process_results
include: _default_template_mmbench_en_yaml
metric_list:
- metric: submission
aggregation: !function en_utils.mmbench_aggregate_test_results
higher_is_better: true
metadata:
version: 0.0
Loading