diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index d32811391..f5c4b7ba3 100644 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -123,7 +123,7 @@ def parse_eval_args() -> argparse.Namespace: return args -def cli_evaluate(args: Union[argparse.Namespace, None], wandb_run) -> None: +def cli_evaluate(args: Union[argparse.Namespace, None] = None, wandb_run=None) -> None: if args is None: args = parse_eval_args() @@ -292,10 +292,22 @@ def print_results(args, results): # initialize Accelerator accelerator = Accelerator() + all_args_dict = vars(args) if accelerator.is_main_process: # initialize a W&B run only on rank 0 wandb_args_dict = utils.simple_parse_args_string(args.wandb_args) + if "name" not in wandb_args_dict: + if "config" not in all_args_dict: + # use the model name and task names as run name + task_names = args.tasks.replace(",", "_") + wandb_args_dict["name"] = f"{args.model}_{task_names}_{args.log_samples_suffix}" + if args.num_fewshot: + wandb_args_dict["name"] += f"_{args.num_fewshot}shot" + else: + # use the name of the config file as run name + wandb_args_dict["name"] = all_args_dict["config"].split("/")[-1].split(".")[0] + wandb_run = wandb.init(**wandb_args_dict) is_main_process = True else: @@ -307,3 +319,6 @@ def print_results(args, results): for args in args_list: results = cli_evaluate(args, wandb_run) results_list.append(results) + + if is_main_process: + wandb_run.finish() diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py index 933f2fdb7..4b4bd1e66 100644 --- a/lmms_eval/models/llava.py +++ b/lmms_eval/models/llava.py @@ -258,7 +258,7 @@ def _collate(x): if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") - + eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}") # encode, pad, and truncate contexts for this batch if visuals: image_tensor = process_images(visuals, self._image_processor, self._config) @@ -289,7 +289,7 @@ def _collate(x): input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) # preconfigure gen_kwargs with defaults - gen_kwargs["image_sizes"] = [visuals[0].size] + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: @@ -318,9 +318,11 @@ def _collate(x): use_cache=self.use_cache, ) except Exception as e: - print("Error in generating") + eval_logger.error(f"Error {e} in generating") cont = "" - raise e + eval_logger.error(prompt) + eval_logger.error(visuals) + eval_logger.error(prompts_input) cont_toks_list = cont.tolist() for cont_toks, context in zip(cont_toks_list, contexts): diff --git a/lmms_eval/tasks/coco/coco2017.yaml b/lmms_eval/tasks/coco/coco2017.yaml new file mode 100644 index 000000000..e459b8279 --- /dev/null +++ b/lmms_eval/tasks/coco/coco2017.yaml @@ -0,0 +1,4 @@ +group : coco2017 +task: + - coco_val2017 + - coco_test2017 \ No newline at end of file diff --git a/lmms_eval/tasks/coco/coco_test.yaml b/lmms_eval/tasks/coco/coco_test.yaml index 4605178c9..d99abfa48 100644 --- a/lmms_eval/tasks/coco/coco_test.yaml +++ b/lmms_eval/tasks/coco/coco_test.yaml @@ -6,12 +6,10 @@ group : "coco_caption" test_split: test output_type: generate_until doc_to_visual: !function utils.coco_doc_to_visual -doc_to_text: !function utils.coco_doc_to_text +doc_to_text: "Provide a one-sentence caption for the provided image." doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 128 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/coco/coco_test2017.yaml b/lmms_eval/tasks/coco/coco_test2017.yaml new file mode 100644 index 000000000..b5ed4d402 --- /dev/null +++ b/lmms_eval/tasks/coco/coco_test2017.yaml @@ -0,0 +1,24 @@ +dataset_path: lmms-lab/COCO-Caption2017 +dataset_kwargs: + token: True +task : "coco_test2017" +group : "coco_caption2017" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.coco_doc_to_visual +doc_to_text: !function utils.coco_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.coco_test_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: coco_passthrough + aggregation : !function utils.coco_test_aggregation_result + higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/coco/coco_val.yaml b/lmms_eval/tasks/coco/coco_val.yaml index 264052baa..38835f58b 100644 --- a/lmms_eval/tasks/coco/coco_val.yaml +++ b/lmms_eval/tasks/coco/coco_val.yaml @@ -6,12 +6,10 @@ group : "coco_caption" test_split: val output_type: generate_until doc_to_visual: !function utils.coco_doc_to_visual -doc_to_text: !function utils.coco_doc_to_text +doc_to_text: "Provide a one-sentence caption for the provided image." doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/coco/coco_val2017.yaml b/lmms_eval/tasks/coco/coco_val2017.yaml new file mode 100644 index 000000000..e08b8b959 --- /dev/null +++ b/lmms_eval/tasks/coco/coco_val2017.yaml @@ -0,0 +1,45 @@ +dataset_path: lmms-lab/COCO-Caption2017 +dataset_kwargs: + token: True +task: "coco_val2017" +group : "coco_caption2017" +test_split: val +output_type: generate_until +doc_to_visual: !function utils.coco_doc_to_visual +doc_to_text: !function utils.coco_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 64 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.coco_process_result +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: coco_Bleu_4 + aggregation : !function utils.coco_bleu4 + higher_is_better : true + - metric: coco_Bleu_3 + aggregation : !function utils.coco_bleu3 + higher_is_better : true + - metric: coco_Bleu_2 + aggregation : !function utils.coco_bleu2 + higher_is_better : true + - metric: coco_Bleu_1 + aggregation : !function utils.coco_bleu1 + higher_is_better : true + - metric: coco_METEOR + aggregation : !function utils.coco_meteor + higher_is_better : true + - metric: coco_ROUGE_L + aggregation : !function utils.coco_rougel + higher_is_better : true + - metric: coco_CIDEr + aggregation : !function utils.coco_cider + higher_is_better : true + #- metric: coco_SPICE + # aggregation : !function utils.coco_spice + # higher_is_better : true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/coco/utils.py b/lmms_eval/tasks/coco/utils.py index edc87fdc5..410760859 100644 --- a/lmms_eval/tasks/coco/utils.py +++ b/lmms_eval/tasks/coco/utils.py @@ -18,8 +18,7 @@ def coco_doc_to_visual(doc): def coco_doc_to_text(doc): - question = doc["question"] - return f"{question}\nDescribe this image briefly using a single sentence." + return f"Provide a one-sentence caption for the provided image." def coco_process_result(doc, result): diff --git a/lmms_eval/tasks/flickr30k/flickr30k.yaml b/lmms_eval/tasks/flickr30k/flickr30k.yaml index f2be49bc2..e7f017b56 100644 --- a/lmms_eval/tasks/flickr30k/flickr30k.yaml +++ b/lmms_eval/tasks/flickr30k/flickr30k.yaml @@ -8,9 +8,7 @@ doc_to_visual: !function utils.flickr_doc_to_visual doc_to_text: !function utils.flickr_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/flickr30k/utils.py b/lmms_eval/tasks/flickr30k/utils.py index 98dfdb1d1..bd41e1252 100644 --- a/lmms_eval/tasks/flickr30k/utils.py +++ b/lmms_eval/tasks/flickr30k/utils.py @@ -18,8 +18,8 @@ def flickr_doc_to_visual(doc): def flickr_doc_to_text(doc): - question = "Please carefully observe the image and come up with a caption for the image." - return f"{question}\nAnswer the question with a short phrase." + # question = "Please carefully observe the image and come up with a caption for the image" + return f"Provide a one-sentence caption for the provided image." def flickr_process_result(doc, result): diff --git a/lmms_eval/tasks/gqa/gqa.yaml b/lmms_eval/tasks/gqa/gqa.yaml index 5a3987047..a1fa2fe22 100644 --- a/lmms_eval/tasks/gqa/gqa.yaml +++ b/lmms_eval/tasks/gqa/gqa.yaml @@ -9,8 +9,11 @@ doc_to_visual: !function utils.gqa_doc_to_visual doc_to_text: !function utils.gqa_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" + max_new_tokens: 16 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false metric_list: - metric: exact_match aggregation: mean diff --git a/lmms_eval/tasks/infovqa/infovqa_test.yaml b/lmms_eval/tasks/infovqa/infovqa_test.yaml index 4944c8dd5..2d3a01854 100644 --- a/lmms_eval/tasks/infovqa/infovqa_test.yaml +++ b/lmms_eval/tasks/infovqa/infovqa_test.yaml @@ -3,7 +3,7 @@ dataset_name: InfographicVQA dataset_kwargs: token: True task: "infovqa_test" -test_split: validation +test_split: test output_type: generate_until doc_to_visual: !function utils.infovqa_doc_to_visual doc_to_text: !function utils.infovqa_doc_to_text diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml index 36fec0c28..0ec13674f 100644 --- a/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml +++ b/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml @@ -10,9 +10,7 @@ doc_to_visual: !function cc_utils.mmbench_doc_to_visual doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 256 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml index 339fa8db8..5543eb759 100644 --- a/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml +++ b/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml @@ -10,9 +10,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual doc_to_text: !function utils.mmbench_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 256 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml index d6e31a55c..9e21ec916 100644 --- a/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml +++ b/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml @@ -10,9 +10,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual doc_to_text: !function utils.mmbench_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 256 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml index dd386ea03..cab882213 100644 --- a/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml +++ b/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml @@ -9,9 +9,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual doc_to_text: !function utils.mmbench_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 256 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml index 7e649da3e..d320e466b 100644 --- a/lmms_eval/tasks/mme/mme.yaml +++ b/lmms_eval/tasks/mme/mme.yaml @@ -8,8 +8,11 @@ doc_to_visual: !function utils.mme_doc_to_visual doc_to_text: !function utils.mme_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" + max_new_tokens: 16 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false # The return value of process_results will be used by metrics process_results: !function utils.mme_process_results # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results @@ -20,5 +23,9 @@ metric_list: - metric: mme_cognition_score aggregation: !function utils.mme_aggregate_results higher_is_better: true +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." metadata: - - version: 0.0 + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/mme/utils.py b/lmms_eval/tasks/mme/utils.py index ee480efd9..5a6c513d3 100644 --- a/lmms_eval/tasks/mme/utils.py +++ b/lmms_eval/tasks/mme/utils.py @@ -22,18 +22,22 @@ } -replace_prompt = "Please answer yes or no." +replace_prompt = " Please answer yes or no." def mme_doc_to_visual(doc): return [doc["image"].convert("RGB")] -def mme_doc_to_text(doc): - question = doc["question"] - # TODO: This is a hack. We should fix this in the dataset. - question = question.replace(replace_prompt, "").strip() - return f"{question}\nAnswer the question using a single word or phrase." +def mme_doc_to_text(doc, model_specific_prompt_kwargs=None): + question = doc["question"].strip() + if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}" + if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "": + question = question.replace(replace_prompt, "") + question = f"{question}{model_specific_prompt_kwargs['post_prompt']}" + return question def parse_pred_ans(pred_ans): diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml index 64dde69aa..88394ea18 100644 --- a/lmms_eval/tasks/nocaps/nocaps_test.yaml +++ b/lmms_eval/tasks/nocaps/nocaps_test.yaml @@ -9,9 +9,7 @@ doc_to_visual: !function utils.nocaps_doc_to_visual doc_to_text: !function utils.nocaps_doc_to_text doc_to_target: "annotations_captions" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml index e02fe2c07..de946a907 100644 --- a/lmms_eval/tasks/nocaps/nocaps_val.yaml +++ b/lmms_eval/tasks/nocaps/nocaps_val.yaml @@ -9,9 +9,7 @@ doc_to_visual: !function utils.nocaps_doc_to_visual doc_to_text: !function utils.nocaps_doc_to_text doc_to_target: "annotations_captions" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/okvqa/okvqa.yaml b/lmms_eval/tasks/okvqa/okvqa.yaml index e18a50998..03c63c500 100644 --- a/lmms_eval/tasks/okvqa/okvqa.yaml +++ b/lmms_eval/tasks/okvqa/okvqa.yaml @@ -6,8 +6,11 @@ doc_to_visual: !function utils.okvqa_doc_to_visual doc_to_text: !function utils.okvqa_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" + max_new_tokens: 16 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false metric_list: - metric: exact_match aggregation: mean diff --git a/lmms_eval/tasks/okvqa/utils.py b/lmms_eval/tasks/okvqa/utils.py index af0781b6d..b3238ac2f 100644 --- a/lmms_eval/tasks/okvqa/utils.py +++ b/lmms_eval/tasks/okvqa/utils.py @@ -262,7 +262,7 @@ def okvqa_process_results(doc, result): def okvqa_doc_to_text(doc): - text = f"{doc['question'].capitalize()}\n Answer the question using a single word or phrase." + text = f"{doc['question'].capitalize()}\nAnswer the question using a single word." return text diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml index 377eb4421..5dc5817f1 100644 --- a/lmms_eval/tasks/textcaps/textcaps_test.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_test.yaml @@ -9,9 +9,7 @@ doc_to_visual: !function utils.textcaps_doc_to_visual doc_to_text: !function utils.textcaps_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml index 9daf613f4..586806435 100644 --- a/lmms_eval/tasks/textcaps/textcaps_val.yaml +++ b/lmms_eval/tasks/textcaps/textcaps_val.yaml @@ -9,9 +9,7 @@ doc_to_visual: !function utils.textcaps_doc_to_visual doc_to_text: !function utils.textcaps_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 + max_new_tokens: 64 temperature: 0 top_p: 0 num_beams: 1 diff --git a/lmms_eval/tasks/vqav2_test/vqav2_test.yaml b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml index 23414ae9e..f4e14311c 100644 --- a/lmms_eval/tasks/vqav2_test/vqav2_test.yaml +++ b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml @@ -8,8 +8,7 @@ doc_to_visual: !function utils.vqav2_doc_to_visual doc_to_text: !function utils.vqav2_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" + max_new_tokens: 16 metric_list: - metric: submission aggregation: !function utils.vqav2_aggreate_submissions diff --git a/lmms_eval/tasks/vqav2_val/vqav2_val.yaml b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml index 6e8ba0c32..42f72c262 100644 --- a/lmms_eval/tasks/vqav2_val/vqav2_val.yaml +++ b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml @@ -8,8 +8,7 @@ doc_to_visual: !function utils.vqav2_doc_to_visual doc_to_text: !function utils.vqav2_doc_to_text doc_to_target: "answer" generation_kwargs: - until: - - "ASSISTANT:" + max_new_tokens: 16 metric_list: - metric: exact_match aggregation: mean diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py index 33511ee33..94a61b011 100644 --- a/lmms_eval/utils.py +++ b/lmms_eval/utils.py @@ -52,7 +52,7 @@ def format(self, record): # ch.setLevel(logging.INFO) # Create a formatter and set it to the handler, ONLY MAKING IT SHOW THE LAST 3 FOLDERS of a path -formatter = PathFormatter("%(asctime)s,%(msecs)03d %(levelname)-8s [%(pathname)s:%(lineno)d] %(message)s", "%Y-%m-%d:%H:%M:%S") +formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(message)s", "%m-%d:%H:%M:%S") ch.setFormatter(formatter) eval_logger.addHandler(ch)