diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index d32811391..f5c4b7ba3 100644
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -123,7 +123,7 @@ def parse_eval_args() -> argparse.Namespace:
     return args
 
 
-def cli_evaluate(args: Union[argparse.Namespace, None], wandb_run) -> None:
+def cli_evaluate(args: Union[argparse.Namespace, None] = None, wandb_run=None) -> None:
     if args is None:
         args = parse_eval_args()
 
@@ -292,10 +292,22 @@ def print_results(args, results):
 
     # initialize Accelerator
     accelerator = Accelerator()
+    all_args_dict = vars(args)
 
     if accelerator.is_main_process:
         # initialize a W&B run only on rank 0
         wandb_args_dict = utils.simple_parse_args_string(args.wandb_args)
+        if "name" not in wandb_args_dict:
+            if "config" not in all_args_dict:
+                # use the model name and task names as run name
+                task_names = args.tasks.replace(",", "_")
+                wandb_args_dict["name"] = f"{args.model}_{task_names}_{args.log_samples_suffix}"
+                if args.num_fewshot:
+                    wandb_args_dict["name"] += f"_{args.num_fewshot}shot"
+            else:
+                # use the name of the config file as run name
+                wandb_args_dict["name"] = all_args_dict["config"].split("/")[-1].split(".")[0]
+
         wandb_run = wandb.init(**wandb_args_dict)
         is_main_process = True
     else:
@@ -307,3 +319,6 @@ def print_results(args, results):
     for args in args_list:
         results = cli_evaluate(args, wandb_run)
         results_list.append(results)
+
+    if is_main_process:
+        wandb_run.finish()
diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py
index 933f2fdb7..4b4bd1e66 100644
--- a/lmms_eval/models/llava.py
+++ b/lmms_eval/models/llava.py
@@ -258,7 +258,7 @@ def _collate(x):
             if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__:
                 # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation
                 self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio")
-
+                eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}")
             # encode, pad, and truncate contexts for this batch
             if visuals:
                 image_tensor = process_images(visuals, self._image_processor, self._config)
@@ -289,7 +289,7 @@ def _collate(x):
             input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
 
             # preconfigure gen_kwargs with defaults
-            gen_kwargs["image_sizes"] = [visuals[0].size]
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
             if "max_new_tokens" not in gen_kwargs:
                 gen_kwargs["max_new_tokens"] = 1024
             if "temperature" not in gen_kwargs:
@@ -318,9 +318,11 @@ def _collate(x):
                     use_cache=self.use_cache,
                 )
             except Exception as e:
-                print("Error in generating")
+                eval_logger.error(f"Error {e} in generating")
                 cont = ""
-                raise e
+                eval_logger.error(prompt)
+                eval_logger.error(visuals)
+                eval_logger.error(prompts_input)
 
             cont_toks_list = cont.tolist()
             for cont_toks, context in zip(cont_toks_list, contexts):
diff --git a/lmms_eval/tasks/coco/coco2017.yaml b/lmms_eval/tasks/coco/coco2017.yaml
new file mode 100644
index 000000000..e459b8279
--- /dev/null
+++ b/lmms_eval/tasks/coco/coco2017.yaml
@@ -0,0 +1,4 @@
+group : coco2017
+task:
+  - coco_val2017
+  - coco_test2017
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco/coco_test.yaml b/lmms_eval/tasks/coco/coco_test.yaml
index 4605178c9..d99abfa48 100644
--- a/lmms_eval/tasks/coco/coco_test.yaml
+++ b/lmms_eval/tasks/coco/coco_test.yaml
@@ -6,12 +6,10 @@ group : "coco_caption"
 test_split: test
 output_type: generate_until
 doc_to_visual: !function utils.coco_doc_to_visual
-doc_to_text: !function utils.coco_doc_to_text
+doc_to_text: "Provide a one-sentence caption for the provided image."
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 128
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/coco/coco_test2017.yaml b/lmms_eval/tasks/coco/coco_test2017.yaml
new file mode 100644
index 000000000..b5ed4d402
--- /dev/null
+++ b/lmms_eval/tasks/coco/coco_test2017.yaml
@@ -0,0 +1,24 @@
+dataset_path: lmms-lab/COCO-Caption2017
+dataset_kwargs:
+  token: True
+task : "coco_test2017"
+group : "coco_caption2017"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.coco_doc_to_visual
+doc_to_text: !function utils.coco_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 128
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.coco_test_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: coco_passthrough 
+    aggregation : !function utils.coco_test_aggregation_result
+    higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco/coco_val.yaml b/lmms_eval/tasks/coco/coco_val.yaml
index 264052baa..38835f58b 100644
--- a/lmms_eval/tasks/coco/coco_val.yaml
+++ b/lmms_eval/tasks/coco/coco_val.yaml
@@ -6,12 +6,10 @@ group : "coco_caption"
 test_split: val
 output_type: generate_until
 doc_to_visual: !function utils.coco_doc_to_visual
-doc_to_text: !function utils.coco_doc_to_text
+doc_to_text: "Provide a one-sentence caption for the provided image."
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/coco/coco_val2017.yaml b/lmms_eval/tasks/coco/coco_val2017.yaml
new file mode 100644
index 000000000..e08b8b959
--- /dev/null
+++ b/lmms_eval/tasks/coco/coco_val2017.yaml
@@ -0,0 +1,45 @@
+dataset_path: lmms-lab/COCO-Caption2017
+dataset_kwargs:
+  token: True
+task: "coco_val2017"
+group : "coco_caption2017"
+test_split: val
+output_type: generate_until
+doc_to_visual: !function utils.coco_doc_to_visual
+doc_to_text: !function utils.coco_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 64
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.coco_process_result
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: coco_Bleu_4 
+    aggregation : !function utils.coco_bleu4
+    higher_is_better : true
+  - metric: coco_Bleu_3
+    aggregation : !function utils.coco_bleu3
+    higher_is_better : true
+  - metric: coco_Bleu_2
+    aggregation : !function utils.coco_bleu2
+    higher_is_better : true
+  - metric: coco_Bleu_1
+    aggregation : !function utils.coco_bleu1
+    higher_is_better : true
+  - metric: coco_METEOR
+    aggregation : !function utils.coco_meteor
+    higher_is_better : true
+  - metric: coco_ROUGE_L
+    aggregation : !function utils.coco_rougel
+    higher_is_better : true
+  - metric: coco_CIDEr
+    aggregation : !function utils.coco_cider
+    higher_is_better : true
+  #- metric: coco_SPICE
+  #  aggregation : !function utils.coco_spice
+  #  higher_is_better : true
+metadata:
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/coco/utils.py b/lmms_eval/tasks/coco/utils.py
index edc87fdc5..410760859 100644
--- a/lmms_eval/tasks/coco/utils.py
+++ b/lmms_eval/tasks/coco/utils.py
@@ -18,8 +18,7 @@ def coco_doc_to_visual(doc):
 
 
 def coco_doc_to_text(doc):
-    question = doc["question"]
-    return f"{question}\nDescribe this image briefly using a single sentence."
+    return f"Provide a one-sentence caption for the provided image."
 
 
 def coco_process_result(doc, result):
diff --git a/lmms_eval/tasks/flickr30k/flickr30k.yaml b/lmms_eval/tasks/flickr30k/flickr30k.yaml
index f2be49bc2..e7f017b56 100644
--- a/lmms_eval/tasks/flickr30k/flickr30k.yaml
+++ b/lmms_eval/tasks/flickr30k/flickr30k.yaml
@@ -8,9 +8,7 @@ doc_to_visual: !function utils.flickr_doc_to_visual
 doc_to_text: !function utils.flickr_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/flickr30k/utils.py b/lmms_eval/tasks/flickr30k/utils.py
index 98dfdb1d1..bd41e1252 100644
--- a/lmms_eval/tasks/flickr30k/utils.py
+++ b/lmms_eval/tasks/flickr30k/utils.py
@@ -18,8 +18,8 @@ def flickr_doc_to_visual(doc):
 
 
 def flickr_doc_to_text(doc):
-    question = "Please carefully observe the image and come up with a caption for the image."
-    return f"{question}\nAnswer the question with a short phrase."
+    # question = "Please carefully observe the image and come up with a caption for the image"
+    return f"Provide a one-sentence caption for the provided image."
 
 
 def flickr_process_result(doc, result):
diff --git a/lmms_eval/tasks/gqa/gqa.yaml b/lmms_eval/tasks/gqa/gqa.yaml
index 5a3987047..a1fa2fe22 100644
--- a/lmms_eval/tasks/gqa/gqa.yaml
+++ b/lmms_eval/tasks/gqa/gqa.yaml
@@ -9,8 +9,11 @@ doc_to_visual: !function utils.gqa_doc_to_visual
 doc_to_text: !function utils.gqa_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lmms_eval/tasks/infovqa/infovqa_test.yaml b/lmms_eval/tasks/infovqa/infovqa_test.yaml
index 4944c8dd5..2d3a01854 100644
--- a/lmms_eval/tasks/infovqa/infovqa_test.yaml
+++ b/lmms_eval/tasks/infovqa/infovqa_test.yaml
@@ -3,7 +3,7 @@ dataset_name: InfographicVQA
 dataset_kwargs:
   token: True
 task: "infovqa_test"
-test_split: validation
+test_split: test
 output_type: generate_until
 doc_to_visual: !function utils.infovqa_doc_to_visual
 doc_to_text: !function utils.infovqa_doc_to_text
diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml
index 36fec0c28..0ec13674f 100644
--- a/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml
+++ b/lmms_eval/tasks/mmbench_cn/mmbench_cc.yaml
@@ -10,9 +10,7 @@ doc_to_visual: !function cc_utils.mmbench_doc_to_visual
 doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 256
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml
index 339fa8db8..5543eb759 100644
--- a/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml
+++ b/lmms_eval/tasks/mmbench_cn/mmbench_cn_dev.yaml
@@ -10,9 +10,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual
 doc_to_text: !function utils.mmbench_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 256
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml
index d6e31a55c..9e21ec916 100644
--- a/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml
+++ b/lmms_eval/tasks/mmbench_cn/mmbench_cn_test.yaml
@@ -10,9 +10,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual
 doc_to_text: !function utils.mmbench_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 256
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml
index dd386ea03..cab882213 100644
--- a/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml
+++ b/lmms_eval/tasks/mmbench_en/mmbench_en_test.yaml
@@ -9,9 +9,7 @@ doc_to_visual: !function utils.mmbench_doc_to_visual
 doc_to_text: !function utils.mmbench_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 256
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml
index 7e649da3e..d320e466b 100644
--- a/lmms_eval/tasks/mme/mme.yaml
+++ b/lmms_eval/tasks/mme/mme.yaml
@@ -8,8 +8,11 @@ doc_to_visual: !function utils.mme_doc_to_visual
 doc_to_text: !function utils.mme_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
 # The return value of process_results will be used by metrics
 process_results: !function utils.mme_process_results
 # Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
@@ -20,5 +23,9 @@ metric_list:
   - metric: mme_cognition_score
     aggregation: !function utils.mme_aggregate_results
     higher_is_better: true
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question using a single word or phrase."
 metadata:
-  - version: 0.0
+  - version: 0.0
\ No newline at end of file
diff --git a/lmms_eval/tasks/mme/utils.py b/lmms_eval/tasks/mme/utils.py
index ee480efd9..5a6c513d3 100644
--- a/lmms_eval/tasks/mme/utils.py
+++ b/lmms_eval/tasks/mme/utils.py
@@ -22,18 +22,22 @@
 }
 
 
-replace_prompt = "Please answer yes or no."
+replace_prompt = " Please answer yes or no."
 
 
 def mme_doc_to_visual(doc):
     return [doc["image"].convert("RGB")]
 
 
-def mme_doc_to_text(doc):
-    question = doc["question"]
-    # TODO: This is a hack. We should fix this in the dataset.
-    question = question.replace(replace_prompt, "").strip()
-    return f"{question}\nAnswer the question using a single word or phrase."
+def mme_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    question = doc["question"].strip()
+    if "pre_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["pre_prompt"] != "":
+        question = question.replace(replace_prompt, "")
+        question = f"{model_specific_prompt_kwargs['pre_prompt']}{question}"
+    if "post_prompt" in model_specific_prompt_kwargs and model_specific_prompt_kwargs["post_prompt"] != "":
+        question = question.replace(replace_prompt, "")
+        question = f"{question}{model_specific_prompt_kwargs['post_prompt']}"
+    return question
 
 
 def parse_pred_ans(pred_ans):
diff --git a/lmms_eval/tasks/nocaps/nocaps_test.yaml b/lmms_eval/tasks/nocaps/nocaps_test.yaml
index 64dde69aa..88394ea18 100644
--- a/lmms_eval/tasks/nocaps/nocaps_test.yaml
+++ b/lmms_eval/tasks/nocaps/nocaps_test.yaml
@@ -9,9 +9,7 @@ doc_to_visual: !function utils.nocaps_doc_to_visual
 doc_to_text: !function utils.nocaps_doc_to_text
 doc_to_target: "annotations_captions"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/nocaps/nocaps_val.yaml b/lmms_eval/tasks/nocaps/nocaps_val.yaml
index e02fe2c07..de946a907 100644
--- a/lmms_eval/tasks/nocaps/nocaps_val.yaml
+++ b/lmms_eval/tasks/nocaps/nocaps_val.yaml
@@ -9,9 +9,7 @@ doc_to_visual: !function utils.nocaps_doc_to_visual
 doc_to_text: !function utils.nocaps_doc_to_text
 doc_to_target: "annotations_captions"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/okvqa/okvqa.yaml b/lmms_eval/tasks/okvqa/okvqa.yaml
index e18a50998..03c63c500 100644
--- a/lmms_eval/tasks/okvqa/okvqa.yaml
+++ b/lmms_eval/tasks/okvqa/okvqa.yaml
@@ -6,8 +6,11 @@ doc_to_visual: !function utils.okvqa_doc_to_visual
 doc_to_text: !function utils.okvqa_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lmms_eval/tasks/okvqa/utils.py b/lmms_eval/tasks/okvqa/utils.py
index af0781b6d..b3238ac2f 100644
--- a/lmms_eval/tasks/okvqa/utils.py
+++ b/lmms_eval/tasks/okvqa/utils.py
@@ -262,7 +262,7 @@ def okvqa_process_results(doc, result):
 
 
 def okvqa_doc_to_text(doc):
-    text = f"{doc['question'].capitalize()}\n Answer the question using a single word or phrase."
+    text = f"{doc['question'].capitalize()}\nAnswer the question using a single word."
     return text
 
 
diff --git a/lmms_eval/tasks/textcaps/textcaps_test.yaml b/lmms_eval/tasks/textcaps/textcaps_test.yaml
index 377eb4421..5dc5817f1 100644
--- a/lmms_eval/tasks/textcaps/textcaps_test.yaml
+++ b/lmms_eval/tasks/textcaps/textcaps_test.yaml
@@ -9,9 +9,7 @@ doc_to_visual: !function utils.textcaps_doc_to_visual
 doc_to_text: !function utils.textcaps_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/textcaps/textcaps_val.yaml b/lmms_eval/tasks/textcaps/textcaps_val.yaml
index 9daf613f4..586806435 100644
--- a/lmms_eval/tasks/textcaps/textcaps_val.yaml
+++ b/lmms_eval/tasks/textcaps/textcaps_val.yaml
@@ -9,9 +9,7 @@ doc_to_visual: !function utils.textcaps_doc_to_visual
 doc_to_text: !function utils.textcaps_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
-  max_new_tokens: 1024
+  max_new_tokens: 64
   temperature: 0
   top_p: 0
   num_beams: 1
diff --git a/lmms_eval/tasks/vqav2_test/vqav2_test.yaml b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml
index 23414ae9e..f4e14311c 100644
--- a/lmms_eval/tasks/vqav2_test/vqav2_test.yaml
+++ b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml
@@ -8,8 +8,7 @@ doc_to_visual: !function utils.vqav2_doc_to_visual
 doc_to_text: !function utils.vqav2_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
+  max_new_tokens: 16
 metric_list:
   - metric: submission
     aggregation: !function utils.vqav2_aggreate_submissions
diff --git a/lmms_eval/tasks/vqav2_val/vqav2_val.yaml b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml
index 6e8ba0c32..42f72c262 100644
--- a/lmms_eval/tasks/vqav2_val/vqav2_val.yaml
+++ b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml
@@ -8,8 +8,7 @@ doc_to_visual: !function utils.vqav2_doc_to_visual
 doc_to_text: !function utils.vqav2_doc_to_text
 doc_to_target: "answer"
 generation_kwargs:
-  until:
-    - "ASSISTANT:"
+  max_new_tokens: 16
 metric_list:
   - metric: exact_match
     aggregation: mean
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index 33511ee33..94a61b011 100644
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -52,7 +52,7 @@ def format(self, record):
 # ch.setLevel(logging.INFO)
 
 # Create a formatter and set it to the handler, ONLY MAKING IT SHOW THE LAST 3 FOLDERS of a path
-formatter = PathFormatter("%(asctime)s,%(msecs)03d %(levelname)-8s [%(pathname)s:%(lineno)d] %(message)s", "%Y-%m-%d:%H:%M:%S")
+formatter = PathFormatter("%(asctime)s [%(pathname)s:%(lineno)d] %(message)s", "%m-%d:%H:%M:%S")
 ch.setFormatter(formatter)
 
 eval_logger.addHandler(ch)