From 46c37d006f857b5d7a01e38b37a88658a6a629e2 Mon Sep 17 00:00:00 2001 From: Pu Fanyi Date: Sat, 27 Jan 2024 20:22:21 +0800 Subject: [PATCH] vqav2 (#25) * Update tqdm progress bar position * Merge commit 'bfdf75d7b67680cdc98fdf3f58458633bb492de6' * Squashed commit of the following: commit 19db53b0e4871319e08603bfd0dbf1a7a4f44fa3 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit bfdf75d7b67680cdc98fdf3f58458633bb492de6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit f69268b069428a5ef36cafe7575bfd3055126c92 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95f3d3e116db32b49631f2005c9b2a608f778cc0' * Update dataset paths and improve user prompts commit 95f3d3e116db32b49631f2005c9b2a608f778cc0 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * remove useless output file * Update dataset path in vqav2.yaml * Squashed commit of the following: commit 9e827183b527e9a035a6359448c1e692df089ed1 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:56:45 2024 +0800 Black lint commit 570500320783a594f218699ea1509ec537591b2e Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:47 2024 +0800 Solve doc_iterator_for_counting crashing issue commit 0e75485613ff06b532403a152974eedf8e117c9c Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 09:55:13 2024 +0800 Exclude train in refcoco/+/g config commit 6429b7e69ddc0eee6a6728772ec5eb2114d6e331 Merge: 6a1ae69 697a438 Author: Bo Li Date: Thu Jan 25 17:17:13 2024 +0000 Merge branch 'dev/bli_add_datasets' of https://github.com/EvolvingLMMs-Lab/lmms-eval into dev/bli_add_datasets commit 892bc90979fd6b5b64de0ed68b17ac2944b9e6fa Author: Bo Li Date: Thu Jan 25 17:17:06 2024 +0000 Fix file path and raise error if config file does not exist commit aff94aaf134bb404e48cd59d931cd214197df339 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Fri Jan 26 00:47:24 2024 +0800 Fix tasks issue for nocaps, refcoco/+/g commit d0dc730cbee420e7121b0520eb40a1f30447930d Author: Bo Li Date: Thu Jan 25 10:09:43 2024 +0000 Remove unused files and update task configuration commit c69ecbfc52492aca3e5ecfc8d425ee9e7af00978 Author: Bo Li Date: Thu Jan 25 09:43:56 2024 +0000 Add submission file for coco, flickr30k, nocaps, and textcaps tasks commit 9053bc9aafb19d654b30927a8fec72347c745886 Author: Bo Li Date: Thu Jan 25 09:32:54 2024 +0000 Refactor get_task_dict function to handle nested groups commit bbf0dbb9e7d05ce6aecd251815a66ac38e9a4169 Author: Bo Li Date: Thu Jan 25 09:13:46 2024 +0000 Fix bug in login functionality Refactor code for better performance Add new feature for user authentication Update UI layout for improved user experience Fix typo in variable name Optimize database queries for faster response time Add error handling for edge cases Update dependencies to latest versions Remove unused code Improve code readability and maintainability commit ee76ebb5bd120708d07477e1462e986ece346975 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 17:07:20 2024 +0800 Fix cli itself can not run with config file commit d252441a31ea5ab29bd32accb5b0b9e1ba73587b Author: Bo Li Date: Thu Jan 25 09:09:04 2024 +0000 Squashed commit of the following: commit 19db53b0e4871319e08603bfd0dbf1a7a4f44fa3 Author: Zhang Peiyuan Date: Thu Jan 25 17:08:25 2024 +0800 add model specific prompt and gen kwargs in sqa (#19) * add mmme * black * add model specific prompt and gen kwargs * black * add yaml config to supprot multi-model eval * print table at the end * refactor multi model code commit 3278cccfcd5454ab972071555918fc8571f94d37 Author: Bo Li Date: Thu Jan 25 09:02:57 2024 +0000 Squashed commit of the following: commit 11795cb69caaaceddf6b284f18a386c7787d476d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit fb19895ca28ecf64d2ea5322e5391f7742e540f4 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e02df3b556a9d34d32d8bfa1f99ea992b763bc6f Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 388a23ac4bb47644826869562c70c10b470a1817 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit bcb7df038402c5ef73db230126fcd76795ee69df Merge: 7e8b57d 1d3fdd4 Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 30056b56be382107f520d5c85b84c3d541d970e9 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit 53ddf3fb2716fd99b2fa454656312d6fc92227b7 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit d7bbd3b2cbd78fdc3df2137ac0d625b5f5505acc Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit 741278f40ef70df04efd52ddd79e3c260c41a53e Merge: 22c3adf 1d3fdd4 Author: Bo Li Date: Thu Jan 25 08:43:15 2024 +0000 Merge commit 'bfdf75d7b67680cdc98fdf3f58458633bb492de6' into dev/bli_add_datasets commit cbdaa28e87913c26dd6d2de6bd7c2b3acb556b0a Author: Bo Li Date: Thu Jan 25 08:38:52 2024 +0000 Squashed commit of the following: commit 11795cb69caaaceddf6b284f18a386c7787d476d Author: jzhang38 Date: Thu Jan 25 11:59:12 2024 +0800 refactor multi model code commit fb19895ca28ecf64d2ea5322e5391f7742e540f4 Author: jzhang38 Date: Thu Jan 25 11:51:16 2024 +0800 print table at the end commit e02df3b556a9d34d32d8bfa1f99ea992b763bc6f Author: jzhang38 Date: Thu Jan 25 11:20:59 2024 +0800 add yaml config to supprot multi-model eval commit 388a23ac4bb47644826869562c70c10b470a1817 Author: jzhang38 Date: Thu Jan 25 10:39:42 2024 +0800 black commit bcb7df038402c5ef73db230126fcd76795ee69df Merge: 7e8b57d 1d3fdd4 Author: jzhang38 Date: Thu Jan 25 10:37:57 2024 +0800 resolve conflicts in sqa commit 30056b56be382107f520d5c85b84c3d541d970e9 Author: jzhang38 Date: Thu Jan 25 10:36:46 2024 +0800 add model specific prompt and gen kwargs commit bfdf75d7b67680cdc98fdf3f58458633bb492de6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit f69268b069428a5ef36cafe7575bfd3055126c92 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95f3d3e116db32b49631f2005c9b2a608f778cc0' * Update dataset paths and improve user prompts commit 53ddf3fb2716fd99b2fa454656312d6fc92227b7 Author: jzhang38 Date: Wed Jan 24 13:56:51 2024 +0800 black commit d7bbd3b2cbd78fdc3df2137ac0d625b5f5505acc Author: jzhang38 Date: Wed Jan 24 13:55:43 2024 +0800 add mmme commit b8389cf8dac3f22c8d07f9789fdd877d8298d786 Author: Bo Li Date: Thu Jan 25 08:38:11 2024 +0000 Add coco_val and coco_test tasks to coco.yaml commit f399ed85ace060b3e64bd5468b17f2a856d005bd Author: Bo Li Date: Thu Jan 25 04:58:28 2024 +0000 Update dataset_path in flickr30k.yaml commit 4657c9b111bac762f3dc5ff9397ea211b2b62656 Author: Bo Li Date: Thu Jan 25 02:12:25 2024 +0000 Merge commit 'bfdf75d7b67680cdc98fdf3f58458633bb492de6' commit 9b3a02280e05f15e305eb86a3669e76f011c6444 Author: Bo Li Date: Thu Jan 25 02:10:18 2024 +0000 Add submission folder and update file paths for storing prediction results commit bfdf75d7b67680cdc98fdf3f58458633bb492de6 Author: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Thu Jan 25 09:47:31 2024 +0800 [Dataset] Add flickr30k (#18) * Add flickr30k support * Black lint * Align prompt with NoCaps commit f69268b069428a5ef36cafe7575bfd3055126c92 Author: Li Bo Date: Wed Jan 24 22:10:14 2024 +0800 [Datasets] modify NoCaps data path and prompts (#17) * Merge commit '95f3d3e116db32b49631f2005c9b2a608f778cc0' * Update dataset paths and improve user prompts commit ad4a267e810a4653e5d7ad0b5b9000ea0a39028e Merge: c6370bf 51f2eaa Author: Li Bo Date: Wed Jan 24 22:10:07 2024 +0800 Merge branch 'main' into dev/bli_add_datasets commit b441be2447ef78dce4c9c8134ad34cfd20765eef Author: Bo Li Date: Wed Jan 24 14:08:06 2024 +0000 Update dataset paths and improve user prompts commit 9e30e09b429b30cc67389af0ebc94a1149dcc4bb Author: Bo Li Date: Wed Jan 24 11:52:33 2024 +0000 Merge commit '95f3d3e116db32b49631f2005c9b2a608f778cc0' commit 95f3d3e116db32b49631f2005c9b2a608f778cc0 Author: Li Bo Date: Wed Jan 24 19:51:34 2024 +0800 Add output path file naming convention (#16) Update datetime format in get_datetime_str() function * Fix bug in login functionality * create vqav2_val * Update vqav2_test.yaml * Update vqav2_test.yaml * Update vqav2_val.yaml --------- Co-authored-by: Li Bo --- lmms_eval/api/task.py | 2 +- lmms_eval/tasks/docvqa/docvqa.yaml | 22 ++ .../tasks/{vqav2 => vqav2_test}/utils.py | 2 +- .../vqav2.yaml => vqav2_test/vqav2_test.yaml} | 4 +- lmms_eval/tasks/vqav2_val/utils.py | 272 ++++++++++++++++++ lmms_eval/tasks/vqav2_val/vqav2_val.yaml | 24 ++ 6 files changed, 322 insertions(+), 4 deletions(-) create mode 100644 lmms_eval/tasks/docvqa/docvqa.yaml rename lmms_eval/tasks/{vqav2 => vqav2_test}/utils.py (98%) rename lmms_eval/tasks/{vqav2/vqav2.yaml => vqav2_test/vqav2_test.yaml} (94%) create mode 100644 lmms_eval/tasks/vqav2_val/utils.py create mode 100644 lmms_eval/tasks/vqav2_val/vqav2_val.yaml diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index 1e05c084e..bbe728d36 100644 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -348,7 +348,7 @@ def build_all_requests(self, limit=None, rank=None, world_size=None) -> None: doc_id_iterator = utils.create_iterator([i for i in range(len(docs))], rank, world_size, limit) doc_id_iterator, doc_id_iterator_counting = itertools.tee(doc_id_iterator) total_docs = sum(1 for _ in doc_id_iterator_counting) - pbar = tqdm(total=total_docs, desc="Building context") + pbar = tqdm(total=total_docs, desc=f"Building context {rank}", position=rank) for doc_id in doc_id_iterator: # sample fewshot context #TODO: need to offset doc_id by rank now! fewshot_ctx = self.fewshot_context(doc_id, 0 if self.config.num_fewshot is None else self.config.num_fewshot, self.config.training_split if self.has_training_docs() else split) diff --git a/lmms_eval/tasks/docvqa/docvqa.yaml b/lmms_eval/tasks/docvqa/docvqa.yaml new file mode 100644 index 000000000..f441a82ea --- /dev/null +++ b/lmms_eval/tasks/docvqa/docvqa.yaml @@ -0,0 +1,22 @@ +task: docvqa +dataset_path: lmms-lab/DocVQA +test_split: test +output_type: generate_until +doc_to_visual: !function utils.vqav2_doc_to_visual +doc_to_text: !function utils.vqav2_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + - metric: submission + aggregation: !function utils.vqav2_aggreate_submissions + higher_is_better: true +metadata: + - version: 0.0 +process_results: !function utils.vqav2_process_results diff --git a/lmms_eval/tasks/vqav2/utils.py b/lmms_eval/tasks/vqav2_test/utils.py similarity index 98% rename from lmms_eval/tasks/vqav2/utils.py rename to lmms_eval/tasks/vqav2_test/utils.py index 7e10c7103..695e4cb23 100644 --- a/lmms_eval/tasks/vqav2/utils.py +++ b/lmms_eval/tasks/vqav2_test/utils.py @@ -265,7 +265,7 @@ def vqav2_doc_to_text(doc): def vqav2_aggreate_submissions(results): now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") os.makedirs("./submissions", exist_ok=True) - submission_file_name = f"./submissions/vqav2-submission-{now_date_time}.json" + submission_file_name = f"./submissions/vqav2-test-submission-{now_date_time}.json" path = os.path.abspath(submission_file_name) with open(path, "w") as f: json.dump(results, f) diff --git a/lmms_eval/tasks/vqav2/vqav2.yaml b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml similarity index 94% rename from lmms_eval/tasks/vqav2/vqav2.yaml rename to lmms_eval/tasks/vqav2_test/vqav2_test.yaml index 3f82c84fc..e29fb4303 100644 --- a/lmms_eval/tasks/vqav2/vqav2.yaml +++ b/lmms_eval/tasks/vqav2_test/vqav2_test.yaml @@ -1,8 +1,8 @@ -task: "vqav2" +task: "vqav2_test" dataset_path: lmms-lab/VQAv2 dataset_kwargs: token: True -test_split: testdev +test_split: test output_type: generate_until doc_to_visual: !function utils.vqav2_doc_to_visual doc_to_text: !function utils.vqav2_doc_to_text diff --git a/lmms_eval/tasks/vqav2_val/utils.py b/lmms_eval/tasks/vqav2_val/utils.py new file mode 100644 index 000000000..ec28bf004 --- /dev/null +++ b/lmms_eval/tasks/vqav2_val/utils.py @@ -0,0 +1,272 @@ +import re +import os +import json +import datetime +import statistics + + +def vqav2_doc_to_visual(doc): + return [doc["image"].convert("RGB")] + + +class EvalAIAnswerProcessor: + """ + Processes an answer similar to Eval AI + copied from + https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897 + """ + + CONTRACTIONS = { + "aint": "ain't", + "arent": "aren't", + "cant": "can't", + "couldve": "could've", + "couldnt": "couldn't", + "couldn'tve": "couldn't've", + "couldnt've": "couldn't've", + "didnt": "didn't", + "doesnt": "doesn't", + "dont": "don't", + "hadnt": "hadn't", + "hadnt've": "hadn't've", + "hadn'tve": "hadn't've", + "hasnt": "hasn't", + "havent": "haven't", + "hed": "he'd", + "hed've": "he'd've", + "he'dve": "he'd've", + "hes": "he's", + "howd": "how'd", + "howll": "how'll", + "hows": "how's", + "Id've": "I'd've", + "I'dve": "I'd've", + "Im": "I'm", + "Ive": "I've", + "isnt": "isn't", + "itd": "it'd", + "itd've": "it'd've", + "it'dve": "it'd've", + "itll": "it'll", + "let's": "let's", + "maam": "ma'am", + "mightnt": "mightn't", + "mightnt've": "mightn't've", + "mightn'tve": "mightn't've", + "mightve": "might've", + "mustnt": "mustn't", + "mustve": "must've", + "neednt": "needn't", + "notve": "not've", + "oclock": "o'clock", + "oughtnt": "oughtn't", + "ow's'at": "'ow's'at", + "'ows'at": "'ow's'at", + "'ow'sat": "'ow's'at", + "shant": "shan't", + "shed've": "she'd've", + "she'dve": "she'd've", + "she's": "she's", + "shouldve": "should've", + "shouldnt": "shouldn't", + "shouldnt've": "shouldn't've", + "shouldn'tve": "shouldn't've", + "somebody'd": "somebodyd", + "somebodyd've": "somebody'd've", + "somebody'dve": "somebody'd've", + "somebodyll": "somebody'll", + "somebodys": "somebody's", + "someoned": "someone'd", + "someoned've": "someone'd've", + "someone'dve": "someone'd've", + "someonell": "someone'll", + "someones": "someone's", + "somethingd": "something'd", + "somethingd've": "something'd've", + "something'dve": "something'd've", + "somethingll": "something'll", + "thats": "that's", + "thered": "there'd", + "thered've": "there'd've", + "there'dve": "there'd've", + "therere": "there're", + "theres": "there's", + "theyd": "they'd", + "theyd've": "they'd've", + "they'dve": "they'd've", + "theyll": "they'll", + "theyre": "they're", + "theyve": "they've", + "twas": "'twas", + "wasnt": "wasn't", + "wed've": "we'd've", + "we'dve": "we'd've", + "weve": "we've", + "werent": "weren't", + "whatll": "what'll", + "whatre": "what're", + "whats": "what's", + "whatve": "what've", + "whens": "when's", + "whered": "where'd", + "wheres": "where's", + "whereve": "where've", + "whod": "who'd", + "whod've": "who'd've", + "who'dve": "who'd've", + "wholl": "who'll", + "whos": "who's", + "whove": "who've", + "whyll": "why'll", + "whyre": "why're", + "whys": "why's", + "wont": "won't", + "wouldve": "would've", + "wouldnt": "wouldn't", + "wouldnt've": "wouldn't've", + "wouldn'tve": "wouldn't've", + "yall": "y'all", + "yall'll": "y'all'll", + "y'allll": "y'all'll", + "yall'd've": "y'all'd've", + "y'alld've": "y'all'd've", + "y'all'dve": "y'all'd've", + "youd": "you'd", + "youd've": "you'd've", + "you'dve": "you'd've", + "youll": "you'll", + "youre": "you're", + "youve": "you've", + } + + NUMBER_MAP = { + "none": "0", + "zero": "0", + "one": "1", + "two": "2", + "three": "3", + "four": "4", + "five": "5", + "six": "6", + "seven": "7", + "eight": "8", + "nine": "9", + "ten": "10", + } + ARTICLES = ["a", "an", "the"] + PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)") + COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)") + PUNCTUATIONS = [ + ";", + r"/", + "[", + "]", + '"', + "{", + "}", + "(", + ")", + "=", + "+", + "\\", + "_", + "-", + ">", + "<", + "@", + "`", + ",", + "?", + "!", + ] + + def __init__(self, *args, **kwargs): + pass + + def word_tokenize(self, word): + word = word.lower() + word = word.replace(",", "").replace("?", "").replace("'s", " 's") + return word.strip() + + def process_punctuation(self, in_text): + out_text = in_text + for p in self.PUNCTUATIONS: + if (p + " " in in_text or " " + p in in_text) or (re.search(self.COMMA_STRIP, in_text) is not None): + out_text = out_text.replace(p, "") + else: + out_text = out_text.replace(p, " ") + out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE) + return out_text + + def process_digit_article(self, in_text): + out_text = [] + temp_text = in_text.lower().split() + for word in temp_text: + word = self.NUMBER_MAP.setdefault(word, word) + if word not in self.ARTICLES: + out_text.append(word) + else: + pass + for word_id, word in enumerate(out_text): + if word in self.CONTRACTIONS: + out_text[word_id] = self.CONTRACTIONS[word] + out_text = " ".join(out_text) + return out_text + + def __call__(self, item): + item = self.word_tokenize(item) + item = item.replace("\n", " ").replace("\t", " ").strip() + item = self.process_punctuation(item) + item = self.process_digit_article(item) + return item + + +def vqav2_process_results(doc, result): + eval_ai_processor = EvalAIAnswerProcessor() + assert len(result) == 1, f"The result should be a list of length 1, but got {len(result)}." + resAns = eval_ai_processor(result[0]) + accuracy = 0 + + if "answers" in doc and doc["answers"] is not None: + for ansDic in doc["answers"]: + ansDic["answer"] = ansDic["answer"].replace("\n", " ") + ansDic["answer"] = ansDic["answer"].replace("\t", " ") + ansDic["answer"] = ansDic["answer"].strip() + gtAcc = [] + gtAnswers = [ans["answer"] for ans in doc["answers"]] + + if len(set(gtAnswers)) > 1: + for ansDic in doc["answers"]: + ansDic["answer"] = eval_ai_processor.process_punctuation(ansDic["answer"]) + ansDic["answer"] = eval_ai_processor.process_digit_article(ansDic["answer"]) + resAns = eval_ai_processor.process_punctuation(resAns) + resAns = eval_ai_processor.process_digit_article(resAns) + + for gtAnsDatum in doc["answers"]: + otherGTAns = [item for item in doc["answers"] if item != gtAnsDatum] + matchingAns = [item for item in otherGTAns if item["answer"] == resAns] + acc = min(1, float(len(matchingAns)) / 3) + gtAcc.append(acc) + accuracy = statistics.mean(gtAcc) + + return { + "exact_match": accuracy, + "submission": { + "question_id": doc["question_id"], + "answer": resAns, + }, + } + + +def vqav2_doc_to_text(doc): + return f"{doc['question']}\nAnswer the question using a single word or phrase." + + +def vqav2_aggreate_submissions(results): + now_date_time = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + submission_file_name = f"vqav2-val-submission-{now_date_time}.json" + path = os.path.abspath(submission_file_name) + with open(path, "w") as f: + json.dump(results, f) + print(f"Submission file saved to {path}") + return 0 diff --git a/lmms_eval/tasks/vqav2_val/vqav2_val.yaml b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml new file mode 100644 index 000000000..6e8ba0c32 --- /dev/null +++ b/lmms_eval/tasks/vqav2_val/vqav2_val.yaml @@ -0,0 +1,24 @@ +task: "vqav2_val" +dataset_path: lmms-lab/VQAv2 +dataset_kwargs: + token: True +test_split: validation +output_type: generate_until +doc_to_visual: !function utils.vqav2_doc_to_visual +doc_to_text: !function utils.vqav2_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: true + - metric: submission + aggregation: !function utils.vqav2_aggreate_submissions + higher_is_better: true +metadata: + - version: 0.0 +process_results: !function utils.vqav2_process_results