diff --git a/README.md b/README.md index 5f0997a94..7b845f44f 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ > Accelerating the development of large multimodal models (LMMs) with `lmms-eval` -🏠 [Homepage](https://lmms-lab.github.io/) | 🎉 [Blog](https://lmms-lab.github.io/lmms-eval-blog/lmms-eval-0.1/) | 📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab) +🏠 [Homepage](https://lmms-lab.github.io/) | 🎉 [Blog](https://lmms-lab.github.io/lmms-eval-blog/lmms-eval-0.1/) | 📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab) | Discord_Thread [discord/lmms-eval](https://discord.gg/ebAMGSsS) -In an era where people pursue AGI (Artificial General Intelligence) with the zeal akin to 1960s moon landing mission. -Evaluating the core of AGI, the large language models (LLMs) and large multimodal models (LMMs) with unprecedented capabilities that can understand, learn, and interact across a broad range of human tasks, has become a pivotal challenge. -To surmount this, a broad spectrum of evaluation datasets is proposed and used to assess model capabilities across various dimensions, creating a comprehensive capability chart that reveals the true performance of models. However, evaluation of models has become quite hard since there are countless evaluation benchmarks and datasets organized in various ways, scattered across the internet, sleeping in somebody's Google Drive, Dropbox, and other websites hosted by schools or research labs. +In today's world, we're on a thrilling quest for Artificial General Intelligence (AGI), driven by a passion that reminds us of the excitement surrounding the 1960s moon landing. At the heart of this adventure are the incredible large language models (LLMs) and large multimodal models (LMMs). These models are like brilliant minds that can understand, learn, and interact with a vast array of human tasks, marking a significant leap toward our goal. + +To truly understand how capable these models are, we've started to create and use a wide variety of evaluation benchmarks. These benchmarks help us map out a detailed chart of abilities, showing us how close we are to achieving true AGI. However, this journey is not without its challenges. The sheer number of benchmarks and datasets we need to look at is overwhelming. They're all over the place - tucked away in someone's Google Drive, scattered across Dropbox, and hidden in the corners of various school and research lab websites. It's like embarking on a treasure hunt where the maps are spread far and wide. In the field of language models, there has been a valuable precedent set by the work of [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness). They offer integrated data and model interfaces, enabling rapid evaluation of language models and serving as the backend support framework for the [open-llm-leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), and has gradually become the underlying ecosystem of the era of foundation models. @@ -25,6 +25,10 @@ We believe our effort could provide an efficient interface for the detailed comp # Annoucement +## Contribution Guidance + +We've added guidance on contributing new datasets and models. Please refer to our [documentation](docs/README.md). If you need assistance, you can contact us via [discord/lmms-eval](https://discord.gg/ebAMGSsS). + ## v0.1.0 Released The first version of the `lmms-eval` is released. We are working on providing an one-command evaluation suite for accelerating the development of LMMs. diff --git a/llava_repr_requirements.txt b/llava_repr_requirements.txt index 914b8608b..e3f0f5277 100644 --- a/llava_repr_requirements.txt +++ b/llava_repr_requirements.txt @@ -1,22 +1,38 @@ -llava@git+https://github.com/haotian-liu/LLaVA@v1.1.3 -accelerate>=0.21.0 -black==24.1.0 +accelerate==0.21.0 datasets==2.16.1 -evaluate>=0.4.0 -jsonlines -numexpr -peft>=0.2.0 -pybind11>=2.6.2 -pytablewriter -rouge-score>=0.0.4 -sacrebleu>=1.5.0 -scikit-learn>=0.24.1 -sqlitedict +evaluate==0.4.1 +hf_transfer==0.1.6 +Jinja2==3.1.3 +numpy==1.26.4 +openai==1.13.3 +packaging==23.2 +pandas==2.2.1 +Pillow==10.2.0 +protobuf==4.25.3 +pycocoevalcap==1.2 +pycocotools==2.0.7 +pytablewriter==1.2.0 +pytest==8.0.2 +python_Levenshtein==0.25.0 +pytz==2024.1 +PyYAML==6.0.1 +PyYAML==6.0.1 +Requests==2.31.0 +sacrebleu==2.4.0 +scikit_learn==1.2.2 +sentencepiece==0.1.99 +setuptools==68.2.2 +sglang==0.1.12 +shortuuid==1.0.12 +sqlitedict==2.1.0 +tenacity==8.2.3 torch==2.0.1 openai>=1.0.0 pycocoevalcap +tokenizers==0.15.2 +tqdm==4.66.2 tqdm-multiprocess -transformers +transformers==4.37.2 zstandard pillow pyyaml diff --git a/lmms_eval/tasks/_task_utils/file_utils.py b/lmms_eval/tasks/_task_utils/file_utils.py index 162455e8a..578ec7e51 100644 --- a/lmms_eval/tasks/_task_utils/file_utils.py +++ b/lmms_eval/tasks/_task_utils/file_utils.py @@ -1,8 +1,8 @@ import os -def generate_submission_file(file_name, args): - path = os.path.join(args.output_path, "submissions") +def generate_submission_file(file_name, args, subpath="submissions"): + path = os.path.join(args.output_path, subpath) os.makedirs(path, exist_ok=True) path = os.path.join(path, file_name) return os.path.abspath(path) diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml new file mode 100644 index 000000000..81094620b --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml @@ -0,0 +1,22 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +doc_to_target: "answer" +dataset_name: "cn" +output_type: generate_until +doc_to_visual: !function cn_utils.mmbench_doc_to_visual +doc_to_text: !function cn_utils.mmbench_doc_to_text +generation_kwargs: + max_new_tokens: 256 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function cn_utils.mmbench_process_results +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original diff --git a/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml new file mode 100644 index 000000000..ab2b882c8 --- /dev/null +++ b/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml @@ -0,0 +1,25 @@ +dataset_path: lmms-lab/MMBench +dataset_kwargs: + token: True +doc_to_target: "answer" +model_specific_prompt_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer with the option's letter from the given choices directly." +doc_to_visual: !function en_utils.mmbench_doc_to_visual +doc_to_text: !function en_utils.mmbench_doc_to_text +doc_to_target: "answer" +process_results: !function en_utils.mmbench_process_results +model_specific_generation_kwargs: + llava: + image_aspect_ratio: original +output_type: generate_until +dataset_name: "en" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 1024 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py index c049613f2..7009e012e 100644 --- a/lmms_eval/tasks/mmbench/cc_utils.py +++ b/lmms_eval/tasks/mmbench/cc_utils.py @@ -7,6 +7,7 @@ eval_logger = logging.getLogger("lmms-eval") from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file with open(Path(__file__).parent / "mmbench_cn.yaml", "r") as f: raw_data = f.readlines() @@ -66,9 +67,9 @@ def mmbench_cn_cc_process_results(doc, results): return data -def mmbench_cn_cc_aggregate_results(results): +def mmbench_cn_cc_aggregate_results(results, args): df = pd.DataFrame(results) - os.makedirs("./submissions", exist_ok=True) - with pd.ExcelWriter("./submissions/mmbench_cn_cc_results.xlsx") as writer: + file = generate_submission_file("mmbench_cn_cc_results.xlsx", args) + with pd.ExcelWriter(file) as writer: df.to_excel(writer, index=False) - eval_logger.info(f"Saved results to mmbench_cn_cc_results.xlsx") + eval_logger.info(f"Saved results to {file}") diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py index 1010eb1b1..812b9aa38 100644 --- a/lmms_eval/tasks/mmbench/cn_utils.py +++ b/lmms_eval/tasks/mmbench/cn_utils.py @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_cn_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 1962dc528..26e260006 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -36,7 +36,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "answer": doc.get("answer", None), "options": options_prompt, "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], "options_dict": options_dict, "index": doc["index"], "hint": doc["hint"], @@ -44,7 +44,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): "split": doc["split"], } - query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}" + query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}" if model_specific_prompt_kwargs: query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}" @@ -64,7 +64,7 @@ def mmbench_process_results(doc, results): "source": doc["source"], "split": doc["split"], "category": doc["category"], - "L2-category": doc["l2-category"], + "L2-category": doc["L2-category"], } } option_candidate = ["A", "B", "C", "D", "E"] @@ -83,8 +83,7 @@ def mmbench_aggregate_dev_results(results, args): def mmbench_aggregate_test_results(results, args): df = pd.DataFrame(results) - Path(args.output_path).joinpath("submissions").mkdir(parents=True, exist_ok=True) - excel_write_path = Path(args.output_path) / "submissions" / f"mmbench_en_test_results.xlsx" + excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args) with pd.ExcelWriter(excel_write_path) as writer: df.to_excel(writer, index=False) eval_logger.info(f"Saved results to {excel_write_path}") diff --git a/lmms_eval/tasks/mmbench/mmbench_cc.yaml b/lmms_eval/tasks/mmbench/mmbench_cc.yaml index 0ec13674f..238aa10c9 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cc.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cc.yaml @@ -1,9 +1,8 @@ -dataset_path: lmms-lab/MMBench_CN +dataset_path: lmms-lab/MMBench +dataset_name: cc dataset_kwargs: token: True -group: mmbench_cn task: "mmbench_cn_cc" -dataset_name: "chinese_culture" test_split: test output_type: generate_until doc_to_visual: !function cc_utils.mmbench_doc_to_visual diff --git a/lmms_eval/tasks/mmbench/mmbench_cn.yaml b/lmms_eval/tasks/mmbench/mmbench_cn.yaml index 82fddeb0c..6232531c4 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn.yaml @@ -5,4 +5,6 @@ task: - mmbench_cn_cc metadata: version: 0.0 - sys_prompt: "有如下几个选项:" \ No newline at end of file + gpt_eval_model_name: "gpt-3.5-turbo" + quick_extract: true + sys_prompt: "有如下几个选项:" diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml index 3b2b4fbb1..3d7b9d98b 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_dev.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -group: mmbench_cn task: "mmbench_cn_dev" -dataset_name: "default" test_split: "dev" -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission + higher_is_better: true aggregation: !function cn_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml index b17bb761d..b86f092cb 100644 --- a/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_cn_test.yaml @@ -1,33 +1,7 @@ -dataset_path: lmms-lab/MMBench_CN -dataset_kwargs: - token: True -task: "mmbench_cn_test" -dataset_name: "default" +task: mmbench_cn_test test_split: test -output_type: generate_until -doc_to_visual: !function cn_utils.mmbench_doc_to_visual -doc_to_text: !function cn_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function cn_utils.mmbench_process_results metric_list: - metric: submission aggregation: !function cn_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 - gpt_eval_model_name: "gpt-3.5-turbo" - quick_extract: true - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\n请直接使用所提供的选项字母作为答案回答。" -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original +include: _default_template_mmbench_cn_yaml diff --git a/lmms_eval/tasks/mmbench/mmbench_en.yaml b/lmms_eval/tasks/mmbench/mmbench_en.yaml index c518f924e..9fa757cc3 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en.yaml @@ -5,11 +5,3 @@ task: metadata: version: 0.0 sys_prompt: "There are several options:" - -model_specific_prompt_kwargs: - default: - pre_prompt: "" - post_prompt: "\nAnswer with the option's letter from the given choices directly." -model_specific_generation_kwargs: - llava: - image_aspect_ratio: original \ No newline at end of file diff --git a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml index 89d6ff76d..b4f4a2e9f 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_dev.yaml @@ -1,23 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_dev" test_split: dev -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - until: - - "ASSISTANT:" - max_new_tokens: 1024 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_dev_results -metadata: - version: 0.0 \ No newline at end of file + higher_is_better: true diff --git a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml index 92f73ef1f..5acf404af 100644 --- a/lmms_eval/tasks/mmbench/mmbench_en_test.yaml +++ b/lmms_eval/tasks/mmbench/mmbench_en_test.yaml @@ -1,22 +1,7 @@ -dataset_path: lmms-lab/MMBench_EN -dataset_kwargs: - token: True task: "mmbench_en_test" test_split: test -output_type: generate_until -doc_to_visual: !function en_utils.mmbench_doc_to_visual -doc_to_text: !function en_utils.mmbench_doc_to_text -doc_to_target: "answer" -generation_kwargs: - max_new_tokens: 256 - temperature: 0 - top_p: 0 - num_beams: 1 - do_sample: false -process_results: !function en_utils.mmbench_process_results +include: _default_template_mmbench_en_yaml metric_list: - metric: submission aggregation: !function en_utils.mmbench_aggregate_test_results higher_is_better: true -metadata: - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/ocrbench/ocrbench.yaml b/lmms_eval/tasks/ocrbench/ocrbench.yaml new file mode 100644 index 000000000..7957e7bfc --- /dev/null +++ b/lmms_eval/tasks/ocrbench/ocrbench.yaml @@ -0,0 +1,22 @@ +dataset_path: echo840/OCRBench +dataset_kwargs: + token: True +task: "ocrbench" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.ocrbench_doc_to_visual +doc_to_text: !function utils.ocrbench_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 128 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function utils.ocrbench_process_results +metric_list: + - metric: ocrbench_accuracy + aggregation: !function utils.ocrbench_aggregate_accuracy + higher_is_better: true +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/ocrbench/upload_ocrbench.py b/lmms_eval/tasks/ocrbench/upload_ocrbench.py new file mode 100644 index 000000000..528ab15d7 --- /dev/null +++ b/lmms_eval/tasks/ocrbench/upload_ocrbench.py @@ -0,0 +1,94 @@ +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json + +import datasets +from PIL import Image as PIL_Image +import json +from uuid import uuid4 +from datasets import Dataset, Features +import pandas as pd +from tqdm import tqdm +import io + +# Find for instance the citation on arxiv or on the dataset repo/website +_CITATION = """https://arxiv.org/abs/2305.07895""" +_DESCRIPTION = "OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models." + + +def image2byte(image): + img_bytes = io.BytesIO() + image.save(img_bytes, format="JPEG") + image_bytes = img_bytes.getvalue() + return image_bytes + + +def get_builder_config(VERSION): + builder_config = [ + datasets.BuilderConfig( + name=f"ocrbench", + version=VERSION, + description=f"ocrbench", + ) + ] + return builder_config + + +ocrbench_json = "pathto/OCRBench/OCRBench.json" +img_dir = "pathto/OCRBench_Images/" + +dataset_features = Features( + { + "dataset": datasets.Value("string"), + "question": datasets.Value("string"), + "question_type": datasets.Value("string"), + "answer": datasets.features.Sequence(datasets.Value("string")), + "image": datasets.Image(), + } +) + +df_items = { + "dataset": [], + "question": [], + "question_type": [], + "answer": [], + "image": [], +} +# img_feature = datasets.Image(decode=False) +with open(ocrbench_json, "r") as f: + data = json.load(f) +for i in tqdm(range(len(data))): + dataset_name = data[i]["dataset_name"] + image_path = img_dir + data[i]["image_path"] + question = data[i]["question"] + answers = data[i]["answers"] + question_type = data[i]["type"] + if type(answers) == str: + answers = [answers] + img = PIL_Image.open(image_path).convert("RGB") + byte_data = image2byte(img) + image = {"bytes": byte_data, "path": ""} + df_items["image"].append(image) + df_items["question"].append(str(question)) + df_items["answer"].append(answers) + df_items["question_type"].append(str(question_type)) + df_items["dataset"].append(str(dataset_name)) + +df_items = pd.DataFrame(df_items) +df_items.head() +dataset = Dataset.from_pandas(df_items, features=dataset_features) +hub_dataset_path = "echo840/OCRBench" +dataset.push_to_hub(repo_id=hub_dataset_path, split="test") diff --git a/lmms_eval/tasks/ocrbench/utils.py b/lmms_eval/tasks/ocrbench/utils.py new file mode 100644 index 000000000..c8c8c650e --- /dev/null +++ b/lmms_eval/tasks/ocrbench/utils.py @@ -0,0 +1,103 @@ +import logging + +from lmms_eval.tasks._task_utils.file_utils import generate_submission_file + +logger = logging.getLogger("lmms-eval") + +# Add the following functions to your existing utils.py file +OCRBench_score = { + "Regular Text Recognition": 0, + "Irregular Text Recognition": 0, + "Artistic Text Recognition": 0, + "Handwriting Recognition": 0, + "Digit String Recognition": 0, + "Non-Semantic Text Recognition": 0, + "Scene Text-centric VQA": 0, + "Doc-oriented VQA": 0, + "Key Information Extraction": 0, + "Handwritten Mathematical Expression Recognition": 0, +} + + +def ocrbench_doc_to_visual(doc): + # Assuming the 'doc' dictionary has a key 'image' with image data + return [doc["image"].convert("RGB")] + + +def ocrbench_doc_to_text(doc): + # Assuming the 'doc' dictionary has a key 'question' with the question text + question = doc["question"].strip() + return f"{question}" + + +def ocrbench_process_results(doc, results): + pred = results[0].lower().strip() + gt_ans = doc["answer"] + dataset_name = doc["dataset"] + + score = 0 + if dataset_name == "HME100k": + if type(gt_ans) == list: + for j in range(len(gt_ans)): + answer = gt_ans[j].strip().replace("\n", " ").replace(" ", "") + predict = pred.strip().replace("\n", " ").replace(" ", "") + if answer in predict: + score = 1 + else: + answer = gt_ans.strip().replace("\n", " ").replace(" ", "") + predict = pred.strip().replace("\n", " ").replace(" ", "") + if answer in predict: + score = 1 + else: + if type(gt_ans) == list: + for j in range(len(gt_ans)): + answer = gt_ans[j].lower().strip().replace("\n", " ") + predict = pred.lower().strip().replace("\n", " ") + if answer in predict: + score = 1 + else: + answer = gt_ans.lower().strip().replace("\n", " ") + predict = pred.lower().strip().replace("\n", " ") + if answer in predict: + score = 1 + return { + "ocrbench_accuracy": {"question_type": doc["question_type"], "score": score, "prediction": pred, "ground_truth": gt_ans}, + } + + +def ocrbench_aggregate_accuracy(results, args): + for result in results: + OCRBench_score[result["question_type"]] += result["score"] + recognition_score = ( + OCRBench_score["Regular Text Recognition"] + + OCRBench_score["Irregular Text Recognition"] + + OCRBench_score["Artistic Text Recognition"] + + OCRBench_score["Handwriting Recognition"] + + OCRBench_score["Digit String Recognition"] + + OCRBench_score["Non-Semantic Text Recognition"] + ) + Final_score = recognition_score + OCRBench_score["Scene Text-centric VQA"] + OCRBench_score["Doc-oriented VQA"] + OCRBench_score["Key Information Extraction"] + OCRBench_score["Handwritten Mathematical Expression Recognition"] + file_name = generate_submission_file("ocrbench_results.txt", args, subpath="results") + with open(file_name, "w") as f: + print("######################### OCRBench #############################", file=f) + print(f"Text Recognition(Total 300): {recognition_score}", file=f) + print("---------------- Details of Recognition Score ------------------", file=f) + print(f"Regular Text Recognition(Total 50): {OCRBench_score['Regular Text Recognition']}", file=f) + print(f"Irregular Text Recognition(Total 50): {OCRBench_score['Irregular Text Recognition']}", file=f) + print(f"Artistic Text Recognition(Total 50): {OCRBench_score['Artistic Text Recognition']}", file=f) + print(f"Handwriting Recognition(Total 50): {OCRBench_score['Handwriting Recognition']}", file=f) + print(f"Digit String Recognition(Total 50): {OCRBench_score['Digit String Recognition']}", file=f) + print(f"Non-Semantic Text Recognition(Total 50): {OCRBench_score['Non-Semantic Text Recognition']}", file=f) + print("----------------------------------------------------------------", file=f) + print(f"Scene Text-centric VQA(Total 200): {OCRBench_score['Scene Text-centric VQA']}", file=f) + print("----------------------------------------------------------------", file=f) + print(f"Doc-oriented VQA(Total 200): {OCRBench_score['Doc-oriented VQA']}", file=f) + print("----------------------------------------------------------------", file=f) + print(f"Key Information Extraction(Total 200): {OCRBench_score['Key Information Extraction']}", file=f) + print("----------------------------------------------------------------") + print(f"Handwritten Mathematical Expression Recognition(Total 100): {OCRBench_score['Handwritten Mathematical Expression Recognition']}", file=f) + print("--------------------- Final Score ------------------------------", file=f) + print(f"Final Score(Total 1000): {Final_score}", file=f) + logger.info(f"OCR Bench results saved to {file_name}") + # return {"Final Score":Final_score,"Text Recognition":recognition_score,'Scene Text-centric VQA':OCRBench_score['Scene Text-centric VQA'],'Doc-oriented VQA':OCRBench_score['Doc-oriented VQA'],'Key Information Extraction':OCRBench_score['Key Information Extraction'],'Handwritten Mathematical Expression Recognition':OCRBench_score['Handwritten Mathematical Expression Recognition']} + return Final_score diff --git a/miscs/repr_scripts.sh b/miscs/repr_scripts.sh index f5a743099..27fccbafc 100644 --- a/miscs/repr_scripts.sh +++ b/miscs/repr_scripts.sh @@ -2,9 +2,13 @@ cd lmms_eval; pip install --no-deps -U -e . +# install LLaVA without building dependencies +cd LLaVA +pip install --no-deps -U -e . + # install all the requirements that require for reproduce llava results pip install -r llava_repr_requirements.txt # Run and exactly reproduce llava_v1.5 results! # mme as an example -accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme --batch_size 1 --log_samples --log_samples_sufix reproduce --output_path ./logs/ \ No newline at end of file +accelerate launch --num_processes=1 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b,use_flash_attention_2=False" --tasks mme --batch_size 1 --log_samples --log_samples_sufix reproduce --output_path ./logs/ \ No newline at end of file