From 09eecf51e0d2116ad7a3a8b0035fae413cb71af8 Mon Sep 17 00:00:00 2001 From: kcz358 <92624596+kcz358@users.noreply.github.com> Date: Sun, 3 Mar 2024 13:01:11 +0800 Subject: [PATCH] [Fix] refcocog dataset path, record gpt prompt in internal eval, build context issue (#59) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * Update README.md with new features and installation instructions * Update supported models and datasets * Delete otter.py file * Fix capitalization in README.md * Update image sizes and add new features * Refactor README.md to improve readability and add new features * Add description for lmms-eval in README.md * Update accelerator support in README.md * Update lmms-eval README with improved description and additional features * Update README.md with improved task grouping description * change `Otter-AI/MME` to `lmms-lab/MME` * Update README.md * Update README.md * Remove unused code in mme.yaml * Squashed commit of the following: commit 90fbf3d5e72936b7b90855caf183b88cf82a2aaa Author: Zhang Peiyuan Date: Thu Feb 29 13:40:02 2024 +0800 Dev/py add models (#57) * add instructblip * minicpm_v * remove from qwen-vl * speed up postprocessing * Optimize build context speed --------- Co-authored-by: Pu Fanyi Co-authored-by: kcz358 commit 0fa3bceb2cc4d36b3f4fd42cdf76d45466fbd03e Author: Pu Fanyi Date: Wed Feb 28 14:49:07 2024 +0800 Pufanyi/flickr30k refractor (#56) * refactor vizwizvqa task * Delete vqav2_test and vqav2_val YAML files * Refactor vqav2_process_results functions * Add a pack for vqav2 * refactor okvqa * roll back vizwiz_vqa * Fix exact_match calculation in ok_vqa_process_results * Update OKVQA dataset name in readme * add model_specific_prompt_kwargs * add model_specific_prompt_kwargs to vizwiz_vqa * add model_specific_prompt_kwargs for vqav2 * lint * fix a small bug for eval_logger * Refactor make_table function to display points as " - " if value is None * Merge commit 'c5e52a785d3cc87a866be9b880deb477d9f73fb7' * Refactor ok_vqa_aggreate_submissions function * Merge commit 'e5aa0a9601d6d8ce727315e4b0a8f13f06f26bff' * Refactor VQA submission file saving * Update file utils * Merge commit '560deca9f72483ca091795d6dc2537d4c54b32b0' * Refactor file path handling and submission generation * OKVQA path * vizwizvqa file * pack cmmmu * fix a small metric bug for cmmmu * Add higher_is_better flag to submission metric * Add CMMMU dataset to README.md * Add logging and refactor submission file generation in docvqa utils.py * pack docvqa * add traceback to print detailed error * Refactor docvqa_test_aggregate_results to accept additional arguments * Add metric check in evaluator.py and update test.yaml and val.yaml * add common `EvalAIAnswerProcessor` for okvqa, textvqa, vizwizvqa and vqav2 * merge textvqa * textvqa * Modify submission file generation for COCO test results * Update test result storage path * update coco cap file name * Update COCO 2017 Caption dataset name * ferret * Add Ferret dataset * Refactor hb_doc_to_text function to include model-specific prompts * Add IconQA and its subtasks * Refactor image list creation in doc_to_visual function * Add process_results function to default template * Update process_results function in iconqa utils.py * refactor flickr30k * change aggregation function * Fix formatting issues and update logging message * Fix llava can not handle only text question (no visuals) * Fix qwen can not handle no image question (no visuals) * Add fuyu prepare accelerator scripts * refactor mme * naming consistency * aggregation_submissions consistency * flickr30k naming consistency * remove submissions for mme * remove unused submission function * Refactor infovqa_test.yaml and infovqa_val.yaml * Refactor code for improved readability and maintainability * stvqa * remane sqa * Update lmms_eval textcaps files and utils.py * Update default prompt for text captions * Refactor textcaps_aggregation_result function * Add generate_submission_file function and update mathvista_aggregate_results signature * Update nocaps_test.yaml and nocaps_val.yaml * refractor internal_eval * Add internal evaluation datasets * pack multidocvqa * mmvet * Fix gpt eval timeout issue for hallubench, restore load from gpt to avoid re evaluating * Refractor llava wild * Refractor llava-bench-coco * Add JSON file generation for gpt evaluation details * mmmu * Remove MMBench English and Chinese tasks * Remove unnecessary return statement in mmbench_aggregate_test_results function * Fix distributed process group initialization * Update dataset paths and group names in mmbench test configs * Update import statements in cc_utils.py, cn_utils.py, and en_utils.py * Add torch module import * lint * Remove IconQA dataset from README.md * Add Multi-DocVQA and its submodules * Add new datasets and update task names * Refactor flickr_aggregation_result function to accept additional arguments * Add timeout kwargs in Accelerator constructor * Add encoding to be utf-8 for cmmmu * Fix llava try and catch, remove torch.distributed.init in main * Ds prepare script for llava --------- Co-authored-by: JvThunder Co-authored-by: kcz358 commit 0182d5d2bb45da31cd77f1fd2ffc273d71d8d495 Author: Li Bo Date: Tue Feb 27 22:52:07 2024 +0800 [Wandb Logger] add models, and args to wandb tables. (#55) * Refactor logging in lmms_eval package * Refactor variable names in lmms_eval package * add llava main in pyproject * Update README.md * Remove unnecessary dependencies and add specific version for llava_repr * Add dependencies for llava_repr*** * Update README.md * add some docs on models and command line commands * remove some lines * typo * Update model_guide.md * Update model_guide.md * Update README.md * Update README.md * Update README.md * Fix refcocog dataset path * Record gpt response in eval info * Resolve conflict * Fix hallusionbench gpt json saving path * Rename hallubench gpt output path * Change remove image to check by type instead of check by names * More robust check by type * Remove unnecessary img in data * Forcing an empty commit. * Testing * Delete unnecessary things * Fix seedbench2 image issue in doc_to_text * Add conditional exclude for internal eval * Fix small bugs in list_with_num * Revise list_with_num model args --------- Co-authored-by: Bo Li Co-authored-by: Fanyi Pu Co-authored-by: jzhang38 --- README.md | 115 ++++++++++++++---- docs/README.md | 12 ++ docs/commands.md | 24 ++++ docs/model_guide.md | 90 ++++++++++++++ .../models/otter.py => docs/task_guide.md | 0 example_eval.yaml | 13 +- lmms_eval/__main__.py | 4 +- lmms_eval/api/task.py | 31 +++-- lmms_eval/evaluator.py | 17 ++- lmms_eval/models/instructblip.py | 1 - lmms_eval/models/minicpm_v.py | 2 - .../tasks/hallusion_bench/evaluate_hb.py | 22 ++-- .../tasks/internal_eval/d170_cn_utils.py | 2 + .../tasks/internal_eval/d170_en_utils.py | 2 + .../tasks/internal_eval/dc200_cn_utils.py | 1 + lmms_eval/tasks/mmbench/cc_utils.py | 2 +- lmms_eval/tasks/mmbench/cn_utils.py | 2 +- lmms_eval/tasks/mmbench/en_utils.py | 2 +- lmms_eval/tasks/mme/mme.yaml | 4 +- lmms_eval/tasks/mme/mme_test.yaml | 2 +- .../tasks/refcocog/_default_template_seg_yaml | 2 +- lmms_eval/tasks/seedbench_2/utils.py | 2 +- pyproject.toml | 55 ++++++--- 23 files changed, 329 insertions(+), 78 deletions(-) create mode 100644 docs/README.md create mode 100644 docs/commands.md create mode 100644 docs/model_guide.md rename lmms_eval/models/otter.py => docs/task_guide.md (100%) diff --git a/README.md b/README.md index 561cf8a13..fc05233ba 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,103 @@ -# lmms-eval +

+ +

-## How to run +# Large-scale Multi-modality Models Evaluation Suite +> Accelerating the development of large-scale multi-modality models (LMMs) with `lmms-eval` + +📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab) + +# Annoucement + +## v0.1.0 Released + +The first version of the `lmms-eval` is released. We are working on providing an one-command evaluation API for accelerating the development of LMMs. + +> In [LLaVA Next](https://llava-vl.github.io/blog/2024-01-30-llava-next/) development, we internally utilize this API to evaluate the model's performance on various model versions and datasets. It significantly accelerates the model development cycle for it's easy integration and fast evaluation speed. The main feature includes: + +### One-command evaluation, with detailed logs and samples. +You can evaluate the models on multiple datasets with a single command. No model/data preparation is needed, just one command line, few minutes, and get the results. Not just a result number, but also the detailed logs and samples, including the model args, input question, model response, and ground truth answer. + +### Accelerator support and Tasks grouping. +We support the usage of `accelerate` to wrap the model for distributed evaluation, supporting multi-gpu and tensor parallelism. With **Task Grouping**, all instances from all tasks are grouped and evaluated in parallel, which significantly improves the throughput of the evaluation. + +### Efficiency benchmark +Below are the total runtime on different datasets using 4 x A100 40G. +|Dataset|LLaVA-v1.5-7b|LLaVA-v1.5-13b| +|-------|-------------|--------------| +|mme | 2 mins 43 seconds | 3 mins 27 seconds | +|gqa | 10 mins 43 seconds | 14 mins 23 seconds | +|scienceqa_img| 1 mins 58 seconds | 2 mins 52 seconds | +|ai2d | 3 mins 17 seconds | 4 mins 12 seconds | +|coco2017_cap_val| 14 mins 13 seconds | 19 mins 58 seconds | + +### Prepared HF datasets. +We are hosting more than 40 (and it's increasing) datasets on [huggingface/lmms-lab](https://huggingface.co/lmms-lab), we carefully converted these datasets from original sources and included all variants, versions and splits. Now they can be directly accessed without any burden of data preprocessing. They also serve for the purpose of visualizing the data and grasping the sense of evaluation tasks distribution. + +

+ +

+ +### Detailed YAML task configuration +Including prompt pre-processing, output post-processing, answer extraction, model specific args and more. + +### Reproducible results (for LLaVA series models) and Logging Utilites. +We provide a set of pre-defined configurations & environments for llava-1.5, which can be directly used to reproduce the results in the paper. + +With `lmms-eval`, all evaluation details will be recorded including log samples and results, generating report tables to terminal output and to Weights & Biases Runs/Tables. + +> Development will be continuing on the main branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub. + +# Installation + +For formal usage, you can install the package from PyPI by running the following command: ```bash -pip install -e . +pip install lmms-eval ``` +For development, you can install the package by cloning the repository and running the following command: ```bash -accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks mme --batch_size 1 --log_samples --log_samples_suffix debug --output_path ./logs/ # Eactly reproduce llava results -accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Eactly reproduce llava results +git clone https://github.com/EvolvingLMMs-Lab/lmms-eval +cd lmms-eval +pip install -e . +``` + +If you wanted to test llava, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and +``` +git clone https://github.com/haotian-liu/LLaVA +cd LLaVA +pip install -e . ``` -## Current models -- GPT4V (API) - - generation-based evaluation +If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0 ` to let pycocoeval api to work. If you don't have it, you can install by using conda +``` +conda install openjdk=8 +``` +you can then check your java version by `java -version` -- LLaVA-v1.5/v1.6-7B/13B/34B - - generation-based evaluation - - perplexity-based evaluation +# Usage +```bash +# Evaluating LLaVA on MME +accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme --batch_size 1 --log_samples --log_samples_suffix llava_v1.5_mme --output_path ./logs/ -- Qwen-VL -- Fuyu/OtterHD +# Evaluating LLaVA on multiple datasets +accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme,mmbench_en --batch_size 1 --log_samples --log_samples_suffix llava_v1.5_mme_mmbenchen --output_path ./logs/ # -## Models to be added +# From a predefined configuration, supporting evaluation of multiple models and datasets +accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml +``` +## Supported models + +- GPT4V (API, only generation-based evaluation) +- LLaVA-v1.5/v1.6-7B/13B/34B (ppl-based, generation-based) +- Qwen-VL series (ppl-based, generation-based) +- Fuyu series (ppl-based, generation-based) +- InstructBLIP series (generation-based) -- InstructBLIP -- Emu -- CogVLM +## Supported datasets +> () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file. -## Current datasets - AI2D (ai2d) - ChartQA (chartqa) - CMMMU (cmmmu) @@ -134,12 +203,16 @@ accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Ea - IconQA (iconqa) - VistBench (vistbench) +# Add Customized Model and Dataset + +Please refer to our [documentation](docs/README.md). -## Acknowledgement +# Acknowledgement -The API, togegher with many code blocks of this project come from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness). **Please read through the [docs of lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) before contributing to this project**. Please do not commit to this project directly. Instead, push your changes to another branch and create a pull request. +The API, togegher with many code blocks of this project come from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness). We recommend you to read through the [docs of lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) for relevant informations. Below are the changes we made to the original API: +- Build context now only pass in idx and process image and doc during the model responding phase. This is due to the fact that dataset now contains lots of images and we can't store them in the doc like the original lm-eval-harness other wise the memory would explode. - Instance.args (lmms_eval/api/instance.py) now contains a list of images to be inputted to lmms. -- lm-eval-harness supports all HF LMM as single model class. Currently this is not possible of lmms because the input/output format of lmms in HF are not yet unified. Thererfore, we have to create a new class for each lmms model. This is not ideal and we will try to unify them in the future. +- lm-eval-harness supports all HF language models as single model class. Currently this is not possible of lmms because the input/output format of lmms in HF are not yet unified. Thererfore, we have to create a new class for each lmms model. This is not ideal and we will try to unify them in the future. diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..522407bc7 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,12 @@ +# LMMs Eval Documentation + +Welcome to the docs for `lmms-eval`! + +Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) + +## Table of Contents + +* To learn about the command line flags, see the [commands](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/commands.md) +* To learn how to add a new moddel, see the [Model Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/model_guide.md). +* For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/new_task_guide.md). +* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/task_guide.md). diff --git a/docs/commands.md b/docs/commands.md new file mode 100644 index 000000000..f5ebf0b61 --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,24 @@ +# User Guide +This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users. + +## Command-line Interface + + +Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line. + +This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`: + +* `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`. + +* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`. + +* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. + +* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length. + +* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well. + +* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`. + +* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models. + diff --git a/docs/model_guide.md b/docs/model_guide.md new file mode 100644 index 000000000..13ae8caf7 --- /dev/null +++ b/docs/model_guide.md @@ -0,0 +1,90 @@ +# New Model Guide +In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lmms_eval.api.model.lmms` class, that defines how the lmms_eval should interface with your model. This guide walks through how to write this `lmms` subclass via adding it to the library! + +## Setup + +To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment: + +```sh +# After forking... +git clone https://github.com//lmms-eval.git +cd lmms-eval +git checkout -b +pip install -e . +``` + +Now, we'll create a new file where we'll be adding our model: + +```sh +touch lmms_eval/models/.py +``` + +As a rule of thumb, we recommend you to use `lmms_eval/models/qwen_vl.py` and `lmms_eval/models/instructblip.py` as reference implementations for your model. You can copy and paste the contents of one of these files into your new file to get started. + +**Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.** + +## Interface + +All models must subclass the `lmms_eval.api.model.lmms` class. + +The lmms class enforces a common interface via which we can extract responses from a model: + +```python +class MyCustomLM(lmms): + #... + def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: + #... + + + def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]: + #... + + + def generate_until(self, requests: list[Instance]) -> list[str]: + #... + #... +``` +Where `Instance` is a dataclass defined in [`lmms_eval.api.instance`](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/api/instance.py) with property `args` of request-dependent type signature described below. + +We support three types of requests, consisting of different interactions / measurements with an autoregressive LM. + +All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name. Overall, you can check the [construct_requests](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/api/task.py#L918) to see how the arguments are being constructed for different types of output type requests. + +- `generate_until` + - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters. + - In each `Instance.args` there will be 6 elements which are `contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `all_gen_kwargs` refers to the dict that contains all the generation configuration for the model. We use `doc_id`, `task`, and `split` to access the dataset and then you can use `doc_to_visual` which is a function reference to process the image. When you implement your own model, you should use these to write your own generate_util function. + - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`). + - The generated input+output text from the model will then be returned. + +- `loglikelihood` + - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned. + - In each `Instance.args` there will be 6 elements which are ` contexts, doc_to_target, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `doc_to_target` is a function reference that get the get the answer from the doc. This will be the continuation of the answer and only tokens belong to this part should be calculated for the loglikelihood. + - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the target string is the *most likely* N-token string to be output by the LM given the input. ) + +- `loglikelihood_rolling` + - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated. + - This is used to evaluate *perplexity* on a data distribution. + - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input. + + + + +## Registration + +Congrats on implementing your model! Now it's time to test it out. + +To make your model usable via the command line interface to `lmms_eval`, you'll need to tell `lmms_eval` what your model's name is. + +This is done via a *decorator*, `lmms_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model ` and alert `lmms_eval` to the model's existence. + +```python +from lmms_eval.api.registry import register_model + +@register_model("", "") +class MyCustomLM(LM): +``` + +The final step is to import your model in `lmms_eval/models/__init__.py`: +```python +from .my_model_filename import MyCustomLM +``` diff --git a/lmms_eval/models/otter.py b/docs/task_guide.md similarity index 100% rename from lmms_eval/models/otter.py rename to docs/task_guide.md diff --git a/example_eval.yaml b/example_eval.yaml index fa0a4d02a..40e29a85d 100644 --- a/example_eval.yaml +++ b/example_eval.yaml @@ -1,8 +1,15 @@ - model: llava model_args: pretrained=liuhaotian/llava-v1.5-7b - tasks: vizwiz_vqa + tasks: ai2d batch_size: 1 log_samples: true - log_samples_suffix: debug + log_samples_suffix: eval_vizwiz_vqa + output_path: "./logs/" + +- model: llava + model_args: pretrained=liuhaotian/llava-v1.5-13b + tasks: mme + batch_size: 1 + log_samples: true + log_samples_suffix: mme output_path: "./logs/" - limit: 8 \ No newline at end of file diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py index 5f95cf96e..9f005109b 100644 --- a/lmms_eval/__main__.py +++ b/lmms_eval/__main__.py @@ -244,14 +244,14 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70 ) eval_logger.info(log_message) - task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name=args.model) + task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava") for task_name in task_dict.keys(): task_obj = task_dict[task_name] if type(task_obj) == tuple: group, task_obj = task_obj if task_obj is None: continue - eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else task_obj.validation_docs()}") + eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}") sys.exit() else: tasks_list = args.tasks.split(",") diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py index 771209246..db3cfd1e9 100644 --- a/lmms_eval/api/task.py +++ b/lmms_eval/api/task.py @@ -10,6 +10,7 @@ from tqdm import tqdm import datasets +from datasets import Image, Sequence import numpy as np from PIL import ImageFile @@ -247,11 +248,16 @@ def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None: download_mode=download_mode, ) for doc_name in self.dataset_no_image: - column_names = self.dataset_no_image[doc_name].column_names - image_column = [col for col in column_names if "image" in col.lower()] - # remove image column from docs - if image_column: - self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(image_column) + remove_cols = [] + features = self.dataset_no_image[doc_name].features + # If it is an Image instance or a Sequence of Image instance. Remove it + for feature in features: + if isinstance(features[feature], Image): + remove_cols.append(feature) + elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): + remove_cols.append(feature) + for remove_col in remove_cols: + self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col) @property def config(self): @@ -694,11 +700,16 @@ def download(self, dataset_kwargs=None) -> None: **dataset_kwargs if dataset_kwargs is not None else {}, ) for doc_name in self.dataset_no_image: - column_names = self.dataset_no_image[doc_name].column_names - image_column = [col for col in column_names if "image" in col.lower()] - # remove image column from docs - if image_column: - self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(image_column) + remove_cols = [] + features = self.dataset_no_image[doc_name].features + # If it is an Image instance or a Sequence of Image instance. Remove it + for feature in features: + if isinstance(features[feature], Image): + remove_cols.append(feature) + elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): + remove_cols.append(feature) + for remove_col in remove_cols: + self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col) def has_training_docs(self) -> bool: if self.config.training_split is not None: diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py index 7d6174dc3..c3100dcae 100644 --- a/lmms_eval/evaluator.py +++ b/lmms_eval/evaluator.py @@ -9,6 +9,7 @@ import torch import logging import numpy as np +from datasets import Image, Sequence import lmms_eval.api import lmms_eval.tasks @@ -317,11 +318,17 @@ def evaluate( # hack: remove image columns to speed avoid loading images and speed up postprocessing # reason: doc_iterator will actually load image if it's in the doc. docs = task.test_docs() if task.has_test_docs() else task.validation_docs() - column_names = docs.column_names - image_column = [col for col in column_names if "image" in col.lower()] - # remove image column from docs - if image_column: - docs = docs.remove_columns(image_column) + if "d170" not in task_name or "dc100" not in task_name or "dc200" not in task_name: + remove_cols = [] + features = docs.features + # If it is an Image instance or a Sequence of Image instance. Remove it + for feature in features: + if isinstance(features[feature], Image): + remove_cols.append(feature) + elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): + remove_cols.append(feature) + if remove_cols: + docs = docs.remove_columns(remove_cols) doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size) # Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting # doc_iterator, doc_iterator_for_counting = itertools.tee(doc_iterator) diff --git a/lmms_eval/models/instructblip.py b/lmms_eval/models/instructblip.py index 1ad562070..7086f346f 100644 --- a/lmms_eval/models/instructblip.py +++ b/lmms_eval/models/instructblip.py @@ -107,7 +107,6 @@ def eot_token_id(self): def max_length(self): return self._max_length - @property def batch_size(self): return self.batch_size_per_gpu diff --git a/lmms_eval/models/minicpm_v.py b/lmms_eval/models/minicpm_v.py index 482ef3016..1838b56f8 100644 --- a/lmms_eval/models/minicpm_v.py +++ b/lmms_eval/models/minicpm_v.py @@ -11,7 +11,6 @@ from transformers import AutoModel, AutoTokenizer - import warnings warnings.filterwarnings("ignore") @@ -105,7 +104,6 @@ def eot_token_id(self): def max_length(self): return self._max_length - @property def batch_size(self): return self.batch_size_per_gpu diff --git a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py index ea6b193ec..87c65519d 100644 --- a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py +++ b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py @@ -6,8 +6,6 @@ from lmms_eval.tasks.hallusion_bench.utils import evaluate_by_chatgpt, check_same_by_chatgpt, assign_correctness, get_eval_all, get_eval_fig, get_eval_pair_all cur_dir = os.path.dirname(os.path.abspath(__file__)) -save_json_path_vd = f"{cur_dir}/hallusion_output_vd_model.json" -save_json_path_vs = f"{cur_dir}/hallusion_output_vs_model.json" output_entry = "model_prediction" correctness_entry = "gpt4v_output_gpt_check" @@ -30,12 +28,12 @@ def hb_doc_to_visual(doc): def hb_process_results(doc, result): sample = doc - doc.pop("image") + # doc.pop("image") sample["model_prediction"] = result[0] return {k: sample for k in metric} -def hb_aggregation_result(results, metric): +def hb_aggregation_result(results, metric, args): data_vd = [] data_vs = [] for data in tqdm(results, desc="Split vd and vs"): @@ -44,6 +42,10 @@ def hb_aggregation_result(results, metric): if data["category"] == "VS": data_vs.append(data) eval_logger.info("Do gpt eval vd ...") + path = os.path.join(args.output_path, "gpt_response") + os.makedirs(path, exist_ok=True) + save_json_path_vd = f"{path}/hallusion_output_vd_model.json" + save_json_path_vs = f"{path}/hallusion_output_vs_model.json" data_vd = evaluate_by_chatgpt(data_vd, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vd) # data_vd = check_same_by_chatgpt(data_vd, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vd) data_vd = assign_correctness(data_vd, correctness_entry=correctness_entry) @@ -64,16 +66,16 @@ def hb_aggregation_result(results, metric): return round(100 * all_data["correct"] / all_data["total"], 4) -def hb_aggregation_result_qAcc(results): - return hb_aggregation_result(results, "qAcc") +def hb_aggregation_result_qAcc(results, args): + return hb_aggregation_result(results, "qAcc", args) -def hb_aggregation_result_fAcc(results): - return hb_aggregation_result(results, "fAcc") +def hb_aggregation_result_fAcc(results, args): + return hb_aggregation_result(results, "fAcc", args) -def hb_aggregation_result_aAcc(results): - return hb_aggregation_result(results, "aAcc") +def hb_aggregation_result_aAcc(results, args): + return hb_aggregation_result(results, "aAcc", args) def hb_aggregation_result_intern(results, metric): diff --git a/lmms_eval/tasks/internal_eval/d170_cn_utils.py b/lmms_eval/tasks/internal_eval/d170_cn_utils.py index 15a797229..77df3260d 100644 --- a/lmms_eval/tasks/internal_eval/d170_cn_utils.py +++ b/lmms_eval/tasks/internal_eval/d170_cn_utils.py @@ -110,6 +110,8 @@ def process_results(doc, results): "prediction": pred, "ground_truth": answer, "eval_model": model_name, + "prompt" : gpt_query_prompt, + "response" : response }, "gpt_eval_avg_score": { "score": score, diff --git a/lmms_eval/tasks/internal_eval/d170_en_utils.py b/lmms_eval/tasks/internal_eval/d170_en_utils.py index 99aa5d63f..c2d252913 100644 --- a/lmms_eval/tasks/internal_eval/d170_en_utils.py +++ b/lmms_eval/tasks/internal_eval/d170_en_utils.py @@ -110,6 +110,8 @@ def process_results(doc, results): "prediction": pred, "ground_truth": answer, "eval_model": model_name, + "prompt" : gpt_query_prompt, + "response" : response }, "gpt_eval_avg_score": { "score": score, diff --git a/lmms_eval/tasks/internal_eval/dc200_cn_utils.py b/lmms_eval/tasks/internal_eval/dc200_cn_utils.py index 961b81d32..6680fdcfb 100644 --- a/lmms_eval/tasks/internal_eval/dc200_cn_utils.py +++ b/lmms_eval/tasks/internal_eval/dc200_cn_utils.py @@ -110,6 +110,7 @@ def process_results(doc, results): "explanation": response, "eval_model": GPT_EVAL_MODEL_NAME, "score": score, + "prompt" : prompt }, "gpt_eval_avg_score": { "score": score, diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py index bc5761d01..379f103ee 100644 --- a/lmms_eval/tasks/mmbench/cc_utils.py +++ b/lmms_eval/tasks/mmbench/cc_utils.py @@ -30,7 +30,7 @@ def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None): options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate) data = { - "img": doc["image"], + # "img": doc["image"], "question": doc["question"], "answer": doc.get("answer", None), "options": options_prompt, diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py index 5cc7b91bb..85cc8bb94 100644 --- a/lmms_eval/tasks/mmbench/cn_utils.py +++ b/lmms_eval/tasks/mmbench/cn_utils.py @@ -31,7 +31,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate) data = { - "img": doc["image"], + # "img": doc["image"], "question": doc["question"], "answer": doc.get("answer", None), "options": options_prompt, diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py index 1bf9235b6..8453ffa56 100644 --- a/lmms_eval/tasks/mmbench/en_utils.py +++ b/lmms_eval/tasks/mmbench/en_utils.py @@ -31,7 +31,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None): options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate) data = { - "img": doc["image"], + # "img": doc["image"], "question": doc["question"], "answer": doc.get("answer", None), "options": options_prompt, diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml index cf49702d2..504e6dd02 100644 --- a/lmms_eval/tasks/mme/mme.yaml +++ b/lmms_eval/tasks/mme/mme.yaml @@ -1,4 +1,4 @@ -dataset_path: Otter-AI/MME +dataset_path: lmms-lab/MME dataset_kwargs: token: True task: "mme" @@ -32,6 +32,6 @@ model_specific_prompt_kwargs: post_prompt: " Answer:" otterhd: pre_prompt: "" - post_prompt: "" + post_prompt: " Answer:" metadata: - version: 0.0 diff --git a/lmms_eval/tasks/mme/mme_test.yaml b/lmms_eval/tasks/mme/mme_test.yaml index 335a4f7a6..c529cf83a 100644 --- a/lmms_eval/tasks/mme/mme_test.yaml +++ b/lmms_eval/tasks/mme/mme_test.yaml @@ -1,4 +1,4 @@ -dataset_path: Otter-AI/MME +dataset_path: lmms-lab/MME dataset_kwargs: token: True task: "mme_test" diff --git a/lmms_eval/tasks/refcocog/_default_template_seg_yaml b/lmms_eval/tasks/refcocog/_default_template_seg_yaml index 854565417..a3a291bd6 100644 --- a/lmms_eval/tasks/refcocog/_default_template_seg_yaml +++ b/lmms_eval/tasks/refcocog/_default_template_seg_yaml @@ -1,4 +1,4 @@ -dataset_path: lmms-lab/RefCOCO +dataset_path: lmms-lab/RefCOCOg output_type: generate_until doc_to_visual: !function utils.refcoco_seg_doc_to_visual doc_to_text: !function utils.refcoco_doc_to_text diff --git a/lmms_eval/tasks/seedbench_2/utils.py b/lmms_eval/tasks/seedbench_2/utils.py index c5a6ec8ab..af8d8571f 100644 --- a/lmms_eval/tasks/seedbench_2/utils.py +++ b/lmms_eval/tasks/seedbench_2/utils.py @@ -19,7 +19,7 @@ def seed_doc_to_text(doc, model_specific_kwargs=None): question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n" question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}" if doc["data_type"] == "Image Generation": - num_img_in_question = len(doc["image"]) - 4 + num_img_in_question = len(doc["data_id"]) - 4 prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question question = " ".join(prepend_tokens) + "\n" + question return f"{question}\n{model_specific_kwargs['post_prompt']}" diff --git a/pyproject.toml b/pyproject.toml index 4017a61a4..05043fd1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "accelerate>=0.21.0", "black==24.1.0", "datasets==2.16.1", - "evaluate", "evaluate>=0.4.0", "jsonlines", "numexpr", @@ -35,7 +34,7 @@ dependencies = [ "sacrebleu>=1.5.0", "scikit-learn>=0.24.1", "sqlitedict", - "torch>=1.8", + "torch>=1.8", # Note the version specification here for torch "openai>=1.0.0", "pycocoevalcap", "tqdm-multiprocess", @@ -54,6 +53,44 @@ dependencies = [ "transformers-stream-generator", "tiktoken", "pre-commit", + "llava@git+https://github.com/haotian-liu/LLaVA", +] + +[project.optional-dependencies] +llava_repr = [ + "accelerate>=0.21.0", + "black==24.1.0", + "datasets==2.16.1", + "evaluate>=0.4.0", + "jsonlines", + "numexpr", + "peft>=0.2.0", + "pybind11>=2.6.2", + "pytablewriter", + "rouge-score>=0.0.4", + "sacrebleu>=1.5.0", + "scikit-learn>=0.24.1", + "sqlitedict", + "openai>=1.0.0", + "pycocoevalcap", + "tqdm-multiprocess", + "transformers>=4.36.2", + "zstandard", + "pillow", + "pyyaml", + "sympy", + "mpmath", + "Jinja2", + "openpyxl", + "Levenshtein", + "hf_transfer", + "tenacity", + "wandb>=0.16.0", + "transformers-stream-generator", + "tiktoken", + "pre-commit", + "torch==2.0.1", # Specific version for llava_repr + "llava@git+https://github.com/haotian-liu/LLaVA", ] [tool.setuptools.packages.find] @@ -70,17 +107,3 @@ lmms_eval = "lmms_eval.__main__:cli_evaluate" [project.urls] Homepage = "https://github.com/EvolvingLMMs-Lab/lmms-eval" Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval" - -[project.optional-dependencies] -dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"] -linting = ["flake8", "pylint", "mypy", "pre-commit"] -testing = ["pytest", "pytest-cov", "pytest-xdist"] -multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] -math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] -sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"] -gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"] -anthropic = ["anthropic"] -openai = ["openai==1.3.9"] -vllm = ["vllm"] -ifeval = ["langdetect", "immutabledict"] -all = ["lmms_eval[dev]", "lmms_eval[testing]", "lmms_eval[linting]"]