diff --git a/README.md b/README.md
index 561cf8a13..fc05233ba 100644
--- a/README.md
+++ b/README.md
@@ -1,34 +1,103 @@
-# lmms-eval
+
+
+
-## How to run
+# Large-scale Multi-modality Models Evaluation Suite
+> Accelerating the development of large-scale multi-modality models (LMMs) with `lmms-eval`
+
+📚 [Documentation](docs/README.md) | 🤗 [Huggingface Datasets](https://huggingface.co/lmms-lab)
+
+# Annoucement
+
+## v0.1.0 Released
+
+The first version of the `lmms-eval` is released. We are working on providing an one-command evaluation API for accelerating the development of LMMs.
+
+> In [LLaVA Next](https://llava-vl.github.io/blog/2024-01-30-llava-next/) development, we internally utilize this API to evaluate the model's performance on various model versions and datasets. It significantly accelerates the model development cycle for it's easy integration and fast evaluation speed. The main feature includes:
+
+### One-command evaluation, with detailed logs and samples.
+You can evaluate the models on multiple datasets with a single command. No model/data preparation is needed, just one command line, few minutes, and get the results. Not just a result number, but also the detailed logs and samples, including the model args, input question, model response, and ground truth answer.
+
+### Accelerator support and Tasks grouping.
+We support the usage of `accelerate` to wrap the model for distributed evaluation, supporting multi-gpu and tensor parallelism. With **Task Grouping**, all instances from all tasks are grouped and evaluated in parallel, which significantly improves the throughput of the evaluation.
+
+### Efficiency benchmark
+Below are the total runtime on different datasets using 4 x A100 40G.
+|Dataset|LLaVA-v1.5-7b|LLaVA-v1.5-13b|
+|-------|-------------|--------------|
+|mme | 2 mins 43 seconds | 3 mins 27 seconds |
+|gqa | 10 mins 43 seconds | 14 mins 23 seconds |
+|scienceqa_img| 1 mins 58 seconds | 2 mins 52 seconds |
+|ai2d | 3 mins 17 seconds | 4 mins 12 seconds |
+|coco2017_cap_val| 14 mins 13 seconds | 19 mins 58 seconds |
+
+### Prepared HF datasets.
+We are hosting more than 40 (and it's increasing) datasets on [huggingface/lmms-lab](https://huggingface.co/lmms-lab), we carefully converted these datasets from original sources and included all variants, versions and splits. Now they can be directly accessed without any burden of data preprocessing. They also serve for the purpose of visualizing the data and grasping the sense of evaluation tasks distribution.
+
+
+
+
+
+### Detailed YAML task configuration
+Including prompt pre-processing, output post-processing, answer extraction, model specific args and more.
+
+### Reproducible results (for LLaVA series models) and Logging Utilites.
+We provide a set of pre-defined configurations & environments for llava-1.5, which can be directly used to reproduce the results in the paper.
+
+With `lmms-eval`, all evaluation details will be recorded including log samples and results, generating report tables to terminal output and to Weights & Biases Runs/Tables.
+
+> Development will be continuing on the main branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub.
+
+# Installation
+
+For formal usage, you can install the package from PyPI by running the following command:
```bash
-pip install -e .
+pip install lmms-eval
```
+For development, you can install the package by cloning the repository and running the following command:
```bash
-accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-13b" --tasks mme --batch_size 1 --log_samples --log_samples_suffix debug --output_path ./logs/ # Eactly reproduce llava results
-accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Eactly reproduce llava results
+git clone https://github.com/EvolvingLMMs-Lab/lmms-eval
+cd lmms-eval
+pip install -e .
+```
+
+If you wanted to test llava, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and
+```
+git clone https://github.com/haotian-liu/LLaVA
+cd LLaVA
+pip install -e .
```
-## Current models
-- GPT4V (API)
- - generation-based evaluation
+If you want to test on caption dataset such as `coco`, `refcoco`, and `nocaps`, you will need to have `java==1.8.0 ` to let pycocoeval api to work. If you don't have it, you can install by using conda
+```
+conda install openjdk=8
+```
+you can then check your java version by `java -version`
-- LLaVA-v1.5/v1.6-7B/13B/34B
- - generation-based evaluation
- - perplexity-based evaluation
+# Usage
+```bash
+# Evaluating LLaVA on MME
+accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme --batch_size 1 --log_samples --log_samples_suffix llava_v1.5_mme --output_path ./logs/
-- Qwen-VL
-- Fuyu/OtterHD
+# Evaluating LLaVA on multiple datasets
+accelerate launch --num_processes=8 -m lmms_eval --model llava --model_args pretrained="liuhaotian/llava-v1.5-7b" --tasks mme,mmbench_en --batch_size 1 --log_samples --log_samples_suffix llava_v1.5_mme_mmbenchen --output_path ./logs/ #
-## Models to be added
+# From a predefined configuration, supporting evaluation of multiple models and datasets
+accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml
+```
+## Supported models
+
+- GPT4V (API, only generation-based evaluation)
+- LLaVA-v1.5/v1.6-7B/13B/34B (ppl-based, generation-based)
+- Qwen-VL series (ppl-based, generation-based)
+- Fuyu series (ppl-based, generation-based)
+- InstructBLIP series (generation-based)
-- InstructBLIP
-- Emu
-- CogVLM
+## Supported datasets
+> () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file.
-## Current datasets
- AI2D (ai2d)
- ChartQA (chartqa)
- CMMMU (cmmmu)
@@ -134,12 +203,16 @@ accelerate launch --num_processes=8 -m lmms_eval --config example_eval.yaml # Ea
- IconQA (iconqa)
- VistBench (vistbench)
+# Add Customized Model and Dataset
+
+Please refer to our [documentation](docs/README.md).
-## Acknowledgement
+# Acknowledgement
-The API, togegher with many code blocks of this project come from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness). **Please read through the [docs of lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) before contributing to this project**. Please do not commit to this project directly. Instead, push your changes to another branch and create a pull request.
+The API, togegher with many code blocks of this project come from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness). We recommend you to read through the [docs of lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs) for relevant informations.
Below are the changes we made to the original API:
+- Build context now only pass in idx and process image and doc during the model responding phase. This is due to the fact that dataset now contains lots of images and we can't store them in the doc like the original lm-eval-harness other wise the memory would explode.
- Instance.args (lmms_eval/api/instance.py) now contains a list of images to be inputted to lmms.
-- lm-eval-harness supports all HF LMM as single model class. Currently this is not possible of lmms because the input/output format of lmms in HF are not yet unified. Thererfore, we have to create a new class for each lmms model. This is not ideal and we will try to unify them in the future.
+- lm-eval-harness supports all HF language models as single model class. Currently this is not possible of lmms because the input/output format of lmms in HF are not yet unified. Thererfore, we have to create a new class for each lmms model. This is not ideal and we will try to unify them in the future.
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 000000000..522407bc7
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,12 @@
+# LMMs Eval Documentation
+
+Welcome to the docs for `lmms-eval`!
+
+Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/)
+
+## Table of Contents
+
+* To learn about the command line flags, see the [commands](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/commands.md)
+* To learn how to add a new moddel, see the [Model Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/model_guide.md).
+* For a crash course on adding new tasks to the library, see our [New Task Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/new_task_guide.md).
+* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/docs/task_guide.md).
diff --git a/docs/commands.md b/docs/commands.md
new file mode 100644
index 000000000..f5ebf0b61
--- /dev/null
+++ b/docs/commands.md
@@ -0,0 +1,24 @@
+# User Guide
+This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users.
+
+## Command-line Interface
+
+
+Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line.
+
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+
+* `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`.
+
+* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`.
+
+* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
+
+* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+
+* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+
+* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+
+* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+
diff --git a/docs/model_guide.md b/docs/model_guide.md
new file mode 100644
index 000000000..13ae8caf7
--- /dev/null
+++ b/docs/model_guide.md
@@ -0,0 +1,90 @@
+# New Model Guide
+In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lmms_eval.api.model.lmms` class, that defines how the lmms_eval should interface with your model. This guide walks through how to write this `lmms` subclass via adding it to the library!
+
+## Setup
+
+To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment:
+
+```sh
+# After forking...
+git clone https://github.com//lmms-eval.git
+cd lmms-eval
+git checkout -b
+pip install -e .
+```
+
+Now, we'll create a new file where we'll be adding our model:
+
+```sh
+touch lmms_eval/models/.py
+```
+
+As a rule of thumb, we recommend you to use `lmms_eval/models/qwen_vl.py` and `lmms_eval/models/instructblip.py` as reference implementations for your model. You can copy and paste the contents of one of these files into your new file to get started.
+
+**Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.**
+
+## Interface
+
+All models must subclass the `lmms_eval.api.model.lmms` class.
+
+The lmms class enforces a common interface via which we can extract responses from a model:
+
+```python
+class MyCustomLM(lmms):
+ #...
+ def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+ #...
+
+
+ def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+ #...
+
+
+ def generate_until(self, requests: list[Instance]) -> list[str]:
+ #...
+ #...
+```
+Where `Instance` is a dataclass defined in [`lmms_eval.api.instance`](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/api/instance.py) with property `args` of request-dependent type signature described below.
+
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
+
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name. Overall, you can check the [construct_requests](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/api/task.py#L918) to see how the arguments are being constructed for different types of output type requests.
+
+- `generate_until`
+ - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+ - In each `Instance.args` there will be 6 elements which are `contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `all_gen_kwargs` refers to the dict that contains all the generation configuration for the model. We use `doc_id`, `task`, and `split` to access the dataset and then you can use `doc_to_visual` which is a function reference to process the image. When you implement your own model, you should use these to write your own generate_util function.
+ - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+ - The generated input+output text from the model will then be returned.
+
+- `loglikelihood`
+ - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+ - In each `Instance.args` there will be 6 elements which are ` contexts, doc_to_target, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `doc_to_target` is a function reference that get the get the answer from the doc. This will be the continuation of the answer and only tokens belong to this part should be calculated for the loglikelihood.
+ - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the target string is the *most likely* N-token string to be output by the LM given the input. )
+
+- `loglikelihood_rolling`
+ - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+ - This is used to evaluate *perplexity* on a data distribution.
+ - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
+
+
+
+
+## Registration
+
+Congrats on implementing your model! Now it's time to test it out.
+
+To make your model usable via the command line interface to `lmms_eval`, you'll need to tell `lmms_eval` what your model's name is.
+
+This is done via a *decorator*, `lmms_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model ` and alert `lmms_eval` to the model's existence.
+
+```python
+from lmms_eval.api.registry import register_model
+
+@register_model("", "")
+class MyCustomLM(LM):
+```
+
+The final step is to import your model in `lmms_eval/models/__init__.py`:
+```python
+from .my_model_filename import MyCustomLM
+```
diff --git a/lmms_eval/models/otter.py b/docs/task_guide.md
similarity index 100%
rename from lmms_eval/models/otter.py
rename to docs/task_guide.md
diff --git a/example_eval.yaml b/example_eval.yaml
index fa0a4d02a..40e29a85d 100644
--- a/example_eval.yaml
+++ b/example_eval.yaml
@@ -1,8 +1,15 @@
- model: llava
model_args: pretrained=liuhaotian/llava-v1.5-7b
- tasks: vizwiz_vqa
+ tasks: ai2d
batch_size: 1
log_samples: true
- log_samples_suffix: debug
+ log_samples_suffix: eval_vizwiz_vqa
+ output_path: "./logs/"
+
+- model: llava
+ model_args: pretrained=liuhaotian/llava-v1.5-13b
+ tasks: mme
+ batch_size: 1
+ log_samples: true
+ log_samples_suffix: mme
output_path: "./logs/"
- limit: 8
\ No newline at end of file
diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py
index 5f95cf96e..9f005109b 100644
--- a/lmms_eval/__main__.py
+++ b/lmms_eval/__main__.py
@@ -244,14 +244,14 @@ def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None:
"\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70
)
eval_logger.info(log_message)
- task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name=args.model)
+ task_dict = get_task_dict([task for task in sorted(ALL_TASKS)], model_name="llava")
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
group, task_obj = task_obj
if task_obj is None:
continue
- eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else task_obj.validation_docs()}")
+ eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}")
sys.exit()
else:
tasks_list = args.tasks.split(",")
diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py
index 771209246..db3cfd1e9 100644
--- a/lmms_eval/api/task.py
+++ b/lmms_eval/api/task.py
@@ -10,6 +10,7 @@
from tqdm import tqdm
import datasets
+from datasets import Image, Sequence
import numpy as np
from PIL import ImageFile
@@ -247,11 +248,16 @@ def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
download_mode=download_mode,
)
for doc_name in self.dataset_no_image:
- column_names = self.dataset_no_image[doc_name].column_names
- image_column = [col for col in column_names if "image" in col.lower()]
- # remove image column from docs
- if image_column:
- self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(image_column)
+ remove_cols = []
+ features = self.dataset_no_image[doc_name].features
+ # If it is an Image instance or a Sequence of Image instance. Remove it
+ for feature in features:
+ if isinstance(features[feature], Image):
+ remove_cols.append(feature)
+ elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
+ remove_cols.append(feature)
+ for remove_col in remove_cols:
+ self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
@property
def config(self):
@@ -694,11 +700,16 @@ def download(self, dataset_kwargs=None) -> None:
**dataset_kwargs if dataset_kwargs is not None else {},
)
for doc_name in self.dataset_no_image:
- column_names = self.dataset_no_image[doc_name].column_names
- image_column = [col for col in column_names if "image" in col.lower()]
- # remove image column from docs
- if image_column:
- self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(image_column)
+ remove_cols = []
+ features = self.dataset_no_image[doc_name].features
+ # If it is an Image instance or a Sequence of Image instance. Remove it
+ for feature in features:
+ if isinstance(features[feature], Image):
+ remove_cols.append(feature)
+ elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
+ remove_cols.append(feature)
+ for remove_col in remove_cols:
+ self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
def has_training_docs(self) -> bool:
if self.config.training_split is not None:
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index 7d6174dc3..c3100dcae 100644
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -9,6 +9,7 @@
import torch
import logging
import numpy as np
+from datasets import Image, Sequence
import lmms_eval.api
import lmms_eval.tasks
@@ -317,11 +318,17 @@ def evaluate(
# hack: remove image columns to speed avoid loading images and speed up postprocessing
# reason: doc_iterator will actually load image if it's in the doc.
docs = task.test_docs() if task.has_test_docs() else task.validation_docs()
- column_names = docs.column_names
- image_column = [col for col in column_names if "image" in col.lower()]
- # remove image column from docs
- if image_column:
- docs = docs.remove_columns(image_column)
+ if "d170" not in task_name or "dc100" not in task_name or "dc200" not in task_name:
+ remove_cols = []
+ features = docs.features
+ # If it is an Image instance or a Sequence of Image instance. Remove it
+ for feature in features:
+ if isinstance(features[feature], Image):
+ remove_cols.append(feature)
+ elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
+ remove_cols.append(feature)
+ if remove_cols:
+ docs = docs.remove_columns(remove_cols)
doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size)
# Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting
# doc_iterator, doc_iterator_for_counting = itertools.tee(doc_iterator)
diff --git a/lmms_eval/models/instructblip.py b/lmms_eval/models/instructblip.py
index 1ad562070..7086f346f 100644
--- a/lmms_eval/models/instructblip.py
+++ b/lmms_eval/models/instructblip.py
@@ -107,7 +107,6 @@ def eot_token_id(self):
def max_length(self):
return self._max_length
-
@property
def batch_size(self):
return self.batch_size_per_gpu
diff --git a/lmms_eval/models/minicpm_v.py b/lmms_eval/models/minicpm_v.py
index 482ef3016..1838b56f8 100644
--- a/lmms_eval/models/minicpm_v.py
+++ b/lmms_eval/models/minicpm_v.py
@@ -11,7 +11,6 @@
from transformers import AutoModel, AutoTokenizer
-
import warnings
warnings.filterwarnings("ignore")
@@ -105,7 +104,6 @@ def eot_token_id(self):
def max_length(self):
return self._max_length
-
@property
def batch_size(self):
return self.batch_size_per_gpu
diff --git a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
index ea6b193ec..87c65519d 100644
--- a/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
+++ b/lmms_eval/tasks/hallusion_bench/evaluate_hb.py
@@ -6,8 +6,6 @@
from lmms_eval.tasks.hallusion_bench.utils import evaluate_by_chatgpt, check_same_by_chatgpt, assign_correctness, get_eval_all, get_eval_fig, get_eval_pair_all
cur_dir = os.path.dirname(os.path.abspath(__file__))
-save_json_path_vd = f"{cur_dir}/hallusion_output_vd_model.json"
-save_json_path_vs = f"{cur_dir}/hallusion_output_vs_model.json"
output_entry = "model_prediction"
correctness_entry = "gpt4v_output_gpt_check"
@@ -30,12 +28,12 @@ def hb_doc_to_visual(doc):
def hb_process_results(doc, result):
sample = doc
- doc.pop("image")
+ # doc.pop("image")
sample["model_prediction"] = result[0]
return {k: sample for k in metric}
-def hb_aggregation_result(results, metric):
+def hb_aggregation_result(results, metric, args):
data_vd = []
data_vs = []
for data in tqdm(results, desc="Split vd and vs"):
@@ -44,6 +42,10 @@ def hb_aggregation_result(results, metric):
if data["category"] == "VS":
data_vs.append(data)
eval_logger.info("Do gpt eval vd ...")
+ path = os.path.join(args.output_path, "gpt_response")
+ os.makedirs(path, exist_ok=True)
+ save_json_path_vd = f"{path}/hallusion_output_vd_model.json"
+ save_json_path_vs = f"{path}/hallusion_output_vs_model.json"
data_vd = evaluate_by_chatgpt(data_vd, output_entry=output_entry, correctness_entry=correctness_entry, load_json=True, save_json_path=save_json_path_vd)
# data_vd = check_same_by_chatgpt(data_vd, output_entry=output_entry, load_json=True, save_json_path=save_json_path_vd)
data_vd = assign_correctness(data_vd, correctness_entry=correctness_entry)
@@ -64,16 +66,16 @@ def hb_aggregation_result(results, metric):
return round(100 * all_data["correct"] / all_data["total"], 4)
-def hb_aggregation_result_qAcc(results):
- return hb_aggregation_result(results, "qAcc")
+def hb_aggregation_result_qAcc(results, args):
+ return hb_aggregation_result(results, "qAcc", args)
-def hb_aggregation_result_fAcc(results):
- return hb_aggregation_result(results, "fAcc")
+def hb_aggregation_result_fAcc(results, args):
+ return hb_aggregation_result(results, "fAcc", args)
-def hb_aggregation_result_aAcc(results):
- return hb_aggregation_result(results, "aAcc")
+def hb_aggregation_result_aAcc(results, args):
+ return hb_aggregation_result(results, "aAcc", args)
def hb_aggregation_result_intern(results, metric):
diff --git a/lmms_eval/tasks/mmbench/cc_utils.py b/lmms_eval/tasks/mmbench/cc_utils.py
index bc5761d01..379f103ee 100644
--- a/lmms_eval/tasks/mmbench/cc_utils.py
+++ b/lmms_eval/tasks/mmbench/cc_utils.py
@@ -30,7 +30,7 @@ def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None):
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
data = {
- "img": doc["image"],
+ # "img": doc["image"],
"question": doc["question"],
"answer": doc.get("answer", None),
"options": options_prompt,
diff --git a/lmms_eval/tasks/mmbench/cn_utils.py b/lmms_eval/tasks/mmbench/cn_utils.py
index 5cc7b91bb..85cc8bb94 100644
--- a/lmms_eval/tasks/mmbench/cn_utils.py
+++ b/lmms_eval/tasks/mmbench/cn_utils.py
@@ -31,7 +31,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
data = {
- "img": doc["image"],
+ # "img": doc["image"],
"question": doc["question"],
"answer": doc.get("answer", None),
"options": options_prompt,
diff --git a/lmms_eval/tasks/mmbench/en_utils.py b/lmms_eval/tasks/mmbench/en_utils.py
index 1bf9235b6..8453ffa56 100644
--- a/lmms_eval/tasks/mmbench/en_utils.py
+++ b/lmms_eval/tasks/mmbench/en_utils.py
@@ -31,7 +31,7 @@ def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
data = {
- "img": doc["image"],
+ # "img": doc["image"],
"question": doc["question"],
"answer": doc.get("answer", None),
"options": options_prompt,
diff --git a/lmms_eval/tasks/mme/mme.yaml b/lmms_eval/tasks/mme/mme.yaml
index cf49702d2..504e6dd02 100644
--- a/lmms_eval/tasks/mme/mme.yaml
+++ b/lmms_eval/tasks/mme/mme.yaml
@@ -1,4 +1,4 @@
-dataset_path: Otter-AI/MME
+dataset_path: lmms-lab/MME
dataset_kwargs:
token: True
task: "mme"
@@ -32,6 +32,6 @@ model_specific_prompt_kwargs:
post_prompt: " Answer:"
otterhd:
pre_prompt: ""
- post_prompt: ""
+ post_prompt: " Answer:"
metadata:
- version: 0.0
diff --git a/lmms_eval/tasks/mme/mme_test.yaml b/lmms_eval/tasks/mme/mme_test.yaml
index 335a4f7a6..c529cf83a 100644
--- a/lmms_eval/tasks/mme/mme_test.yaml
+++ b/lmms_eval/tasks/mme/mme_test.yaml
@@ -1,4 +1,4 @@
-dataset_path: Otter-AI/MME
+dataset_path: lmms-lab/MME
dataset_kwargs:
token: True
task: "mme_test"
diff --git a/lmms_eval/tasks/refcocog/_default_template_seg_yaml b/lmms_eval/tasks/refcocog/_default_template_seg_yaml
index 854565417..a3a291bd6 100644
--- a/lmms_eval/tasks/refcocog/_default_template_seg_yaml
+++ b/lmms_eval/tasks/refcocog/_default_template_seg_yaml
@@ -1,4 +1,4 @@
-dataset_path: lmms-lab/RefCOCO
+dataset_path: lmms-lab/RefCOCOg
output_type: generate_until
doc_to_visual: !function utils.refcoco_seg_doc_to_visual
doc_to_text: !function utils.refcoco_doc_to_text
diff --git a/lmms_eval/tasks/seedbench_2/utils.py b/lmms_eval/tasks/seedbench_2/utils.py
index c5a6ec8ab..af8d8571f 100644
--- a/lmms_eval/tasks/seedbench_2/utils.py
+++ b/lmms_eval/tasks/seedbench_2/utils.py
@@ -19,7 +19,7 @@ def seed_doc_to_text(doc, model_specific_kwargs=None):
question += f"C. {parse_choice_img(doc['choice_c'], model_specific_kwargs['img_token'])}\n"
question += f"D. {parse_choice_img(doc['choice_d'], model_specific_kwargs['img_token'])}"
if doc["data_type"] == "Image Generation":
- num_img_in_question = len(doc["image"]) - 4
+ num_img_in_question = len(doc["data_id"]) - 4
prepend_tokens = [model_specific_kwargs["img_token"]] * num_img_in_question
question = " ".join(prepend_tokens) + "\n" + question
return f"{question}\n{model_specific_kwargs['post_prompt']}"
diff --git a/pyproject.toml b/pyproject.toml
index 4017a61a4..05043fd1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,7 +24,6 @@ dependencies = [
"accelerate>=0.21.0",
"black==24.1.0",
"datasets==2.16.1",
- "evaluate",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
@@ -35,7 +34,7 @@ dependencies = [
"sacrebleu>=1.5.0",
"scikit-learn>=0.24.1",
"sqlitedict",
- "torch>=1.8",
+ "torch>=1.8", # Note the version specification here for torch
"openai>=1.0.0",
"pycocoevalcap",
"tqdm-multiprocess",
@@ -54,6 +53,44 @@ dependencies = [
"transformers-stream-generator",
"tiktoken",
"pre-commit",
+ "llava@git+https://github.com/haotian-liu/LLaVA",
+]
+
+[project.optional-dependencies]
+llava_repr = [
+ "accelerate>=0.21.0",
+ "black==24.1.0",
+ "datasets==2.16.1",
+ "evaluate>=0.4.0",
+ "jsonlines",
+ "numexpr",
+ "peft>=0.2.0",
+ "pybind11>=2.6.2",
+ "pytablewriter",
+ "rouge-score>=0.0.4",
+ "sacrebleu>=1.5.0",
+ "scikit-learn>=0.24.1",
+ "sqlitedict",
+ "openai>=1.0.0",
+ "pycocoevalcap",
+ "tqdm-multiprocess",
+ "transformers>=4.36.2",
+ "zstandard",
+ "pillow",
+ "pyyaml",
+ "sympy",
+ "mpmath",
+ "Jinja2",
+ "openpyxl",
+ "Levenshtein",
+ "hf_transfer",
+ "tenacity",
+ "wandb>=0.16.0",
+ "transformers-stream-generator",
+ "tiktoken",
+ "pre-commit",
+ "torch==2.0.1", # Specific version for llava_repr
+ "llava@git+https://github.com/haotian-liu/LLaVA",
]
[tool.setuptools.packages.find]
@@ -70,17 +107,3 @@ lmms_eval = "lmms_eval.__main__:cli_evaluate"
[project.urls]
Homepage = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
-
-[project.optional-dependencies]
-dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
-linting = ["flake8", "pylint", "mypy", "pre-commit"]
-testing = ["pytest", "pytest-cov", "pytest-xdist"]
-multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
-math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
-sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
-gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
-anthropic = ["anthropic"]
-openai = ["openai==1.3.9"]
-vllm = ["vllm"]
-ifeval = ["langdetect", "immutabledict"]
-all = ["lmms_eval[dev]", "lmms_eval[testing]", "lmms_eval[linting]"]