diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index 2ae2c8971..f29660ed0 100644 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -2,3 +2,4 @@ from .otterhd import OtterHD from .qwen_vl import Qwen_VL from .fuyu import Fuyu +from .gpt4v import GPT4V diff --git a/lmms_eval/models/fuyu.py b/lmms_eval/models/fuyu.py index fa67cab9e..07d4f8c4d 100644 --- a/lmms_eval/models/fuyu.py +++ b/lmms_eval/models/fuyu.py @@ -8,6 +8,10 @@ from lmms_eval.api.instance import Instance from tqdm import tqdm +import warnings + +warnings.filterwarnings("ignore") + @register_model("fuyu") class Fuyu(lmms): @@ -118,7 +122,40 @@ def _collate(x): def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # TODO - assert False, "We have not implemented this function for llava yet" + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + formatted_contexts = [f"{contexts}\n"] + formatted_continuation = [f"{contexts}\n{continuation}"] + model_inputs = self.processor(text=formatted_continuation, images=visuals, device=self.device) + for k, v in model_inputs.items(): + model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v] + + for index in range(len(model_inputs["image_patches"])): + model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype) + + labels = model_inputs["input_ids"].clone() + contxt_id = self.processor(text=formatted_contexts, return_tensors="pt")["input_ids"] + labels[: len(contxt_id)] = -100 + with torch.inference_mode(): + outputs = self.model(**model_inputs, labels=labels) + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = model_inputs["input_ids"][:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : model_inputs["input_ids"].shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + + pbar.close() + return res def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: # TODO diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py index e69de29bb..53b422aa1 100644 --- a/lmms_eval/models/gpt4v.py +++ b/lmms_eval/models/gpt4v.py @@ -0,0 +1,108 @@ +from io import BytesIO +import os +import base64 +from typing import List, Tuple +from tqdm import tqdm +import requests as url_requests +import time +import logging + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval import utils + +from PIL import Image + +API_TYPE = os.getenv("API_TYPE", "openai") +NUM_SECONDS_TO_SLEEP = 5 +eval_logger = logging.getLogger("lmms-eval") + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +@register_model("gpt4V") +class GPT4V(lmms): + def __init__(self, **kwargs) -> None: + super().__init__() + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="JPEG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + + payload = {"model": "gpt-4-vision-preview", "messages": [{"role": "user", "content": []}]} + payload["messages"][0]["content"].append({"type": "text", "text": contexts}) + + for visual in visuals: + img = self.encode_image(visual) + payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}) + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + # payload["max_tokens"] = gen_kwargs["max_new_tokens"] + # payload["temperature"] = gen_kwargs["temperature"] + + for attempt in range(5): + try: + response = url_requests.post(API_URL, headers=headers, json=payload) + response_data = response.json() + + content = response_data["choices"][0]["message"]["content"].strip() + break # If successful, break out of the loop + + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + content = "" + res.append(content) + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "GPT4V not support" + + def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]: + # TODO + assert False, "GPT4V not support" diff --git a/lmms_eval/tasks/llava-bench-coco/utils.py b/lmms_eval/tasks/llava-bench-coco/utils.py index 19075868d..03cf1d96c 100644 --- a/lmms_eval/tasks/llava-bench-coco/utils.py +++ b/lmms_eval/tasks/llava-bench-coco/utils.py @@ -8,10 +8,13 @@ import time import yaml from pathlib import Path +from copy import deepcopy eval_logger = logging.getLogger("lmms-eval") NUM_SECONDS_TO_SLEEP = 0.5 +LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_conv"] + rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r")) with open(Path(__file__).parent / "llava-bench-coco.yaml", "r") as f: @@ -24,16 +27,30 @@ config = yaml.safe_load("".join(safe_data)) -API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") -API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] +GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] + +API_TYPE = os.getenv("API_TYPE", "openai") -def get_eval(content: str, max_tokens: int, retries: int = 3): +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json", } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +def get_eval(content: str, max_tokens: int, retries: int = 3): + global headers messages = [ { @@ -125,7 +142,7 @@ def llava_process_results(doc, result): scores = [-1, -1] metric = f"gpt_eval_llava_{doc.get('category', 'unknown')}" - review_dict = { + category_review_dict = { "question": question, "ans1": ans1, "ans2": ans2, @@ -136,8 +153,19 @@ def llava_process_results(doc, result): "eval_model": model_name, } + non_category_review_dict = deepcopy(category_review_dict) + non_category_review_dict["scores"] = [-999, -999] + + data_dict = {} + for m in LLAVA_W_METRICS: + if m == metric: + data_dict[m] = category_review_dict + else: + data_dict[m] = non_category_review_dict + data_dict["gpt_eval_llava_all"] = category_review_dict + # return {"gpt_eval_llava_all": review_dict} - return {metric: review_dict, "gpt_eval_llava_all": review_dict} + return data_dict def llava_conv_aggregation(results): @@ -160,6 +188,8 @@ def llava_aggregation(results, category): try: scores = [] for result in results: + if -999 in result["scores"]: + continue scores.append(result["scores"]) stats = np.asarray(scores).mean(0).tolist() diff --git a/lmms_eval/tasks/llava-in-the-wild/utils.py b/lmms_eval/tasks/llava-in-the-wild/utils.py index 050b16e81..7591c6743 100644 --- a/lmms_eval/tasks/llava-in-the-wild/utils.py +++ b/lmms_eval/tasks/llava-in-the-wild/utils.py @@ -8,10 +8,13 @@ import time import yaml from pathlib import Path +from copy import deepcopy eval_logger = logging.getLogger("lmms-eval") NUM_SECONDS_TO_SLEEP = 0.5 +LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_conv"] + rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r")) with open(Path(__file__).parent / "llava-in-the-wild.yaml", "r") as f: @@ -24,16 +27,28 @@ config = yaml.safe_load("".join(safe_data)) -API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") -API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"] +API_TYPE = os.getenv("API_TYPE", "openai") -def get_eval(content: str, max_tokens: int, retries: int = 3): +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") headers = { "Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json", } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +def get_eval(content: str, max_tokens: int, retries: int = 3): + global headers messages = [ { @@ -125,7 +140,7 @@ def llava_process_results(doc, result): scores = [-1, -1] metric = f"gpt_eval_llava_{doc.get('category', 'all')}" - review_dict = { + category_review_dict = { "question": question, "ans1": ans1, "ans2": ans2, @@ -136,8 +151,19 @@ def llava_process_results(doc, result): "eval_model": model_name, } + non_category_review_dict = deepcopy(category_review_dict) + non_category_review_dict["scores"] = [-999, -999] + + data_dict = {} + for m in LLAVA_W_METRICS: + if m == metric: + data_dict[m] = category_review_dict + else: + data_dict[m] = non_category_review_dict + data_dict["gpt_eval_llava_all"] = category_review_dict + # return {"gpt_eval_llava_all": review_dict} - return {metric: review_dict, "gpt_eval_llava_all": review_dict} + return data_dict def llava_conv_aggregation(results): @@ -160,6 +186,8 @@ def llava_aggregation(results, category): try: scores = [] for result in results: + if -999 in result["scores"]: + continue scores.append(result["scores"]) stats = np.asarray(scores).mean(0).tolist()