Skip to content

Commit

Permalink
[Feat] GPT4V added interleaved image and text support (EvolvingLMMs-L…
Browse files Browse the repository at this point in the history
…ab#52)

* Revise GPT4V to allow interleaved image and text

* Use the first char as the answer for seedbench

* Save gpt eval's answer
  • Loading branch information
kcz358 authored Feb 19, 2024
1 parent c83c7cf commit 17ed6d1
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 5 deletions.
35 changes: 30 additions & 5 deletions lmms_eval/models/gpt4v.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from io import BytesIO
from copy import deepcopy
import os
import base64
from typing import List, Tuple
Expand Down Expand Up @@ -38,6 +39,10 @@
class GPT4V(lmms):
def __init__(self, **kwargs) -> None:
super().__init__()
# Manually set a image token for GPT4V so that we can search for it
# and split the text and image
# Here we just use the same token as llava for convenient
self.image_token = "<image>"

# Function to encode the image
def encode_image(self, image: Image):
Expand All @@ -62,13 +67,32 @@ def generate_until(self, requests) -> List[str]:
# encode, pad, and truncate contexts for this batch
visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
visuals = self.flatten(visuals)

payload = {"model": "gpt-4-vision-preview", "messages": [{"role": "user", "content": []}]}
payload["messages"][0]["content"].append({"type": "text", "text": contexts})

imgs = []
for visual in visuals:
img = self.encode_image(visual)
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
imgs.append(img)

payload = {"model": "gpt-4-vision-preview", "messages": []}
response_json = {"role": "user", "content": []}
# When there is no image token in the context, append the image to the text
if self.image_token not in contexts:
payload["messages"].append(deepcopy(response_json))
payload["messages"][0]["content"].append({"type": "text", "text": contexts})
for img in imgs:
payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
else:
contexts = contexts.split(self.image_token)
for idx, img in enumerate(imgs):
payload["messages"].append(deepcopy(response_json))
payload["messages"][idx]["content"].append({"type" : "text", "text" : contexts[idx]})
payload["messages"][idx]["content"].append({"type" : "image_url", "image_url" : {"url" : f"data:image/jpeg;base64,{img}"}})

# If n image tokens are in the contexts
# contexts will be splitted into n+1 chunks
# Manually add it into the payload
payload["messages"].append(deepcopy(response_json))
payload["messages"][-1]["content"].append({"type" : "text", "text" : contexts[-1]})

if "max_new_tokens" not in gen_kwargs:
gen_kwargs["max_new_tokens"] = 1024
if "temperature" not in gen_kwargs:
Expand Down Expand Up @@ -97,6 +121,7 @@ def generate_until(self, requests) -> List[str]:
eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
content = ""
res.append(content)
pbar.update(1)
return res

def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
Expand Down
1 change: 1 addition & 0 deletions lmms_eval/tasks/hallusion_bench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def evaluate_by_chatgpt(data, output_entry, correctness_entry, gpt_model="gpt-4"
gpt_correctness = "2"

sample[correctness_entry] = gpt_correctness
sample["gpt_answer"] = prompt + output_text

output.append(sample)

Expand Down
1 change: 1 addition & 0 deletions lmms_eval/tasks/seedbench/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def seed_doc_to_text(doc):

def seed_process_result(doc, result):
pred = result[0].strip()
pred = pred[0]
answer = doc["answer"]
data_type = doc["data_type"]

Expand Down
3 changes: 3 additions & 0 deletions lmms_eval/tasks/seedbench_2/seedbench_2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,5 +42,8 @@ metadata:

model_specific_prompt_kwargs:
llava :
img_token : <image>
post_prompt : "Answer with the option's letter from the given choices directly."
gpt4V :
img_token : <image>
post_prompt : "Answer with the option's letter from the given choices directly."
1 change: 1 addition & 0 deletions lmms_eval/tasks/seedbench_2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def seed_doc_to_text(doc, model_specific_kwargs=None):

def seed_process_result(doc, result):
pred = result[0].strip()
pred = pred[0]
answer = doc["answer"]
data_type = doc["data_type"].split(" ")
data_type = "_".join(data_type)
Expand Down

0 comments on commit 17ed6d1

Please sign in to comment.