Skip to content

Commit

Permalink
lint check
Browse files Browse the repository at this point in the history
  • Loading branch information
choiszt committed Feb 27, 2025
1 parent 884f31a commit 08b049b
Show file tree
Hide file tree
Showing 3 changed files with 134 additions and 131 deletions.
115 changes: 59 additions & 56 deletions lmms_eval/models/egogpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,30 @@

# Import LLaVA modules
try:
from egogpt.model.builder import load_pretrained_model
from egogpt.mm_utils import get_model_name_from_path, process_images
from egogpt.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, IGNORE_INDEX,SPEECH_TOKEN_INDEX,DEFAULT_SPEECH_TOKEN
from egogpt.conversation import conv_templates, SeparatorStyle

from PIL import Image
import requests
import copy
import torch
import numpy as np
import os
import re
import sys
import warnings
from decord import VideoReader, cpu

import numpy as np
import requests
import soundfile as sf
from scipy.signal import resample
import torch
import whisper
import re
import os
from decord import VideoReader, cpu
from egogpt.constants import (
DEFAULT_IMAGE_TOKEN,
DEFAULT_SPEECH_TOKEN,
IGNORE_INDEX,
IMAGE_TOKEN_INDEX,
SPEECH_TOKEN_INDEX,
)
from egogpt.conversation import SeparatorStyle, conv_templates
from egogpt.mm_utils import get_model_name_from_path, process_images
from egogpt.model.builder import load_pretrained_model
from PIL import Image
from scipy.signal import resample
except ImportError as e:
eval_logger.debug(f"egogpt is not installed. Please install egogpt to use this model.\nError: {e}")

Expand Down Expand Up @@ -109,16 +115,15 @@ def __init__(
if attn_implementation is not None:
egogpt_model_args["attn_implementation"] = attn_implementation


self.pretrained = pretrained
self.token_strategy = token_strategy
self.max_frames_num = max_frames_num
self.mm_spatial_pool_stride = mm_spatial_pool_stride
self.mm_spatial_pool_mode = mm_spatial_pool_mode
self.video_decode_backend = video_decode_backend
# Try to load the model with the multimodal argument
self._tokenizer, self._model, self._max_length = load_pretrained_model(pretrained,device_map=self.device_map, **egogpt_model_args)
self._image_processor=self._model.get_vision_tower().image_processor
self._tokenizer, self._model, self._max_length = load_pretrained_model(pretrained, device_map=self.device_map, **egogpt_model_args)
self._image_processor = self._model.get_vision_tower().image_processor
self._config = self._model.config
self.model.eval()
self.truncation = truncation
Expand Down Expand Up @@ -236,14 +241,14 @@ def flatten(self, input):
for j in i:
new_list.append(j)
return new_list
def split_text(self,text, keywords):
pattern = '(' + '|'.join(map(re.escape, keywords)) + ')'

def split_text(self, text, keywords):
pattern = "(" + "|".join(map(re.escape, keywords)) + ")"
parts = re.split(pattern, text)
parts = [part for part in parts if part]
return parts

def load_video(self,video_path=None,audio_path=None,max_frames_num=16,fps=1,task_name=None):
def load_video(self, video_path=None, audio_path=None, max_frames_num=16, fps=1, task_name=None):
if audio_path is not None:
speech, sample_rate = sf.read(audio_path)
if sample_rate != 16000:
Expand All @@ -256,24 +261,24 @@ def load_video(self,video_path=None,audio_path=None,max_frames_num=16,fps=1,task
speech = whisper.log_mel_spectrogram(speech, n_mels=128).permute(1, 0)
speech_lengths = torch.LongTensor([speech.shape[0]])
else:
speech=torch.zeros(3000,128)
speech_lengths=torch.LongTensor([3000])
speech = torch.zeros(3000, 128)
speech_lengths = torch.LongTensor([3000])

vr=VideoReader(video_path,ctx=cpu(0),num_threads=1)
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
total_frame_num = len(vr)
avg_fps = round(vr.get_avg_fps() / fps)
frame_idx = [i for i in range(0, total_frame_num, avg_fps)]
frame_time = [i/avg_fps for i in frame_idx]
frame_time = [i / avg_fps for i in frame_idx]

if max_frames_num > 0:
if len(frame_idx) > max_frames_num:
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
frame_idx = uniform_sampled_frames.tolist()
if task_name=="egoplan":
#add current ovservation frame
frame_idx.append(total_frame_num-1)
if task_name == "egoplan":
# add current ovservation frame
frame_idx.append(total_frame_num - 1)
video = vr.get_batch(frame_idx).asnumpy()
return video,speech,speech_lengths
return video, speech, speech_lengths

def generate_until(self, requests: List[Instance]) -> List[str]:
res = []
Expand Down Expand Up @@ -340,15 +345,14 @@ def _collate(x):
image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
else:
image_tensor = image_tensor.to(dtype=torch.float16, device=self.device)
image_tensor=[image_tensor]
image_tensor = [image_tensor]
task_type = "video"
placeholder_count = 1

elif type(visual[0]) == PIL.Image.Image: # For image, multi-image tasks

image_tensor = process_images(visual, self._image_processor, self._config)
speech=torch.zeros(3000,128)
speech_lengths=torch.LongTensor([3000])
speech = torch.zeros(3000, 128)
speech_lengths = torch.LongTensor([3000])
if type(image_tensor) is list:
image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
else:
Expand All @@ -362,27 +366,27 @@ def _collate(x):
try:
if self.video_decode_backend == "decord":
if "egoplan" in visual[0]:
task_name="egoplan"
task_name = "egoplan"
else:
task_name=None
frames,speech,speech_lengths = self.load_video(video_path=visual[0], max_frames_num=self.max_frames_num,task_name=task_name)
task_name = None
frames, speech, speech_lengths = self.load_video(video_path=visual[0], max_frames_num=self.max_frames_num, task_name=task_name)
else:
raise NotImplementedError("Only decord backend is supported for video task")
processed_frames = self._image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].half().cuda()
processed_frames=processed_frames.half()
processed_frames = processed_frames.half()
image_tensor.append(processed_frames)
image_sizes=[frames[0].size]
image_sizes = [frames[0].size]
except Exception as e:
eval_logger.error(f"Error {e} in loading video")
image_tensor = None

task_type = "video"
placeholder_count = len(frames) if self.token_strategy == "multiple" else 1
if DEFAULT_IMAGE_TOKEN not in context:
question=DEFAULT_IMAGE_TOKEN+"\n"+context
if DEFAULT_IMAGE_TOKEN not in context:
question = DEFAULT_IMAGE_TOKEN + "\n" + context
else:
question=context
speech=torch.stack([speech]).to(self.device).half()
question = context
speech = torch.stack([speech]).to(self.device).half()
# This is much safer for llama3, as we now have some object type in it
if "llama_3" in self.conv_template:
conv = copy.deepcopy(conv_templates[self.conv_template])
Expand Down Expand Up @@ -418,38 +422,37 @@ def _collate(x):
if "num_beams" not in gen_kwargs:
gen_kwargs["num_beams"] = 1


parts=self.split_text(prompt_question,["<image>","<speech>"])
input_ids=[]
parts = self.split_text(prompt_question, ["<image>", "<speech>"])
input_ids = []
for part in parts:
if "<image>"==part:
input_ids+=[IMAGE_TOKEN_INDEX]
elif "<speech>"==part:
input_ids+=[SPEECH_TOKEN_INDEX]
if "<image>" == part:
input_ids += [IMAGE_TOKEN_INDEX]
elif "<speech>" == part:
input_ids += [SPEECH_TOKEN_INDEX]
else:
input_ids+=self.tokenizer(part).input_ids
input_ids = torch.tensor(input_ids,dtype=torch.long).unsqueeze(0).to(self.device)
input_ids += self.tokenizer(part).input_ids

input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(self.device)
input_ids_list = [input_ids]
pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
attention_masks = input_ids.ne(pad_token_ids).to(self.device)
input_ids = torch.tensor(input_ids,dtype=torch.long).squeeze(0).to(self.device)
input_ids = torch.tensor(input_ids, dtype=torch.long).squeeze(0).to(self.device)
if task_type == "image":
gen_kwargs["image_sizes"] = [batched_visuals[0][idx].size for idx in range(len(batched_visuals[0]))]
elif task_type == "video":
gen_kwargs["modalities"] = ["video"]
self._config.mm_spatial_pool_stride = self.mm_spatial_pool_stride
self._config.mm_spatial_pool_mode = self.mm_spatial_pool_mode
gen_kwargs["eos_token_id"]=self.tokenizer.eos_token_id
gen_kwargs["eos_token_id"] = self.tokenizer.eos_token_id

# These steps are not in LLaVA's original code, but are necessary for generation to work
# TODO: attention to this major generation step...
if "image_aspect_ratio" in gen_kwargs.keys():
gen_kwargs.pop("image_aspect_ratio")
try:
with torch.inference_mode():
cont = self.model.generate(input_ids, images=image_tensor,speech=speech,speech_lengths=speech_lengths, **gen_kwargs)
cont = self.model.generate(input_ids, images=image_tensor, speech=speech, speech_lengths=speech_lengths, **gen_kwargs)

text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)
except Exception as e:
Expand All @@ -466,4 +469,4 @@ def _collate(x):
return res

def generate_until_multi_round(self, requests: List[Instance]) -> List[str]:
raise NotImplementedError("generate_until_multi_round is not implemented for EgoGPT")
raise NotImplementedError("generate_until_multi_round is not implemented for EgoGPT")
70 changes: 36 additions & 34 deletions lmms_eval/tasks/egoplan/utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
from collections import defaultdict
import os
import datetime
import json
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file


from pathlib import Path
import yaml
import sys
from typing import List, Dict, Optional, Union
import os
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Union

import cv2
import numpy as np

import yaml
from loguru import logger as eval_logger

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

# with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
# raw_data = f.readlines()
# safe_data = []
Expand Down Expand Up @@ -137,32 +136,36 @@ def egoplan_doc_to_visual(doc):


def egoplan_doc_to_text(doc, lmms_eval_specific_kwargs=None):
task_goal=doc["task_goal"]
task_goal = doc["task_goal"]
if "goal" in task_goal:
task_goal = task_goal.split("to", 1)[1].strip()
words=task_goal.split()
words = task_goal.split()
if words[0].endswith("ing"):
question_pattern = "I am tasked with {}. " \
"The task's progress is demonstrated in the provided video. " \
"My current field of view is shown in the provided image. " \
"What should be my next action? " \
"Please output the most reasonable action you think, expressed in a short phrase."
question_pattern = (
"I am tasked with {}. "
"The task's progress is demonstrated in the provided video. "
"My current field of view is shown in the provided image. "
"What should be my next action? "
"Please output the most reasonable action you think, expressed in a short phrase."
)
else:
question_pattern = "My current task is to {}. " \
"The task's progress is demonstrated in the provided video. " \
"My current field of view is shown in the provided image. " \
"What should be my next action? " \
"Please output the most reasonable action you think, expressed in a short phrase."
question_pattern = (
"My current task is to {}. "
"The task's progress is demonstrated in the provided video. "
"My current field of view is shown in the provided image. "
"What should be my next action? "
"Please output the most reasonable action you think, expressed in a short phrase."
)
question = question_pattern.format(task_goal)

candidates = []
for choice_idx in ["A", "B", "C", "D"]:
question+="\n"+f"{choice_idx}. "+(doc[f"choice_{choice_idx.lower()}"])
question += "\n" + f"{choice_idx}. " + (doc[f"choice_{choice_idx.lower()}"])
post_prompt = "\nAnswer with the option's letter from the given choices"


return f"{question}{post_prompt}"


def extract_characters_regex(s):
s = s.strip()
answer_prefixes = [
Expand All @@ -184,22 +187,21 @@ def extract_characters_regex(s):
return ""
return matches[0]

def egoplan_process_results(doc, results):

def egoplan_process_results(doc, results):
pred = results[0]
pred_ans = extract_characters_regex(pred)
# gt_ans = doc["answer"].lower().strip().replace(".", "")
doc['pred_answer'] = pred_ans
data_dict=doc.copy()
doc["pred_answer"] = pred_ans
data_dict = doc.copy()
return {f"egoplan_mcq_accuracy": data_dict}


def egoplan_aggregate_results(results):

correct_num=0
correct_num = 0
for result in results:
if result['pred_answer'] == result['golden_choice_idx']:
correct_num+=1
question_num=len(results)
accuracy=correct_num/question_num
if result["pred_answer"] == result["golden_choice_idx"]:
correct_num += 1
question_num = len(results)
accuracy = correct_num / question_num
return accuracy
Loading

0 comments on commit 08b049b

Please sign in to comment.