From 7ffac2c0e89b98c2fd60eae6cd84630dcc00a7c1 Mon Sep 17 00:00:00 2001 From: kcz358 Date: Tue, 4 Feb 2025 00:33:39 +0000 Subject: [PATCH] Add kino and kino_qwen2_5_vl Update Support navit processor Add pixels and error handle Resize for minimal smart resize Add kino qwen2_5 Fix vdc 499 for azure Fix gpt model Revise kino logic for handling inputs Add error handl for vita Use qwen vl utils to process visual --- lmms_eval/models/__init__.py | 2 + lmms_eval/models/kino.py | 372 ++++++++++++++++++ lmms_eval/models/kino_qwen_2_5.py | 367 +++++++++++++++++ lmms_eval/models/vita.py | 76 ++-- .../tasks/air_bench/_default_template_yaml | 2 +- .../tasks/alpaca_audio/alpaca_audio.yaml | 2 +- .../tasks/clotho_aqa/_default_template_yaml | 2 +- .../llava-in-the-wild/llava-in-the-wild.yaml | 2 +- .../_default_template_wilder_yaml | 2 +- lmms_eval/tasks/mathvista/mathvista.yaml | 2 +- lmms_eval/tasks/mathvista/mathvista_evals.py | 42 +- lmms_eval/tasks/openhermes/openhermes.yaml | 2 +- .../_default_template_yaml | 2 +- .../tasks/video_detail_description/utils.py | 7 + lmms_eval/tasks/wavcaps/wavcaps.yaml | 2 +- .../tasks/worldqa/_default_template_yaml | 2 +- 16 files changed, 817 insertions(+), 69 deletions(-) create mode 100644 lmms_eval/models/kino.py create mode 100644 lmms_eval/models/kino_qwen_2_5.py diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py index df4810974..1ba6bcb70 100644 --- a/lmms_eval/models/__init__.py +++ b/lmms_eval/models/__init__.py @@ -25,6 +25,8 @@ "internvideo2": "InternVideo2", "internvl": "InternVLChat", "internvl2": "InternVL2", + "kino": "Kino", + "kino_qwen_2_5": "KinoQwen2_5", "llama_vid": "LLaMAVid", "llama_vision": "LlamaVision", "llava": "Llava", diff --git a/lmms_eval/models/kino.py b/lmms_eval/models/kino.py new file mode 100644 index 000000000..8c6f6fc95 --- /dev/null +++ b/lmms_eval/models/kino.py @@ -0,0 +1,372 @@ +import os +import warnings +from typing import List, Optional, Tuple, Union + +import librosa +import moviepy as mp +import numpy as np +import PIL +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from decord import VideoReader, cpu +from qwen_vl_utils import process_vision_info +from synvo_engine.models.kino import KinoForConditionalGeneration +from synvo_engine.models.kino.processing_kino import KinoProcessor +from tqdm import tqdm +from transformers import AutoConfig, AutoProcessor + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.audio_processing import downsample_audio + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_VIDEO_TOKEN = "