Skip to content

Commit

Permalink
Merge pull request #3 from aigc-apps/fix/video_caption
Browse files Browse the repository at this point in the history
Fix video caption
  • Loading branch information
bubbliiiing authored May 26, 2024
2 parents c2a5eda + ac97760 commit 234c157
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 7 deletions.
5 changes: 4 additions & 1 deletion easyanimate/video_caption/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ EasyAnimate uses multi-modal LLMs to generate captions for frames extracted from

# Install additional requirements for video caption.
cd easyanimate/video_caption && pip install -r requirements.txt

# We strongly recommend using Docker unless you can properly handle the dependency between vllm with torch(cuda).
```

## How to use
Expand All @@ -29,7 +31,7 @@ EasyAnimate uses multi-modal LLMs to generate captions for frames extracted from
```shell
CUDA_VISIBLE_DEVICES=0 python caption_video_frame.py \
--video_folder="your-video-folder/"
--frame_sample_method="extract_mid_frame" \
--frame_sample_method="mid" \
--num_sampled_frames=1 \
--image_caption_model_name="llava-v1.6-vicuna-7b" \
--image_caption_prompt="Please describe this image in detail." \
Expand All @@ -53,6 +55,7 @@ EasyAnimate uses multi-modal LLMs to generate captions for frames extracted from
2. Output the video description directly. Begin with 'In this video'. \
3. Limit the video description within 100 words. \
Here is the mid-frame description: " \
--output_dir="tmp" \
--saved_path="video_summary_caption.jsonl"
```

Expand Down
7 changes: 4 additions & 3 deletions easyanimate/video_caption/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
auto_gptq==0.6.0
pandas>=2.0.0
auto_gptq
vllm
sglang[srt]
vllm==0.3.3
sglang[srt]==0.1.13
func_timeout
9 changes: 6 additions & 3 deletions easyanimate/video_caption/utils/image_captioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ def __call__(self, prompt: str, image: str):

class LLaVASRT:
def __init__(self, device: str = "cuda:0", quantized: bool = True):
runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b", tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(runtime)
self.runtime = sgl.Runtime(model_path="liuhaotian/llava-v1.6-vicuna-7b", tokenizer_path="llava-hf/llava-1.5-7b-hf")
sgl.set_default_backend(self.runtime)
logger.info(
f"Start the SGLang runtime for llava-v1.6-vicuna-7b with chat template: {runtime.endpoint.chat_template.name}. "
f"Start the SGLang runtime for llava-v1.6-vicuna-7b with chat template: {self.runtime.endpoint.chat_template.name}. "
"Input parameter device and quantized do not take effect."
)
if not os.path.exists(TMP_DIR):
Expand Down Expand Up @@ -135,6 +135,9 @@ def __call__(self, prompt: Union[str, List[str]], image: Union[str, Image.Image,
return [s["answer"] for s in state], state
else:
raise ValueError("Input prompt and image must be both strings or list of strings with the same length.")

def __del__(self):
self.runtime.shutdown()


if __name__ == "__main__":
Expand Down

0 comments on commit 234c157

Please sign in to comment.