-
-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add video chat * add cli chat * update chat_ui * fix tests * add video chat * add cli chat * update chat_ui * fix qwen2_5_vl video processing * fix llava video processing * update stream_gen and genereate * refactor prompt utils to support video * update video generate to support qwen and others * revert llava bunny and next (no video support) * fix qwen prompt * bump version * update notebook and deps * Update README.md * Update README.md * update deps
- Loading branch information
Showing
16 changed files
with
1,268 additions
and
112 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Video Understanding\n", | ||
"\n", | ||
"In this example, we will generate a description of a video using `Qwen2-VL`, `Qwen2-5-VL`, `LLava`, and `Idefics3`, with more models coming soon.\n", | ||
"\n", | ||
"This feature is currently in beta, may not work as expected.\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Install Dependencies" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install -U mlx-vlm" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Import Dependencies" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/opt/homebrew/Caskroom/miniconda/base/envs/mlx_code/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", | ||
" from .autonotebook import tqdm as notebook_tqdm\n", | ||
"This is a beta version of the video understanding. It may not work as expected.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from pprint import pprint\n", | ||
"from mlx_vlm import load\n", | ||
"from mlx_vlm.utils import generate\n", | ||
"from mlx_vlm.video_generate import process_vision_info\n", | ||
"\n", | ||
"import mlx.core as mx" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Load the model and processor\n", | ||
"model, processor = load(\"mlx-community/Qwen2.5-VL-7B-Instruct-4bit\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"numpy reader: video_path=videos/fastmlx_local_ai_hub.mp4, total_frames=1134, video_fps=59.941855343141576, time=0.000s\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Messages containing a video and a text query\n", | ||
"messages = [\n", | ||
" {\n", | ||
" \"role\": \"user\",\n", | ||
" \"content\": [\n", | ||
" {\n", | ||
" \"type\": \"video\",\n", | ||
" \"video\": \"videos/fastmlx_local_ai_hub.mp4\",\n", | ||
" \"max_pixels\": 360 * 360,\n", | ||
" \"fps\": 1.0,\n", | ||
" },\n", | ||
" {\"type\": \"text\", \"text\": \"Describe this video.\"},\n", | ||
" ],\n", | ||
" }\n", | ||
"]\n", | ||
"\n", | ||
"# Preparation for inference\n", | ||
"text = processor.apply_chat_template(\n", | ||
" messages, tokenize=False, add_generation_prompt=True\n", | ||
")\n", | ||
"image_inputs, video_inputs = process_vision_info(messages)\n", | ||
"inputs = processor(\n", | ||
" text=[text],\n", | ||
" images=image_inputs,\n", | ||
" videos=video_inputs,\n", | ||
" padding=True,\n", | ||
" return_tensors=\"pt\",\n", | ||
")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Convert inputs to mlx arrays\n", | ||
"input_ids = mx.array(inputs['input_ids'])\n", | ||
"pixel_values = mx.array(inputs['pixel_values_videos'])\n", | ||
"mask = mx.array(inputs['attention_mask'])\n", | ||
"image_grid_thw = mx.array(inputs['video_grid_thw'])\n", | ||
"\n", | ||
"kwargs = {\n", | ||
" \"image_grid_thw\": image_grid_thw,\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"kwargs[\"video\"] = \"videos/fastmlx_local_ai_hub.mp4\"\n", | ||
"kwargs[\"input_ids\"] = input_ids\n", | ||
"kwargs[\"pixel_values\"] = pixel_values\n", | ||
"kwargs[\"mask\"] = mask\n", | ||
"response = generate(model, processor, prompt=text, temp=0.7, max_tokens=100, **kwargs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"('The video appears to be a live stream or a recording of a coding session, '\n", | ||
" 'likely on a platform like Discord, as indicated by the presence of text '\n", | ||
" \"chats and a streamer's interface. The video is primarily focused on a \"\n", | ||
" 'computer screen displaying a code editor with various programming languages '\n", | ||
" 'and snippets of code. The coder seems to be explaining or demonstrating '\n", | ||
" 'something related to the code, possibly working through a programming '\n", | ||
" 'problem, explaining the logic, or showing the process of solving a problem.\\n'\n", | ||
" '\\n'\n", | ||
" 'Here are some key observations from')\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"pprint(response)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# open video and play it\n", | ||
"from ipywidgets import Video\n", | ||
"Video.from_file(\"videos/fastmlx_local_ai_hub.mp4\", width=320, height=240)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "mlx_code", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.