From c95b4b165717e6e41ea8a1425c86748801642688 Mon Sep 17 00:00:00 2001 From: Xida Date: Fri, 7 Feb 2025 18:57:15 +0000 Subject: [PATCH 01/12] augment model_management.py to support tinyllama 25m --- .../integration_tests/llm/model_management.py | 116 +++++++++++++++++- 1 file changed, 114 insertions(+), 2 deletions(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index f4660ae7a..fe81793e1 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from typing import Optional, Tuple, Dict from enum import Enum, auto +from huggingface_hub import snapshot_download from sharktank.utils.hf_datasets import Dataset, RemoteFile, get_dataset @@ -12,6 +13,8 @@ logger = logging.getLogger(__name__) +LLAMA_CPP_LOCATION = Path("/tmp/llama.cpp") + class AccuracyValidationException(RuntimeError): """Custom exception for accuracy validation failures.""" @@ -35,6 +38,7 @@ class ModelSource(Enum): HUGGINGFACE = auto() LOCAL = auto() AZURE = auto() + HUGGINGFACE_TO_GGUF_TO_IRPA = auto() @dataclass @@ -71,6 +75,11 @@ def __post_init__(self): raise ValueError("local_path required for local models") elif self.source == ModelSource.AZURE and not self.azure_config: raise ValueError("azure_config required for Azure models") + elif self.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + if not self.repo_id: + raise ValueError( + "repo_id required for HUGGINGFACE_TO_GGUF_TO_IRPA models" + ) @dataclass @@ -108,6 +117,8 @@ def _get_model_dir(self) -> Path: / "azure" / self.config.azure_config.blob_path.replace("/", "_") ) + elif self.config.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + return self.base_dir / self.config.repo_id.replace("/", "_") raise ValueError(f"Unsupported model source: {self.config.source}") def _download_from_huggingface(self) -> Path: @@ -143,12 +154,71 @@ def _download_from_huggingface(self) -> Path: return model_path + def _download_and_convert_from_huggingface(self) -> Path: + """Downloads model from HuggingFace and converts through GGUF to IRPA.""" + irpa_path = self.model_dir / "model.irpa" + + if not irpa_path.exists(): + logger.info( + f"Processing model {self.config.repo_id} from HuggingFace through GGUF to IRPA" + ) + + # Step 1: Download from HuggingFace + hf_model_path = self.model_dir / "model_hf_repo_clone" + if not hf_model_path.exists(): + logger.info( + f"Downloading model from HuggingFace: {self.config.repo_id}" + ) + snapshot_download( + repo_id=self.config.repo_id, + local_dir=hf_model_path, + local_dir_use_symlinks=False, + revision="main", + ) + + # Step 2: Convert to GGUF + gguf_path = self.model_dir / "model.gguf" + if not gguf_path.exists(): + logger.info("Converting model to GGUF format") + subprocess.run( + [ + "python", + LLAMA_CPP_LOCATION / "convert_hf_to_gguf.py", + hf_model_path, + "--outfile", + str(gguf_path), + "--outtype", + "f32", + ], + check=True, + ) + + # Step 3: Convert to IRPA + logger.info("Converting GGUF to IRPA format") + subprocess.run( + [ + "python", + "-m", + "sharktank.tools.dump_gguf", + f"--gguf-file={gguf_path}", + "--save", + str(irpa_path), + ], + check=True, + ) + + # Cleanup intermediate files if desired + # shutil.rmtree(hf_model_path) + # gguf_path.unlink() + + return irpa_path + def _copy_from_local(self) -> Path: """Copies model from local filesystem.""" - import shutil - model_path = self.model_dir / self.config.model_file if not model_path.exists(): + import shutil + logger.info(f"Copying local model from {self.config.local_path}") shutil.copy2(self.config.local_path, model_path) return model_path @@ -273,6 +343,8 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: weights_path = manager._copy_from_local() elif config.source == ModelSource.AZURE: weights_path = manager._download_from_azure() + elif config.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + weights_path = manager._download_and_convert_from_huggingface() else: raise ValueError(f"Unsupported model source: {config.source}") @@ -323,4 +395,44 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: batch_sizes=(1, 4), device_settings=None, ), + "tiny_stories_direct": ModelConfig( + source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, + repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + model_file="model.irpa", # This will be the final converted file name + tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + batch_sizes=(1, 4), + device_settings=None, + ), } + + +# test like so +# from pathlib import Path +# import sys +# sys.path.append("/home/xidaren2/shark-ai") + + +# from app_tests.integration_tests.llm.model_management import ModelConfig, ModelSource, ModelProcessor +# from app_tests.integration_tests.llm.device_settings import GFX942 + +# # Setup base directory +# base_dir = Path("./model_artifacts") +# base_dir.mkdir(exist_ok=True) + +# # Configure model +# config = ModelConfig( +# source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, +# repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", +# model_file="model.irpa", +# tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", +# batch_sizes=(1, 4), +# device_settings=GFX942 +# ) + +# # Process model +# processor = ModelProcessor(base_dir) +# artifacts = processor.process_model(config) + +# # Print results +# print(f"Artifacts location: {base_dir}") +# print(f"VMFB path: {artifacts.vmfb_path}") From 1bed5083e7f4c727acfdc52af0888fec01ef8089 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 12 Feb 2025 11:08:35 -0800 Subject: [PATCH 02/12] clean this up to a working state --- .../integration_tests/llm/model_management.py | 147 ++++++++++++------ .../llm/shortfin/conftest.py | 2 +- .../shortfin/tinystories_llama2_25m_test.py | 121 ++++++++++++++ 3 files changed, 220 insertions(+), 50 deletions(-) create mode 100644 app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index fe81793e1..6ac8e0715 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -1,9 +1,12 @@ """Module for managing model artifacts through various processing stages.""" import logging +import tempfile +import zipfile +import urllib.request from pathlib import Path import subprocess from dataclasses import dataclass -from typing import Optional, Tuple, Dict +from typing import Optional, Tuple from enum import Enum, auto from huggingface_hub import snapshot_download @@ -13,7 +16,36 @@ logger = logging.getLogger(__name__) -LLAMA_CPP_LOCATION = Path("/tmp/llama.cpp") + +def get_llama_cpp_path() -> Path: + """Downloads and extracts llama.cpp if needed, returns path to installation.""" + # Use system temp directory as base + temp_base = Path(tempfile.gettempdir()) / "sharktank_llamacpp" + llama_cpp_dir = temp_base / "llama.cpp-b4696" + + # Only download and extract if not already present + if not llama_cpp_dir.exists(): + temp_base.mkdir(parents=True, exist_ok=True) + zip_path = temp_base / "llama.cpp.zip" + + # Download zip file + logger.info("Downloading llama.cpp...") + urllib.request.urlretrieve( + "https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4696.zip", + zip_path, + ) + + # Extract zip file + logger.info("Extracting llama.cpp...") + with zipfile.ZipFile(zip_path, "r") as zip_ref: + zip_ref.extractall(temp_base) + + # Clean up zip file + zip_path.unlink() + + logger.info(f"llama.cpp installed at {llama_cpp_dir}") + + return llama_cpp_dir class AccuracyValidationException(RuntimeError): @@ -76,9 +108,9 @@ def __post_init__(self): elif self.source == ModelSource.AZURE and not self.azure_config: raise ValueError("azure_config required for Azure models") elif self.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: - if not self.repo_id: + if not self.dataset_name: raise ValueError( - "repo_id required for HUGGINGFACE_TO_GGUF_TO_IRPA models" + "dataset_name required for HUGGINGFACE_TO_GGUF_TO_IRPA models" ) @@ -118,7 +150,7 @@ def _get_model_dir(self) -> Path: / self.config.azure_config.blob_path.replace("/", "_") ) elif self.config.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: - return self.base_dir / self.config.repo_id.replace("/", "_") + return self.base_dir / self.config.dataset_name.replace("/", "_") raise ValueError(f"Unsupported model source: {self.config.source}") def _download_from_huggingface(self) -> Path: @@ -169,12 +201,8 @@ def _download_and_convert_from_huggingface(self) -> Path: logger.info( f"Downloading model from HuggingFace: {self.config.repo_id}" ) - snapshot_download( - repo_id=self.config.repo_id, - local_dir=hf_model_path, - local_dir_use_symlinks=False, - revision="main", - ) + dataset = get_dataset(self.config.dataset_name) + downloaded_files = dataset.download(local_dir=self.model_dir) # Step 2: Convert to GGUF gguf_path = self.model_dir / "model.gguf" @@ -183,8 +211,8 @@ def _download_and_convert_from_huggingface(self) -> Path: subprocess.run( [ "python", - LLAMA_CPP_LOCATION / "convert_hf_to_gguf.py", - hf_model_path, + get_llama_cpp_path() / "convert_hf_to_gguf.py", + self.model_dir, "--outfile", str(gguf_path), "--outtype", @@ -366,44 +394,65 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: ) -TEST_MODELS = { - "open_llama_3b": ModelConfig( - source=ModelSource.HUGGINGFACE, - repo_id="SlyEcho/open_llama_3b_v2_gguf", - model_file="open-llama-3b-v2-f16.gguf", - tokenizer_id="openlm-research/open_llama_3b_v2", - batch_sizes=(1, 4), - device_settings=None, +TEST_MODELS = {} + +TEST_MODELS["open_llama_3b"] = ModelConfig( + source=ModelSource.HUGGINGFACE, + repo_id="SlyEcho/open_llama_3b_v2_gguf", + model_file="open-llama-3b-v2-f16.gguf", + tokenizer_id="openlm-research/open_llama_3b_v2", + batch_sizes=(1, 4), + device_settings=None, +) + +TEST_MODELS["llama3.1_8b"] = ModelConfig( + source=ModelSource.HUGGINGFACE, + repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF", + model_file="meta-llama-3.1-8b-instruct.f16.gguf", + tokenizer_id="NousResearch/Meta-Llama-3.1-8B", + batch_sizes=(1, 4), + device_settings=None, +) +TEST_MODELS[ + "azure_llama" +] = ModelConfig( # This model is currently unused. When you use it, check to make sure the irpa indeed still exist and remove this comment. + source=ModelSource.AZURE, + azure_config=AzureConfig( + account_name="sharkblobs", + container_name="halo-models", + blob_path="llm-dev/llama3_8b/8b_f16.irpa", ), - "llama3.1_8b": ModelConfig( - source=ModelSource.HUGGINGFACE, - repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF", - model_file="meta-llama-3.1-8b-instruct.f16.gguf", - tokenizer_id="NousResearch/Meta-Llama-3.1-8B", - batch_sizes=(1, 4), - device_settings=None, - ), - "azure_llama": ModelConfig( - source=ModelSource.AZURE, - azure_config=AzureConfig( - account_name="sharkblobs", - container_name="halo-models", - blob_path="llm-dev/llama3_8b/8b_f16.irpa", - ), - model_file="azure-llama.irpa", - tokenizer_id="openlm-research/open_llama_3b_v2", - batch_sizes=(1, 4), - device_settings=None, - ), - "tiny_stories_direct": ModelConfig( - source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, - repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - model_file="model.irpa", # This will be the final converted file name - tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - batch_sizes=(1, 4), - device_settings=None, + model_file="azure-llama.irpa", + tokenizer_id="openlm-research/open_llama_3b_v2", + batch_sizes=(1, 4), + device_settings=None, +) + +Dataset( + "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + ( + RemoteFile( + filename, + "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + filename, + ) + for filename in ( + "model.safetensors", + "tokenizer.json", + "tokenizer_config.json", + "config.json", + ) ), -} +) + +TEST_MODELS["tinystories_llama2_25m"] = ModelConfig( + source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, + dataset_name="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + model_file="model.irpa", # This will be the final converted file name + tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + batch_sizes=(1, 4), + device_settings=None, +) # test like so diff --git a/app_tests/integration_tests/llm/shortfin/conftest.py b/app_tests/integration_tests/llm/shortfin/conftest.py index 90b2d37a7..7996a1169 100644 --- a/app_tests/integration_tests/llm/shortfin/conftest.py +++ b/app_tests/integration_tests/llm/shortfin/conftest.py @@ -33,7 +33,7 @@ def test_device(request): return ret -@pytest.fixture(scope="module") +@pytest.fixture(scope="session") def model_artifacts(tmp_path_factory, request, test_device): """Prepares model artifacts in a cached directory.""" model_config = TEST_MODELS[request.param] diff --git a/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py new file mode 100644 index 000000000..33882e8f6 --- /dev/null +++ b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py @@ -0,0 +1,121 @@ +""" +Simple smoke tests to: +- ensure the full fastapi server works +- ensure the smoke test model works so we know it's not a model issue when another test using this model fails. +""" + +from concurrent.futures import ThreadPoolExecutor, as_completed +import logging +import pytest +import requests +from typing import Dict, Any +import uuid + +logger = logging.getLogger(__name__) + +from ..model_management import AccuracyValidationException + + +pytestmark = pytest.mark.parametrize( + "model_artifacts,server", + [ + ["tinystories_llama2_25m", {"prefix_sharing": "none"}], + ], + indirect=True, +) + + +class TestLLMServer: + """Test suite for LLM server functionality.""" + + def test_basic_generation(self, server: tuple[Any, int]) -> None: + """Tests basic text generation capabilities. + + Args: + server: Tuple of (process, port) from server fixture + """ + process, port = server + assert process.poll() is None, "Server process terminated unexpectedly" + expected_prefix = "to" + response = self._generate( + "Alice was so tired when she got back home so she went ", port + ) + if not expected_prefix in response: + raise AccuracyValidationException( + expected=f"{expected_prefix}...", + actual=response, + message=f"Generation did not match expected pattern.\nExpected to start with: {expected_prefix}\nActual response: {response}", + ) + + @pytest.mark.parametrize("concurrent_requests", [2, 4, 8]) + def test_concurrent_generation( + self, server: tuple[Any, int], concurrent_requests: int + ) -> None: + """Tests concurrent text generation requests. + + Args: + server: Tuple of (process, port) from server fixture + concurrent_requests: Number of concurrent requests to test + """ + process, port = server + assert process.poll() is None, "Server process terminated unexpectedly" + + prompt = "Alice was so tired when she got back home so she went " + expected_prefix = "to" + + with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: + futures = [ + executor.submit(self._generate, prompt, port) + for _ in range(concurrent_requests) + ] + + for future in as_completed(futures): + response = future.result() + if not response.startswith(expected_prefix): + raise AccuracyValidationException( + expected=f"{expected_prefix}...", + actual=response, + message=f"Concurrent generation did not match expected pattern.\nExpected to start with: {expected_prefix}\nActual response: {response}", + ) + + def _generate(self, prompt: str | list[int], port: int, input_ids=False) -> str: + """Helper method to make generation request to server. + + Args: + prompt: Input text prompt + port: Server port number + + Returns: + Generated text response + + Raises: + requests.exceptions.RequestException: If request fails + AccuracyValidationException: If response format is invalid + """ + payload = { + "sampling_params": {"max_completion_tokens": 15, "temperature": 0.7}, + "rid": uuid.uuid4().hex, + "stream": False, + } + if input_ids: + payload["input_ids"] = prompt + else: + payload["text"] = prompt + response = requests.post( + f"http://localhost:{port}/generate", + headers={"Content-Type": "application/json"}, + json=payload, + timeout=30, # Add reasonable timeout + ) + response.raise_for_status() + + # Parse and validate streaming response format + data = response.text + if not data.startswith("data: "): + raise AccuracyValidationException( + expected="Response starting with 'data: '", + actual=data, + message=f"Invalid response format.\nExpected format starting with 'data: '\nActual response: {data}", + ) + + return data[6:].rstrip("\n") From e04bbf8603e96781f5882b4b985017404901fb31 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 12 Feb 2025 11:10:19 -0800 Subject: [PATCH 03/12] comments on usage example --- app_tests/integration_tests/llm/model_management.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index 6ac8e0715..bccc936a3 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -455,7 +455,8 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: ) -# test like so +# Usage example +# This uses a small model that takes roughly 2 minutes to run # from pathlib import Path # import sys # sys.path.append("/home/xidaren2/shark-ai") From 7b7db314fce0e4b7de589c8fb18dae2f491827cd Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 12 Feb 2025 11:10:52 -0800 Subject: [PATCH 04/12] remove redundant usage examples comment --- .../integration_tests/llm/model_management.py | 33 ------------------- 1 file changed, 33 deletions(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index bccc936a3..5904e4c54 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -453,36 +453,3 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: batch_sizes=(1, 4), device_settings=None, ) - - -# Usage example -# This uses a small model that takes roughly 2 minutes to run -# from pathlib import Path -# import sys -# sys.path.append("/home/xidaren2/shark-ai") - - -# from app_tests.integration_tests.llm.model_management import ModelConfig, ModelSource, ModelProcessor -# from app_tests.integration_tests.llm.device_settings import GFX942 - -# # Setup base directory -# base_dir = Path("./model_artifacts") -# base_dir.mkdir(exist_ok=True) - -# # Configure model -# config = ModelConfig( -# source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, -# repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", -# model_file="model.irpa", -# tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", -# batch_sizes=(1, 4), -# device_settings=GFX942 -# ) - -# # Process model -# processor = ModelProcessor(base_dir) -# artifacts = processor.process_model(config) - -# # Print results -# print(f"Artifacts location: {base_dir}") -# print(f"VMFB path: {artifacts.vmfb_path}") From d960fa7dc13ca423faae14d3318af508af248bd1 Mon Sep 17 00:00:00 2001 From: Cedar Date: Wed, 12 Feb 2025 12:03:59 -0800 Subject: [PATCH 05/12] add golden prompts from model run using hf and add new test to CI before integration test --- .github/workflows/pkgci_shark_ai.yml | 5 +++++ .../llm/shortfin/tinystories_llama2_25m_test.py | 16 ++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pkgci_shark_ai.yml b/.github/workflows/pkgci_shark_ai.yml index a7ea94137..04d2a0ff8 100644 --- a/.github/workflows/pkgci_shark_ai.yml +++ b/.github/workflows/pkgci_shark_ai.yml @@ -88,6 +88,11 @@ jobs: source ${VENV_DIR}/bin/activate uv pip install -r requirements-iree-pinned.txt + - name: Run LLM Smoke Test + run: | + source ${VENV_DIR}/bin/activate + pytest -v -s --test_device=${{ matrix.test_device }} app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py --log-cli-level=INFO + - name: Run LLM Integration Tests run: | source ${VENV_DIR}/bin/activate diff --git a/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py index 33882e8f6..a32a71202 100644 --- a/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py +++ b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py @@ -25,6 +25,11 @@ ) +# goldens are generated in: https://colab.research.google.com/drive/1pFiyvyIxk1RsHnw5gTk_gu9QiQNy9gfW?usp=sharing +GOLDEN_PROMPT = "Once upon a time" +GOLDEN_RESPONSE = ", there was a little girl named Lily." + + class TestLLMServer: """Test suite for LLM server functionality.""" @@ -36,10 +41,9 @@ def test_basic_generation(self, server: tuple[Any, int]) -> None: """ process, port = server assert process.poll() is None, "Server process terminated unexpectedly" - expected_prefix = "to" - response = self._generate( - "Alice was so tired when she got back home so she went ", port - ) + prompt = GOLDEN_PROMPT + expected_prefix = GOLDEN_RESPONSE + response = self._generate(prompt, port) if not expected_prefix in response: raise AccuracyValidationException( expected=f"{expected_prefix}...", @@ -60,8 +64,8 @@ def test_concurrent_generation( process, port = server assert process.poll() is None, "Server process terminated unexpectedly" - prompt = "Alice was so tired when she got back home so she went " - expected_prefix = "to" + prompt = GOLDEN_PROMPT + expected_prefix = GOLDEN_RESPONSE with ThreadPoolExecutor(max_workers=concurrent_requests) as executor: futures = [ From 7ae99ba0c1f413f6dd7c8fd8950856c6415a0e27 Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 13 Feb 2025 09:25:51 -0800 Subject: [PATCH 06/12] address code quality review comments --- .../llm/sglang_benchmarks/conftest.py | 2 +- .../integration_tests/llm/model_management.py | 40 +++++++++---------- .../integration_tests/llm/sglang/conftest.py | 2 +- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py index 15aede1df..7cf2ccfe2 100644 --- a/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py +++ b/app_tests/benchmark_tests/llm/sglang_benchmarks/conftest.py @@ -33,7 +33,7 @@ # we can replace this with an import after #890 merges TEST_MODELS = { "llama3.1_8b": ModelConfig( - source=ModelSource.HUGGINGFACE, + source=ModelSource.HUGGINGFACE_FROM_GGUF, repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF", model_file="meta-llama-3.1-8b-instruct.f16.gguf", tokenizer_id="NousResearch/Meta-Llama-3.1-8B", diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index 5904e4c54..6516c8d1e 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -67,10 +67,10 @@ def __init__( class ModelSource(Enum): - HUGGINGFACE = auto() + HUGGINGFACE_FROM_GGUF = auto() LOCAL = auto() AZURE = auto() - HUGGINGFACE_TO_GGUF_TO_IRPA = auto() + HUGGINGFACE_FROM_SAFETENSORS = auto() @dataclass @@ -98,7 +98,7 @@ class ModelConfig: azure_config: Optional[AzureConfig] = None def __post_init__(self): - if self.source == ModelSource.HUGGINGFACE: + if self.source == ModelSource.HUGGINGFACE_FROM_GGUF: if not (self.dataset_name or self.repo_id): raise ValueError( "Either dataset_name or repo_id required for HuggingFace models" @@ -107,10 +107,10 @@ def __post_init__(self): raise ValueError("local_path required for local models") elif self.source == ModelSource.AZURE and not self.azure_config: raise ValueError("azure_config required for Azure models") - elif self.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + elif self.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS: if not self.dataset_name: raise ValueError( - "dataset_name required for HUGGINGFACE_TO_GGUF_TO_IRPA models" + "dataset_name required for HUGGINGFACE_FROM_SAFETENSORS models" ) @@ -137,7 +137,7 @@ def __init__(self, base_dir: Path, config: ModelConfig): def _get_model_dir(self) -> Path: """Creates and returns appropriate model directory based on source.""" - if self.config.source == ModelSource.HUGGINGFACE: + if self.config.source == ModelSource.HUGGINGFACE_FROM_GGUF: if self.config.dataset_name: return self.base_dir / self.config.dataset_name.replace("/", "_") return self.base_dir / self.config.repo_id.replace("/", "_") @@ -149,7 +149,7 @@ def _get_model_dir(self) -> Path: / "azure" / self.config.azure_config.blob_path.replace("/", "_") ) - elif self.config.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + elif self.config.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS: return self.base_dir / self.config.dataset_name.replace("/", "_") raise ValueError(f"Unsupported model source: {self.config.source}") @@ -365,13 +365,13 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: manager = ModelStageManager(self.base_dir, config) # Stage 1: Download weights and tokenizer (cached) - if config.source == ModelSource.HUGGINGFACE: + if config.source == ModelSource.HUGGINGFACE_FROM_GGUF: weights_path = manager._download_from_huggingface() elif config.source == ModelSource.LOCAL: weights_path = manager._copy_from_local() elif config.source == ModelSource.AZURE: weights_path = manager._download_from_azure() - elif config.source == ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA: + elif config.source == ModelSource.HUGGINGFACE_FROM_SAFETENSORS: weights_path = manager._download_and_convert_from_huggingface() else: raise ValueError(f"Unsupported model source: {config.source}") @@ -397,7 +397,7 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: TEST_MODELS = {} TEST_MODELS["open_llama_3b"] = ModelConfig( - source=ModelSource.HUGGINGFACE, + source=ModelSource.HUGGINGFACE_FROM_GGUF, repo_id="SlyEcho/open_llama_3b_v2_gguf", model_file="open-llama-3b-v2-f16.gguf", tokenizer_id="openlm-research/open_llama_3b_v2", @@ -406,7 +406,7 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: ) TEST_MODELS["llama3.1_8b"] = ModelConfig( - source=ModelSource.HUGGINGFACE, + source=ModelSource.HUGGINGFACE_FROM_GGUF, repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF", model_file="meta-llama-3.1-8b-instruct.f16.gguf", tokenizer_id="NousResearch/Meta-Llama-3.1-8B", @@ -432,21 +432,19 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", ( RemoteFile( - filename, - "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - filename, - ) - for filename in ( - "model.safetensors", - "tokenizer.json", - "tokenizer_config.json", - "config.json", + file_id = "model.safetensors", + repo_id = "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + extra_filenames = ( + "config.json", + "tokenizer.json", + "tokenizer_config.json", + ), ) ), ) TEST_MODELS["tinystories_llama2_25m"] = ModelConfig( - source=ModelSource.HUGGINGFACE_TO_GGUF_TO_IRPA, + source=ModelSource.HUGGINGFACE_FROM_SAFETENSORS, dataset_name="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", model_file="model.irpa", # This will be the final converted file name tokenizer_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", diff --git a/app_tests/integration_tests/llm/sglang/conftest.py b/app_tests/integration_tests/llm/sglang/conftest.py index 40294268e..b1741fb7d 100644 --- a/app_tests/integration_tests/llm/sglang/conftest.py +++ b/app_tests/integration_tests/llm/sglang/conftest.py @@ -40,7 +40,7 @@ def model_artifacts(request, tmp_path_factory): tmp_dir = tmp_path_factory.mktemp("sglang_integration_tests") model_config = ModelConfig( - source=ModelSource.HUGGINGFACE, + source=ModelSource.HUGGINGFACE_FROM_GGUF, repo_id="SanctumAI/Meta-Llama-3.1-8B-Instruct-GGUF", model_file="meta-llama-3.1-8b-instruct.f16.gguf", tokenizer_id="NousResearch/Meta-Llama-3.1-8B", From 36a914a2e2a7137e6ab7b16d39f66675b4525eaf Mon Sep 17 00:00:00 2001 From: Cedar Date: Thu, 13 Feb 2025 09:29:18 -0800 Subject: [PATCH 07/12] add todo comment to upstream TinyStories-LLaMA2 to sharktank --- app_tests/integration_tests/llm/model_management.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index 6516c8d1e..295f47ec1 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -8,7 +8,6 @@ from dataclasses import dataclass from typing import Optional, Tuple from enum import Enum, auto -from huggingface_hub import snapshot_download from sharktank.utils.hf_datasets import Dataset, RemoteFile, get_dataset @@ -428,6 +427,7 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: device_settings=None, ) +# TODO: upstream this to sharktank Dataset( "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", ( From aca05738d0b9759bd2677308684147df32130615 Mon Sep 17 00:00:00 2001 From: Cedar Date: Fri, 14 Feb 2025 08:57:36 -0800 Subject: [PATCH 08/12] correct missing filenames arg to RemoteFile --- app_tests/integration_tests/llm/model_management.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index 295f47ec1..e272ab41c 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -433,6 +433,7 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: ( RemoteFile( file_id = "model.safetensors", + filename= "model.safetensors", repo_id = "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", extra_filenames = ( "config.json", From 6ba3c1e419ef8c37f3f0018f3c153694fb6e4909 Mon Sep 17 00:00:00 2001 From: Cedar Date: Fri, 14 Feb 2025 09:17:18 -0800 Subject: [PATCH 09/12] fix another remote file interface problem --- app_tests/integration_tests/llm/model_management.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index e272ab41c..a0e9248b5 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -430,7 +430,7 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: # TODO: upstream this to sharktank Dataset( "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - ( + files = [ RemoteFile( file_id = "model.safetensors", filename= "model.safetensors", @@ -440,8 +440,8 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: "tokenizer.json", "tokenizer_config.json", ), - ) - ), + ), + ] ) TEST_MODELS["tinystories_llama2_25m"] = ModelConfig( From 716c9c0acc5d993f5c50da478656b5196277f1a8 Mon Sep 17 00:00:00 2001 From: Cedar Date: Fri, 14 Feb 2025 10:31:35 -0800 Subject: [PATCH 10/12] whitespace fix --- app_tests/integration_tests/llm/model_management.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index a0e9248b5..0b51d45fb 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -430,18 +430,18 @@ def process_model(self, config: ModelConfig) -> ModelArtifacts: # TODO: upstream this to sharktank Dataset( "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - files = [ + files=[ RemoteFile( - file_id = "model.safetensors", - filename= "model.safetensors", - repo_id = "Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", - extra_filenames = ( + file_id="model.safetensors", + filename="model.safetensors", + repo_id="Mxode/TinyStories-LLaMA2-25M-256h-4l-GQA", + extra_filenames=( "config.json", "tokenizer.json", "tokenizer_config.json", ), ), - ] + ], ) TEST_MODELS["tinystories_llama2_25m"] = ModelConfig( From 831b2f4a9e22900c8e16bceadbc58f36031d5de3 Mon Sep 17 00:00:00 2001 From: Cedar Date: Fri, 14 Feb 2025 11:10:03 -0800 Subject: [PATCH 11/12] logging improvements --- app_tests/integration_tests/llm/model_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app_tests/integration_tests/llm/model_management.py b/app_tests/integration_tests/llm/model_management.py index 0b51d45fb..3bc7d8a95 100644 --- a/app_tests/integration_tests/llm/model_management.py +++ b/app_tests/integration_tests/llm/model_management.py @@ -191,14 +191,14 @@ def _download_and_convert_from_huggingface(self) -> Path: if not irpa_path.exists(): logger.info( - f"Processing model {self.config.repo_id} from HuggingFace through GGUF to IRPA" + f"Processing model `{self.config.dataset_name}` from HuggingFace through GGUF to IRPA" ) # Step 1: Download from HuggingFace hf_model_path = self.model_dir / "model_hf_repo_clone" if not hf_model_path.exists(): logger.info( - f"Downloading model from HuggingFace: {self.config.repo_id}" + f"Downloading model from HuggingFace: `{self.config.dataset_name}`" ) dataset = get_dataset(self.config.dataset_name) downloaded_files = dataset.download(local_dir=self.model_dir) From 793bb675b9d8ee371d2a287d178df8938de59c90 Mon Sep 17 00:00:00 2001 From: Cedar Date: Fri, 14 Feb 2025 11:15:43 -0800 Subject: [PATCH 12/12] add note about golden response assuming greedy search & determinism --- .../llm/shortfin/tinystories_llama2_25m_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py index a32a71202..9c51f7866 100644 --- a/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py +++ b/app_tests/integration_tests/llm/shortfin/tinystories_llama2_25m_test.py @@ -27,7 +27,7 @@ # goldens are generated in: https://colab.research.google.com/drive/1pFiyvyIxk1RsHnw5gTk_gu9QiQNy9gfW?usp=sharing GOLDEN_PROMPT = "Once upon a time" -GOLDEN_RESPONSE = ", there was a little girl named Lily." +GOLDEN_RESPONSE = ", there was a little girl named Lily." # this assumes purely deterministic greedy search class TestLLMServer: