diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 4d1e51cb26..473097855b 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -20,7 +20,6 @@ from pathlib import Path -from accelerate.commands.estimate import estimate_command_parser, gather_data from sagemaker import Session from sagemaker.model import Model from sagemaker.base_predictor import PredictorBase @@ -43,7 +42,11 @@ from sagemaker.serve.utils import task from sagemaker.serve.utils.exceptions import TaskNotFoundException from sagemaker.serve.utils.predictors import _get_local_mode_predictor -from sagemaker.serve.utils.hardware_detector import _get_gpu_info, _get_gpu_info_fallback +from sagemaker.serve.utils.hardware_detector import ( + _get_gpu_info, + _get_gpu_info_fallback, + _total_inference_model_size_mib, +) from sagemaker.serve.detector.image_detector import ( auto_detect_container, _detect_framework_and_version, @@ -70,11 +73,8 @@ ModelServer.DJL_SERVING, } -MIB_CONVERSION_FACTOR = 0.00000095367431640625 -MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer - -# pylint: disable=attribute-defined-outside-init +# pylint: disable=attribute-defined-outside-init, disable=E1101 @dataclass class ModelBuilder(Triton, DJL, JumpStart, TGI, Transformers): """Class that builds a deployable model. @@ -719,28 +719,6 @@ def _schema_builder_init(self, model_task: str): except ValueError: raise TaskNotFoundException(f"Schema builder for {model_task} could not be found.") - def _total_inference_model_size_mib(self): - """Calculates the model size from HF accelerate - - This function gets the model size from accelerate. It also adds a - padding and converts to size MiB. When performing inference, expect - to add up to an additional 20% to the given model size as found by EleutherAI. - """ - dtypes = self.env_vars.get("dtypes", "float32") - parser = estimate_command_parser() - args = parser.parse_args([self.model, "--dtypes", dtypes]) - - output = gather_data( - args - ) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam" - - if output is None: - raise ValueError(f"Could not get Model size for {self.model}") - - total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR - logger.info("Total memory size MIB: %s", total_memory_size_mib) - return total_memory_size_mib - def _can_fit_on_single_gpu(self) -> Type[bool]: """Check if model can fit on a single GPU @@ -748,10 +726,15 @@ def _can_fit_on_single_gpu(self) -> Type[bool]: """ try: single_gpu_size_mib = self._try_fetch_gpu_info() - if self._total_inference_model_size_mib() <= single_gpu_size_mib: + if ( + _total_inference_model_size_mib(self.model, self.env_vars.get("dtypes", "float32")) + <= single_gpu_size_mib + ): logger.info( "Total inference model size MIB %s, single GPU size for instance MIB %s", - self._total_inference_model_size_mib(), + _total_inference_model_size_mib( + self.model, self.env_vars.get("dtypes", "float32") + ), single_gpu_size_mib, ) return True diff --git a/src/sagemaker/serve/builder/schema_builder.py b/src/sagemaker/serve/builder/schema_builder.py index 24900a5dc8..d0f65716d8 100644 --- a/src/sagemaker/serve/builder/schema_builder.py +++ b/src/sagemaker/serve/builder/schema_builder.py @@ -208,12 +208,18 @@ def _get_inverse(self, obj): def __repr__(self): """Placeholder docstring""" + if hasattr(self, "input_serializer") and hasattr(self, "output_serializer"): + return ( + f"SchemaBuilder(\n" + f"input_serializer={self.input_serializer}\n" + f"output_serializer={self.output_serializer}\n" + f"input_deserializer={self.input_deserializer._deserializer}\n" + f"output_deserializer={self.output_deserializer._deserializer})" + ) return ( f"SchemaBuilder(\n" - f"input_serializer={self.input_serializer}\n" - f"output_serializer={self.output_serializer}\n" - f"input_deserializer={self.input_deserializer._deserializer}\n" - f"output_deserializer={self.output_deserializer._deserializer})" + f"custom_input_translator={self.custom_input_translator}\n" + f"custom_output_translator={self.custom_output_translator}\n" ) def generate_marshalling_map(self) -> dict: diff --git a/src/sagemaker/serve/utils/hardware_detector.py b/src/sagemaker/serve/utils/hardware_detector.py index 632149dc8f..b642cc5854 100644 --- a/src/sagemaker/serve/utils/hardware_detector.py +++ b/src/sagemaker/serve/utils/hardware_detector.py @@ -18,12 +18,18 @@ from botocore.exceptions import ClientError +from accelerate.commands.estimate import estimate_command_parser, gather_data from sagemaker import Session +from sagemaker.model import Model from sagemaker import instance_types_gpu_info logger = logging.getLogger(__name__) +MIB_CONVERSION_FACTOR = 0.00000095367431640625 +MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer + + def _get_gpu_info(instance_type: str, session: Session) -> Tuple[int, int]: """Get GPU info for the provided instance @@ -108,3 +114,24 @@ def _format_instance_type(instance_type: str) -> str: ec2_instance = ".".join(split_instance) return ec2_instance + + +def _total_inference_model_size_mib(model: Model, dtype: str) -> int: + """Calculates the model size from HF accelerate + + This function gets the model size from accelerate. It also adds a + padding and converts to size MiB. When performing inference, expect + to add up to an additional 20% to the given model size as found by EleutherAI. + """ + args = estimate_command_parser().parse_args([model, "--dtypes", dtype]) + + output = gather_data( + args + ) # "dtype", "Largest Layer", "Total Size Bytes", "Training using Adam" + + if output is None: + raise ValueError(f"Could not get Model size for {model}") + + total_memory_size_mib = MEMORY_BUFFER_MULTIPLIER * output[0][2] * MIB_CONVERSION_FACTOR + logger.info("Total memory size MIB: %s", total_memory_size_mib) + return total_memory_size_mib diff --git a/tests/integ/sagemaker/serve/test_serve_pt_happy.py b/tests/integ/sagemaker/serve/test_serve_pt_happy.py index f533942037..5b386b992d 100644 --- a/tests/integ/sagemaker/serve/test_serve_pt_happy.py +++ b/tests/integ/sagemaker/serve/test_serve_pt_happy.py @@ -181,7 +181,6 @@ def model_builder(request): # ), f"{caught_ex} was thrown when running pytorch squeezenet local container test" -@pytest.mark.skip(reason="Failing test. Fix is pending.") @pytest.mark.skipif( PYTHON_VERSION_IS_NOT_310, # or NOT_RUNNING_ON_INF_EXP_DEV_PIPELINE, reason="The goal of these test are to test the serving components of our feature", @@ -222,8 +221,10 @@ def test_happy_pytorch_sagemaker_endpoint( ) if caught_ex: logger.exception(caught_ex) + ignore_if_worker_dies = "Worker died." in str(caught_ex) + # https://github.com/pytorch/serve/issues/3032 assert ( - False + ignore_if_worker_dies ), f"{caught_ex} was thrown when running pytorch squeezenet sagemaker endpoint test" diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 3b60d13dfb..3ed0e0c5cd 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -53,9 +53,6 @@ ModelServer.DJL_SERVING, } -MIB_CONVERSION_FACTOR = 0.00000095367431640625 -MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer - mock_session = MagicMock() @@ -1205,7 +1202,7 @@ def test_build_for_transformers_happy_case( @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers") @patch("sagemaker.serve.builder.model_builder.ModelBuilder._try_fetch_gpu_info") - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib") + @patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib") @patch("sagemaker.image_uris.retrieve") @patch("sagemaker.djl_inference.model.urllib") @patch("sagemaker.djl_inference.model.json") @@ -1248,7 +1245,7 @@ def test_build_for_transformers_happy_case_with_values( @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_djl", Mock()) @patch("sagemaker.serve.builder.model_builder._get_gpu_info") - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib") + @patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib") @patch("sagemaker.image_uris.retrieve") @patch("sagemaker.djl_inference.model.urllib") @patch("sagemaker.djl_inference.model.json") @@ -1293,7 +1290,7 @@ def test_build_for_transformers_happy_case_with_valid_gpu_info( @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock()) @patch("sagemaker.serve.builder.model_builder._get_gpu_info") @patch("sagemaker.serve.builder.model_builder._get_gpu_info_fallback") - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib") + @patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib") @patch("sagemaker.image_uris.retrieve") @patch("sagemaker.djl_inference.model.urllib") @patch("sagemaker.djl_inference.model.json") @@ -1342,61 +1339,6 @@ def test_build_for_transformers_happy_case_with_valid_gpu_fallback( ) self.assertEqual(model_builder._can_fit_on_single_gpu(), True) - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock()) - @patch("sagemaker.serve.builder.model_builder.estimate_command_parser") - @patch("sagemaker.serve.builder.model_builder.gather_data") - @patch("sagemaker.image_uris.retrieve") - @patch("sagemaker.djl_inference.model.urllib") - @patch("sagemaker.djl_inference.model.json") - @patch("sagemaker.huggingface.llm_utils.urllib") - @patch("sagemaker.huggingface.llm_utils.json") - @patch("sagemaker.model_uris.retrieve") - @patch("sagemaker.serve.builder.model_builder._ServeSettings") - def test_build_for_transformers_happy_case_hugging_face_responses( - self, - mock_serveSettings, - mock_model_uris_retrieve, - mock_llm_utils_json, - mock_llm_utils_urllib, - mock_model_json, - mock_model_urllib, - mock_image_uris_retrieve, - mock_gather_data, - mock_parser, - ): - mock_setting_object = mock_serveSettings.return_value - mock_setting_object.role_arn = mock_role_arn - mock_setting_object.s3_model_data_url = mock_s3_model_data_url - - mock_model_uris_retrieve.side_effect = KeyError - mock_llm_utils_json.load.return_value = {"pipeline_tag": "text-classification"} - mock_llm_utils_urllib.request.Request.side_effect = Mock() - - mock_model_json.load.return_value = {"some": "config"} - mock_model_urllib.request.Request.side_effect = Mock() - mock_image_uris_retrieve.return_value = "https://some-image-uri" - - mock_parser.return_value = Mock() - mock_gather_data.return_value = [[1, 1, 1, 1]] - product = MIB_CONVERSION_FACTOR * 1 * MEMORY_BUFFER_MULTIPLIER - - model_builder = ModelBuilder( - model="stable-diffusion", - sagemaker_session=mock_session, - instance_type=mock_instance_type, - ) - self.assertEqual(model_builder._total_inference_model_size_mib(), product) - - mock_parser.return_value = Mock() - mock_gather_data.return_value = None - model_builder = ModelBuilder( - model="stable-diffusion", - sagemaker_session=mock_session, - instance_type=mock_instance_type, - ) - with self.assertRaises(ValueError) as _: - model_builder._total_inference_model_size_mib() - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_djl") @patch("sagemaker.serve.builder.model_builder.ModelBuilder._can_fit_on_single_gpu") @patch("sagemaker.image_uris.retrieve") @@ -1556,7 +1498,7 @@ def test_try_fetch_gpu_info_throws( self.assertEqual(model_builder._can_fit_on_single_gpu(), False) @patch("sagemaker.serve.builder.model_builder.ModelBuilder._build_for_transformers", Mock()) - @patch("sagemaker.serve.builder.model_builder.ModelBuilder._total_inference_model_size_mib") + @patch("sagemaker.serve.builder.model_builder._total_inference_model_size_mib") @patch("sagemaker.image_uris.retrieve") @patch("sagemaker.djl_inference.model.urllib") @patch("sagemaker.djl_inference.model.json") diff --git a/tests/unit/sagemaker/serve/utils/test_hardware_detector.py b/tests/unit/sagemaker/serve/utils/test_hardware_detector.py index 5ec493de72..8efbadfea4 100644 --- a/tests/unit/sagemaker/serve/utils/test_hardware_detector.py +++ b/tests/unit/sagemaker/serve/utils/test_hardware_detector.py @@ -13,6 +13,7 @@ from __future__ import absolute_import from botocore.exceptions import ClientError +from unittest.mock import patch, Mock import pytest from sagemaker.serve.utils import hardware_detector @@ -21,6 +22,8 @@ VALID_INSTANCE_TYPE = "ml.g5.48xlarge" INVALID_INSTANCE_TYPE = "fl.c5.57xxlarge" EXPECTED_INSTANCE_GPU_INFO = (8, 196608) +MIB_CONVERSION_FACTOR = 0.00000095367431640625 +MEMORY_BUFFER_MULTIPLIER = 1.2 # 20% buffer def test_get_gpu_info_success(sagemaker_session, boto_session): @@ -96,3 +99,24 @@ def test_format_instance_type_without_ml_success(): formatted_instance_type = hardware_detector._format_instance_type("g5.48xlarge") assert formatted_instance_type == "g5.48xlarge" + + +@patch("sagemaker.serve.utils.hardware_detector.estimate_command_parser") +@patch("sagemaker.serve.utils.hardware_detector.gather_data") +def test_total_inference_model_size_mib( + mock_gather_data, + mock_parser, +): + mock_parser.return_value = Mock() + mock_gather_data.return_value = [[1, 1, 1, 1]] + product = MIB_CONVERSION_FACTOR * 1 * MEMORY_BUFFER_MULTIPLIER + + assert ( + hardware_detector._total_inference_model_size_mib("stable-diffusion", "float32") == product + ) + + mock_parser.return_value = Mock() + mock_gather_data.return_value = None + + with pytest.raises(ValueError): + hardware_detector._total_inference_model_size_mib("stable-diffusion", "float32")