From 3ff859f315a4dd74b930056ae39ac2f9c4e0f7ef Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Mon, 15 Apr 2024 11:15:15 -0700 Subject: [PATCH 1/2] Fix invalid values in cluster creation script (#935) * Fix invalid values in cluster creation script Signed-off-by: Partho Sarthi * Fix return type Signed-off-by: Partho Sarthi --------- Signed-off-by: Partho Sarthi --- .../spark_rapids_pytools/cloud_api/dataproc.py | 3 +++ .../src/spark_rapids_pytools/cloud_api/emr.py | 17 +++++++++++++++++ .../spark_rapids_pytools/cloud_api/sp_types.py | 3 +++ .../resources/dataproc-configs.json | 3 ++- .../resources/emr-configs.json | 3 ++- .../templates/cluster_template/dataproc.ms | 5 ++++- .../resources/templates/cluster_template/emr.ms | 5 +++-- .../dataproc-create_gpu_cluster_script.ms | 2 +- 8 files changed, 35 insertions(+), 6 deletions(-) diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py index 06e71daf2..f908f7d5b 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py @@ -165,6 +165,9 @@ def get_matching_executor_instance(self, cores_per_executor): def generate_cluster_configuration(self, render_args: dict): executor_names = ','.join([f'"test-node-e{i}"' for i in range(render_args['NUM_EXECUTOR_NODES'])]) render_args['EXECUTOR_NAMES'] = f'[{executor_names}]' + image_version = self.configs.get_value('clusterInference', 'defaultImage') + render_args['IMAGE'] = f'"{image_version}"' + render_args['ZONE'] = f'"{self.cli.get_zone()}"' return super().generate_cluster_configuration(render_args) diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py index c3cacd489..2c82f82bc 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py @@ -117,6 +117,12 @@ def create_local_submission_job(self, job_prop, ctxt) -> Any: def _get_prediction_model_name(self) -> str: return CspEnv.pretty_print(CspEnv.get_default()) + def generate_cluster_configuration(self, render_args: dict): + image_version = self.configs.get_value('clusterInference', 'defaultImage') + render_args['IMAGE'] = f'"{image_version}"' + render_args['ZONE'] = f'"{self.cli.get_zone()}"' + return super().generate_cluster_configuration(render_args) + @dataclass class EMRCMDDriver(CMDDriverBase): @@ -199,6 +205,17 @@ def _build_platform_describe_node_instance(self, node: ClusterNode) -> list: '--instance-types', f'{node.instance_type}'] return cmd_params + def get_zone(self) -> str: + describe_cmd = ['aws ec2 describe-availability-zones', + '--region', f'{self.get_region()}'] + selected_zone = '' + try: + zones_list = json.loads(self.run_sys_cmd(describe_cmd)) + selected_zone = zones_list['AvailabilityZones'][0]['ZoneName'] + except Exception: # pylint: disable=broad-except + self.logger.warning('Unable to extract zone from region %s', self.get_region()) + return selected_zone + def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list: diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py index 0e0e2ecfa..d88e0c356 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py @@ -299,6 +299,9 @@ def get_env_var(self, key: str): def get_region(self) -> str: return self.env_vars.get('region') + def get_zone(self) -> str: + return self.env_vars.get('zone') + def get_cmd_run_configs(self) -> dict: return self.env_vars.get('cmdRunnerProperties') diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json index b569fb220..e5c299bff 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json @@ -267,7 +267,8 @@ "executor": { "n1-standard": {"vCPUs": [1, 2, 4, 8, 16, 32, 64, 96]} } - } + }, + "defaultImage": "2.1.41-debian11" }, "clusterSpecs": { "minWorkerNodes": 2, diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json index e703c4723..72e3387ee 100644 --- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json @@ -391,7 +391,8 @@ {"name": "m5d.12xlarge", "vCPUs": 48}, {"name": "m5d.16xlarge", "vCPUs": 64} ] - } + }, + "defaultImage": "emr-6.10.0" }, "clusterSpecs": { "minWorkerNodes": 2 diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms index 77093242a..d4dc50215 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms @@ -3,7 +3,7 @@ "clusterUuid": "1234-5678-1234567", "config": { "gceClusterConfig": { - "zoneUri": "us-central1-a" + "zoneUri": {{{ ZONE }}} }, "masterConfig": { "instanceNames": [ @@ -16,6 +16,9 @@ "instanceNames": {{{ EXECUTOR_NAMES }}}, "machineTypeUri": {{{ EXECUTOR_INSTANCE }}}, "numInstances": {{ NUM_EXECUTOR_NODES }} + }, + "softwareConfig": { + "imageVersion": {{{ IMAGE }}} } }, "status": { diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms index 817691ddd..7a598c098 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms @@ -6,7 +6,7 @@ "State": "TERMINATED" }, "Ec2InstanceAttributes": { - "Ec2AvailabilityZone": "us-west-2a" + "Ec2AvailabilityZone": {{{ ZONE }}} }, "InstanceGroups": [ { @@ -25,6 +25,7 @@ "InstanceType": {{{ DRIVER_INSTANCE }}}, "RequestedInstanceCount": {{ NUM_DRIVER_NODES }} } - ] + ], + "ReleaseLabel": {{{ IMAGE }}} } } diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms b/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms index 114195dfb..114764fb6 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms @@ -5,7 +5,7 @@ export CLUSTER_NAME="{{ CLUSTER_NAME }}" gcloud dataproc clusters create $CLUSTER_NAME \ --image-version={{ IMAGE }} \ --region {{ REGION }} \ - --zone {{ ZONE }}-a \ + --zone {{ ZONE }} \ --master-machine-type {{ MASTER_MACHINE }} \ --num-workers {{ WORKERS_COUNT }} \ --worker-machine-type {{ WORKERS_MACHINE }} \ From 1cb24c26c392bd96c757a038cdaeec724b9fafb5 Mon Sep 17 00:00:00 2001 From: Partho Sarthi Date: Mon, 15 Apr 2024 15:56:46 -0700 Subject: [PATCH 2/2] Cluster inference should not run for unsupported platform (#941) Signed-off-by: Partho Sarthi --- user_tools/src/spark_rapids_pytools/rapids/qualification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/user_tools/src/spark_rapids_pytools/rapids/qualification.py b/user_tools/src/spark_rapids_pytools/rapids/qualification.py index affef20bd..223a3bf33 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/qualification.py +++ b/user_tools/src/spark_rapids_pytools/rapids/qualification.py @@ -902,7 +902,7 @@ def __infer_cluster_and_update_savings(self, cluster_info_df: pd.DataFrame): Update savings if CPU cluster can be inferred and corresponding GPU cluster can be defined. :param cluster_info_df: Parsed cluster information. """ - if self.ctxt.get_ctxt('cpuClusterProxy') is not None: + if self.ctxt.get_ctxt('cpuClusterProxy') is not None or not self.ctxt.platform.cluster_inference_supported: return # Infer the CPU cluster from the cluster information