From 77d8cbd8d32d3d2d52947a80862401d23f095bd7 Mon Sep 17 00:00:00 2001 From: Seth Zegelstein Date: Thu, 5 Sep 2024 16:42:17 +0000 Subject: [PATCH] .ci/aws: Pin p4/p5 ami's to AMI's from 8/7/24 In order to attempt to stabalize the aws-ofi-nccl plugin GH PR CI, the plan is to pin the AMI's to 8/7 before we started running into a bunch of CUDA version related issues. When these are fixed, we will unpin the AMI's. Signed-off-by: Seth Zegelstein --- .ci/aws/Jenkinsfile | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/.ci/aws/Jenkinsfile b/.ci/aws/Jenkinsfile index 8a5bed33e..b0c1df9b6 100644 --- a/.ci/aws/Jenkinsfile +++ b/.ci/aws/Jenkinsfile @@ -219,6 +219,14 @@ pipeline { def g4dn_region = "us-west-2" def g4dn_odcr = "cr-0e2f9cac30bb5ad5f" def g4dn_addl_args = "${base_args} --odcr-placement-group-name g4dn-placement-group" + // Pin p4/p5 to AMI's from August 7th until we figure out why ImageBuilder is broken + // p4/p5 are in different regions which is why they need different AMI ID's + def p4_al2_ami = " --ami-id ami-0325055f791f59e7b" + def p4_ub2004_ami = " --ami-id ami-05feaa67734032ae8" + def p4_ub2204_ami = " --ami-id ami-06af5c08a83958af0" + def p5_al2_ami = " --ami-id ami-0c7d6c9eddde7c8cd" + def p5_ub2004_ami = " --ami-id ami-0945f264a4dc5bbb8" + def p5_ub2204_ami = " --ami-id ami-0c97f93421701b894" // p3dn tests stages["4_p3dn_al2"] = get_test_stage_with_lock("4_p3dn_al2", env.BUILD_TAG, "alinux2", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_al2_addl_args) @@ -226,14 +234,14 @@ pipeline { stages["4_p3dn_ubuntu2204"] = get_test_stage_with_lock("4_p3dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p3dn.24xlarge", p3dn_region, p3dn_lock_label, num_instances, p3dn_odcr, p3dn_addl_args) // p4d tests - stages["4_p4d_alinux2"] = get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args) - stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args) - stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args) + stages["4_p4d_alinux2"] = get_test_stage_with_lock("4_p4d_alinux2", env.BUILD_TAG, "alinux2", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args + p4_al2_ami) + stages["4_p4d_ubuntu2004"] = get_test_stage_with_lock("4_p4d_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args + p4_ub2004_ami) + stages["4_p4d_ubuntu2204"] = get_test_stage_with_lock("4_p4d_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p4d.24xlarge", p4d_region, p4d_lock_label, num_instances, p4d_odcr, p4_p5_addl_args + p4_ub2204_ami) // p5 tests - stages["4_p5_alinux2"] = get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args) - stages["4_p5_ubuntu2004"] = get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args) - stages["4_p5_ubuntu2204"] = get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args) + stages["4_p5_alinux2"] = get_test_stage_with_lock("4_p5_alinux2", env.BUILD_TAG, "alinux2", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args + p5_al2_ami) + stages["4_p5_ubuntu2004"] = get_test_stage_with_lock("4_p5_ubuntu2004", env.BUILD_TAG, "ubuntu2004", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args + p5_ub2004_ami) + stages["4_p5_ubuntu2204"] = get_test_stage_with_lock("4_p5_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "p5.48xlarge", p5_region, p5_lock_label, num_instances, p5_odcr, p4_p5_addl_args + p5_ub2204_ami) // g4dn tests stages["4_g4dn_ubuntu2204"] = get_test_stage_with_lock("4_g4dn_ubuntu2204", env.BUILD_TAG, "ubuntu2204", "g4dn.12xlarge", g4dn_region, g4dn_lock_label, num_instances, g4dn_odcr, g4dn_addl_args)