From e0abaadee27a73dc6ce0e4b5200fb1210e211809 Mon Sep 17 00:00:00 2001 From: Lucas Duarte <30901918+lusoal@users.noreply.github.com> Date: Wed, 31 Jan 2024 18:45:26 -0600 Subject: [PATCH] refactor: Trainium inferentia karpenter provisioners update (#420) Co-authored-by: Sanjeev Ganjihal Co-authored-by: Vara Bonthu --- ai-ml/trainium-inferentia/addons.tf | 492 ++++++++++++------ ai-ml/trainium-inferentia/eks.tf | 2 +- .../examples/llama2/2-launch-cmd-shell-pod.sh | 2 +- .../llama2/3-llama2-neuronx-mpi-compile.sh | 2 +- .../llama2/4-llama2-neuronx-mpi-train.sh | 2 +- .../examples/llama2/5-deploy-tensorboard.sh | 2 +- .../llama2/docker/Dockerfile.tensorboard | 2 +- .../llama2/install-pre-requsites-for-ec2.sh | 2 - .../llama2-inf2/ray-service-llama2.yaml | 2 +- .../karpenter-resources/Chart.yaml | 5 + .../helm-values/karpenter-resources/README.md | 71 +++ .../templates/node-class.yaml | 56 ++ .../templates/node-pool.yaml | 36 ++ .../karpenter-resources/values.yaml | 40 ++ .../karpenter-default.yaml | 54 -- .../karpenter-inf2.yaml | 58 --- .../karpenter-trn1.yaml | 53 -- ai-ml/trainium-inferentia/main.tf | 59 +-- ai-ml/trainium-inferentia/outputs.tf | 2 +- ai-ml/trainium-inferentia/variables.tf | 33 +- .../terraform/managed-airflow-mwaa/eks.tf | 10 +- website/docs/blueprints/ai-ml/trainium.md | 10 +- website/docs/gen-ai/training/Llama2.md | 12 +- 23 files changed, 578 insertions(+), 429 deletions(-) create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf index 464439552..104d362f4 100644 --- a/ai-ml/trainium-inferentia/addons.tf +++ b/ai-ml/trainium-inferentia/addons.tf @@ -238,6 +238,91 @@ module "eks_blueprints_addons" { } tags = local.tags + + # We are installing Karpenter resources with Helm Chart, see helm-values/ + helm_releases = { + karpenter-resources-default = { + name = "default" + description = "A Helm chart for default node pool" + chart = "${path.module}/helm-values/karpenter-resources" + values = [ + <<-EOT + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[3]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + nodePool: + labels: + - provisioner: default + - workload: rayhead + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5", "m5", "r5"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + EOT + ] + } + karpenter-resources-inferentia = { + name = "inferentia-inf2" + description = "A Helm chart for karpenter inferentia-inf2" + chart = "${path.module}/helm-values/karpenter-resources" + values = [ + <<-EOT + name: inferentia-inf2 + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[3]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + nodePool: + labels: + - provisioner: inferentia-inf2 + - hub.jupyter.org/node-purpose: user + taints: + - key: aws.amazon.com/neuroncore + value: "true" + effect: "NoSchedule" + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["inf2"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: ["24xlarge", "48xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + EOT + ] + } + } + } #--------------------------------------------------------------- @@ -245,7 +330,7 @@ module "eks_blueprints_addons" { #--------------------------------------------------------------- module "eks_data_addons" { source = "aws-ia/eks-data-addons/aws" - version = "~> 1.2" # ensure to update this to the latest/desired version + version = "~> 1.2.9" # ensure to update this to the latest/desired version oidc_provider_arn = module.eks.oidc_provider_arn @@ -279,6 +364,83 @@ module "eks_data_addons" { }) ] } + enable_karpenter_resources = true + karpenter_resources_helm_config = { + inferentia-inf2 = { + values = [ + <<-EOT + name: inferentia-inf2 + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[3]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + nodePool: + labels: + - provisioner: inferentia-inf2 + - hub.jupyter.org/node-purpose: user + taints: + - key: aws.amazon.com/neuroncore + value: "true" + effect: "NoSchedule" + - key: aws.amazon.com/neuron + value: "true" + effect: "NoSchedule" + - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html + operator: "Equal" + value: "user" + effect: "NoSchedule" + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["inf2"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: ["24xlarge", "48xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + EOT + ] + } + default = { + values = [ + <<-EOT + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + id: ${module.vpc.private_subnets[3]} + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + nodePool: + labels: + - provisioner: default + - workload: rayhead + requirements: + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5", "m5", "r5"] + - key: "karpenter.k8s.aws/instance-size" + operator: In + values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + EOT + ] + } + } } #--------------------------------------------------------------- @@ -324,29 +486,6 @@ resource "aws_secretsmanager_secret_version" "grafana" { secret_string = random_password.grafana.result } -locals { - karpenter_trn1_32xl_lt_name = format("%s-trn132xl-lt", local.name) -} - -#--------------------------------------- -# Karpenter Provisioners -#--------------------------------------- -data "kubectl_path_documents" "karpenter_provisioners" { - pattern = "${path.module}/karpenter-provisioners/karpenter-*.yaml" - vars = { - azs = local.region - eks_cluster_id = local.name - launch_template_name = local.karpenter_trn1_32xl_lt_name - } -} - -resource "kubectl_manifest" "karpenter_provisioner" { - for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) - yaml_body = each.value - - depends_on = [module.eks_blueprints_addons] -} - #tfsec:ignore:* module "s3_bucket" { source = "terraform-aws-modules/s3-bucket/aws" @@ -359,147 +498,6 @@ module "s3_bucket" { tags = local.tags } -#--------------------------------------------------------------- -# Create a Launch Template Userdata for Trainium -# Note: As of version v0.29.0, the Karpenter AWSNodeTemplate lacks the ability to configure multipleNetwork interfaces for EFA. -# To work around this limitation, we are utilizing Terraform to generate launch templates that include EFA configurations. -# These launch templates are then used as input for the AWS Node template, enabling us to achieve the desired network interface setups. -#--------------------------------------------------------------- -data "cloudinit_config" "trn1_lt" { - base64_encode = true - gzip = false - boundary = "//" - - # Prepend to existing user data supplied by AWS EKS - part { - content_type = "text/x-shellscript" - content = <<-EOT - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - # Configure NVMe volumes in RAID0 configuration - # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 - # Mount will be: /mnt/k8s-disks - export LOCAL_DISKS='raid0' - - # Install Neuron monitoring tools - yum install aws-neuronx-tools-2.* -y - export PATH=/opt/aws/neuron/bin:$PATH - - # EFA Setup for Trainium and Inferentia - export FI_EFA_USE_DEVICE_RDMA=1 - export FI_PROVIDER=efa - export FI_EFA_FORK_SAFE=1 - - curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz - tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer - ./efa_installer.sh -y -g - /opt/amazon/efa/bin/fi_info -p efa - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - # Bootstrap the node - B64_CLUSTER_CA=${module.eks.cluster_certificate_authority_data} - API_SERVER_URL=${module.eks.cluster_endpoint} - /etc/eks/bootstrap.sh ${local.name} --kubelet-extra-args "--node-labels=eks.amazonaws.com/nodegroup-image=${data.aws_ami.eks_gpu.id}" --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL - - EOT - } -} - -#--------------------------------------------------------------- -# This Terraform code defines a data block to fetch the most recent Amazon Machine Image (AMI) -# for an Amazon Elastic Kubernetes Service (EKS) cluster with GPU support. -#--------------------------------------------------------------- -data "aws_ami" "eks_gpu" { - owners = ["amazon"] - most_recent = true - - filter { - name = "name" - values = ["amazon-eks-gpu-node-${var.eks_cluster_version}-*"] - } -} - -#--------------------------------------------------------------- -# AWS Launch Template Configuration for Karpenter Trn1.32xlarge Instances -#--------------------------------------------------------------- -resource "aws_launch_template" "trn1_lt" { - name = local.karpenter_trn1_32xl_lt_name - description = "Karpenter Trn1.32xlarge Launch Template" - - user_data = data.cloudinit_config.trn1_lt.rendered - - ebs_optimized = true - - image_id = data.aws_ami.eks_gpu.id - - iam_instance_profile { - name = module.eks_blueprints_addons.karpenter.node_instance_profile_name - } - - # Commented for visibility to implement this feature in the future - # placement { - # tenancy = "default" - # availability_zone = "${local.region}d" - # group_name = local.karpenter_trn1_32xl_lt_name - # } - - metadata_options { - http_endpoint = "enabled" - http_tokens = "required" - http_put_response_hop_limit = 2 - } - - block_device_mappings { - device_name = "/dev/xvda" - ebs { - volume_size = 100 - delete_on_termination = true - volume_type = "gp3" - } - } - - monitoring { - enabled = true - } - - tag_specifications { - resource_type = "instance" - - tags = merge(local.tags, { - "karpenter.sh/discovery" = local.name - }) - } - - # First network interface with device_index=0 and network_card_index=0 - network_interfaces { - device_index = 0 - network_card_index = 0 - associate_public_ip_address = false - interface_type = "efa" - delete_on_termination = true - security_groups = [module.eks.node_security_group_id] - description = "Karpenter EFA config for Trainium" - } - - # Additional network interfaces with device_index=1 and network_card_index ranging from 1 to 7 - dynamic "network_interfaces" { - for_each = range(1, 8) # Create 7 additional network interfaces - content { - device_index = 1 - network_card_index = network_interfaces.value - associate_public_ip_address = false - interface_type = "efa" - delete_on_termination = true - security_groups = [module.eks.node_security_group_id] - description = "Karpenter EFA config for Trainium" - } - } -} - #--------------------------------------------------------------- # MPI Operator for distributed training on Trainium #--------------------------------------------------------------- @@ -516,3 +514,167 @@ resource "kubectl_manifest" "mpi_operator" { yaml_body = each.value depends_on = [module.eks.eks_cluster_id] } + +#--------------------------------------------------------------- +# Create a Launch Template Userdata for Trainium, and use it in Karpenter, deprecated +# This commented section of the pattern is commented due to lack of support in utilizing LaunchTemplates in newer Karpenter versions. +# See full change list https://github.com/aws/karpenter-provider-aws/blob/d1d1371ae2e1552b8fdded7d343bf24ea18bee31/designs/v1beta1-full-changelist.md#remove-speclaunchtemplate +#--------------------------------------------------------------- +# data "cloudinit_config" "trn1_lt" { +# base64_encode = true +# gzip = false +# boundary = "//" + +# # Prepend to existing user data supplied by AWS EKS +# part { +# content_type = "text/x-shellscript" +# content = <<-EOT +# cat <<-EOF > /etc/profile.d/bootstrap.sh +# #!/bin/sh + +# # Configure NVMe volumes in RAID0 configuration +# # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126 +# # Mount will be: /mnt/k8s-disks +# export LOCAL_DISKS='raid0' + +# # Install Neuron monitoring tools +# yum install aws-neuronx-tools-2.* -y +# export PATH=/opt/aws/neuron/bin:$PATH + +# # EFA Setup for Trainium and Inferentia +# export FI_EFA_USE_DEVICE_RDMA=1 +# export FI_PROVIDER=efa +# export FI_EFA_FORK_SAFE=1 + +# curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz +# tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer +# ./efa_installer.sh -y -g +# /opt/amazon/efa/bin/fi_info -p efa +# EOF + +# # Source extra environment variables in bootstrap script +# sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + +# # Bootstrap the node +# B64_CLUSTER_CA=${module.eks.cluster_certificate_authority_data} +# API_SERVER_URL=${module.eks.cluster_endpoint} +# /etc/eks/bootstrap.sh ${local.name} --kubelet-extra-args "--node-labels=eks.amazonaws.com/nodegroup-image=${data.aws_ami.eks_gpu.id}" --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL + +# EOT +# } +# } + +#--------------------------------------------------------------- +# This Terraform code defines a data block to fetch the most recent Amazon Machine Image (AMI) +# for an Amazon Elastic Kubernetes Service (EKS) cluster with GPU support. +#--------------------------------------------------------------- +# data "aws_ami" "eks_gpu" { +# owners = ["amazon"] +# most_recent = true + +# filter { +# name = "name" +# values = ["amazon-eks-gpu-node-${var.eks_cluster_version}-*"] +# } +# } + + +# locals { +# karpenter_trn1_32xl_lt_name = format("%s-trn132xl-lt", local.name) +# } + +#--------------------------------------------------------------- +# AWS Launch Template Configuration for Karpenter Trn1.32xlarge Instances +#--------------------------------------------------------------- +# resource "aws_launch_template" "trn1_lt" { +# name = local.karpenter_trn1_32xl_lt_name +# description = "Karpenter Trn1.32xlarge Launch Template" + +# user_data = data.cloudinit_config.trn1_lt.rendered + +# ebs_optimized = true + +# image_id = data.aws_ami.eks_gpu.id + +# iam_instance_profile { +# name = module.eks_blueprints_addons.karpenter.node_instance_profile_name +# } + +# # Commented for visibility to implement this feature in the future +# # placement { +# # tenancy = "default" +# # availability_zone = "${local.region}d" +# # group_name = local.karpenter_trn1_32xl_lt_name +# # } + +# metadata_options { +# http_endpoint = "enabled" +# http_tokens = "required" +# http_put_response_hop_limit = 2 +# } + +# block_device_mappings { +# device_name = "/dev/xvda" +# ebs { +# volume_size = 100 +# delete_on_termination = true +# volume_type = "gp3" +# } +# } + +# monitoring { +# enabled = true +# } + +# tag_specifications { +# resource_type = "instance" + +# tags = merge(local.tags, { +# "karpenter.sh/discovery" = local.name +# }) +# } + +# # First network interface with device_index=0 and network_card_index=0 +# network_interfaces { +# device_index = 0 +# network_card_index = 0 +# associate_public_ip_address = false +# interface_type = "efa" +# delete_on_termination = true +# security_groups = [module.eks.node_security_group_id] +# description = "Karpenter EFA config for Trainium" +# } + +# # Additional network interfaces with device_index=1 and network_card_index ranging from 1 to 7 +# dynamic "network_interfaces" { +# for_each = range(1, 8) # Create 7 additional network interfaces +# content { +# device_index = 1 +# network_card_index = network_interfaces.value +# associate_public_ip_address = false +# interface_type = "efa" +# delete_on_termination = true +# security_groups = [module.eks.node_security_group_id] +# description = "Karpenter EFA config for Trainium" +# } +# } +# } + +# #--------------------------------------- +# # Karpenter Provisioners +# #--------------------------------------- +# data "kubectl_path_documents" "karpenter_provisioners" { +# pattern = "${path.module}/karpenter-provisioners/karpenter-*.yaml" +# vars = { +# azs = local.region +# eks_cluster_id = local.name +# launch_template_name = local.karpenter_trn1_32xl_lt_name +# } +# } + +# resource "kubectl_manifest" "karpenter_provisioner" { +# for_each = toset(data.kubectl_path_documents.karpenter_provisioners.documents) +# yaml_body = each.value + +# depends_on = [module.eks_blueprints_addons] +# } diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 5175673d0..621ca8fc8 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -15,7 +15,7 @@ module "eks" { # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]) - + manage_aws_auth_configmap = true aws_auth_roles = [ diff --git a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh index 75cabf752..89bdd3181 100755 --- a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh +++ b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh @@ -18,7 +18,7 @@ fi ECR_REPO_URI=$(cat .ecr_repo_uri) echo -e "Using container image $ECR_REPO_URI:latest" -# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh +# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh kubectl apply -f - <' > temp-values.yaml + helm template my-release-name . --values temp-values.yaml + rm temp-values.yaml + ``` + + This will output the Kubernetes manifests to your terminal, allowing you to review them. diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml new file mode 100644 index 000000000..604e7f8e8 --- /dev/null +++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml @@ -0,0 +1,56 @@ +{{- if .Values.ec2NodeClass.enabled }} +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: {{ .Values.name }} +spec: + {{- if .Values.ec2NodeClass.amiFamily }} + amiFamily: {{ .Values.ec2NodeClass.amiFamily }} + {{- else if .Values.ec2NodeClass.amiSelectorTerms }} + amiSelectorTerms: + {{- toYaml .Values.ec2NodeClass.amiSelectorTerms | nindent 4 }} + {{- end }} + subnetSelectorTerms: + {{- if .Values.ec2NodeClass.subnetSelectorTerms.tags }} + - tags: + {{- range $key, $value := .Values.ec2NodeClass.subnetSelectorTerms.tags }} + {{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} + {{- if .Values.ec2NodeClass.subnetSelectorTerms.id }} + - id: {{ .Values.ec2NodeClass.subnetSelectorTerms.id }} + {{- end }} + securityGroupSelectorTerms: + {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.name }} + - name: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.name }} + {{- end }} + {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.id }} + - id: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.id }} + {{- end }} + {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} + - tags: + {{- range $key, $value := .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} + {{ $key }}: {{ $value | quote }} + {{- end }} + {{- end }} + role: {{ .Values.ec2NodeClass.karpenterRole }} + tags: + Name: karpenter-{{ .Values.name }} + metadataOptions: + httpEndpoint: {{ .Values.ec2NodeClass.metadataOptions.httpEndpoint }} + httpProtocolIPv6: {{ .Values.ec2NodeClass.metadataOptions.httpProtocolIPv6 }} + httpPutResponseHopLimit: {{ .Values.ec2NodeClass.metadataOptions.httpPutResponseHopLimit }} + httpTokens: {{ .Values.ec2NodeClass.metadataOptions.httpTokens }} + blockDeviceMappings: + - deviceName: {{ default "/dev/xvda" .Values.ec2NodeClass.blockDevice.deviceName }} + ebs: + volumeSize: {{ .Values.ec2NodeClass.blockDevice.volumeSize }} + volumeType: {{ .Values.ec2NodeClass.blockDevice.volumeType }} + encrypted: {{ .Values.ec2NodeClass.blockDevice.encrypted }} + deleteOnTermination: {{ .Values.ec2NodeClass.blockDevice.deleteOnTermination }} + detailedMonitoring: {{ .Values.ec2NodeClass.detailedMonitoring }} + {{- if .Values.ec2NodeClass.userData }} + userData: | + {{- .Values.ec2NodeClass.userData | nindent 4 }} + {{- end }} +{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml new file mode 100644 index 000000000..0ac17988f --- /dev/null +++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml @@ -0,0 +1,36 @@ +{{- if .Values.nodePool.enabled }} +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: {{ .Values.name }} +spec: + template: + metadata: + labels: + NodePool: {{ .Values.name }} + NodeGroupType: {{ .Values.name }} + {{- with .Values.nodePool.labels }} + {{- range . }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + spec: + nodeClassRef: + name: {{ .Values.name }} + {{- with .Values.nodePool.taints }} + taints: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.nodePool.requirements }} + requirements: + {{- toYaml . | nindent 8 }} + {{- end }} + disruption: + consolidationPolicy: {{ .Values.nodePool.disruption.consolidationPolicy }} + consolidateAfter: {{ .Values.nodePool.disruption.consolidateAfter }} + expireAfter: {{ .Values.nodePool.disruption.expireAfter }} + limits: + cpu: {{ .Values.nodePool.limits.cpu }} + memory: {{ .Values.nodePool.limits.memory }} + weight: {{ .Values.nodePool.weight }} +{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml new file mode 100644 index 000000000..456dce270 --- /dev/null +++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml @@ -0,0 +1,40 @@ +# Shared values +name: default +clusterName: test-cluster + +# EC2NodeClass specific values +ec2NodeClass: + enabled: true + amiFamily: AL2 + amiSelectorTerms: + subnetSelectorTerms: # tag or id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ + securityGroupSelectorTerms: # tag, name, id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ + karpenterRole: + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: required + blockDevice: + deviceName: /dev/xvda + volumeSize: 100Gi + volumeType: gp3 + encrypted: true + deleteOnTermination: true + detailedMonitoring: true + userData: + +# NodePool specific values +nodePool: + enabled: true + labels: + taints: + requirements: + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + limits: + cpu: "1000" + memory: 1000Gi + weight: 10 diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml deleted file mode 100644 index 1ba2eb9fd..000000000 --- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml +++ /dev/null @@ -1,54 +0,0 @@ ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: default -spec: - providerRef: - name: default - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}d] - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["c", "m", "r"] - - key: "karpenter.k8s.aws/instance-size" - operator: In - values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - labels: - provisioner: default - workload: rayhead - limits: - resources: - cpu: 1000 - memory: 20000Gi - ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: default -spec: - amiFamily: AL2 - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - subnetSelector: - Name: ${eks_cluster_id}-private* - securityGroupSelector: - Name: ${eks_cluster_id}-node* - tags: - managed-by: "karpenter" - intent: "apps" - Name: "karpenter-node-default" diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml deleted file mode 100644 index 262e307d2..000000000 --- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml +++ /dev/null @@ -1,58 +0,0 @@ ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: inferentia-inf2 - namespace: karpenter -spec: - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}d] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" - operator: In - values: ["inf2.24xlarge", "inf2.48xlarge"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - providerRef: - name: inferentia-inf2 - labels: - provisioner: inferentia-inf2 - hub.jupyter.org/node-purpose: user - taints: - - key: aws.amazon.com/neuroncore - value: "true" - effect: "NoSchedule" - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - ttlSecondsAfterEmpty: 300 # optional, but never scales down if not set - ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: inferentia-inf2 - namespace: karpenter -spec: - amiFamily: AL2 - blockDeviceMappings: - - deviceName: /dev/xvda - ebs: - volumeSize: 200Gi - volumeType: gp3 - encrypted: true - subnetSelector: - Name: ${eks_cluster_id}-private* - securityGroupSelector: - Name: ${eks_cluster_id}-node* - tags: - InstanceType: "inferentia" diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml deleted file mode 100644 index 1297054f8..000000000 --- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- -apiVersion: karpenter.sh/v1alpha5 -kind: Provisioner -metadata: - name: trainium-trn1 - namespace: karpenter # Same namespace as Karpenter add-on installed -spec: - requirements: - - key: "topology.kubernetes.io/zone" - operator: In - values: [${azs}d] - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "node.kubernetes.io/instance-type" - operator: In - values: ["trn1.32xlarge", "trn1n.32xlarge"] # trn1.2xlarge, trn1n.32xlarge - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - providerRef: - name: trainium-trn1 - labels: - provisioner: trainium-trn1 - hub.jupyter.org/node-purpose: user - taints: - - key: aws.amazon.com/neuroncore - value: "true" - effect: "NoSchedule" - - key: aws.amazon.com/neuron - value: "true" - effect: "NoSchedule" - - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html - operator: "Equal" - value: "user" - effect: "NoSchedule" - ttlSecondsAfterEmpty: 300 # optional, but never scales down if not set - -# Note: As of version v0.29.0, the Karpenter AWSNodeTemplate lacks the ability to configure multipleNetwork interfaces for EFA. -# To work around this limitation, we are utilizing Terraform to generate launch templates that include EFA configurations. -# These launch templates are then used as input for the AWS Node template, enabling us to achieve the desired network interface setups. ---- -apiVersion: karpenter.k8s.aws/v1alpha1 -kind: AWSNodeTemplate -metadata: - name: trainium-trn1 - namespace: karpenter -spec: - subnetSelector: - Name: "${eks_cluster_id}-private*" - launchTemplate: "${launch_template_name}" - tags: - InstanceType: "trainium" diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf index 56984ff0e..6d3fd5e26 100755 --- a/ai-ml/trainium-inferentia/main.tf +++ b/ai-ml/trainium-inferentia/main.tf @@ -2,8 +2,6 @@ provider "aws" { region = local.region } -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html provider "aws" { alias = "ecr" region = "us-east-1" @@ -22,62 +20,7 @@ provider "helm" { token = data.aws_eks_cluster_auth.this.token } } -provider "kubectl" { - apply_retry_count = 30 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - load_config_file = false -} - -data "aws_eks_cluster_auth" "this" { - name = module.eks.cluster_name -} - -data "aws_ecrpublic_authorization_token" "token" { - provider = aws.ecr -} - -locals { - name = var.name - region = var.region - # Trn1 and Inf2 instances are available in specific AZs in us-east-1, - # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used. - az_mapping = { - "us-west-2" = ["usw2-az4", "usw2-az1"], - "us-east-1" = ["use1-az6", "use1-az5"], - "us-east-2" = ["use2-az3", "use2-az1"] - } - azs = local.az_mapping[var.region] - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} -provider "aws" { - region = local.region -} -# ECR always authenticates with `us-east-1` region -# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html -provider "aws" { - alias = "ecr" - region = "us-east-1" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - token = data.aws_eks_cluster_auth.this.token - } -} provider "kubectl" { apply_retry_count = 30 host = module.eks.cluster_endpoint @@ -109,4 +52,4 @@ locals { Blueprint = local.name GithubRepo = "github.com/awslabs/data-on-eks" } -} +} \ No newline at end of file diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf index 35354df9e..40adfb3b3 100755 --- a/ai-ml/trainium-inferentia/outputs.tf +++ b/ai-ml/trainium-inferentia/outputs.tf @@ -1,4 +1,4 @@ output "configure_kubectl" { description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" value = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}" -} \ No newline at end of file +} diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf index 094ea931d..f4259359b 100755 --- a/ai-ml/trainium-inferentia/variables.tf +++ b/ai-ml/trainium-inferentia/variables.tf @@ -53,49 +53,48 @@ variable "enable_mpi_operator" { variable "trn1_32xl_min_size" { description = "trn1 Worker node minimum size" - type = number - default = 0 + type = number + default = 0 } variable "trn1_32xl_desired_size" { description = "trn1 Worker node desired size" - type = number - default = 0 + type = number + default = 0 } variable "trn1n_32xl_min_size" { description = "Worker node minimum size" - type = number - default = 0 + type = number + default = 0 } variable "trn1n_32xl_desired_size" { description = "Worker node desired size" - type = number - default = 0 + type = number + default = 0 } variable "inf2_24xl_min_size" { description = "Worker node minimum size" - type = number - default = 0 + type = number + default = 0 } variable "inf2_24xl_desired_size" { description = "Worker node desired size" - type = number - default = 0 + type = number + default = 0 } variable "inf2_48xl_min_size" { description = "Worker node minimum size" - type = number - default = 0 + type = number + default = 0 } variable "inf2_48xl_desired_size" { description = "Worker node desired size" - type = number - default = 0 + type = number + default = 0 } - diff --git a/schedulers/terraform/managed-airflow-mwaa/eks.tf b/schedulers/terraform/managed-airflow-mwaa/eks.tf index 7e537afa1..e107ab071 100644 --- a/schedulers/terraform/managed-airflow-mwaa/eks.tf +++ b/schedulers/terraform/managed-airflow-mwaa/eks.tf @@ -42,11 +42,11 @@ module "eks" { # MWAA needs access to the EKS control plane in order to submit a job allow_access_from_mwaa = { - description = "Access from MWAA" - protocol = "tcp" - from_port = 443 - to_port = 443 - type = "ingress" + description = "Access from MWAA" + protocol = "tcp" + from_port = 443 + to_port = 443 + type = "ingress" source_security_group_id = module.mwaa.mwaa_security_group_id } } diff --git a/website/docs/blueprints/ai-ml/trainium.md b/website/docs/blueprints/ai-ml/trainium.md index f845f138c..954c9f205 100644 --- a/website/docs/blueprints/ai-ml/trainium.md +++ b/website/docs/blueprints/ai-ml/trainium.md @@ -64,6 +64,10 @@ In this [example](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/trainiu - Prepare the necessary etcd setup as a prerequisite for TorchX. - Create a test queue within Volcano to enable TorchX job submission to this specific queue. +:::info +**Important**: In this setup, Karpenter is utilized exclusively for `inferentia-inf2` instances, due to its current limitations in custom networking interfaces configuration. For Trainium instances, managed node groups and the Cluster Autoscaler are employed for scaling purposes. For users working with an older version of Karpenter (specifically, the `v1alpha5` APIs), please note that the configuration for Trainium with `LaunchTemplates` is still accessible. It can be found in the `data-on-eks/ai-ml/trainium-inferentia/addons.tf` file, although it is commented out at the file's end. +::: + ### Prerequisites Ensure that you have installed the following tools on your machine. @@ -83,7 +87,7 @@ git clone https://github.com/awslabs/data-on-eks.git Navigate into one of the example directories and run `install.sh` script ```bash -cd data-on-eks/ai-ml/trainium/ && chmod +x install.sh +cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh ./install.sh ``` @@ -92,12 +96,12 @@ cd data-on-eks/ai-ml/trainium/ && chmod +x install.sh Verify the Amazon EKS Cluster ```bash -aws eks describe-cluster --name trainium +aws eks describe-cluster --name trainium-inferentia ``` ```bash # Creates k8s config file to authenticate with EKS -aws eks --region us-west-2 update-kubeconfig --name trainium +aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia kubectl get nodes # Output shows the EKS Managed Node group nodes diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md index 8a3f2cef8..816667274 100644 --- a/website/docs/gen-ai/training/Llama2.md +++ b/website/docs/gen-ai/training/Llama2.md @@ -137,7 +137,7 @@ Navigate to examples/llama2 directory cd examples/llama2/ ``` -Run the `1-llama2-neuronx-pretrain-build-image.sh` script to build the neuronx-nemo-megatron container image and push the image into ECR. +Run the `1-llama2-neuronx-pretrain-build-image.sh` script to build the neuronx-nemo-megatron container image and push the image into ECR. When prompted for a region, enter the region in which you launched your EKS cluster, above. @@ -149,7 +149,7 @@ Note: The image building and pushing to ECR will take ~10 minutes ### Launch and connect to a CLI pod -In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. +In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. Run the following script to launch the CLI pod: @@ -227,9 +227,9 @@ python3 neuronx-nemo-megatron/nemo/scripts/nlp_language_modeling/preprocess_data Note: When we later launch our training jobs in EKS, the training pods will run the training script from within neuronx-nemo-megatron/nemo/examples directory on FSx. This is convenient, because it will let you modify your training script directly on FSx without requiring that you rebuild the neuronx-nemo-megatron container for every change. -Modify the test_llama.sh script `/shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh` to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. +Modify the test_llama.sh script `/shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh` to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. -You can use any common text editor such as nano or vim to make these changes. +You can use any common text editor such as nano or vim to make these changes. Run: ```bash @@ -279,7 +279,7 @@ Run the pre-compilation script ./3-llama2-neuronx-mpi-compile.sh ``` -Pre-compilation will take ~10 minutes when using 4 trn1.32xlarge nodes. +Pre-compilation will take ~10 minutes when using 4 trn1.32xlarge nodes. Periodically run `kubectl get pods | grep compile` and wait until you see that the compile job shows ‘Completed’. @@ -333,7 +333,7 @@ Run the following script to create a TensorBoard deployment so you can visualize ./5-deploy-tensorboard.sh ``` -Once the deployment is ready the script will output a password-protected URL for your new TensorBoard deployment. +Once the deployment is ready the script will output a password-protected URL for your new TensorBoard deployment. Launch the URL to view your training progress.