From e0abaadee27a73dc6ce0e4b5200fb1210e211809 Mon Sep 17 00:00:00 2001
From: Lucas Duarte <30901918+lusoal@users.noreply.github.com>
Date: Wed, 31 Jan 2024 18:45:26 -0600
Subject: [PATCH] refactor: Trainium inferentia karpenter provisioners update
 (#420)

Co-authored-by: Sanjeev Ganjihal <sanjeevrg7@gmail.com>
Co-authored-by: Vara Bonthu <vara.bonthu@gmail.com>
---
 ai-ml/trainium-inferentia/addons.tf           | 492 ++++++++++++------
 ai-ml/trainium-inferentia/eks.tf              |   2 +-
 .../examples/llama2/2-launch-cmd-shell-pod.sh |   2 +-
 .../llama2/3-llama2-neuronx-mpi-compile.sh    |   2 +-
 .../llama2/4-llama2-neuronx-mpi-train.sh      |   2 +-
 .../examples/llama2/5-deploy-tensorboard.sh   |   2 +-
 .../llama2/docker/Dockerfile.tensorboard      |   2 +-
 .../llama2/install-pre-requsites-for-ec2.sh   |   2 -
 .../llama2-inf2/ray-service-llama2.yaml       |   2 +-
 .../karpenter-resources/Chart.yaml            |   5 +
 .../helm-values/karpenter-resources/README.md |  71 +++
 .../templates/node-class.yaml                 |  56 ++
 .../templates/node-pool.yaml                  |  36 ++
 .../karpenter-resources/values.yaml           |  40 ++
 .../karpenter-default.yaml                    |  54 --
 .../karpenter-inf2.yaml                       |  58 ---
 .../karpenter-trn1.yaml                       |  53 --
 ai-ml/trainium-inferentia/main.tf             |  59 +--
 ai-ml/trainium-inferentia/outputs.tf          |   2 +-
 ai-ml/trainium-inferentia/variables.tf        |  33 +-
 .../terraform/managed-airflow-mwaa/eks.tf     |  10 +-
 website/docs/blueprints/ai-ml/trainium.md     |  10 +-
 website/docs/gen-ai/training/Llama2.md        |  12 +-
 23 files changed, 578 insertions(+), 429 deletions(-)
 create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml
 create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md
 create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml
 create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml
 create mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml
 delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml
 delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml
 delete mode 100644 ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml

diff --git a/ai-ml/trainium-inferentia/addons.tf b/ai-ml/trainium-inferentia/addons.tf
index 464439552..104d362f4 100644
--- a/ai-ml/trainium-inferentia/addons.tf
+++ b/ai-ml/trainium-inferentia/addons.tf
@@ -238,6 +238,91 @@ module "eks_blueprints_addons" {
   }
 
   tags = local.tags
+
+  # We are installing Karpenter resources with Helm Chart, see helm-values/
+  helm_releases = {
+    karpenter-resources-default = {
+      name        = "default"
+      description = "A Helm chart for default node pool"
+      chart       = "${path.module}/helm-values/karpenter-resources"
+      values = [
+        <<-EOT
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+      nodePool:
+        labels:
+          - provisioner: default
+          - workload: rayhead
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["c5", "m5", "r5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+      EOT
+      ]
+    }
+    karpenter-resources-inferentia = {
+      name        = "inferentia-inf2"
+      description = "A Helm chart for karpenter inferentia-inf2"
+      chart       = "${path.module}/helm-values/karpenter-resources"
+      values = [
+        <<-EOT
+      name: inferentia-inf2
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+      nodePool:
+        labels:
+          - provisioner: inferentia-inf2
+          - hub.jupyter.org/node-purpose: user
+        taints:
+          - key: aws.amazon.com/neuroncore
+            value: "true"
+            effect: "NoSchedule"
+          - key: aws.amazon.com/neuron
+            value: "true"
+            effect: "NoSchedule"
+          - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["inf2"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: ["24xlarge", "48xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+      EOT
+      ]
+    }
+  }
+
 }
 
 #---------------------------------------------------------------
@@ -245,7 +330,7 @@ module "eks_blueprints_addons" {
 #---------------------------------------------------------------
 module "eks_data_addons" {
   source  = "aws-ia/eks-data-addons/aws"
-  version = "~> 1.2" # ensure to update this to the latest/desired version
+  version = "~> 1.2.9" # ensure to update this to the latest/desired version
 
   oidc_provider_arn = module.eks.oidc_provider_arn
 
@@ -279,6 +364,83 @@ module "eks_data_addons" {
       })
     ]
   }
+  enable_karpenter_resources = true
+  karpenter_resources_helm_config = {
+    inferentia-inf2 = {
+      values = [
+        <<-EOT
+      name: inferentia-inf2
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+      nodePool:
+        labels:
+          - provisioner: inferentia-inf2
+          - hub.jupyter.org/node-purpose: user
+        taints:
+          - key: aws.amazon.com/neuroncore
+            value: "true"
+            effect: "NoSchedule"
+          - key: aws.amazon.com/neuron
+            value: "true"
+            effect: "NoSchedule"
+          - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["inf2"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: ["24xlarge", "48xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+      EOT
+      ]
+    }
+    default = {
+      values = [
+        <<-EOT
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          id: ${module.vpc.private_subnets[3]}
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+      nodePool:
+        labels:
+          - provisioner: default
+          - workload: rayhead
+        requirements:
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["c5", "m5", "r5"]
+          - key: "karpenter.k8s.aws/instance-size"
+            operator: In
+            values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge", "24xlarge"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+      EOT
+      ]
+    }
+  }
 }
 
 #---------------------------------------------------------------
@@ -324,29 +486,6 @@ resource "aws_secretsmanager_secret_version" "grafana" {
   secret_string = random_password.grafana.result
 }
 
-locals {
-  karpenter_trn1_32xl_lt_name = format("%s-trn132xl-lt", local.name)
-}
-
-#---------------------------------------
-# Karpenter Provisioners
-#---------------------------------------
-data "kubectl_path_documents" "karpenter_provisioners" {
-  pattern = "${path.module}/karpenter-provisioners/karpenter-*.yaml"
-  vars = {
-    azs                  = local.region
-    eks_cluster_id       = local.name
-    launch_template_name = local.karpenter_trn1_32xl_lt_name
-  }
-}
-
-resource "kubectl_manifest" "karpenter_provisioner" {
-  for_each  = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
-  yaml_body = each.value
-
-  depends_on = [module.eks_blueprints_addons]
-}
-
 #tfsec:ignore:*
 module "s3_bucket" {
   source  = "terraform-aws-modules/s3-bucket/aws"
@@ -359,147 +498,6 @@ module "s3_bucket" {
   tags = local.tags
 }
 
-#---------------------------------------------------------------
-# Create a Launch Template Userdata for Trainium
-# Note: As of version v0.29.0, the Karpenter AWSNodeTemplate lacks the ability to configure multipleNetwork interfaces for EFA.
-# To work around this limitation, we are utilizing Terraform to generate launch templates that include EFA configurations.
-# These launch templates are then used as input for the AWS Node template, enabling us to achieve the desired network interface setups.
-#---------------------------------------------------------------
-data "cloudinit_config" "trn1_lt" {
-  base64_encode = true
-  gzip          = false
-  boundary      = "//"
-
-  # Prepend to existing user data supplied by AWS EKS
-  part {
-    content_type = "text/x-shellscript"
-    content      = <<-EOT
-      cat <<-EOF > /etc/profile.d/bootstrap.sh
-      #!/bin/sh
-
-      # Configure NVMe volumes in RAID0 configuration
-      # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126
-      # Mount will be: /mnt/k8s-disks
-      export LOCAL_DISKS='raid0'
-
-      # Install Neuron monitoring tools
-      yum install aws-neuronx-tools-2.* -y
-      export PATH=/opt/aws/neuron/bin:$PATH
-
-      # EFA Setup for Trainium and Inferentia
-      export FI_EFA_USE_DEVICE_RDMA=1
-      export FI_PROVIDER=efa
-      export FI_EFA_FORK_SAFE=1
-
-      curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
-      tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
-      ./efa_installer.sh -y -g
-      /opt/amazon/efa/bin/fi_info -p efa
-      EOF
-
-      # Source extra environment variables in bootstrap script
-      sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-
-      # Bootstrap the node
-      B64_CLUSTER_CA=${module.eks.cluster_certificate_authority_data}
-      API_SERVER_URL=${module.eks.cluster_endpoint}
-      /etc/eks/bootstrap.sh ${local.name} --kubelet-extra-args "--node-labels=eks.amazonaws.com/nodegroup-image=${data.aws_ami.eks_gpu.id}" --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL
-
-    EOT
-  }
-}
-
-#---------------------------------------------------------------
-# This Terraform code defines a data block to fetch the most recent Amazon Machine Image (AMI)
-# for an Amazon Elastic Kubernetes Service (EKS) cluster with GPU support.
-#---------------------------------------------------------------
-data "aws_ami" "eks_gpu" {
-  owners      = ["amazon"]
-  most_recent = true
-
-  filter {
-    name   = "name"
-    values = ["amazon-eks-gpu-node-${var.eks_cluster_version}-*"]
-  }
-}
-
-#---------------------------------------------------------------
-# AWS Launch Template Configuration for Karpenter Trn1.32xlarge Instances
-#---------------------------------------------------------------
-resource "aws_launch_template" "trn1_lt" {
-  name        = local.karpenter_trn1_32xl_lt_name
-  description = "Karpenter Trn1.32xlarge Launch Template"
-
-  user_data = data.cloudinit_config.trn1_lt.rendered
-
-  ebs_optimized = true
-
-  image_id = data.aws_ami.eks_gpu.id
-
-  iam_instance_profile {
-    name = module.eks_blueprints_addons.karpenter.node_instance_profile_name
-  }
-
-  # Commented for visibility to implement this feature in the future
-  #  placement {
-  #   tenancy = "default"
-  #   availability_zone = "${local.region}d"
-  #   group_name        = local.karpenter_trn1_32xl_lt_name
-  # }
-
-  metadata_options {
-    http_endpoint               = "enabled"
-    http_tokens                 = "required"
-    http_put_response_hop_limit = 2
-  }
-
-  block_device_mappings {
-    device_name = "/dev/xvda"
-    ebs {
-      volume_size           = 100
-      delete_on_termination = true
-      volume_type           = "gp3"
-    }
-  }
-
-  monitoring {
-    enabled = true
-  }
-
-  tag_specifications {
-    resource_type = "instance"
-
-    tags = merge(local.tags, {
-      "karpenter.sh/discovery" = local.name
-    })
-  }
-
-  # First network interface with device_index=0 and network_card_index=0
-  network_interfaces {
-    device_index                = 0
-    network_card_index          = 0
-    associate_public_ip_address = false
-    interface_type              = "efa"
-    delete_on_termination       = true
-    security_groups             = [module.eks.node_security_group_id]
-    description                 = "Karpenter EFA config for Trainium"
-  }
-
-  # Additional network interfaces with device_index=1 and network_card_index ranging from 1 to 7
-  dynamic "network_interfaces" {
-    for_each = range(1, 8) # Create 7 additional network interfaces
-    content {
-      device_index                = 1
-      network_card_index          = network_interfaces.value
-      associate_public_ip_address = false
-      interface_type              = "efa"
-      delete_on_termination       = true
-      security_groups             = [module.eks.node_security_group_id]
-      description                 = "Karpenter EFA config for Trainium"
-    }
-  }
-}
-
 #---------------------------------------------------------------
 # MPI Operator for distributed training on Trainium
 #---------------------------------------------------------------
@@ -516,3 +514,167 @@ resource "kubectl_manifest" "mpi_operator" {
   yaml_body  = each.value
   depends_on = [module.eks.eks_cluster_id]
 }
+
+#---------------------------------------------------------------
+# Create a Launch Template Userdata for Trainium, and use it in Karpenter, deprecated
+# This commented section of the pattern is commented due to lack of support in utilizing LaunchTemplates in newer Karpenter versions.
+# See full change list https://github.com/aws/karpenter-provider-aws/blob/d1d1371ae2e1552b8fdded7d343bf24ea18bee31/designs/v1beta1-full-changelist.md#remove-speclaunchtemplate
+#---------------------------------------------------------------
+# data "cloudinit_config" "trn1_lt" {
+#   base64_encode = true
+#   gzip          = false
+#   boundary      = "//"
+
+#   # Prepend to existing user data supplied by AWS EKS
+#   part {
+#     content_type = "text/x-shellscript"
+#     content      = <<-EOT
+#       cat <<-EOF > /etc/profile.d/bootstrap.sh
+#       #!/bin/sh
+
+#       # Configure NVMe volumes in RAID0 configuration
+#       # https://github.com/awslabs/amazon-eks-ami/blob/056e31f8c7477e893424abce468cb32bbcd1f079/files/bootstrap.sh#L35C121-L35C126
+#       # Mount will be: /mnt/k8s-disks
+#       export LOCAL_DISKS='raid0'
+
+#       # Install Neuron monitoring tools
+#       yum install aws-neuronx-tools-2.* -y
+#       export PATH=/opt/aws/neuron/bin:$PATH
+
+#       # EFA Setup for Trainium and Inferentia
+#       export FI_EFA_USE_DEVICE_RDMA=1
+#       export FI_PROVIDER=efa
+#       export FI_EFA_FORK_SAFE=1
+
+#       curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz
+#       tar -xf aws-efa-installer-latest.tar.gz && cd aws-efa-installer
+#       ./efa_installer.sh -y -g
+#       /opt/amazon/efa/bin/fi_info -p efa
+#       EOF
+
+#       # Source extra environment variables in bootstrap script
+#       sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+#       # Bootstrap the node
+#       B64_CLUSTER_CA=${module.eks.cluster_certificate_authority_data}
+#       API_SERVER_URL=${module.eks.cluster_endpoint}
+#       /etc/eks/bootstrap.sh ${local.name} --kubelet-extra-args "--node-labels=eks.amazonaws.com/nodegroup-image=${data.aws_ami.eks_gpu.id}" --b64-cluster-ca $B64_CLUSTER_CA --apiserver-endpoint $API_SERVER_URL
+
+#     EOT
+#   }
+# }
+
+#---------------------------------------------------------------
+# This Terraform code defines a data block to fetch the most recent Amazon Machine Image (AMI)
+# for an Amazon Elastic Kubernetes Service (EKS) cluster with GPU support.
+#---------------------------------------------------------------
+# data "aws_ami" "eks_gpu" {
+#   owners      = ["amazon"]
+#   most_recent = true
+
+#   filter {
+#     name   = "name"
+#     values = ["amazon-eks-gpu-node-${var.eks_cluster_version}-*"]
+#   }
+# }
+
+
+# locals {
+#   karpenter_trn1_32xl_lt_name = format("%s-trn132xl-lt", local.name)
+# }
+
+#---------------------------------------------------------------
+# AWS Launch Template Configuration for Karpenter Trn1.32xlarge Instances
+#---------------------------------------------------------------
+# resource "aws_launch_template" "trn1_lt" {
+#   name        = local.karpenter_trn1_32xl_lt_name
+#   description = "Karpenter Trn1.32xlarge Launch Template"
+
+#   user_data = data.cloudinit_config.trn1_lt.rendered
+
+#   ebs_optimized = true
+
+#   image_id = data.aws_ami.eks_gpu.id
+
+#   iam_instance_profile {
+#     name = module.eks_blueprints_addons.karpenter.node_instance_profile_name
+#   }
+
+#   # Commented for visibility to implement this feature in the future
+#   #  placement {
+#   #   tenancy = "default"
+#   #   availability_zone = "${local.region}d"
+#   #   group_name        = local.karpenter_trn1_32xl_lt_name
+#   # }
+
+#   metadata_options {
+#     http_endpoint               = "enabled"
+#     http_tokens                 = "required"
+#     http_put_response_hop_limit = 2
+#   }
+
+#   block_device_mappings {
+#     device_name = "/dev/xvda"
+#     ebs {
+#       volume_size           = 100
+#       delete_on_termination = true
+#       volume_type           = "gp3"
+#     }
+#   }
+
+#   monitoring {
+#     enabled = true
+#   }
+
+#   tag_specifications {
+#     resource_type = "instance"
+
+#     tags = merge(local.tags, {
+#       "karpenter.sh/discovery" = local.name
+#     })
+#   }
+
+#   # First network interface with device_index=0 and network_card_index=0
+#   network_interfaces {
+#     device_index                = 0
+#     network_card_index          = 0
+#     associate_public_ip_address = false
+#     interface_type              = "efa"
+#     delete_on_termination       = true
+#     security_groups             = [module.eks.node_security_group_id]
+#     description                 = "Karpenter EFA config for Trainium"
+#   }
+
+#   # Additional network interfaces with device_index=1 and network_card_index ranging from 1 to 7
+#   dynamic "network_interfaces" {
+#     for_each = range(1, 8) # Create 7 additional network interfaces
+#     content {
+#       device_index                = 1
+#       network_card_index          = network_interfaces.value
+#       associate_public_ip_address = false
+#       interface_type              = "efa"
+#       delete_on_termination       = true
+#       security_groups             = [module.eks.node_security_group_id]
+#       description                 = "Karpenter EFA config for Trainium"
+#     }
+#   }
+# }
+
+# #---------------------------------------
+# # Karpenter Provisioners
+# #---------------------------------------
+# data "kubectl_path_documents" "karpenter_provisioners" {
+#   pattern = "${path.module}/karpenter-provisioners/karpenter-*.yaml"
+#   vars = {
+#     azs                  = local.region
+#     eks_cluster_id       = local.name
+#     launch_template_name = local.karpenter_trn1_32xl_lt_name
+#   }
+# }
+
+# resource "kubectl_manifest" "karpenter_provisioner" {
+#   for_each  = toset(data.kubectl_path_documents.karpenter_provisioners.documents)
+#   yaml_body = each.value
+
+#   depends_on = [module.eks_blueprints_addons]
+# }
diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf
index 5175673d0..621ca8fc8 100644
--- a/ai-ml/trainium-inferentia/eks.tf
+++ b/ai-ml/trainium-inferentia/eks.tf
@@ -15,7 +15,7 @@ module "eks" {
   # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
   subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
   substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
-  
+
 
   manage_aws_auth_configmap = true
   aws_auth_roles = [
diff --git a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh
index 75cabf752..89bdd3181 100755
--- a/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh
+++ b/ai-ml/trainium-inferentia/examples/llama2/2-launch-cmd-shell-pod.sh
@@ -18,7 +18,7 @@ fi
 ECR_REPO_URI=$(cat .ecr_repo_uri)
 echo -e "Using container image $ECR_REPO_URI:latest"
 
-# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
 kubectl apply -f - <<EOF
 apiVersion: v1
 kind: Pod
diff --git a/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh b/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh
index 463ff48cc..2481dedf6 100755
--- a/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh
+++ b/ai-ml/trainium-inferentia/examples/llama2/3-llama2-neuronx-mpi-compile.sh
@@ -18,7 +18,7 @@ fi
 ECR_REPO_URI=$(cat .ecr_repo_uri)
 echo -e "Using container image $ECR_REPO_URI:latest"
 
-# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
 sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_compile.yaml | kubectl apply -f -
 
 if [[ "$?" -eq 0 ]]; then
diff --git a/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh b/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh
index 71391e253..f3365020b 100755
--- a/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh
+++ b/ai-ml/trainium-inferentia/examples/llama2/4-llama2-neuronx-mpi-train.sh
@@ -18,7 +18,7 @@ fi
 ECR_REPO_URI=$(cat .ecr_repo_uri)
 echo -e "Using container image $ECR_REPO_URI:latest"
 
-# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh 
+# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
 sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_train.yaml | kubectl apply -f -
 
 if [[ "$?" -eq 0 ]]; then
diff --git a/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh b/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh
index c043d2209..142328860 100755
--- a/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh
+++ b/ai-ml/trainium-inferentia/examples/llama2/5-deploy-tensorboard.sh
@@ -117,7 +117,7 @@ STATUS=$(curl -sI $LB_HOST|head -1)
 while [[ ! $STATUS =~ Unauthorized ]]
 do
 	echo -n "."
-	sleep 10 
+	sleep 10
 	STATUS=$(curl -sI $LB_HOST|head -1)
 done
 
diff --git a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard
index 8a21900fa..bf51dc235 100644
--- a/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard
+++ b/ai-ml/trainium-inferentia/examples/llama2/docker/Dockerfile.tensorboard
@@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y nginx python3-pip apache2-utils curl
 RUN python3 -m pip install --upgrade pip && python3 -m pip install tensorboard
 
 # TB_PASSWORD is specified during container build
-ARG TB_PASSWORD="" 
+ARG TB_PASSWORD=""
 
 RUN htpasswd -c -b /etc/nginx/htpasswd admin $TB_PASSWORD
 COPY assets/nginx_auth.conf /etc/nginx/sites-enabled/default
diff --git a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh
index c2b247187..430241622 100755
--- a/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh
+++ b/ai-ml/trainium-inferentia/examples/llama2/install-pre-requsites-for-ec2.sh
@@ -81,5 +81,3 @@ install_helm
 install_boto3
 
 echo "Installation of prerequisites complete."
-
-
diff --git a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml
index 9fc6926e9..e751c2e4c 100644
--- a/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml
+++ b/ai-ml/trainium-inferentia/examples/ray-serve/llama2-inf2/ray-service-llama2.yaml
@@ -90,7 +90,7 @@ spec:
                     memory: "700G"
                     aws.amazon.com/neuron: "12"
             nodeSelector:
-              karpenter.sh/provisioner-name: inferentia-inf2
+              provisioner: inferentia-inf2
             tolerations:
               - key: aws.amazon.com/neuroncore
                 operator: Exists
diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml
new file mode 100644
index 000000000..0c3b8474a
--- /dev/null
+++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+name: karpenter-resources
+description: Helm chart for configuring custom resources for Karpenter on the cluster
+version: 0.0.1
+appVersion: 0.0.1
diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md
new file mode 100644
index 000000000..e95f582c1
--- /dev/null
+++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md
@@ -0,0 +1,71 @@
+# Karpenter Resources Helm Chart
+
+## Overview
+
+This Helm chart is an abstraction layer designed for deploying various configurations of Karpenter nodes in a Kubernetes cluster managed by EKS. It integrates seamlessly with Terraform, allowing users to define different node pools and settings for their Kubernetes cluster.
+
+## Prerequisites
+
+- Helm 3.x or later installed
+- Terraform installed
+- Access to an AWS EKS cluster
+
+## Configuration
+
+The chart is configured to be used with Terraform. Here is an example of how you might define Helm releases for different Karpenter configurations in your Terraform files using EKS Blueprints add-ons:
+
+```hcl
+module "eks_blueprints_addons" {
+    # ... other configurations ...
+    helm_releases = {
+    karpenter-resources-default = {
+      name        = "default"
+      description = "A Helm chart for default node pool"
+      chart       = "${path.module}/helm-values/karpenter-resources"
+      values = [
+        <<-EOT
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      nodePool:
+        labels:
+          - provisioner: default
+          - workload: rayhead
+      EOT
+      ]
+    }
+    }
+}
+```
+
+## Testing the Chart with Helm Template
+
+To review the Kubernetes manifests that will be generated by the Helm chart based on your configuration, you can use the `helm template` command. This is especially useful for validating your Terraform configurations before applying them.
+
+1. **Generate the Manifests**
+
+    Navigate to the directory where your Helm chart is located.
+
+    ```sh
+    cd path/to/helm-chart
+    ```
+
+2. **Run Helm Template**
+
+    Use the `helm template` command with your custom values. For example:
+
+    ```sh
+    helm template my-release-name . --values values.yaml
+    ```
+
+    Replace `my-release-name` with a name for your release, and `values.yaml` with the path to your custom values file.
+
+    To test specific configurations defined in your Terraform file, you can create a temporary values file with the configuration snippet from your Terraform definition:
+
+    ```sh
+    echo '<your Terraform values here>' > temp-values.yaml
+    helm template my-release-name . --values temp-values.yaml
+    rm temp-values.yaml
+    ```
+
+    This will output the Kubernetes manifests to your terminal, allowing you to review them.
diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml
new file mode 100644
index 000000000..604e7f8e8
--- /dev/null
+++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml
@@ -0,0 +1,56 @@
+{{- if .Values.ec2NodeClass.enabled }}
+apiVersion: karpenter.k8s.aws/v1beta1
+kind: EC2NodeClass
+metadata:
+  name: {{ .Values.name }}
+spec:
+  {{- if .Values.ec2NodeClass.amiFamily }}
+  amiFamily: {{ .Values.ec2NodeClass.amiFamily }}
+  {{- else if .Values.ec2NodeClass.amiSelectorTerms }}
+  amiSelectorTerms:
+    {{- toYaml .Values.ec2NodeClass.amiSelectorTerms | nindent 4 }}
+  {{- end }}
+  subnetSelectorTerms:
+    {{- if .Values.ec2NodeClass.subnetSelectorTerms.tags }}
+    - tags:
+      {{- range $key, $value := .Values.ec2NodeClass.subnetSelectorTerms.tags }}
+        {{ $key }}: {{ $value | quote }}
+      {{- end }}
+    {{- end }}
+    {{- if .Values.ec2NodeClass.subnetSelectorTerms.id }}
+    - id: {{ .Values.ec2NodeClass.subnetSelectorTerms.id }}
+    {{- end }}
+  securityGroupSelectorTerms:
+    {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.name }}
+    - name: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.name }}
+    {{- end }}
+    {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.id }}
+    - id: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.id }}
+    {{- end }}
+    {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.tags }}
+    - tags:
+      {{- range $key, $value := .Values.ec2NodeClass.securityGroupSelectorTerms.tags }}
+        {{ $key }}: {{ $value | quote }}
+      {{- end }}
+    {{- end }}
+  role: {{ .Values.ec2NodeClass.karpenterRole }}
+  tags:
+    Name: karpenter-{{ .Values.name }}
+  metadataOptions:
+    httpEndpoint: {{ .Values.ec2NodeClass.metadataOptions.httpEndpoint }}
+    httpProtocolIPv6: {{ .Values.ec2NodeClass.metadataOptions.httpProtocolIPv6 }}
+    httpPutResponseHopLimit: {{ .Values.ec2NodeClass.metadataOptions.httpPutResponseHopLimit }}
+    httpTokens: {{ .Values.ec2NodeClass.metadataOptions.httpTokens }}
+  blockDeviceMappings:
+    - deviceName: {{ default "/dev/xvda" .Values.ec2NodeClass.blockDevice.deviceName }}
+      ebs:
+        volumeSize: {{ .Values.ec2NodeClass.blockDevice.volumeSize }}
+        volumeType: {{ .Values.ec2NodeClass.blockDevice.volumeType }}
+        encrypted: {{ .Values.ec2NodeClass.blockDevice.encrypted }}
+        deleteOnTermination: {{ .Values.ec2NodeClass.blockDevice.deleteOnTermination }}
+  detailedMonitoring: {{ .Values.ec2NodeClass.detailedMonitoring }}
+  {{- if .Values.ec2NodeClass.userData }}
+  userData: |
+    {{- .Values.ec2NodeClass.userData | nindent 4 }}
+  {{- end }}
+{{- end }}
diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml
new file mode 100644
index 000000000..0ac17988f
--- /dev/null
+++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml
@@ -0,0 +1,36 @@
+{{- if .Values.nodePool.enabled }}
+apiVersion: karpenter.sh/v1beta1
+kind: NodePool
+metadata:
+  name: {{ .Values.name }}
+spec:
+  template:
+    metadata:
+      labels:
+        NodePool: {{ .Values.name }}
+        NodeGroupType: {{ .Values.name }}
+        {{- with .Values.nodePool.labels }}
+        {{- range . }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+        {{- end }}
+    spec:
+      nodeClassRef:
+        name: {{ .Values.name }}
+      {{- with .Values.nodePool.taints }}
+      taints:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.nodePool.requirements }}
+      requirements:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+  disruption:
+    consolidationPolicy: {{ .Values.nodePool.disruption.consolidationPolicy }}
+    consolidateAfter: {{ .Values.nodePool.disruption.consolidateAfter }}
+    expireAfter: {{ .Values.nodePool.disruption.expireAfter }}
+  limits:
+    cpu: {{ .Values.nodePool.limits.cpu }}
+    memory: {{ .Values.nodePool.limits.memory }}
+  weight: {{ .Values.nodePool.weight }}
+{{- end }}
diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml
new file mode 100644
index 000000000..456dce270
--- /dev/null
+++ b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml
@@ -0,0 +1,40 @@
+# Shared values
+name: default
+clusterName: test-cluster
+
+# EC2NodeClass specific values
+ec2NodeClass:
+  enabled: true
+  amiFamily: AL2
+  amiSelectorTerms:
+  subnetSelectorTerms: # tag or id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/
+  securityGroupSelectorTerms: # tag, name, id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/
+  karpenterRole:
+  metadataOptions:
+    httpEndpoint: enabled
+    httpProtocolIPv6: disabled
+    httpPutResponseHopLimit: 2
+    httpTokens: required
+  blockDevice:
+    deviceName: /dev/xvda
+    volumeSize: 100Gi
+    volumeType: gp3
+    encrypted: true
+    deleteOnTermination: true
+  detailedMonitoring: true
+  userData:
+
+# NodePool specific values
+nodePool:
+  enabled: true
+  labels:
+  taints:
+  requirements:
+  disruption:
+    consolidationPolicy: WhenEmpty
+    consolidateAfter: 30s
+    expireAfter: 720h
+  limits:
+    cpu: "1000"
+    memory: 1000Gi
+  weight: 10
diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml
deleted file mode 100644
index 1ba2eb9fd..000000000
--- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-default.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-apiVersion: karpenter.sh/v1alpha5
-kind: Provisioner
-metadata:
-  name: default
-spec:
-  providerRef:
-    name: default
-  requirements:
-    - key: "topology.kubernetes.io/zone"
-      operator: In
-      values: [${azs}d]
-    - key: "karpenter.k8s.aws/instance-category"
-      operator: In
-      values: ["c", "m", "r"]
-    - key: "karpenter.k8s.aws/instance-size"
-      operator: In
-      values: ["xlarge", "2xlarge", "4xlarge", "8xlarge", "16xlarge"]
-    - key: "kubernetes.io/arch"
-      operator: In
-      values: ["amd64"]
-    - key: "karpenter.sh/capacity-type"
-      operator: In
-      values: ["spot", "on-demand"]
-  labels:
-    provisioner: default
-    workload: rayhead
-  limits:
-    resources:
-      cpu: 1000
-      memory: 20000Gi
-  ttlSecondsAfterEmpty: 120 # optional, but never scales down if not set
-
----
-apiVersion: karpenter.k8s.aws/v1alpha1
-kind: AWSNodeTemplate
-metadata:
-  name: default
-spec:
-  amiFamily: AL2
-  blockDeviceMappings:
-    - deviceName: /dev/xvda
-      ebs:
-        volumeSize: 100Gi
-        volumeType: gp3
-        encrypted: true
-  subnetSelector:
-    Name: ${eks_cluster_id}-private*
-  securityGroupSelector:
-    Name: ${eks_cluster_id}-node*
-  tags:
-    managed-by: "karpenter"
-    intent: "apps"
-    Name: "karpenter-node-default"
diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml
deleted file mode 100644
index 262e307d2..000000000
--- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-inf2.yaml
+++ /dev/null
@@ -1,58 +0,0 @@
----
-apiVersion: karpenter.sh/v1alpha5
-kind: Provisioner
-metadata:
-  name: inferentia-inf2
-  namespace: karpenter
-spec:
-  requirements:
-    - key: "topology.kubernetes.io/zone"
-      operator: In
-      values: [${azs}d]
-    - key: "karpenter.sh/capacity-type"
-      operator: In
-      values: ["spot", "on-demand"]
-    - key: "node.kubernetes.io/instance-type"
-      operator: In
-      values: ["inf2.24xlarge", "inf2.48xlarge"]
-    - key: "kubernetes.io/arch"
-      operator: In
-      values: ["amd64"]
-  providerRef:
-    name: inferentia-inf2
-  labels:
-    provisioner: inferentia-inf2
-    hub.jupyter.org/node-purpose: user
-  taints:
-    - key: aws.amazon.com/neuroncore
-      value: "true"
-      effect: "NoSchedule"
-    - key: aws.amazon.com/neuron
-      value: "true"
-      effect: "NoSchedule"
-    - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-      operator: "Equal"
-      value: "user"
-      effect: "NoSchedule"
-  ttlSecondsAfterEmpty: 300 # optional, but never scales down if not set
-
----
-apiVersion: karpenter.k8s.aws/v1alpha1
-kind: AWSNodeTemplate
-metadata:
-  name: inferentia-inf2
-  namespace: karpenter
-spec:
-  amiFamily: AL2
-  blockDeviceMappings:
-    - deviceName: /dev/xvda
-      ebs:
-        volumeSize: 200Gi
-        volumeType: gp3
-        encrypted: true
-  subnetSelector:
-    Name: ${eks_cluster_id}-private*
-  securityGroupSelector:
-    Name: ${eks_cluster_id}-node*
-  tags:
-    InstanceType: "inferentia"
diff --git a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml b/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml
deleted file mode 100644
index 1297054f8..000000000
--- a/ai-ml/trainium-inferentia/karpenter-provisioners/karpenter-trn1.yaml
+++ /dev/null
@@ -1,53 +0,0 @@
----
-apiVersion: karpenter.sh/v1alpha5
-kind: Provisioner
-metadata:
-  name: trainium-trn1
-  namespace: karpenter # Same namespace as Karpenter add-on installed
-spec:
-  requirements:
-    - key: "topology.kubernetes.io/zone"
-      operator: In
-      values: [${azs}d]
-    - key: "karpenter.sh/capacity-type"
-      operator: In
-      values: ["spot", "on-demand"]
-    - key: "node.kubernetes.io/instance-type"
-      operator: In
-      values: ["trn1.32xlarge", "trn1n.32xlarge"] # trn1.2xlarge, trn1n.32xlarge
-    - key: "kubernetes.io/arch"
-      operator: In
-      values: ["amd64"]
-  providerRef:
-    name: trainium-trn1
-  labels:
-    provisioner: trainium-trn1
-    hub.jupyter.org/node-purpose: user
-  taints:
-    - key: aws.amazon.com/neuroncore
-      value: "true"
-      effect: "NoSchedule"
-    - key: aws.amazon.com/neuron
-      value: "true"
-      effect: "NoSchedule"
-    - key: hub.jupyter.org/dedicated # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
-      operator: "Equal"
-      value: "user"
-      effect: "NoSchedule"
-  ttlSecondsAfterEmpty: 300 # optional, but never scales down if not set
-
-# Note: As of version v0.29.0, the Karpenter AWSNodeTemplate lacks the ability to configure multipleNetwork interfaces for EFA.
-# To work around this limitation, we are utilizing Terraform to generate launch templates that include EFA configurations.
-# These launch templates are then used as input for the AWS Node template, enabling us to achieve the desired network interface setups.
----
-apiVersion: karpenter.k8s.aws/v1alpha1
-kind: AWSNodeTemplate
-metadata:
-  name: trainium-trn1
-  namespace: karpenter
-spec:
-  subnetSelector:
-    Name: "${eks_cluster_id}-private*"
-  launchTemplate: "${launch_template_name}"
-  tags:
-    InstanceType: "trainium"
diff --git a/ai-ml/trainium-inferentia/main.tf b/ai-ml/trainium-inferentia/main.tf
index 56984ff0e..6d3fd5e26 100755
--- a/ai-ml/trainium-inferentia/main.tf
+++ b/ai-ml/trainium-inferentia/main.tf
@@ -2,8 +2,6 @@ provider "aws" {
   region = local.region
 }
 
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
 provider "aws" {
   alias  = "ecr"
   region = "us-east-1"
@@ -22,62 +20,7 @@ provider "helm" {
     token                  = data.aws_eks_cluster_auth.this.token
   }
 }
-provider "kubectl" {
-  apply_retry_count      = 30
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-  load_config_file       = false
-}
-
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
-data "aws_ecrpublic_authorization_token" "token" {
-  provider = aws.ecr
-}
-
-locals {
-  name   = var.name
-  region = var.region
-  # Trn1 and Inf2 instances are available in specific AZs in us-east-1,
-  # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used.
-  az_mapping = {
-    "us-west-2" = ["usw2-az4", "usw2-az1"],
-    "us-east-1" = ["use1-az6", "use1-az5"],
-    "us-east-2" = ["use2-az3", "use2-az1"]
-  }
-  azs = local.az_mapping[var.region]
-  tags = {
-    Blueprint  = local.name
-    GithubRepo = "github.com/awslabs/data-on-eks"
-  }
-}
-provider "aws" {
-  region = local.region
-}
 
-# ECR always authenticates with `us-east-1` region
-# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html
-provider "aws" {
-  alias  = "ecr"
-  region = "us-east-1"
-}
-
-provider "kubernetes" {
-  host                   = module.eks.cluster_endpoint
-  cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-  token                  = data.aws_eks_cluster_auth.this.token
-}
-
-provider "helm" {
-  kubernetes {
-    host                   = module.eks.cluster_endpoint
-    cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data)
-    token                  = data.aws_eks_cluster_auth.this.token
-  }
-}
 provider "kubectl" {
   apply_retry_count      = 30
   host                   = module.eks.cluster_endpoint
@@ -109,4 +52,4 @@ locals {
     Blueprint  = local.name
     GithubRepo = "github.com/awslabs/data-on-eks"
   }
-}
+}
\ No newline at end of file
diff --git a/ai-ml/trainium-inferentia/outputs.tf b/ai-ml/trainium-inferentia/outputs.tf
index 35354df9e..40adfb3b3 100755
--- a/ai-ml/trainium-inferentia/outputs.tf
+++ b/ai-ml/trainium-inferentia/outputs.tf
@@ -1,4 +1,4 @@
 output "configure_kubectl" {
   description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig"
   value       = "aws eks --region ${var.region} update-kubeconfig --name ${local.name}"
-}
\ No newline at end of file
+}
diff --git a/ai-ml/trainium-inferentia/variables.tf b/ai-ml/trainium-inferentia/variables.tf
index 094ea931d..f4259359b 100755
--- a/ai-ml/trainium-inferentia/variables.tf
+++ b/ai-ml/trainium-inferentia/variables.tf
@@ -53,49 +53,48 @@ variable "enable_mpi_operator" {
 
 variable "trn1_32xl_min_size" {
   description = "trn1 Worker node minimum size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "trn1_32xl_desired_size" {
   description = "trn1 Worker node desired size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "trn1n_32xl_min_size" {
   description = "Worker node minimum size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "trn1n_32xl_desired_size" {
   description = "Worker node desired size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "inf2_24xl_min_size" {
   description = "Worker node minimum size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "inf2_24xl_desired_size" {
   description = "Worker node desired size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "inf2_48xl_min_size" {
   description = "Worker node minimum size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
 
 variable "inf2_48xl_desired_size" {
   description = "Worker node desired size"
-  type = number
-  default = 0
+  type        = number
+  default     = 0
 }
-
diff --git a/schedulers/terraform/managed-airflow-mwaa/eks.tf b/schedulers/terraform/managed-airflow-mwaa/eks.tf
index 7e537afa1..e107ab071 100644
--- a/schedulers/terraform/managed-airflow-mwaa/eks.tf
+++ b/schedulers/terraform/managed-airflow-mwaa/eks.tf
@@ -42,11 +42,11 @@ module "eks" {
 
     # MWAA needs access to the EKS control plane in order to submit a job
     allow_access_from_mwaa = {
-      description = "Access from MWAA"
-      protocol    = "tcp"
-      from_port   = 443
-      to_port     = 443
-      type        = "ingress"
+      description              = "Access from MWAA"
+      protocol                 = "tcp"
+      from_port                = 443
+      to_port                  = 443
+      type                     = "ingress"
       source_security_group_id = module.mwaa.mwaa_security_group_id
     }
   }
diff --git a/website/docs/blueprints/ai-ml/trainium.md b/website/docs/blueprints/ai-ml/trainium.md
index f845f138c..954c9f205 100644
--- a/website/docs/blueprints/ai-ml/trainium.md
+++ b/website/docs/blueprints/ai-ml/trainium.md
@@ -64,6 +64,10 @@ In this [example](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/trainiu
  - Prepare the necessary etcd setup as a prerequisite for TorchX.
  - Create a test queue within Volcano to enable TorchX job submission to this specific queue.
 
+:::info
+**Important**: In this setup, Karpenter is utilized exclusively for `inferentia-inf2` instances, due to its current limitations in custom networking interfaces configuration. For Trainium instances, managed node groups and the Cluster Autoscaler are employed for scaling purposes. For users working with an older version of Karpenter (specifically, the `v1alpha5` APIs), please note that the configuration for Trainium with `LaunchTemplates` is still accessible. It can be found in the `data-on-eks/ai-ml/trainium-inferentia/addons.tf` file, although it is commented out at the file's end.
+:::
+
 ### Prerequisites
 
 Ensure that you have installed the following tools on your machine.
@@ -83,7 +87,7 @@ git clone https://github.com/awslabs/data-on-eks.git
 Navigate into one of the example directories and run `install.sh` script
 
 ```bash
-cd data-on-eks/ai-ml/trainium/ && chmod +x install.sh
+cd data-on-eks/ai-ml/trainium-inferentia/ && chmod +x install.sh
 ./install.sh
 ```
 
@@ -92,12 +96,12 @@ cd data-on-eks/ai-ml/trainium/ && chmod +x install.sh
 Verify the Amazon EKS Cluster
 
 ```bash
-aws eks describe-cluster --name trainium
+aws eks describe-cluster --name trainium-inferentia
 ```
 
 ```bash
 # Creates k8s config file to authenticate with EKS
-aws eks --region us-west-2 update-kubeconfig --name trainium
+aws eks --region us-west-2 update-kubeconfig --name trainium-inferentia
 
 kubectl get nodes # Output shows the EKS Managed Node group nodes
 
diff --git a/website/docs/gen-ai/training/Llama2.md b/website/docs/gen-ai/training/Llama2.md
index 8a3f2cef8..816667274 100644
--- a/website/docs/gen-ai/training/Llama2.md
+++ b/website/docs/gen-ai/training/Llama2.md
@@ -137,7 +137,7 @@ Navigate to examples/llama2 directory
 cd examples/llama2/
 ```
 
-Run the `1-llama2-neuronx-pretrain-build-image.sh` script to build the neuronx-nemo-megatron container image and push the image into ECR. 
+Run the `1-llama2-neuronx-pretrain-build-image.sh` script to build the neuronx-nemo-megatron container image and push the image into ECR.
 
 When prompted for a region, enter the region in which you launched your EKS cluster, above.
 
@@ -149,7 +149,7 @@ Note: The image building and pushing to ECR will take ~10 minutes
 
 ### Launch and connect to a CLI pod
 
-In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above. 
+In this step we need access to the shared FSx storage. To copy files to this storage, we’ll first launch and connect to a CLI pod running the neuronx-nemo-megatron docker image that you created above.
 
 Run the following script to launch the CLI pod:
 
@@ -227,9 +227,9 @@ python3 neuronx-nemo-megatron/nemo/scripts/nlp_language_modeling/preprocess_data
 
 Note: When we later launch our training jobs in EKS, the training pods will run the training script from within neuronx-nemo-megatron/nemo/examples directory on FSx. This is convenient, because it will let you modify your training script directly on FSx without requiring that you rebuild the neuronx-nemo-megatron container for every change.
 
-Modify the test_llama.sh script `/shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh` to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem. 
+Modify the test_llama.sh script `/shared/neuronx-nemo-megatron/nemo/examples/nlp/language_modeling/test_llama.sh` to update the following two lines. These lines tell the training pod workers where to find the Llama tokenizer and the dataset on the FSx filesystem.
 
-You can use any common text editor such as nano or vim to make these changes. 
+You can use any common text editor such as nano or vim to make these changes.
 
 Run:
 ```bash
@@ -279,7 +279,7 @@ Run the pre-compilation script
 ./3-llama2-neuronx-mpi-compile.sh
 ```
 
-Pre-compilation will take ~10 minutes when using 4 trn1.32xlarge nodes. 
+Pre-compilation will take ~10 minutes when using 4 trn1.32xlarge nodes.
 
 Periodically run `kubectl get pods | grep compile` and wait until you see that the compile job shows ‘Completed’.
 
@@ -333,7 +333,7 @@ Run the following script to create a TensorBoard deployment so you can visualize
 ./5-deploy-tensorboard.sh
 ```
 
-Once the deployment is ready the script will output a password-protected URL for your new TensorBoard deployment. 
+Once the deployment is ready the script will output a password-protected URL for your new TensorBoard deployment.
 
 Launch the URL to view your training progress.