Skip to content

Commit

Permalink
refactor: Trainium inferentia karpenter provisioners update (#420)
Browse files Browse the repository at this point in the history
Co-authored-by: Sanjeev Ganjihal <sanjeevrg7@gmail.com>
Co-authored-by: Vara Bonthu <vara.bonthu@gmail.com>
  • Loading branch information
3 people authored Feb 1, 2024
1 parent 006321d commit e0abaad
Show file tree
Hide file tree
Showing 23 changed files with 578 additions and 429 deletions.
492 changes: 327 additions & 165 deletions ai-ml/trainium-inferentia/addons.tf

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion ai-ml/trainium-inferentia/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ module "eks" {
# Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) :
substr(cidr_block, 0, 4) == "100." ? subnet_id : null])


manage_aws_auth_configmap = true
aws_auth_roles = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fi
ECR_REPO_URI=$(cat .ecr_repo_uri)
echo -e "Using container image $ECR_REPO_URI:latest"

# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
# Launch the cmd-shell pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
kubectl apply -f - <<EOF
apiVersion: v1
kind: Pod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fi
ECR_REPO_URI=$(cat .ecr_repo_uri)
echo -e "Using container image $ECR_REPO_URI:latest"

# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
# Launch the llama2-7B pre-compilation pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_compile.yaml | kubectl apply -f -

if [[ "$?" -eq 0 ]]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fi
ECR_REPO_URI=$(cat .ecr_repo_uri)
echo -e "Using container image $ECR_REPO_URI:latest"

# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
# Launch the llama2-7B training pod using the container image created by 1-llama2-neuronx-pretrain-build-image.sh
sed "s|IMG_PLACEHOLDER|$ECR_REPO_URI:latest|" ./example_manifests/llama_train.yaml | kubectl apply -f -

if [[ "$?" -eq 0 ]]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ STATUS=$(curl -sI $LB_HOST|head -1)
while [[ ! $STATUS =~ Unauthorized ]]
do
echo -n "."
sleep 10
sleep 10
STATUS=$(curl -sI $LB_HOST|head -1)
done

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ RUN apt-get update && apt-get install -y nginx python3-pip apache2-utils curl
RUN python3 -m pip install --upgrade pip && python3 -m pip install tensorboard

# TB_PASSWORD is specified during container build
ARG TB_PASSWORD=""
ARG TB_PASSWORD=""

RUN htpasswd -c -b /etc/nginx/htpasswd admin $TB_PASSWORD
COPY assets/nginx_auth.conf /etc/nginx/sites-enabled/default
Original file line number Diff line number Diff line change
Expand Up @@ -81,5 +81,3 @@ install_helm
install_boto3

echo "Installation of prerequisites complete."


Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ spec:
memory: "700G"
aws.amazon.com/neuron: "12"
nodeSelector:
karpenter.sh/provisioner-name: inferentia-inf2
provisioner: inferentia-inf2
tolerations:
- key: aws.amazon.com/neuroncore
operator: Exists
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v2
name: karpenter-resources
description: Helm chart for configuring custom resources for Karpenter on the cluster
version: 0.0.1
appVersion: 0.0.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Karpenter Resources Helm Chart

## Overview

This Helm chart is an abstraction layer designed for deploying various configurations of Karpenter nodes in a Kubernetes cluster managed by EKS. It integrates seamlessly with Terraform, allowing users to define different node pools and settings for their Kubernetes cluster.

## Prerequisites

- Helm 3.x or later installed
- Terraform installed
- Access to an AWS EKS cluster

## Configuration

The chart is configured to be used with Terraform. Here is an example of how you might define Helm releases for different Karpenter configurations in your Terraform files using EKS Blueprints add-ons:

```hcl
module "eks_blueprints_addons" {
# ... other configurations ...
helm_releases = {
karpenter-resources-default = {
name = "default"
description = "A Helm chart for default node pool"
chart = "${path.module}/helm-values/karpenter-resources"
values = [
<<-EOT
clusterName: ${module.eks.cluster_name}
ec2NodeClass:
karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
nodePool:
labels:
- provisioner: default
- workload: rayhead
EOT
]
}
}
}
```

## Testing the Chart with Helm Template

To review the Kubernetes manifests that will be generated by the Helm chart based on your configuration, you can use the `helm template` command. This is especially useful for validating your Terraform configurations before applying them.

1. **Generate the Manifests**

Navigate to the directory where your Helm chart is located.

```sh
cd path/to/helm-chart
```

2. **Run Helm Template**

Use the `helm template` command with your custom values. For example:

```sh
helm template my-release-name . --values values.yaml
```

Replace `my-release-name` with a name for your release, and `values.yaml` with the path to your custom values file.

To test specific configurations defined in your Terraform file, you can create a temporary values file with the configuration snippet from your Terraform definition:

```sh
echo '<your Terraform values here>' > temp-values.yaml
helm template my-release-name . --values temp-values.yaml
rm temp-values.yaml
```

This will output the Kubernetes manifests to your terminal, allowing you to review them.
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{{- if .Values.ec2NodeClass.enabled }}
apiVersion: karpenter.k8s.aws/v1beta1
kind: EC2NodeClass
metadata:
name: {{ .Values.name }}
spec:
{{- if .Values.ec2NodeClass.amiFamily }}
amiFamily: {{ .Values.ec2NodeClass.amiFamily }}
{{- else if .Values.ec2NodeClass.amiSelectorTerms }}
amiSelectorTerms:
{{- toYaml .Values.ec2NodeClass.amiSelectorTerms | nindent 4 }}
{{- end }}
subnetSelectorTerms:
{{- if .Values.ec2NodeClass.subnetSelectorTerms.tags }}
- tags:
{{- range $key, $value := .Values.ec2NodeClass.subnetSelectorTerms.tags }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
{{- if .Values.ec2NodeClass.subnetSelectorTerms.id }}
- id: {{ .Values.ec2NodeClass.subnetSelectorTerms.id }}
{{- end }}
securityGroupSelectorTerms:
{{- if .Values.ec2NodeClass.securityGroupSelectorTerms.name }}
- name: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.name }}
{{- end }}
{{- if .Values.ec2NodeClass.securityGroupSelectorTerms.id }}
- id: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.id }}
{{- end }}
{{- if .Values.ec2NodeClass.securityGroupSelectorTerms.tags }}
- tags:
{{- range $key, $value := .Values.ec2NodeClass.securityGroupSelectorTerms.tags }}
{{ $key }}: {{ $value | quote }}
{{- end }}
{{- end }}
role: {{ .Values.ec2NodeClass.karpenterRole }}
tags:
Name: karpenter-{{ .Values.name }}
metadataOptions:
httpEndpoint: {{ .Values.ec2NodeClass.metadataOptions.httpEndpoint }}
httpProtocolIPv6: {{ .Values.ec2NodeClass.metadataOptions.httpProtocolIPv6 }}
httpPutResponseHopLimit: {{ .Values.ec2NodeClass.metadataOptions.httpPutResponseHopLimit }}
httpTokens: {{ .Values.ec2NodeClass.metadataOptions.httpTokens }}
blockDeviceMappings:
- deviceName: {{ default "/dev/xvda" .Values.ec2NodeClass.blockDevice.deviceName }}
ebs:
volumeSize: {{ .Values.ec2NodeClass.blockDevice.volumeSize }}
volumeType: {{ .Values.ec2NodeClass.blockDevice.volumeType }}
encrypted: {{ .Values.ec2NodeClass.blockDevice.encrypted }}
deleteOnTermination: {{ .Values.ec2NodeClass.blockDevice.deleteOnTermination }}
detailedMonitoring: {{ .Values.ec2NodeClass.detailedMonitoring }}
{{- if .Values.ec2NodeClass.userData }}
userData: |
{{- .Values.ec2NodeClass.userData | nindent 4 }}
{{- end }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{{- if .Values.nodePool.enabled }}
apiVersion: karpenter.sh/v1beta1
kind: NodePool
metadata:
name: {{ .Values.name }}
spec:
template:
metadata:
labels:
NodePool: {{ .Values.name }}
NodeGroupType: {{ .Values.name }}
{{- with .Values.nodePool.labels }}
{{- range . }}
{{- toYaml . | nindent 8 }}
{{- end }}
{{- end }}
spec:
nodeClassRef:
name: {{ .Values.name }}
{{- with .Values.nodePool.taints }}
taints:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.nodePool.requirements }}
requirements:
{{- toYaml . | nindent 8 }}
{{- end }}
disruption:
consolidationPolicy: {{ .Values.nodePool.disruption.consolidationPolicy }}
consolidateAfter: {{ .Values.nodePool.disruption.consolidateAfter }}
expireAfter: {{ .Values.nodePool.disruption.expireAfter }}
limits:
cpu: {{ .Values.nodePool.limits.cpu }}
memory: {{ .Values.nodePool.limits.memory }}
weight: {{ .Values.nodePool.weight }}
{{- end }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Shared values
name: default
clusterName: test-cluster

# EC2NodeClass specific values
ec2NodeClass:
enabled: true
amiFamily: AL2
amiSelectorTerms:
subnetSelectorTerms: # tag or id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/
securityGroupSelectorTerms: # tag, name, id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/
karpenterRole:
metadataOptions:
httpEndpoint: enabled
httpProtocolIPv6: disabled
httpPutResponseHopLimit: 2
httpTokens: required
blockDevice:
deviceName: /dev/xvda
volumeSize: 100Gi
volumeType: gp3
encrypted: true
deleteOnTermination: true
detailedMonitoring: true
userData:

# NodePool specific values
nodePool:
enabled: true
labels:
taints:
requirements:
disruption:
consolidationPolicy: WhenEmpty
consolidateAfter: 30s
expireAfter: 720h
limits:
cpu: "1000"
memory: 1000Gi
weight: 10

This file was deleted.

This file was deleted.

Loading

0 comments on commit e0abaad

Please sign in to comment.