From 9a67d3e9bcf070e7337b1b1e274edaed853672c8 Mon Sep 17 00:00:00 2001 From: Vara Bonthu Date: Thu, 1 Feb 2024 12:24:36 -0800 Subject: [PATCH] Updates to trainium blueprint --- .../karpenter-resources/Chart.yaml | 5 -- .../helm-values/karpenter-resources/README.md | 71 ------------------- .../templates/node-class.yaml | 56 --------------- .../templates/node-pool.yaml | 36 ---------- .../karpenter-resources/values.yaml | 40 ----------- .../data-analytics/spark-operator-yunikorn.md | 2 +- 6 files changed, 1 insertion(+), 209 deletions(-) delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml delete mode 100644 ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml deleted file mode 100644 index 0c3b8474a..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/Chart.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v2 -name: karpenter-resources -description: Helm chart for configuring custom resources for Karpenter on the cluster -version: 0.0.1 -appVersion: 0.0.1 diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md deleted file mode 100644 index e95f582c1..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# Karpenter Resources Helm Chart - -## Overview - -This Helm chart is an abstraction layer designed for deploying various configurations of Karpenter nodes in a Kubernetes cluster managed by EKS. It integrates seamlessly with Terraform, allowing users to define different node pools and settings for their Kubernetes cluster. - -## Prerequisites - -- Helm 3.x or later installed -- Terraform installed -- Access to an AWS EKS cluster - -## Configuration - -The chart is configured to be used with Terraform. Here is an example of how you might define Helm releases for different Karpenter configurations in your Terraform files using EKS Blueprints add-ons: - -```hcl -module "eks_blueprints_addons" { - # ... other configurations ... - helm_releases = { - karpenter-resources-default = { - name = "default" - description = "A Helm chart for default node pool" - chart = "${path.module}/helm-values/karpenter-resources" - values = [ - <<-EOT - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - nodePool: - labels: - - provisioner: default - - workload: rayhead - EOT - ] - } - } -} -``` - -## Testing the Chart with Helm Template - -To review the Kubernetes manifests that will be generated by the Helm chart based on your configuration, you can use the `helm template` command. This is especially useful for validating your Terraform configurations before applying them. - -1. **Generate the Manifests** - - Navigate to the directory where your Helm chart is located. - - ```sh - cd path/to/helm-chart - ``` - -2. **Run Helm Template** - - Use the `helm template` command with your custom values. For example: - - ```sh - helm template my-release-name . --values values.yaml - ``` - - Replace `my-release-name` with a name for your release, and `values.yaml` with the path to your custom values file. - - To test specific configurations defined in your Terraform file, you can create a temporary values file with the configuration snippet from your Terraform definition: - - ```sh - echo '' > temp-values.yaml - helm template my-release-name . --values temp-values.yaml - rm temp-values.yaml - ``` - - This will output the Kubernetes manifests to your terminal, allowing you to review them. diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml deleted file mode 100644 index 604e7f8e8..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-class.yaml +++ /dev/null @@ -1,56 +0,0 @@ -{{- if .Values.ec2NodeClass.enabled }} -apiVersion: karpenter.k8s.aws/v1beta1 -kind: EC2NodeClass -metadata: - name: {{ .Values.name }} -spec: - {{- if .Values.ec2NodeClass.amiFamily }} - amiFamily: {{ .Values.ec2NodeClass.amiFamily }} - {{- else if .Values.ec2NodeClass.amiSelectorTerms }} - amiSelectorTerms: - {{- toYaml .Values.ec2NodeClass.amiSelectorTerms | nindent 4 }} - {{- end }} - subnetSelectorTerms: - {{- if .Values.ec2NodeClass.subnetSelectorTerms.tags }} - - tags: - {{- range $key, $value := .Values.ec2NodeClass.subnetSelectorTerms.tags }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} - {{- if .Values.ec2NodeClass.subnetSelectorTerms.id }} - - id: {{ .Values.ec2NodeClass.subnetSelectorTerms.id }} - {{- end }} - securityGroupSelectorTerms: - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.name }} - - name: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.name }} - {{- end }} - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.id }} - - id: {{ .Values.ec2NodeClass.securityGroupSelectorTerms.id }} - {{- end }} - {{- if .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} - - tags: - {{- range $key, $value := .Values.ec2NodeClass.securityGroupSelectorTerms.tags }} - {{ $key }}: {{ $value | quote }} - {{- end }} - {{- end }} - role: {{ .Values.ec2NodeClass.karpenterRole }} - tags: - Name: karpenter-{{ .Values.name }} - metadataOptions: - httpEndpoint: {{ .Values.ec2NodeClass.metadataOptions.httpEndpoint }} - httpProtocolIPv6: {{ .Values.ec2NodeClass.metadataOptions.httpProtocolIPv6 }} - httpPutResponseHopLimit: {{ .Values.ec2NodeClass.metadataOptions.httpPutResponseHopLimit }} - httpTokens: {{ .Values.ec2NodeClass.metadataOptions.httpTokens }} - blockDeviceMappings: - - deviceName: {{ default "/dev/xvda" .Values.ec2NodeClass.blockDevice.deviceName }} - ebs: - volumeSize: {{ .Values.ec2NodeClass.blockDevice.volumeSize }} - volumeType: {{ .Values.ec2NodeClass.blockDevice.volumeType }} - encrypted: {{ .Values.ec2NodeClass.blockDevice.encrypted }} - deleteOnTermination: {{ .Values.ec2NodeClass.blockDevice.deleteOnTermination }} - detailedMonitoring: {{ .Values.ec2NodeClass.detailedMonitoring }} - {{- if .Values.ec2NodeClass.userData }} - userData: | - {{- .Values.ec2NodeClass.userData | nindent 4 }} - {{- end }} -{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml deleted file mode 100644 index 0ac17988f..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/templates/node-pool.yaml +++ /dev/null @@ -1,36 +0,0 @@ -{{- if .Values.nodePool.enabled }} -apiVersion: karpenter.sh/v1beta1 -kind: NodePool -metadata: - name: {{ .Values.name }} -spec: - template: - metadata: - labels: - NodePool: {{ .Values.name }} - NodeGroupType: {{ .Values.name }} - {{- with .Values.nodePool.labels }} - {{- range . }} - {{- toYaml . | nindent 8 }} - {{- end }} - {{- end }} - spec: - nodeClassRef: - name: {{ .Values.name }} - {{- with .Values.nodePool.taints }} - taints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodePool.requirements }} - requirements: - {{- toYaml . | nindent 8 }} - {{- end }} - disruption: - consolidationPolicy: {{ .Values.nodePool.disruption.consolidationPolicy }} - consolidateAfter: {{ .Values.nodePool.disruption.consolidateAfter }} - expireAfter: {{ .Values.nodePool.disruption.expireAfter }} - limits: - cpu: {{ .Values.nodePool.limits.cpu }} - memory: {{ .Values.nodePool.limits.memory }} - weight: {{ .Values.nodePool.weight }} -{{- end }} diff --git a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml b/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml deleted file mode 100644 index 456dce270..000000000 --- a/ai-ml/trainium-inferentia/helm-values/karpenter-resources/values.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# Shared values -name: default -clusterName: test-cluster - -# EC2NodeClass specific values -ec2NodeClass: - enabled: true - amiFamily: AL2 - amiSelectorTerms: - subnetSelectorTerms: # tag or id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ - securityGroupSelectorTerms: # tag, name, id see documentation, https://karpenter.sh/docs/concepts/nodeclasses/ - karpenterRole: - metadataOptions: - httpEndpoint: enabled - httpProtocolIPv6: disabled - httpPutResponseHopLimit: 2 - httpTokens: required - blockDevice: - deviceName: /dev/xvda - volumeSize: 100Gi - volumeType: gp3 - encrypted: true - deleteOnTermination: true - detailedMonitoring: true - userData: - -# NodePool specific values -nodePool: - enabled: true - labels: - taints: - requirements: - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - limits: - cpu: "1000" - memory: 1000Gi - weight: 10 diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 59ef10363..c00bd8b41 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -448,7 +448,7 @@ Step2: Execute Benchmark test Karpenter Nodepool weights with Graviton and Intel}> -### Using Karpenter Nodepool weights for runing Spark Jobs on both AWS Graviton and Intel EC2 Instances +### Using Karpenter Nodepool weights for running Spark Jobs on both AWS Graviton and Intel EC2 Instances Customers often seek to leverage AWS Graviton instances for running Spark jobs due to their cost savings and performance improvements over traditional Intel instances. However, a common challenge is the availability of Graviton instances in specific regions or availability zones, especially during times of high demand. To address this, a fallback strategy to equivalent Intel instances is desirable.