diff --git a/community/modules/compute/gke-topology-scheduler/README.md b/community/modules/compute/gke-topology-scheduler/README.md index 8d5b42913e..5aaa4fca98 100644 --- a/community/modules/compute/gke-topology-scheduler/README.md +++ b/community/modules/compute/gke-topology-scheduler/README.md @@ -45,7 +45,8 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no | +| [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/gke-topology-scheduler/main.tf b/community/modules/compute/gke-topology-scheduler/main.tf index 3a79befcf3..677595632b 100644 --- a/community/modules/compute/gke-topology-scheduler/main.tf +++ b/community/modules/compute/gke-topology-scheduler/main.tf @@ -15,7 +15,8 @@ module "kubectl_apply" { source = "../../../../modules/management/kubectl-apply" - gke_cluster_exists = var.gke_cluster_exists + cluster_id = var.cluster_id + project_id = var.project_id apply_manifests = [ { source = "${path.module}/manifests/topology-scheduler-scripts.yaml" }, diff --git a/community/modules/compute/gke-topology-scheduler/variables.tf b/community/modules/compute/gke-topology-scheduler/variables.tf index 2fcbb93d58..0766091223 100644 --- a/community/modules/compute/gke-topology-scheduler/variables.tf +++ b/community/modules/compute/gke-topology-scheduler/variables.tf @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -variable "gke_cluster_exists" { - description = "A static flag that signals to modules that a cluster has been created." - type = bool - default = false +variable "project_id" { + description = "The project ID to host the cluster in." + type = string +} + +variable "cluster_id" { + description = "projects/{{project}}/locations/{{location}}/clusters/{{cluster}}" + type = string } diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index ac8de13eb7..a1fcaa8f01 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -323,7 +323,6 @@ limitations under the License. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no | | [gke\_version](#input\_gke\_version) | GKE version | `string` | n/a | yes | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({| `[]` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 44a2b6f3a0..bcce7b04a0 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -375,7 +375,8 @@ resource "null_resource" "enable_tcpxo_in_workload" { module "kubectl_apply" { source = "../../management/kubectl-apply" - gke_cluster_exists = var.gke_cluster_exists + cluster_id = var.cluster_id + project_id = var.project_id apply_manifests = flatten([ for manifest in local.gpu_direct_setting.gpu_direct_manifests : [ diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index b15fc3f3ef..d3b403b564 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -24,12 +24,6 @@ variable "cluster_id" { type = string } -variable "gke_cluster_exists" { - description = "A static flag that signals to modules that a cluster has been created." - type = bool - default = false -} - variable "zones" { description = "A list of zones to be used. Zones must be in region of cluster. If null, cluster zones will be inherited. Note `zones` not `zone`; does not work with `zone` deployment variable." type = list(string) diff --git a/modules/file-system/gke-persistent-volume/README.md b/modules/file-system/gke-persistent-volume/README.md index b5967763c9..f4d94d8c3b 100644 --- a/modules/file-system/gke-persistent-volume/README.md +++ b/modules/file-system/gke-persistent-volume/README.md @@ -121,6 +121,7 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.0 | +| [google](#requirement\_google) | >= 4.42 | | [kubectl](#requirement\_kubectl) | >= 1.7.0 | | [local](#requirement\_local) | >= 2.0.0 | @@ -128,6 +129,7 @@ limitations under the License. | Name | Version | |------|---------| +| [google](#provider\_google) | >= 4.42 | | [kubectl](#provider\_kubectl) | >= 1.7.0 | | [local](#provider\_local) | >= 2.0.0 | @@ -142,15 +144,17 @@ No modules. | [kubectl_manifest.pv](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.pvc](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [local_file.debug_file](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | +| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [capacity\_gb](#input\_capacity\_gb) | The storage capacity with which to create the persistent volume. | `number` | n/a | yes | +| [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}` | `string` | n/a | yes | | [filestore\_id](#input\_filestore\_id) | An identifier for a filestore with the format `projects/{{project}}/locations/{{location}}/instances/{{name}}`. | `string` | `null` | no | | [gcs\_bucket\_name](#input\_gcs\_bucket\_name) | The gcs bucket to be used with the persistent volume. | `string` | `null` | no | -| [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [network\_storage](#input\_network\_storage) | Network attached storage mount to be configured. |
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(object({
gpu_driver_version = string
}), { gpu_driver_version = "DEFAULT" })
gpu_partition_size = optional(string)
gpu_sharing_config = optional(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
object({| n/a | yes | diff --git a/modules/file-system/gke-persistent-volume/main.tf b/modules/file-system/gke-persistent-volume/main.tf index d12c5d6d39..5b52bcc950 100644 --- a/modules/file-system/gke-persistent-volume/main.tf +++ b/modules/file-system/gke-persistent-volume/main.tf @@ -77,6 +77,9 @@ locals { capacity = "${var.capacity_gb}Gi" } ) + + cluster_name = split("/", var.cluster_id)[5] + cluster_location = split("/", var.cluster_id)[3] } resource "local_file" "debug_file" { @@ -87,8 +90,21 @@ resource "local_file" "debug_file" { filename = "${path.root}/pv-pvc-debug-file-${local.filestore_name}.yaml" } +data "google_container_cluster" "gke_cluster" { + name = local.cluster_name + location = local.cluster_location +} + +data "google_client_config" "default" {} + +provider "kubectl" { + host = "https://${data.google_container_cluster.gke_cluster.endpoint}" + cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) + token = data.google_client_config.default.access_token + load_config_file = false +} + resource "kubectl_manifest" "pv" { - count = var.gke_cluster_exists ? 1 : 0 yaml_body = local.is_gcs ? local.gcs_pv_contents : local.filestore_pv_contents lifecycle { diff --git a/modules/file-system/gke-persistent-volume/variables.tf b/modules/file-system/gke-persistent-volume/variables.tf index 96e3f31949..a72fa3857f 100644 --- a/modules/file-system/gke-persistent-volume/variables.tf +++ b/modules/file-system/gke-persistent-volume/variables.tf @@ -14,10 +14,9 @@ * limitations under the License. */ -variable "gke_cluster_exists" { - description = "A static flag that signals to modules that a cluster has been created." - type = bool - default = false +variable "cluster_id" { + description = "An identifier for the GKE cluster in the format `projects/{{project}}/locations/{{location}}/clusters/{{cluster}}`" + type = string } variable "network_storage" { diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index 3fd6cf3c3a..2378b1d62b 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -15,6 +15,10 @@ terraform { required_version = ">= 1.0" required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.42" + } kubectl = { source = "gavinbunney/kubectl" version = ">= 1.7.0" diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md index 9d7a2fb428..ca628aaaeb 100644 --- a/modules/file-system/gke-storage/README.md +++ b/modules/file-system/gke-storage/README.md @@ -109,10 +109,11 @@ No resources. |------|-------------|------|---------|:--------:| | [access\_mode](#input\_access\_mode) | The access mode that the volume can be mounted to the host/pod. More details in [Access Modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes)
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
})
list(object({| `[]` | no | -| [gke\_cluster\_exists](#input\_gke\_cluster\_exists) | A static flag that signals to modules that a cluster has been created. | `bool` | `false` | no | +| [cluster\_id](#input\_cluster\_id) | An identifier for the gke cluster resource with format projects/
content = optional(string, null)
source = optional(string, null)
template_vars = optional(map(any), null)
server_side_apply = optional(bool, false)
wait_for_rollout = optional(bool, true)
}))
object({| `{}` | no | | [kueue](#input\_kueue) | Install and configure [Kueue](https://kueue.sigs.k8s.io/docs/overview/) workload scheduler. A configuration yaml/template file can be provided with config\_path to be applied right after kueue installation. If a template file provided, its variables can be set to config\_template\_vars. |
install = optional(bool, false)
version = optional(string, "v0.5.2")
})
object({| `{}` | no | +| [project\_id](#input\_project\_id) | The project ID that hosts the gke cluster. | `string` | n/a | yes | ## Outputs diff --git a/modules/management/kubectl-apply/main.tf b/modules/management/kubectl-apply/main.tf index cc1abd05f0..5663e01580 100644 --- a/modules/management/kubectl-apply/main.tf +++ b/modules/management/kubectl-apply/main.tf @@ -15,6 +15,11 @@ */ locals { + cluster_id_parts = split("/", var.cluster_id) + cluster_name = local.cluster_id_parts[5] + cluster_location = local.cluster_id_parts[3] + project_id = var.project_id != null ? var.project_id : local.cluster_id_parts[1] + apply_manifests_map = tomap({ for index, manifest in var.apply_manifests : index => manifest }) @@ -25,8 +30,16 @@ locals { jobset_install_source = format("${path.module}/manifests/jobset-%s.yaml", try(var.jobset.version, "")) } +data "google_container_cluster" "gke_cluster" { + project = local.project_id + name = local.cluster_name + location = local.cluster_location +} + +data "google_client_config" "default" {} + module "kubectl_apply_manifests" { - for_each = var.gke_cluster_exists ? local.apply_manifests_map : {} + for_each = local.apply_manifests_map source = "./kubectl" content = each.value.content @@ -36,34 +49,34 @@ module "kubectl_apply_manifests" { wait_for_rollout = each.value.wait_for_rollout providers = { - http = http.h + kubectl = kubectl + http = http.h } } module "install_kueue" { - count = var.gke_cluster_exists ? 1 : 0 source = "./kubectl" source_path = local.install_kueue ? local.kueue_install_source : null server_side_apply = true providers = { - http = http.h + kubectl = kubectl + http = http.h } } module "install_jobset" { - count = var.gke_cluster_exists ? 1 : 0 source = "./kubectl" source_path = local.install_jobset ? local.jobset_install_source : null server_side_apply = true providers = { - http = http.h + kubectl = kubectl + http = http.h } } module "configure_kueue" { - count = var.gke_cluster_exists ? 1 : 0 source = "./kubectl" source_path = local.install_kueue ? try(var.kueue.config_path, "") : null template_vars = local.install_kueue ? try(var.kueue.config_template_vars, null) : null @@ -73,6 +86,7 @@ module "configure_kueue" { wait_for_rollout = true providers = { - http = http.h + kubectl = kubectl + http = http.h } } diff --git a/modules/management/kubectl-apply/providers.tf b/modules/management/kubectl-apply/providers.tf index d5577975f3..74d157b93b 100644 --- a/modules/management/kubectl-apply/providers.tf +++ b/modules/management/kubectl-apply/providers.tf @@ -14,6 +14,14 @@ * limitations under the License. */ +provider "kubectl" { + host = "https://${data.google_container_cluster.gke_cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) + load_config_file = false + apply_retry_count = 15 # Terraform may apply resources in parallel, leading to potential dependency issues. This retry mechanism ensures that if a resource's dependencies aren't ready, Terraform will attempt to apply it again. +} + provider "http" { alias = "h" } diff --git a/modules/management/kubectl-apply/variables.tf b/modules/management/kubectl-apply/variables.tf index 0b2f469d50..e1bd91aa97 100644 --- a/modules/management/kubectl-apply/variables.tf +++ b/modules/management/kubectl-apply/variables.tf @@ -37,10 +37,15 @@ resource "terraform_data" "jobset_validations" { } } -variable "gke_cluster_exists" { - description = "A static flag that signals to modules that a cluster has been created." - type = bool - default = false +variable "project_id" { + description = "The project ID that hosts the gke cluster." + type = string +} + +variable "cluster_id" { + description = "An identifier for the gke cluster resource with format projects/
install = optional(bool, false)
version = optional(string, "v0.8.1")
config_path = optional(string, null)
config_template_vars = optional(map(any), null)
})