Skip to content

Commit

Permalink
add internal docker registry for mirroring and eks images (#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeanschmidt authored Feb 16, 2024
1 parent 6fb03a1 commit bd3e67c
Show file tree
Hide file tree
Showing 15 changed files with 468 additions and 48 deletions.
1 change: 1 addition & 0 deletions modules/arc/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.k8s-rds-state
.terraform.lock.hcl
.terraform/
tls/
93 changes: 85 additions & 8 deletions modules/arc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@ ifneq ($(strip $(MAXRUNNERS)),)
ADDITIONAL_VALUES := $(ADDITIONAL_VALUES) maxRunners=$(MAXRUNNERS)
endif

DOCKERREGISTRYBUCKET := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["docker_registry_bucket"]' <"${CLUSTER_CONFIG_FILE}")
DOCKERREGISTRYINTERNALSECRETARN := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["internal_registry_secret_arn"]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERCONTROLERROLEARN := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["karpenter_controler_role_arn"]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERNODEROLEARN := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["karpenter_node_role_arn"]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERNODEROLE := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["karpenter_node_role_name"]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERNODEROLEARN := $(shell jq '.["$(EKS_CLUSTER_NAME)"]["karpenter_node_role_arn"]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERSGIDS := $(shell jq -c '[.["$(EKS_CLUSTER_NAME)"]["security_group_ids"][] | {"id": .}]' <"${CLUSTER_CONFIG_FILE}")
KARPENTERSUBNETIDS := $(shell jq -c '[.["$(EKS_CLUSTER_NAME)"]["subnet_ids"][] | {"id": .}]' <"${CLUSTER_CONFIG_FILE}")

RUNNERS_NAMESPACE = actions-runners
RUNNERS_SYSTEM_NAMESPACE = actions-runner-system
KARPENTER_NAMESPACE = karpenter
DOCKER_REGISTRY_NAMESPACE = docker-registry
DOCKER_REGISTRY_INTERNAL_NAME = pytorch-internal
DOCKER_REGISTRY_TLS_SECRET_NAME = $(DOCKER_REGISTRY_INTERNAL_NAME)-tls

.PHONY: clean-k8s-rds-state
clean-k8s-rds-state:
Expand Down Expand Up @@ -53,6 +60,8 @@ update-kubectl: do-update-kubectl add-eksctl-identity-mappings

.PHONY: helm-repo-update
helm-repo-update: update-kubectl
helm repo add docker-registry-mirror https://t83714.github.io/docker-registry-mirror
helm repo add twuni https://helm.twun.io
helm repo update

.PHONY: create-runner-namespace
Expand All @@ -68,14 +77,77 @@ create-gha-arc-secret: create-runner-namespace
--from-literal=github_app_private_key="$$$(GHA_PRIVATE_KEY_VAR)" \
--dry-run=client -o yaml | kubectl apply -f -

.PHONY: create-docker-registry-tls-secret
create-docker-registry-tls-secret:
echo "Checking if secret $(DOCKER_REGISTRY_TLS_SECRET_NAME) exists, if not creating it"
if kubectl get secret --namespace=$(DOCKER_REGISTRY_NAMESPACE) $(DOCKER_REGISTRY_TLS_SECRET_NAME) ; then \
echo "Secret $(DOCKER_REGISTRY_TLS_SECRET_NAME) found, not changing it" ; \
else \
echo "Secret $(DOCKER_REGISTRY_TLS_SECRET_NAME) not found, creating it" ; \
openssl req \
-new -newkey rsa:4096 -x509 -sha256 \
-days 3650 -nodes \
-out tls/tls.crt \
-keyout tls/tls.key \
-addext "subjectAltName = DNS:$(DOCKER_REGISTRY_INTERNAL_NAME).$(DOCKER_REGISTRY_NAMESPACE).svc.cluster.local" \
-subj "/CN=$(DOCKER_REGISTRY_INTERNAL_NAME).$(DOCKER_REGISTRY_NAMESPACE).svc.cluster.local" ; \
kubectl create secret tls $(DOCKER_REGISTRY_TLS_SECRET_NAME) \
--namespace=$(DOCKER_REGISTRY_NAMESPACE) \
--cert=tls/tls.crt \
--key=tls/tls.key \
--dry-run=client -o yaml | kubectl apply -f - ; \
fi

.PHONY: install-docker-registry
install-docker-registry: helm-repo-update create-docker-registry-tls-secret
[ "$(REGION)" != "" ] || (echo "REGION not set"; exit 1)
[ "$(CLUSTER_CONFIG_FILE)" != "" ] || (echo "CLUSTER_CONFIG_FILE not set"; exit 1)
[ "$(EKS_CLUSTER_NAME)" != "" ] || (echo "EKS_CLUSTER_NAME not set"; exit 1)
[ "$$DOCKER_REGISTRY_HTPASSWD" != "" ] || (echo "DOCKER_REGISTRY_HTPASSWD not set"; exit 1)
helm upgrade --install docker-registry-mirror-docker-io docker-registry-mirror/docker-registry-mirror \
--namespace=$(DOCKER_REGISTRY_NAMESPACE) \
--create-namespace \
--wait \
--set service.type=ClusterIP \
--set service.port=5000 \
--set service.clusterIP=172.20.56.113 \
--set replicaCount=3 \
--set tolerations[0].key=$(ARC_SYS_TAINT),tolerations[0].operator="Exists",tolerations[0].effect=NoSchedule
helm upgrade --install docker-registry-mirror-ghcr-io docker-registry-mirror/docker-registry-mirror \
--namespace=$(DOCKER_REGISTRY_NAMESPACE) \
--create-namespace \
--wait \
--set proxy.remoteurl=https://ghcr.io \
--set proxy.username=pytorch \
--set proxy.password=`../../venv/bin/python ../../scripts/gh_app_get_github_token.py -i $(GHA_ID) -l $(GHA_INST_ID) -k "$$$(GHA_PRIVATE_KEY_VAR)"` \
--set service.type=ClusterIP \
--set service.port=5000 \
--set service.clusterIP=172.20.56.114 \
--set replicaCount=3 \
--set tolerations[0].key=$(ARC_SYS_TAINT),tolerations[0].operator="Exists",tolerations[0].effect=NoSchedule
helm upgrade --install $(DOCKER_REGISTRY_INTERNAL_NAME) twuni/docker-registry \
--namespace=$(DOCKER_REGISTRY_NAMESPACE) \
--create-namespace \
--wait \
--set tlsSecretName=$(DOCKER_REGISTRY_TLS_SECRET_NAME) \
--set s3.region=$(REGION) \
--set s3.bucket=$(DOCKERREGISTRYBUCKET) \
--set secrets.s3.accessKey=`jq '.["$(EKS_CLUSTER_NAME)"]["docker_registry_user_access_key"]' <"${CLUSTER_CONFIG_FILE}"` \
--set secrets.s3.secretKey=`jq '.["$(EKS_CLUSTER_NAME)"]["docker_registry_user_secret"]' <"${CLUSTER_CONFIG_FILE}"` \
--set secrets.htpasswd="$$DOCKER_REGISTRY_HTPASSWD" \
--set tolerations[0].key=$(ARC_SYS_TAINT),tolerations[0].operator="Exists",tolerations[0].effect=NoSchedule \
--set fullnameOverride=$(DOCKER_REGISTRY_INTERNAL_NAME) \
--values k8s/internal-docker-registry.yaml
kubectl patch svc $(DOCKER_REGISTRY_INTERNAL_NAME) --namespace=$(DOCKER_REGISTRY_NAMESPACE) --patch-file k8s/pytorch-internal-svc-patch.yaml

.PHONY: install-arc
install-arc: helm-repo-update create-gha-arc-secret
[ "$(GHA_ID)" != "" ] || (echo "GHA_ID not set"; exit 1)
[ "$(GHA_INST_ID)" != "" ] || (echo "GHA_INST_ID not set"; exit 1)
[ "$(GHA_PRIVATE_KEY_VAR)" != "" ] || (echo "GHA_PRIVATE_KEY_VAR not set"; exit 1)
[ "$$$(GHA_PRIVATE_KEY_VAR)" != "" ] || (echo "$(GHA_PRIVATE_KEY_VAR) not set"; exit 1)
helm upgrade --install arc oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set-controller \
--namespace actions-runner-system \
--namespace $(RUNNERS_SYSTEM_NAMESPACE) \
--create-namespace \
--set=replicaCount=3 \
--set githubConfigSecret.create=true \
Expand All @@ -85,8 +157,8 @@ install-arc: helm-repo-update create-gha-arc-secret
--set tolerations[0].key=$(ARC_SYS_TAINT),tolerations[0].operator="Exists",tolerations[0].effect=NoSchedule \
--wait
# If the changes are not impacting the controller pod config, it won't restart, so we need to do it manually
for pod in `kubectl get pod --namespace=actions-runner-system | grep 'arc-gha-rs-controller-' | cut -f 1 -d ' '` ; do \
kubectl delete pod $$pod --namespace=actions-runner-system ; \
for pod in `kubectl get pod --namespace=$(RUNNERS_SYSTEM_NAMESPACE) | grep 'arc-gha-rs-controller-' | cut -f 1 -d ' '` ; do \
kubectl delete pod $$pod --namespace=$(RUNNERS_SYSTEM_NAMESPACE) ; \
done

.PHONY: install-karpenter
Expand All @@ -95,7 +167,7 @@ install-karpenter: helm-repo-update
[ "$(KARPENTERCONTROLERROLEARN)" != "" ] || (echo "KARPENTERCONTROLERROLEARN not set"; exit 1)
[ "$(KARPENTERNODEROLEARN)" != "" ] || (echo "KARPENTERNODEROLEARN not set"; exit 1)
helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \
--namespace karpenter \
--namespace $(KARPENTER_NAMESPACE) \
--create-namespace \
--version v0.32.1 \
--set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=$(KARPENTERCONTROLERROLEARN) \
Expand All @@ -108,7 +180,7 @@ install-karpenter: helm-repo-update
fi

.PHONY: setup-karpenter-autoscaler
setup-karpenter-autoscaler: install-karpenter
setup-karpenter-autoscaler: install-karpenter install-docker-registry
[ "$(CLUSTER_CONFIG_FILE)" != "" ] || (echo "CLUSTER_CONFIG_FILE not set"; exit 1)
[ "$(EKS_CLUSTER_NAME)" != "" ] || (echo "EKS_CLUSTER_NAME not set"; exit 1)
[ "$(EKS_ENVIRONMENT)" != "" ] || (echo "EKS_ENVIRONMENT not set"; exit 1)
Expand All @@ -119,7 +191,7 @@ setup-karpenter-autoscaler: install-karpenter
[ "$(RUNNERSCOPE)" != "" ] || (echo "RUNNERSCOPE not set"; exit 1)
../../venv/bin/python3 ../../scripts/kubectl_apply_runner_templates.py \
--template-name k8s/nodeclass.yaml \
--namespace karpenter \
--namespace $(KARPENTER_NAMESPACE) \
--arc-runner-config-files $(ARC_CFG_FILE_FOLDER)/ARC_NODE_CONFIG.yaml \
--rds-state-file $(K8S_RDS_STATE_FILE) \
--runner-scope $(RUNNERSCOPE) \
Expand All @@ -131,11 +203,15 @@ setup-karpenter-autoscaler: install-karpenter
karpentersubnetids='$(KARPENTERSUBNETIDS)' \
project=gh-ci-$(EKS_ENVIRONMENT)-arc \
projecttag=$(PROJECTTAG) \
dockerregistrymirror=`kubectl get svc --namespace=docker-registry docker-registry-mirror-docker-io -o json | jq ".spec.clusterIP" | sed 's/"//g'` \
githubregistrymirror=`kubectl get svc --namespace=docker-registry docker-registry-mirror-ghcr-io -o json | jq ".spec.clusterIP" | sed 's/"//g'` \
pytorchregistrymirror=`kubectl get svc --namespace=docker-registry $(DOCKER_REGISTRY_INTERNAL_NAME) -o json | jq ".spec.clusterIP" | sed 's/"//g'` \
DOCKERREGISTRYINTERNALSECRETARN=$(DOCKERREGISTRYINTERNALSECRETARN) \
--root-classes nodeConfig \
--label-property nodeType
../../venv/bin/python3 ../../scripts/kubectl_apply_runner_templates.py \
--template-name k8s/nodepool.yaml \
--namespace karpenter \
--namespace $(KARPENTER_NAMESPACE) \
--arc-runner-config-files $(ARC_CFG_FILE_FOLDER)/ARC_NODE_CONFIG.yaml \
--rds-state-file $(K8S_RDS_STATE_FILE) \
--runner-scope $(RUNNERSCOPE) \
Expand All @@ -147,6 +223,7 @@ setup-karpenter-autoscaler: install-karpenter
karpentersubnetids='$(KARPENTERSUBNETIDS)' \
project=gh-ci-$(EKS_ENVIRONMENT)-arc \
projecttag=$(PROJECTTAG) \
DOCKERREGISTRYINTERNALSECRETARN=$(DOCKERREGISTRYINTERNALSECRETARN) \
--root-classes nodeConfig \
--label-property nodeType

Expand Down
30 changes: 30 additions & 0 deletions modules/arc/iam_policies.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
resource "aws_iam_user" "internal_docker_registry_usr" {
name = "internal_docker_registry-${var.environment}-${var.aws_vpc_suffix}"

tags = {
Project = "runners-eks"
Environment = var.environment
Context = local.cluster_name
}
}

resource "aws_iam_access_key" "internal_docker_registry_usr_key" {
user = aws_iam_user.internal_docker_registry_usr.name
}

data "aws_iam_policy_document" "internal_docker_registry_usr_pol_doc" {
statement {
effect = "Allow"
actions = ["*"]
resources = [
aws_s3_bucket.internal_docker_registry.arn,
"${aws_s3_bucket.internal_docker_registry.arn}/*"
]
}
}

resource "aws_iam_user_policy" "internal_docker_registry_usr_pol_attach" {
name = "internal_docker_registry_policy-${var.environment}-${var.aws_vpc_suffix}"
user = aws_iam_user.internal_docker_registry_usr.name
policy = data.aws_iam_policy_document.internal_docker_registry_usr_pol_doc.json
}
39 changes: 39 additions & 0 deletions modules/arc/k8s/internal-docker-registry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
replicaCount: 2

service:
type: ClusterIP
port: 5000
clusterIP: 172.20.56.115

storage: s3

garbageCollect:
enabled: true
deleteUntagged: true

metrics:
enabled: true

configData:
version: 0.1
log:
fields:
service: registry
storage:
cache:
blobdescriptor: inmemory
http:
addr: :5000
host: https://pytorch-internal.docker-registry.svc.cluster.local:5000
headers:
X-Content-Type-Options: [nosniff]
debug:
addr: :5001
prometheus:
enabled: true
path: /metrics
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
88 changes: 87 additions & 1 deletion modules/arc/k8s/nodeclass.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,92 @@ kind: EC2NodeClass
metadata:
name: nodeclass-$(NODETYPE)
spec:
userData: |
#!/bin/bash
mv /etc/eks/bootstrap.sh /etc/eks/bootstrap.sh.bak
cat <<BEOF >/etc/eks/bootstrap.sh
#!/bin/bash
/etc/eks/bootstrap.sh.bak "\$@"
yum install docker -y
DOCKER_SECRET=\$(aws secretsmanager get-secret-value --secret-id $(DOCKERREGISTRYINTERNALSECRETARN) | jq ".SecretString" | sed 's/"//g')
DOCKER_USER=\$(echo \$DOCKER_SECRET | cut -d',' -f1)
DOCKER_PASS=\$(echo \$DOCKER_SECRET | cut -d',' -f2)
mkdir -p /etc/containerd/
cat <<EOF >>/etc/containerd/config.toml
[plugins."io.containerd.grpc.v1.cri".registry.configs."$(PYTORCHREGISTRYMIRROR):5000".auth]
username = "\$DOCKER_USER"
password = "\$DOCKER_PASS"
EOF
mkdir -p /etc/containerd/certs.d/docker.io/
cat <<EOF >>/etc/containerd/certs.d/docker.io/hosts.toml
server = "https://docker.io"
[host."http://$(DOCKERREGISTRYMIRROR):5000"]
capabilities = ["pull", "resolve"]
skip_verify = true
[host."https://docker.io"]
capabilities = ["push"]
EOF
mkdir -p /etc/containerd/certs.d/gcr.io/
cat <<EOF >>/etc/containerd/certs.d/gcr.io/hosts.toml
server = "https://gcr.io"
[host."http://$(GITHUBREGISTRYMIRROR):5000"]
capabilities = ["pull", "resolve"]
skip_verify = true
[host."https://gcr.io"]
capabilities = ["push"]
EOF
mkdir -p /etc/containerd/certs.d/pytorch-internal.docker-registry.svc.cluster.local
cat <<EOF >>/etc/containerd/certs.d/pytorch-internal.docker-registry.svc.cluster.local/hosts.toml
server = "https://pytorch-internal.docker-registry.svc.cluster.local"
[host."https://$(PYTORCHREGISTRYMIRROR):5000"]
capabilities = ["pull", "resolve", "push"]
skip_verify = true
EOF
mkdir -p /etc/docker
cat <<EOF >>/etc/docker/daemon.json
{
"insecure-registries": [
"$(DOCKERREGISTRYMIRROR)",
"$(DOCKERREGISTRYMIRROR):443",
"$(DOCKERREGISTRYMIRROR):5000",
"$(GITHUBREGISTRYMIRROR)",
"$(GITHUBREGISTRYMIRROR):443",
"$(GITHUBREGISTRYMIRROR):5000",
"$(PYTORCHREGISTRYMIRROR)",
"$(PYTORCHREGISTRYMIRROR):443",
"$(PYTORCHREGISTRYMIRROR):5000",
"pytorch-internal.docker-registry.svc.cluster.local",
"pytorch-internal.docker-registry.svc.cluster.local:443",
"pytorch-internal.docker-registry.svc.cluster.local:5000"
]
}
EOF
echo "172.20.56.115 pytorch-internal.docker-registry.svc.cluster.local" >> /etc/hosts
service containerd restart
systemctl restart docker
docker login -u "\$DOCKER_USER" -p "\$DOCKER_PASS" pytorch-internal.docker-registry.svc.cluster.local:5000
docker login -u "\$DOCKER_USER" -p "\$DOCKER_PASS" pytorch-internal.docker-registry.svc.cluster.local
BEOF
chmod +x /etc/eks/bootstrap.sh
amiFamily: AL2
subnetSelectorTerms: $(KARPENTERSUBNETIDS)
securityGroupSelectorTerms: $(KARPENTERSGIDS)
Expand All @@ -11,6 +97,6 @@ spec:
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeSize: 1Tb
volumeSize: 1Ti
volumeType: gp3
deleteOnTermination: true
5 changes: 5 additions & 0 deletions modules/arc/k8s/pytorch-internal-svc-patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
spec:
ports:
- name: https-443
port: 443
targetPort: 5000
4 changes: 4 additions & 0 deletions modules/arc/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,9 @@ terraform {
source = "hashicorp/aws"
version = ">= 5.5"
}
external = {
source = "hashicorp/external"
version = ">= 2.3"
}
}
}
28 changes: 28 additions & 0 deletions modules/arc/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,31 @@ output "subnet_ids" {
output "security_group_ids" {
value = [module.eks.node_security_group_id]
}

output "docker_registry_bucket" {
value = aws_s3_bucket.internal_docker_registry.bucket
}

output "docker_registry_bucket_arn" {
value = aws_s3_bucket.internal_docker_registry.arn
}

output "docker_registry_user" {
value = aws_iam_user.internal_docker_registry_usr.name
}

output "docker_registry_user_arn" {
value = aws_iam_user.internal_docker_registry_usr.arn
}

output "docker_registry_user_access_key" {
value = aws_iam_access_key.internal_docker_registry_usr_key.id
}

output "docker_registry_user_secret" {
value = aws_iam_access_key.internal_docker_registry_usr_key.secret
}

output "internal_registry_secret_arn" {
value = resource.aws_secretsmanager_secret.pytorch_internal_docker_registry_auth.arn
}
Loading

0 comments on commit bd3e67c

Please sign in to comment.