diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml index 046a1d9..a027e8a 100644 --- a/.github/workflows/on-pr.yml +++ b/.github/workflows/on-pr.yml @@ -1,10 +1,15 @@ name: On PR + on: pull_request: {} push: branches: - main +permissions: + id-token: write + contents: read + jobs: tflint-plan: name: tflint + terraform plan diff --git a/.github/workflows/release-do-open-pr.yml b/.github/workflows/release-do-open-pr.yml index c298924..16dfe04 100644 --- a/.github/workflows/release-do-open-pr.yml +++ b/.github/workflows/release-do-open-pr.yml @@ -25,7 +25,6 @@ jobs: run: pip install virtualenv - name: Open PR - working-directory: runners shell: bash run: make open-rel-pr env: @@ -33,3 +32,4 @@ jobs: GITHUB_APP_ID: ${{ secrets.RELEASE_APP_ID }} GITHUB_APP_INSTALLATION_ID: ${{ secrets.RELEASE_APP_INSTALLATION_ID }} GITHUB_REPOSITORY: ${{ github.repository }} + FAST_RELEASE_FIREFIGHT: ${{ github.event.inputs.fast_release_firefight }} diff --git a/.github/workflows/release-on-comment-pr.yml b/.github/workflows/release-on-comment-pr.yml index 1a24982..155124d 100644 --- a/.github/workflows/release-on-comment-pr.yml +++ b/.github/workflows/release-on-comment-pr.yml @@ -23,7 +23,6 @@ jobs: - name: React to PR comment run: | make COMMENTS="PROCEED_TO_VANGUARD,ABORT_DEPLOYMENT_SHUTDOWN_VANGUARD,PROCEED_TO_PRODUCTION,CLEANUP_DEPLOYMENT" LABELS="deploy-to-vanguard,abort-vanguard,deploy-to-prod,cleanup-deployment" CHECK_REMOVE_LABELS="deploy-to-canary,,deploy-to-vanguard,deploy-to-prod" CHECK_COMMENTS="Successfully deployed to canary##Successfully deployed to vanguard#Successfully deployed to production" react-pr-comment - working-directory: runners shell: bash env: GHA_PRIVATE_KEY_DEPLOY : ${{ secrets.RELEASE_APP_PRIVATE_KEY }} diff --git a/.github/workflows/release-on-release-label.yml b/.github/workflows/release-on-release-label.yml index 163f7e1..866c0f5 100644 --- a/.github/workflows/release-on-release-label.yml +++ b/.github/workflows/release-on-release-label.yml @@ -57,16 +57,15 @@ jobs: aws-region: us-east-1 - name: Notify job started - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Starting to deploy to canary, wait for its conclusion and I'll guide you to next steps" add-comment-to-pr env: GHA_PRIVATE_KEY_DEPLOY : ${{ secrets.RELEASE_APP_PRIVATE_KEY }} + GITHUB_APP_ID: ${{ secrets.RELEASE_APP_ID }} GITHUB_APP_INSTALLATION_ID: ${{ secrets.RELEASE_APP_INSTALLATION_ID }} GITHUB_REPOSITORY: ${{ github.repository }} - name: Terraform Apply / ARC canary (apply-arc-canary arc-canary) - working-directory: runners shell: bash run: make apply-arc-canary arc-canary env: @@ -78,7 +77,6 @@ jobs: TERRAFORM_EXTRAS: -auto-approve -lock-timeout=15m - name: Notify job success - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Successfully deployed to canary, add a comment with PROCEED_TO_VANGUARD in order to proceed deploying to vanguard, or close this PR in order to abort" add-comment-to-pr env: @@ -103,7 +101,6 @@ jobs: run: pip install virtualenv - name: Notify deploy-to-canary job Failure - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Something went wrong when deploying to canary, either re-run the job or close this PR to abort the deployment process" add-comment-to-pr env: @@ -154,7 +151,6 @@ jobs: aws-region: us-east-1 - name: Notify job started - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Starting to deploy to vanguard, wait for its conclusion and I'll guide you to next steps" add-comment-to-pr env: @@ -164,7 +160,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="PROCEED_TO_VANGUARD" wait-check-user-comment env: @@ -174,7 +169,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check bot comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="Successfully deployed to canary" wait-check-bot-comment env: @@ -184,7 +178,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Check PR approval - working-directory: runners shell: bash run: make wait-check-pr-approved env: @@ -194,7 +187,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Terraform Apply / ARC vanguard (apply-arc-vanguard arc-vanguard) - working-directory: runners shell: bash run: make apply-arc-vanguard arc-vanguard env: @@ -203,12 +195,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.LIST_PYTORCH_RUNNERS_GITHUB_TOKEN }} KUBECONFIG: ${{ runner.temp }}/kubeconfig NO_EKSCTL: 'true' - ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} - ROCKSET_API_SERVER: ${{ secrets.ROCKSET_API_SERVER }} TERRAFORM_EXTRAS: -auto-approve -lock-timeout=15m - name: Notify job success - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Successfully deployed to vanguard. In order to proceed find someone to approve this PR and then add a comment with PROCEED_TO_PRODUCTION in order to proceed deploying to production environment or ABORT_DEPLOYMENT_SHUTDOWN_VANGUARD in order to stop vanguard and close this PR" add-comment-to-pr env: @@ -233,7 +222,6 @@ jobs: run: pip install virtualenv - name: Notify deploy-to-vanguard job Failure - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Something went wrong when deploying to vanguard, either re-run the job or comment ABORT_DEPLOYMENT_SHUTDOWN_VANGUARD to revert vanguard to old state, abort the deployment proces and close the PR" add-comment-to-pr env: @@ -284,7 +272,6 @@ jobs: aws-region: us-east-1 - name: Notify job started - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Starting to deploy to prod, wait for its conclusion and I'll guide you to next steps" add-comment-to-pr env: @@ -294,7 +281,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="PROCEED_TO_PRODUCTION" wait-check-user-comment env: @@ -304,7 +290,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check bot comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="Successfully deployed to vanguard" wait-check-bot-comment env: @@ -314,21 +299,17 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Terraform Apply / ARC prod (arc-prod) - working-directory: runners shell: bash - run: make apply-arc-prod arc-prod + run: make apply arc-prod env: GHA_PRIVATE_KEY_CANARY: ${{ secrets.GHA_PRIVATE_KEY_CANARY }} GHA_PRIVATE_KEY: ${{ secrets.GHA_PRIVATE_KEY }} GITHUB_TOKEN: ${{ secrets.LIST_PYTORCH_RUNNERS_GITHUB_TOKEN }} KUBECONFIG: ${{ runner.temp }}/kubeconfig NO_EKSCTL: 'true' - ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} - ROCKSET_API_SERVER: ${{ secrets.ROCKSET_API_SERVER }} TERRAFORM_EXTRAS: -auto-approve -lock-timeout=15m - name: Notify job success - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Successfully deployed to production, add a comment with CLEANUP_DEPLOYMENT in order to merge this PR and stop vanguard" add-comment-to-pr env: @@ -353,7 +334,6 @@ jobs: run: pip install virtualenv - name: Notify deploy-to-prod job Failure - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Something went wrong when deploying to production, re-run the job given job. If it does not work, manual action is required" add-comment-to-pr env: @@ -406,7 +386,6 @@ jobs: aws-region: us-east-1 - name: Notify job started - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Starting to revert vanguard to old state and shut it down, wait for its conclusion and I'll guide you to next steps" add-comment-to-pr env: @@ -416,7 +395,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="ABORT_DEPLOYMENT_SHUTDOWN_VANGUARD" wait-check-user-comment env: @@ -426,7 +404,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Terraform Apply / ARC vanguard OFF (apply-arc-vanguard arc-vanguard-off) - working-directory: runners shell: bash run: make apply-arc-vanguard arc-vanguard-off env: @@ -435,12 +412,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.LIST_PYTORCH_RUNNERS_GITHUB_TOKEN }} KUBECONFIG: ${{ runner.temp }}/kubeconfig NO_EKSCTL: 'true' - ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} - ROCKSET_API_SERVER: ${{ secrets.ROCKSET_API_SERVER }} TERRAFORM_EXTRAS: -auto-approve -lock-timeout=15m - name: Notify job success - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Successfully reverted vanguard to current prod_live branch state and disabled it" add-comment-to-pr env: @@ -449,7 +423,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Close PR - working-directory: runners shell: bash run: make close-pr env: @@ -474,7 +447,6 @@ jobs: run: pip install virtualenv - name: Notify abort-vanguard job Failure - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Something went wrong when restoring vanguard state, THIS IS A MAJOR ISSUE, firefight starts **NOW**" add-comment-to-pr env: @@ -525,7 +497,6 @@ jobs: aws-region: us-east-1 - name: Notify job started - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Finishing deployment, by shutting down vanguard and merging this PR" add-comment-to-pr env: @@ -535,7 +506,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="CLEANUP_DEPLOYMENT" wait-check-user-comment env: @@ -545,7 +515,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Double-check bot comment added - working-directory: runners shell: bash run: make WAIT_COMMENT="Successfully deployed to production" wait-check-bot-comment env: @@ -555,7 +524,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Terraform Apply / ARC vanguard OFF (arc-vanguard-off) - working-directory: runners shell: bash run: make arc-vanguard-off env: @@ -564,12 +532,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.LIST_PYTORCH_RUNNERS_GITHUB_TOKEN }} KUBECONFIG: ${{ runner.temp }}/kubeconfig NO_EKSCTL: 'true' - ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} - ROCKSET_API_SERVER: ${{ secrets.ROCKSET_API_SERVER }} TERRAFORM_EXTRAS: -auto-approve -lock-timeout=15m - name: Notify job success - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Successfully stopped vanguard" add-comment-to-pr env: @@ -579,7 +544,6 @@ jobs: GITHUB_REPOSITORY: ${{ github.repository }} - name: Merge PR - working-directory: runners shell: bash run: make merge-pr env: @@ -604,7 +568,6 @@ jobs: run: pip install virtualenv - name: Notify cleanup-deployment job Failure - working-directory: runners shell: bash run: make COMMENT_TO_ADD="Something went wrong when stopping vanguard and merging the PR, pleae take manual actions from now on to stabelize the status of the system" add-comment-to-pr env: diff --git a/Makefile b/Makefile index 7825ce3..ad6b968 100644 --- a/Makefile +++ b/Makefile @@ -34,3 +34,163 @@ plan: done ; \ popd ; \ done + +.PHONY: apply +apply: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make apply: aws/$$account/$$region ============================================" ; \ + $(MAKE) apply || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: apply-arc-canary +apply-arc-canary: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make apply-arc-canary: aws/$$account/$$region ============================================" ; \ + $(MAKE) apply-arc-canary || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: apply-arc-vanguard +apply-arc-vanguard: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make apply-arc-vanguard: aws/$$account/$$region ============================================" ; \ + $(MAKE) apply-arc-vanguard || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: apply-arc-prod +apply-arc-prod: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make apply-arc-prod: aws/$$account/$$region ============================================" ; \ + $(MAKE) apply-arc-prod || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: arc-canary +arc-canary: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make arc-canary: aws/$$account/$$region ============================================" ; \ + $(MAKE) arc-canary || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: arc-vanguard +arc-vanguard: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make arc-vanguard: aws/$$account/$$region ============================================" ; \ + $(MAKE) arc-vanguard || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: arc-vanguard-off +arc-vanguard-off: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make arc-vanguard-off: aws/$$account/$$region ============================================" ; \ + $(MAKE) arc-vanguard-off || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +.PHONY: arc-prod +arc-prod: + cd aws ; for account in ./*/ ; do \ + pushd $$account ; \ + for region in ./*/ ; do \ + pushd $$region ; \ + echo "==== make arc-prod: aws/$$account/$$region ============================================" ; \ + $(MAKE) arc-prod || exit 1 ; \ + popd ; \ + done ; \ + popd ; \ + done + +venv/bin/pip: + virtualenv venv + venv/bin/pip install -r requirements.txt + +tf-modules/VERSIONS: venv/bin/pip Terrafile + venv/bin/python scripts/terrafile_lambdas.py -t Terrafile -m tf-modules + +.PHONY: terrafile +terrafile: tf-modules/VERSIONS + +# Deployment +.PHONY: open-rel-pr +open-rel-pr: venv/bin/pip + venv/bin/python ./scripts/deployment.py --debug open-rel-pr + +.PHONY: wait-check-deployment +wait-check-deployment: venv/bin/pip + [ "$(RELEASE_ACTION_NAME)" != "" ] || (echo "RELEASE_ACTION_NAME not set"; exit 1) + venv/bin/python ./scripts/deployment.py --debug wait-check-deployment --release-action-name "$(RELEASE_ACTION_NAME)" --comment-to-add "$(COMMENT_TO_ADD)" --ignore-if-label "$(IGNORE_IF_LABEL)" + +.PHONY: wait-check-user-comment +wait-check-user-comment: venv/bin/pip + [ "$(WAIT_COMMENT)" != "" ] || (echo "WAIT_COMMENT not set"; exit 1) + venv/bin/python ./scripts/deployment.py --debug wait-check-user-comment --comment "$(WAIT_COMMENT)" + +.PHONY: wait-check-bot-comment +wait-check-bot-comment: venv/bin/pip + [ "$(WAIT_COMMENT)" != "" ] || (echo "WAIT_COMMENT not set"; exit 1) + venv/bin/python ./scripts/deployment.py --debug wait-check-bot-comment --comment "$(WAIT_COMMENT)" + +.PHONY: react-pr-comment +react-pr-comment: venv/bin/pip + [ "$(COMMENTS)" != "" ] || (echo "COMMENTS not set"; exit 1) + [ "$(LABELS)" != "" ] || (echo "LABELS not set"; exit 1) + [ "$(CHECK_REMOVE_LABELS)" != "" ] || (echo "CHECK_REMOVE_LABELS not set"; exit 1) + [ "$(CHECK_COMMENTS)" != "" ] || (echo "CHECK_COMMENTS not set"; exit 1) + venv/bin/python ./scripts/deployment.py --debug react-pr-comment --comments "$(COMMENTS)" --labels "$(LABELS)" --check-remove-labels "$(CHECK_REMOVE_LABELS)" --check-comments "$(CHECK_COMMENTS)" + +.PHONY: add-comment-to-pr +add-comment-to-pr: venv/bin/pip + [ "$(COMMENT_TO_ADD)" != "" ] || (echo "COMMENT_TO_ADD not set"; exit 1) + venv/bin/python ./scripts/deployment.py --debug add-comment-to-pr --comment "$(COMMENT_TO_ADD)" + +.PHONY: wait-check-pr-approved +wait-check-pr-approved: venv/bin/pip + venv/bin/python ./scripts/deployment.py --debug wait-check-pr-approved + +.PHONY: close-pr +close-pr: venv/bin/pip + venv/bin/python ./scripts/deployment.py --debug close-pr + +.PHONY: merge-pr +merge-pr: venv/bin/pip + venv/bin/python ./scripts/deployment.py --debug merge-pr + diff --git a/aws/391835788720/us-east-1/iam_policies.tf b/aws/391835788720/us-east-1/iam_policies.tf index 22e6271..726414e 100644 --- a/aws/391835788720/us-east-1/iam_policies.tf +++ b/aws/391835788720/us-east-1/iam_policies.tf @@ -10,14 +10,14 @@ resource "aws_iam_role" "ossci_gha_terraform" { name = "ossci_gha_terraform" max_session_duration = 18000 - description = "used by pytorch-labs/pytorch-gha-infra workflows to deploy terraform configs" + description = "used by pytorch/ci-infra workflows to deploy terraform configs" assume_role_policy = jsonencode({ Version = "2012-10-17" Statement = [ { Effect = "Allow" Principal = { - Federated = "arn:aws:iam::${local.aws_region}:oidc-provider/token.actions.githubusercontent.com" + Federated = "arn:aws:iam::${local.aws_account_id}:oidc-provider/token.actions.githubusercontent.com" } Action = "sts:AssumeRoleWithWebIdentity" Condition = { diff --git a/modules/arc/Makefile b/modules/arc/Makefile index 01e7e5d..9a497a6 100644 --- a/modules/arc/Makefile +++ b/modules/arc/Makefile @@ -1,4 +1,4 @@ -SHELL := /bin/bash +SHELL := /bin/bash -o pipefail ARC_SYS_TAINT = "CriticalAddonsOnly" K8S_RDS_STATE_FILE = ".k8s-rds-state" diff --git a/scripts/deployment.py b/scripts/deployment.py new file mode 100644 index 0000000..ed4ef3a --- /dev/null +++ b/scripts/deployment.py @@ -0,0 +1,600 @@ +#!/usr/bin/env python3 + +import argparse +import datetime +import logging +import os +import re +import sys +import time + +from git import Repo +from github import Auth, Github, PaginatedList, CheckRun, Repository, PullRequest +from typing import List + + +RELEASE_BRANCH = 'prod_live' + +PROD_RELEASE_LABEL = 'prod-release' +CANARY_LABEL = 'deploy-to-canary' +VANGUARD_LABEL = 'deploy-to-vanguard' +FAST_RELEASE_FIREFIGHT_LABEL = 'fast-release-firefight' + +PROCEED_TO_VANGUARD_COMMENT = 'PROCEED_TO_VANGUARD' +PROCEED_TO_PROD_COMMENT = 'PROCEED_TO_PRODUCTION' +SHUTDOWN_VANGUARD_COMMENT = 'ABORT_DEPLOYMENT_SHUTDOWN_VANGUARD' +CLEANUP_DEPLOYMENT_COMMENT = 'CLEANUP_DEPLOYMENT' +ROLLBACK_PRODUCTION_COMMENT = 'TBD' + + +def nice_bool_option(s: str) -> bool: + if s.lower().strip() in ['true', '1', 't', 'y', 'yes']: + return True + elif s.lower().strip() in ['false', '0', 'f', 'n', 'no', '']: + return False + else: + raise ValueError(f'Invalid boolean option: {s}') + + +def gh_get_check_runs(repo: Repository.Repository, ref: str) -> PaginatedList.PaginatedList[CheckRun.CheckRun]: + ''' + :calls: `GET /repos/{owner}/{repo}/commits/{ref}/check-runs + ''' + return PaginatedList.PaginatedList( + CheckRun.CheckRun, + repo._requester, + f'{repo.url}/commits/{ref}/check-runs', + { + 'status': 'completed', + }, + list_item='check_runs', + ) + + +def get_gh_client(opts: argparse.Namespace) -> Github: + if opts.github_app_id and opts.github_app_key and opts.github_app_installation_id: + logging.debug(f'Using github app private key credentials') + auth = Auth.AppAuth(opts.github_app_id, opts.github_app_key).get_installation_auth(opts.github_app_installation_id) + gh = Github(auth=auth) + # this is to share the credentials with child processes (e.g. git) + os.environ['GITHUB_TOKEN'] = auth.token + os.environ['GIT_PASS'] = auth.token + os.environ['GIT_USER'] = opts.bot_name + return gh + + elif opts.github_token: + logging.debug('Using github token credentials') + auth = Auth.Token(opts.github_token) + return Github(auth=auth) + + else: + raise RuntimeError('No github token or app credentials provided') + + +def get_pr(repo: Repository.Repository, release_branch: str) -> PullRequest.PullRequest: + pulls = list(repo.get_pulls(state='open', base=release_branch)) + if len(pulls) > 1: + logging.error(f'Found more than one open pull request for {release_branch}: {", ".join(p.html_url for p in pulls)} by {", ".join(p.user.login for p in pulls)}') + logging.error(f'Please finish the current multiple deployments or close the pull requests before continuing') + raise RuntimeError(f'Found multiple open pull request for {release_branch}') + elif len(pulls) == 0: + logging.error(f'No open pull request for {release_branch}') + raise RuntimeError(f'No open pull request for {release_branch}') + + logging.debug(f'Found open pull request for {release_branch}: {pulls[0].html_url} by {pulls[0].user.login}') + return pulls[0] + + +def setup_local_git_auth(opts: argparse.Namespace, git_repo: Repo) -> None: + # This convoluted authentication is to avoid setting the credentials in the git config, or URL that could expose to leaks + git_repo.git.config('user.name', opts.bot_name) + git_repo.git.config('user.email', f'{opts.github_app_installation_id}+{opts.bot_name}[bot]@users.noreply.github.com') + + try: + git_repo.git.config('--unset-all', 'credential.helper') + except: + logging.debug('No credential helper set') + try: + git_repo.git.config('--unset-all', 'http.https://github.com/.extraheader') + except: + logging.debug('No extraheader for http.https://github.com/ set') + try: + git_repo.git.config('--unset-all', 'http.extraheader') + except: + logging.debug('No extraheader set') + + git_repo.git.config('credential.helper', '!f() { echo "username=${GIT_USER}\npassword=${GIT_PASS}"; }; f') + + +def create_git_branch(opts: argparse.Namespace, git_repo: Repo, branch_name: str) -> None: + git_repo.create_head(branch_name) + git_repo.remotes.origin.push(branch_name) + logging.info(f'Created branch {branch_name} with ref {git_repo.head.commit.hexsha} and pushed to origin') + + +def create_git_tag(opts: argparse.Namespace, git_repo: Repo, tag_name: str) -> None: + git_repo.create_tag(tag_name) + git_repo.remotes.origin.push(tag_name) + logging.info(f'Created tag {tag_name} with ref {git_repo.head.commit.hexsha} and pushed to origin') + + +# Commands +def open_release_pr(gh: Github, opts: argparse.Namespace) -> None: + 'open-rel-pr' + repo = gh.get_repo(opts.repo) + + for pull in repo.get_pulls(state='open', base=opts.release_branch): + logging.error(f'Found open pull request for {opts.release_branch}: {pull.html_url} by {pull.user.login}') + logging.error(f'Please finish the current deployment or close the pull request before opening a new one') + raise RuntimeError(f'Found open pull request for {opts.release_branch}') + + usr = gh.get_user_by_id(opts.github_actor_id) + + right_now = datetime.datetime.now() + gh_righ_now = right_now.strftime('%Y%m%d%H%M%S') + human_right_now = right_now.strftime('%Y-%m-%dT%H:%M:%S') + + tag_name = f'arc-prod-release-{gh_righ_now}' + branch_name = f'prod-release/{gh_righ_now}' + + git_repo = Repo('.') + + setup_local_git_auth(opts, git_repo) + create_git_branch(opts, git_repo, branch_name) + create_git_tag(opts, git_repo, tag_name) + + body = f'''Prod release {human_right_now} opened by {usr.name} ({usr.login} {opts.github_actor_id}) + +Head commit: {git_repo.head.commit.hexsha} +Deployment tag: {tag_name} +Branch name: {branch_name} + +I am a bot ({opts.bot_name}), and this is an automated response. Please don't manipulate the +labels in this PR or I might get lost as I am not very smart. The labels are used to track the state of the deployment +and trigger the relevant jobs. As the steps proceed, I'll be adding comments to this PR to let you know what to do next. + +So lets keep our conversation in the comments here, and I'll take care of the rest. + +more details can be found in the relevant section for GHA runbook: https://fburl.com/pytorch-arc-deployment-docs''' + + pr = repo.create_pull( + title=f'Prod release {human_right_now} by {usr.name}', + body=body, + head=branch_name, + base=opts.release_branch + ) + + pr.add_to_assignees(usr) + logging.info(f'Created pull request for {opts.release_branch}: {pr.html_url}') + + labels: List[str] = [PROD_RELEASE_LABEL, ] + if opts.fast_release_firefight: + pr.create_issue_comment(PROCEED_TO_VANGUARD_COMMENT) + labels += [FAST_RELEASE_FIREFIGHT_LABEL, VANGUARD_LABEL, ] + else: + labels += [CANARY_LABEL, ] + + pr.add_to_labels(*labels) + logging.debug(f'Added labels to pull request: {", ".join(labels)}') + + pr.create_issue_comment(f'I triggered the canary release for you, please wait for it to finish. I\'ll let you know when it\'s done.') + + +def add_comment_to_pr(gh: Github, opts: argparse.Namespace) -> None: + 'add-comment-to-pr' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + pull.create_issue_comment(opts.comment) + logging.info(f'Added comment to pr: {pull.html_url}') + + +def wait_check_deployment(gh: Github, opts: argparse.Namespace) -> None: + 'wait-check-deployment' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + if opts.ignore_if_label.strip(): + for label in pull.get_labels(): + if label.name.strip().lower() == opts.ignore_if_label.strip().lower(): + logging.info(f'Found label {opts.ignore_if_label} on PR {pull.html_url}, ignoring this check') + return + + commit = list(pull.get_commits())[-1] + logging.debug(f'Latest commit on pr: {commit.sha}') + logging.info(f'Checking for successful release ({opts.release_action_name}) on main...') + + found_success = False + time_limit = datetime.datetime.now() + datetime.timedelta(minutes=15) + while not found_success and time_limit > datetime.datetime.now(): + for wf in gh_get_check_runs(repo, commit.sha): + if wf.name == opts.release_action_name and wf.conclusion == 'success': + found_success = True + break + + if not found_success: + time.sleep(30) + + if not found_success: + logging.error(f'No successful canary release ({opts.release_action_name}) found for {opts.release_branch}') + raise RuntimeError('No successful canary release found') + + logging.info(f'Found successful canary release ({opts.release_action_name}) on for {opts.release_branch} ({commit.sha})') + + if opts.comment_to_add: + pull.create_issue_comment(opts.comment_to_add) + logging.info(f'Added comment to pr: {pull.html_url}') + + +def wait_check_user_comment(gh: Github, opts: argparse.Namespace) -> None: + 'wait-check-user-comment' + + assert(opts.comment is not None and opts.comment != '') + + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + found_coment = False + time_limit = datetime.datetime.now() + datetime.timedelta(minutes=15) + while not found_coment and time_limit > datetime.datetime.now(): + for comment in pull.get_issue_comments(): + if comment.body.strip() == opts.comment.strip() and comment.user.type.lower() != 'bot' and comment.get_reactions().totalCount != 0: + found_coment = True + break + + if not found_coment: + time.sleep(30) + + if not found_coment: + pull.create_issue_comment(f'Comment [{opts.comment}] not found on PR {pull.html_url}, so I can\'t proceed') + logging.error(f'No comment found for {opts.comment} on PR {pull.html_url}') + raise RuntimeError('No comment found') + + +def wait_check_bot_comment(gh: Github, opts: argparse.Namespace) -> None: + 'wait-check-bot-comment' + + assert(opts.comment is not None and opts.comment != '') + + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + for comment in pull.get_issue_comments(): + if comment.body.strip().startswith(opts.comment.strip()) and comment.user.type.lower() == 'bot': + logging.info(f'Found comment for {opts.comment} on PR {pull.html_url}') + return + + logging.error(f'No comment found for [{opts.comment}] on PR {pull.html_url}') + raise RuntimeError('No comment found') + + +def react_pr_comment(gh: Github, opts: argparse.Namespace) -> None: + 'react-pr-comment' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + comments_lst = opts.comments.split(',') + labels_lst = opts.labels.split(',') + check_remove_labels_lst = opts.check_remove_labels.split(',') + check_comments_lst = opts.check_comments.split('#') + + if not (len(comments_lst) == len(labels_lst) == len(check_remove_labels_lst) == len(check_comments_lst)): + logging.error(f'Options "comments" ({len(comments_lst)}), "labels" ({len(labels_lst)}), "check-comments" ({len(check_comments_lst)}) and "check-remove-labels" ({len(check_remove_labels_lst)}) should be the same size!') + raise RuntimeError('Options "comments", "labels", "check-comments" and "check-remove-labels" should be the same size!') + + check_dict = {} + for comment, label, check_label, check_comment in zip(comments_lst, labels_lst, check_remove_labels_lst, check_comments_lst): + check_dict[comment.strip()] = { + 'label': label.strip(), + 'check_label': check_label, + 'check_comment': check_comment.strip(), + } + + issue_comments = list(pull.get_issue_comments()) + current_labels = set(l.name.lower() for l in pull.get_labels()) + + for comment in reversed(issue_comments): + if comment.user.type.lower() != 'bot' and comment.body.strip() in check_dict and comment.get_reactions().totalCount == 0: + + check = check_dict[comment.body.strip()] + if check['check_label']: + if check['check_label'].lower() not in current_labels: + comment.create_reaction('-1') + pull.create_issue_comment(f'Label {check["check_label"]} not found on PR {pull.html_url}, so I can\'t proceed with the deployment') + logging.error(f'Label {check["check_label"]} not found on PR {pull.html_url}') + raise RuntimeError(f'Required label not found on PR') + + if check['check_comment']: + found_coment = False + for c in issue_comments: + if c.body.strip().startswith(check['check_comment'].strip()): + found_coment = True + break + + if not found_coment: + comment.create_reaction('-1') + pull.create_issue_comment(f'Comment [{check["check_comment"]}] not found on PR {pull.html_url}, so I can\'t proceed with the deployment') + logging.error(f'Comment [{check["check_comment"]}] not found on PR {pull.html_url}') + raise RuntimeError(f'Required comment not found on PR') + + pull.remove_from_labels(check['check_label']) + logging.debug(f'Removed label {check["check_label"]} from PR {pull.html_url}') + + pull.add_to_labels(check['label']) + logging.debug(f'Added label {check["label"]} to PR {pull.html_url}') + + comment.create_reaction('rocket') + logging.debug(f'Added reaction to comment: {comment.html_url}') + + return + + logging.error(f'No comment found for {opts.comments} on PR {pull.html_url}') + raise RuntimeError('No comment found on PR') + + +def close_pr(gh: Github, opts: argparse.Namespace) -> None: + 'close-pr' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + pull.edit(state='closed') + logging.info(f'Closed pull request for {opts.release_branch}: {pull.html_url}') + + +def check_pr_approved(gh: Github, opts: argparse.Namespace) -> None: + 'wait-check-pr-approved' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + user_name_lst = re.findall(r'\(([a-zA-Z0-9_]+) [0-9]+\)', pull.body) + if len(user_name_lst) != 1: + logging.error(f'Found more than or less than one user in the PR body: {pull.html_url}') + raise RuntimeError('Found more than or less than one user in the PR body') + user_name = user_name_lst[0] + + reviews = list(pull.get_reviews()) + + for review in reviews: + if review.user.login != user_name and review.state == 'APPROVED': + pull.create_issue_comment(f'PR approved by {review.user.login}') + logging.info(f'Found approved PR for {opts.release_branch}: {pull.html_url} by {review.user.login}') + return + + for review in reviews: + if review.user.login == user_name and review.state == 'APPROVED': + pull.create_issue_comment('PR self-approved, this is OK, but make sure to communicate this and avoid rolling out without letting the team know.') + logging.info(f'Found self-approved PR for {opts.release_branch}: {pull.html_url}') + return + + pull.create_issue_comment('PR is not approved, please make sure to get approval before merging. It is OK to self-approve, but make sure to communicate this and avoid rolling out without letting the team know.') + logging.error(f'PR does not have any approval {opts.release_branch}: {pull.html_url}') + raise RuntimeError('PR does not have any approval') + + +def merge_pr(gh: Github, opts: argparse.Namespace) -> None: + 'merge-pr' + repo = gh.get_repo(opts.repo) + pull = get_pr(repo, opts.release_branch) + + pull.merge(merge_method='rebase') + logging.info(f'Merged pull request for {opts.release_branch}: {pull.html_url}') + + +CMD_MAP = { + add_comment_to_pr.__doc__: add_comment_to_pr, + check_pr_approved.__doc__: check_pr_approved, + close_pr.__doc__: close_pr, + merge_pr.__doc__: merge_pr, + open_release_pr.__doc__: open_release_pr, + react_pr_comment.__doc__: react_pr_comment, + wait_check_bot_comment.__doc__: wait_check_bot_comment, + wait_check_deployment.__doc__: wait_check_deployment, + wait_check_user_comment.__doc__: wait_check_user_comment, +} + + +# Main +def parse_args() -> argparse.Namespace: + opts = argparse.ArgumentParser(description='Deployment script for the project') + + opts.add_argument( + '--github-token', + help='Github token to use for the deployment', + default=os.environ.get('GITHUB_TOKEN', '').strip() or None, + type=str, + required=False + ) + opts.add_argument( + '--github-app-id', + help='Github app id to use for the deployment', + default=os.environ.get('GITHUB_APP_ID', '').strip() or None, + type=str, + required=False + ) + opts.add_argument( + '--github-app-key', + help='Github app key to use for the deployment', + default=os.environ.get('GHA_PRIVATE_KEY_DEPLOY', '').strip() or None, + type=str, + required=False + ) + opts.add_argument( + '--github-app-installation-id', + help='Github app installation id to use for the deployment', + default=os.environ.get('GITHUB_APP_INSTALLATION_ID', '').strip() or None, + type=int, + required=False + ) + opts.add_argument( + '--repo', + help='Github repo to use for the deployment', + default=os.environ.get('GITHUB_REPOSITORY', 'pytorch-labs/pytorch-gha-infra').strip() or None, + type=str, + required=False + ) + opts.add_argument( + '--debug', + help='Enable debug mode', + action='store_true' + ) + opts.add_argument( + '--github-actor-id', + help='Github actor id to use for the deployment', + default=os.environ.get('GITHUB_ACTOR_ID', '').strip() or None, + type=int, + required=False + ) + opts.add_argument( + '--release-branch', + help='Release branch to use for the deployment', + default=RELEASE_BRANCH, + type=str, + required=False + ) + opts.add_argument( + '--bot-name', + help='Name of the bot to use for the deployment', + default='pytorch-arc-pr-deployment-bot', + type=str, + required=False + ) + + rel_subparsers = opts.add_subparsers(help='Release actions', dest='release_action') + + open_rel_issue_parser = rel_subparsers.add_parser( + str(open_release_pr.__doc__), + help='Opens the release issue' + ) + open_rel_issue_parser.add_argument( + '--fast-release-firefight', + help='Enable fast release firefight mode', + type=nice_bool_option, + default=nice_bool_option(os.environ.get('FAST_RELEASE_FIREFIGHT', 'false')), + ) + + wait_check_deployment_parser = rel_subparsers.add_parser( + str(wait_check_deployment.__doc__), + help='Waits for the deployment to finish' + ) + wait_check_deployment_parser.add_argument( + '--release-action-name', + help='Name of the action that releases and is a requirement for the release', + type=str, + required=True + ) + wait_check_deployment_parser.add_argument( + '--comment-to-add', + help='String to add as a comment to the release issue', + default='', + type=str, + required=False + ) + wait_check_deployment_parser.add_argument( + '--ignore-if-label', + help='Ignore if the PR has this label', + default='', + type=str, + required=False + ) + + wait_check_user_comment_parser = rel_subparsers.add_parser( + str(wait_check_user_comment.__doc__), + help='waits for user comment on PR' + ) + wait_check_user_comment_parser.add_argument( + '--comment', + help='String to check as a comment to the release issue', + type=str, + required=True, + ) + + wait_check_bot_comment_parser = rel_subparsers.add_parser( + str(wait_check_bot_comment.__doc__), + help='waits for bot comment on PR' + ) + wait_check_bot_comment_parser.add_argument( + '--comment', + help='String to check as a comment to the release issue', + type=str, + required=True, + ) + + react_pr_comment_parser = rel_subparsers.add_parser( + str(react_pr_comment.__doc__), + help='reacts to user comment on PR, add the corresponding label based on comment' + ) + react_pr_comment_parser.add_argument( + '--comments', + help='Coma separated list of comments to react to', + type=str, + required=True, + ) + react_pr_comment_parser.add_argument( + '--labels', + help='Coma separated list of labels to add', + type=str, + required=True, + ) + react_pr_comment_parser.add_argument( + '--check-remove-labels', + help='Coma separated list of labels to check and remove if found', + type=str, + required=False, + default='', + ) + react_pr_comment_parser.add_argument( + '--check-comments', + help='"#" separated list of comments to check', + type=str, + required=False, + default='', + ) + + add_comment_to_pr_parser = rel_subparsers.add_parser( + str(add_comment_to_pr.__doc__), + help='Adds a comment to the release issue' + ) + add_comment_to_pr_parser.add_argument( + '--comment', + help='String to add as a comment to the release issue', + type=str, + required=True, + ) + + rel_subparsers.add_parser( + str(close_pr.__doc__), + help='Closes the release issue' + ) + + rel_subparsers.add_parser( + str(check_pr_approved.__doc__), + help='Checks if the release issue is approved' + ) + + rel_subparsers.add_parser( + str(merge_pr.__doc__), + help='Merges the release issue' + ) + + return opts.parse_args() + + +def main(): + opts = parse_args() + + logging.basicConfig( + format="<%(name)s:%(levelname)s> - %(message)s", + level=logging.DEBUG if opts.debug else logging.INFO, + stream=sys.stderr + ) + + gh = get_gh_client(opts) + try: + CMD_MAP[opts.release_action](gh, opts) + except Exception as e: + gh.close() + raise e + + +if __name__ == '__main__': + main() diff --git a/scripts/module_makefile b/scripts/module_makefile index 0fe0d87..266bea2 100644 --- a/scripts/module_makefile +++ b/scripts/module_makefile @@ -4,7 +4,9 @@ PROHOME = $(realpath ../../..) REGION = $(notdir $(CURDIR)) ACCOUNT = $(notdir $(patsubst %/,%,$(dir $(CURDIR)))) +ifneq ($(GITHUB_ACTIONS),true) export AWS_PROFILE = $(ACCOUNT) +endif .PHONY: all all: k8s-runner-scaler @@ -13,11 +15,10 @@ all: k8s-runner-scaler venv: $(PROHOME)/venv/bin/pip $(PROHOME)/venv/bin/pip: - cd $(PROHOME)/ && virtualenv venv - $(PROHOME)/venv/bin/pip install -r $(PROHOME)/requirements.txt + cd $(PROHOME)/ && make venv/bin/pip $(PROHOME)/tf-modules/VERSIONS: $(PROHOME)/venv/bin/pip $(PROHOME)/Terrafile - $(PROHOME)/venv/bin/python $(PROHOME)/scripts/terrafile_lambdas.py -t $(PROHOME)/Terrafile -m $(PROHOME)/tf-modules + cd $(PROHOME)/ && make tf-modules/VERSIONS .PHONY: terrafile terrafile: $(PROHOME)/tf-modules/VERSIONS @@ -43,24 +44,24 @@ clean: .PHONY: backend-state backend-state: backend.tf -external_k8s_cidr_ipv4.tf: .account-checked $(PROHOME)/venv/bin/pip $(PROHOME)/scripts/simplify_cidr_blocks.py +external_k8s_cidr_ipv4.tf: $(PROHOME)/venv/bin/pip $(PROHOME)/scripts/simplify_cidr_blocks.py $(PROHOME)/venv/bin/python $(PROHOME)/scripts/simplify_cidr_blocks.py --rules-per-sg 50 --output-file external_k8s_cidr_ipv4.tf -dyn_locals.tf: .account-checked +dyn_locals.tf: echo -e "locals {\n aws_region = \"$(REGION)\"\n aws_account_id = \"$(ACCOUNT)\"\n}\n" >dyn_locals.tf backend.tf: backend-state.tf sed "s/#AWS_REGION/$(REGION)/g" <$(PROHOME)/modules/backend-file/backend.tf >backend.tf $(RM) terraform.tfstate -backend-state.tf: .account-checked dyn_locals.tf external_k8s_cidr_ipv4.tf +backend-state.tf: dyn_locals.tf external_k8s_cidr_ipv4.tf sed "s/#AWS_REGION/$(REGION)/g" <$(PROHOME)/modules/backend-file/backend-state.tf >backend-state.tf terraform get -update terraform init -backend=false terraform plan -input=false -out=backend.plan -detailed-exitcode -target=module.backend-state ${TERRAFORM_EXTRAS} ; \ ext_code=$$? ; \ if [ $$ext_code -eq 2 ] ; then \ - terraform apply backend.plan ${TERRAFORM_EXTRAS} ; \ + terraform apply ${TERRAFORM_EXTRAS} backend.plan ; \ elif [ $$ext_code -eq 0 ] ; then \ echo "Backend state already exists" ; \ else \ @@ -68,10 +69,6 @@ backend-state.tf: .account-checked dyn_locals.tf external_k8s_cidr_ipv4.tf exit 1 ; \ fi -.account-checked: - aws configure list-profiles | grep $(ACCOUNT) || (echo "Account $(ACCOUNT) not configured in ~/.aws/config", see https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html; exit 1) - echo OK >.account-checked - .PHONY: plan plan: .terraform/modules/modules.json terraform plan $(TERRAFORM_EXTRAS) @@ -265,49 +262,3 @@ arc-prod: inventory/eks/prod_cluster_name inventory/eks/prod_cluster_config $(PR .PHONY: eks-use-cluster eks-use-cluster: cd $(PROHOME)/modules/arc ; $(MAKE) EKS_CLUSTER_NAME=$(CLUSTER) update-kubectl - -# Deployment -.PHONY: open-rel-pr -open-rel-pr: $(PROHOME)/venv/bin/pip - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug open-rel-pr - -.PHONY: wait-check-deployment -wait-check-deployment: $(PROHOME)/venv/bin/pip - [ "$(RELEASE_ACTION_NAME)" != "" ] || (echo "RELEASE_ACTION_NAME not set"; exit 1) - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug wait-check-deployment --release-action-name "$(RELEASE_ACTION_NAME)" --comment-to-add "$(COMMENT_TO_ADD)" --ignore-if-label "$(IGNORE_IF_LABEL)" - -.PHONY: wait-check-user-comment -wait-check-user-comment: $(PROHOME)/venv/bin/pip - [ "$(WAIT_COMMENT)" != "" ] || (echo "WAIT_COMMENT not set"; exit 1) - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug wait-check-user-comment --comment "$(WAIT_COMMENT)" - -.PHONY: wait-check-bot-comment -wait-check-bot-comment: $(PROHOME)/venv/bin/pip - [ "$(WAIT_COMMENT)" != "" ] || (echo "WAIT_COMMENT not set"; exit 1) - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug wait-check-bot-comment --comment "$(WAIT_COMMENT)" - -.PHONY: react-pr-comment -react-pr-comment: $(PROHOME)/venv/bin/pip - [ "$(COMMENTS)" != "" ] || (echo "COMMENTS not set"; exit 1) - [ "$(LABELS)" != "" ] || (echo "LABELS not set"; exit 1) - [ "$(CHECK_REMOVE_LABELS)" != "" ] || (echo "CHECK_REMOVE_LABELS not set"; exit 1) - [ "$(CHECK_COMMENTS)" != "" ] || (echo "CHECK_COMMENTS not set"; exit 1) - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug react-pr-comment --comments "$(COMMENTS)" --labels "$(LABELS)" --check-remove-labels "$(CHECK_REMOVE_LABELS)" --check-comments "$(CHECK_COMMENTS)" - -.PHONY: add-comment-to-pr -add-comment-to-pr: $(PROHOME)/venv/bin/pip - [ "$(COMMENT_TO_ADD)" != "" ] || (echo "COMMENT_TO_ADD not set"; exit 1) - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug add-comment-to-pr --comment "$(COMMENT_TO_ADD)" - -.PHONY: wait-check-pr-approved -wait-check-pr-approved: $(PROHOME)/venv/bin/pip - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug wait-check-pr-approved - -.PHONY: close-pr -close-pr: $(PROHOME)/venv/bin/pip - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug close-pr - -.PHONY: merge-pr -merge-pr: $(PROHOME)/venv/bin/pip - $(PROHOME)/venv/bin/python ./scripts/deployment.py --debug merge-pr -