Skip to content

Commit

Permalink
Merge branch 'main' into zxiiro/cloud-account-access
Browse files Browse the repository at this point in the history
  • Loading branch information
zxiiro authored Jan 7, 2025
2 parents 2d349ef + 0272242 commit c2b921c
Show file tree
Hide file tree
Showing 15 changed files with 288 additions and 88 deletions.
21 changes: 21 additions & 0 deletions .checkov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# .checkov.yml - Configuration file for Checkov

# Path configurations
skip-path:
- arc
- '.github/workflows/arc*'

# Skip INFO and other unresolvable checks
skip-check:
- CKV2_AWS_61
- CKV_AWS_355
- CKV_AWS_290
- CKV_AWS_119
- CKV2_AWS_62
- CKV_AWS_18
- CKV_AWS_145
- CKV_AWS_144
- CKV2_AWS_16

# Configure Checkov's log level (useful for debugging)
# log-level: DEBUG # Available options: DEBUG, INFO, WARN, ERROR
15 changes: 15 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
root = true

[*]
end_of_line = lf
insert_final_newline = true

# Python
[*.py]
indent_style = space
indent_size = 4

# Terraform
[*.tf]
indent_style = space
indent_size = 2
25 changes: 9 additions & 16 deletions .github/workflows/lint-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ name: Lint GitHub Actions workflows
# yamllint disable-line rule:truthy
on: [push, pull_request]

permissions: read-all

jobs:
actionlint:
runs-on: ubuntu-latest
Expand All @@ -17,24 +19,15 @@ jobs:
run: ${{ steps.get_actionlint.outputs.executable }} -color
shell: bash

commit-message:
pre-commit:
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
# checkout at the last commit
ref: ${{ github.event.pull_request.head.sha }}
# get all history
fetch-depth: 0

- name: Install gitlint
- uses: actions/checkout@v4
- name: Install pre-commit
shell: bash
run: |
python -m pip install gitlint
- name: Run gitlint
shell: bash
python -m pip install pre-commit
- name: Run pre-commit
run: |
gitlint --commits "${{ github.event.pull_request.base.sha }}..HEAD"
pre-commit install
pre-commit run --all-files
43 changes: 43 additions & 0 deletions .github/workflows/security-scan.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# CodeQL and Checkov scans for ci-infra
#
name: "Security Scan"

permissions: read-all

on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
schedule:
- cron: '0 12 * * 6' # Runs every Saturday at 12:00 PM

jobs:
Analyze-Python:
name: Analyze Python code
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: python
- name: Run CodeQL Analysis
uses: github/codeql-action/analyze@v3
continue-on-error: true

Analyze-IaC:
name: Analyze Infra as Code
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.10'
- name: Install Checkov
run: pip install checkov
- name: Run Checkov
run: checkov --quiet -d .
5 changes: 5 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
repos:
- repo: https://github.com/editorconfig-checker/editorconfig-checker
rev: main # We want to pin to a tag here but v3.0.3 does not have hooks yet.
hooks:
- id: editorconfig-checker
2 changes: 1 addition & 1 deletion ali/Terrafile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ terraform-aws-vpc:
terraform-aws-github-runner:
source: "pytorch/test-infra"
module-root: "terraform-aws-github-runner"
tag: "v20241025-175554"
tag: "v20241216-232848"
assets:
- "runner-binaries-syncer.zip"
- "runners.zip"
Expand Down
1 change: 1 addition & 0 deletions ali/aws/391835788720/us-east-1/autoscaler-lambda-canary.tf
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ module "autoscaler-lambda-canary" {

runner_iam_role_managed_policy_arns = [
aws_iam_policy.allow_ecr_on_gha_runners.arn,
aws_iam_policy.allow_secretmanager_docker_hub_token_on_gha_runners.arn,
aws_iam_policy.allow_s3_sccache_access_on_gha_runners.arn,
aws_iam_policy.allow_lambda_on_gha_runners.arn,
aws_iam_policy.allow_torchci_metrics_on_gha_runners.arn
Expand Down
1 change: 1 addition & 0 deletions ali/aws/391835788720/us-east-1/autoscaler-lambda.tf
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ module "autoscaler-lambda" {

runner_iam_role_managed_policy_arns = [
aws_iam_policy.allow_ecr_on_gha_runners.arn,
aws_iam_policy.allow_secretmanager_docker_hub_token_on_gha_runners.arn,
aws_iam_policy.allow_s3_sccache_access_on_gha_runners.arn,
aws_iam_policy.allow_lambda_on_gha_runners.arn,
aws_iam_policy.allow_torchci_metrics_on_gha_runners.arn
Expand Down
92 changes: 92 additions & 0 deletions ali/aws/391835788720/us-east-1/gha_roles.tf
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,95 @@ resource "aws_iam_role" "gha_target_determinator_s3_read_write" {
workflow = "target-determinator-indexer"
}
}

# Role for using packer to create AMIs
resource "aws_iam_role" "gha-packer-role" {
name = "gha-packer-role"

max_session_duration = 18000
description = "Allows PyTorch runners to run Packer to build AMIs."
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Principal = {
Federated = "arn:aws:iam::${local.aws_account_id}:oidc-provider/token.actions.githubusercontent.com"
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"token.actions.githubusercontent.com:aud" = "sts.amazonaws.com"
}
StringLike = {
"token.actions.githubusercontent.com:sub" = [
"repo:pytorch/pytorch:environment:packer-build-env",
"repo:pytorch/pytorch-canary:environment:packer-build-env",
"repo:pytorch/test-infra:environment:packer-build-env"
]
}
}
}
]
})

inline_policy {
name = "gha-packer-policy"
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"ec2:AssociateIamInstanceProfile",
"ec2:AttachVolume",
"ec2:AuthorizeSecurityGroupIngress",
"ec2:CopyImage",
"ec2:CreateImage",
"ec2:CreateKeypair",
"ec2:CreateSecurityGroup",
"ec2:CreateSnapshot",
"ec2:CreateTags",
"ec2:CreateVolume",
"ec2:DeleteKeyPair",
"ec2:DeleteSecurityGroup",
"ec2:DeleteSnapshot",
"ec2:DeleteVolume",
"ec2:DeregisterImage",
"ec2:DescribeImageAttribute",
"ec2:DescribeImages",
"ec2:DescribeInstances",
"ec2:DescribeInstanceStatus",
"ec2:DescribeRegions",
"ec2:DescribeSecurityGroups",
"ec2:DescribeSnapshots",
"ec2:DescribeSubnets",
"ec2:DescribeTags",
"ec2:DescribeVolumes",
"ec2:DetachVolume",
"ec2:GetPasswordData",
"ec2:ModifyImageAttribute",
"ec2:ModifyInstanceAttribute",
"ec2:ModifySnapshotAttribute",
"ec2:RegisterImage",
"ec2:ReplaceIamInstanceProfileAssociation",
"ec2:RunInstances",
"ec2:StopInstances",
"ec2:TerminateInstances",
"iam:PassRole",
"iam:GetInstanceProfile"
]
Resource = [
"*",
]
},
]
})
}

tags = {
project = var.ali_prod_environment
environment = "pytorch-packer-workflows"
workflow = "build-windows-ami"
}
}
18 changes: 18 additions & 0 deletions ali/aws/391835788720/us-east-1/iam_policies.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ resource "aws_iam_role" "ossci_gha_terraform" {

resource "aws_iam_role_policy_attachment" "ossci_gha_terraform_admin" {
role = aws_iam_role.ossci_gha_terraform.name
#checkov:skip=CKV_AWS_274:Terraform needs AdministratorAccess to run
policy_arn = "arn:aws:iam::aws:policy/AdministratorAccess"
}

Expand Down Expand Up @@ -68,6 +69,23 @@ resource "aws_iam_policy" "allow_ecr_on_gha_runners" {
EOT
}

resource "aws_iam_policy" "allow_secretmanager_docker_hub_token_on_gha_runners" {
name = "${var.ali_prod_environment}_allow_secretmanager_docker_hub_token_on_gha_runners"
description = "Allows our GHA EC2 runners access to the read-only docker.io token"
policy = <<EOT
{
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Action": [
"secretsmanager:GetSecretValue"
],
"Resource": "arn:aws:secretsmanager:us-east-1:391835788720:secret:docker_hub_readonly_token-V74gSU"
}]
}
EOT
}

// ossci-compiler-cache-circleci-v2 = linux sccache
// ossci-compiler-cache = windows sccache
resource "aws_iam_policy" "allow_s3_sccache_access_on_gha_runners" {
Expand Down
2 changes: 1 addition & 1 deletion ali/aws/391835788720/us-east-1/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,5 @@ variable "ami_filter_linux_arm64" {
variable "ami_filter_windows" {
description = "AMI for windows"
type = list
default = ["Windows 2019 GHA CI - 20240830161839"]
default = ["Windows 2019 GHA CI - 20241127165339"]
}
8 changes: 8 additions & 0 deletions ali/modules/ali_scripts/linux_post_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

set +e

# Log in to docker.io, it's ok if this fails, we will just fallback to an anonymous user then.
# This is to mitigate https://docs.docker.com/docker-hub/download-rate-limit/#rate-limit
aws secretsmanager get-secret-value --secret-id docker_hub_readonly_token | jq --raw-output '.SecretString' | jq -r .docker_hub_readonly_token | docker login --username pytorchbot --password-stdin || true

# Log in to our ECR instance
if uname -a | grep 'amzn2023' > /dev/null ; then
echo "New amazon linux"
Expand All @@ -11,3 +15,7 @@ else
$(aws ecr get-login --no-include-email --region us-east-1)
fi

# copy the docker config from root to ec2-user, so both users can access the same registries
mkdir -p /home/ec2-user/.docker
cat </root/.docker/config.json >/home/ec2-user/.docker/config.json
chown -R ec2-user:ec2-user /home/ec2-user/.docker
3 changes: 2 additions & 1 deletion modules/backend-state/dynamo.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
resource "aws_dynamodb_table" "terraform_state_lock" {
#checkov:skip=CKV_AWS_28:ALI uses this as a cache and does not need backup
count = data.external.terraform_state_bucket_exists.result.exists == "true" ? 0 : 1
name = "${var.dynamo_table_name}-${var.project}-${var.environment}"
read_capacity = 1
Expand All @@ -9,4 +10,4 @@ resource "aws_dynamodb_table" "terraform_state_lock" {
name = "LockID"
type = "S"
}
}
}
Loading

0 comments on commit c2b921c

Please sign in to comment.