Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into add-project-update-…
Browse files Browse the repository at this point in the history
…automation-app-docker
  • Loading branch information
peterzhuamazon committed Feb 19, 2025
2 parents f7a13ac + bcb0693 commit fa15082
Show file tree
Hide file tree
Showing 21 changed files with 942 additions and 24 deletions.
1 change: 1 addition & 0 deletions DEVELOPER_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ So you want to contribute code to this project? Excellent! We're glad you're her
- `cdk deploy OpenSearchMetrics-GitHubAutomationAppEvents-S3`: Creates the S3 Bucket for the [GitHub Automation App](https://github.com/opensearch-project/automation-app) to store OpenSearch Project GitHub Events.
- `cdk deploy OpenSearchS3EventIndex-Workflow`: Creates the Lambda and Step Function to index the GitHub Events stored in the S3 Bucket to the Metrics cluster.
- `cdk deploy OpenSearchMaintainerInactivity-Workflow`: Creates the Lambda and Step Function to index Maintainer Inactivity to the Metrics cluster.
- `cdk deploy OpenSearchEventCanary-Workflow`: Creates the Lambda and Step Function that runs the GitHub Label Canary for Automation App monitoring.

### Forking and Cloning

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,18 @@ graph LR
F --> H[Data for Debugging and Trend Analysis]
```

##### Automation App Failure Detection
The logic for detecting automation app failures is found in the [github-label-canary-monitor.ts](https://github.com/opensearch-project/automation-app/blob/main/src/call/github-label-canary-monitor.ts). There is a GitHub Event Canary Lambda that will create and delete a label every 10 minutes. The Automation App will listen on this label event, and will send CloudWatch Metrics every time the event is heard. If the Automation App goes down or stops working, then the CloudWatch Alarm will sense this missing data and go into Alarm state, notifying us of the outage.

```mermaid
graph LR
A[GitHub Event Canary Lambda] -->|Creates/Deletes Label every 10 minutes| B[GitHub Repository]
B -->|Label Event| C[Automation App]
C -->|Sends Metrics| D[CloudWatch Metrics]
D -->|Monitors for missing data| E[CloudWatch Alarm]
E -->|Triggers on missing data| F[Notification]
```

## Contributing

See [developer guide](DEVELOPER_GUIDE.md) and [how to contribute to this project](CONTRIBUTING.md).
Expand Down
10 changes: 9 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,18 @@ dependencies {

implementation 'com.amazonaws:aws-java-sdk-secretsmanager:1.12.671'

implementation 'software.amazon.awssdk:s3:2.29.12'
implementation 'software.amazon.awssdk:s3:2.30.22'

implementation 'org.json:json:20240303'

implementation 'io.jsonwebtoken:jjwt-api:0.12.6'
runtimeOnly 'io.jsonwebtoken:jjwt-impl:0.12.6'
runtimeOnly 'io.jsonwebtoken:jjwt-jackson:0.12.6'

implementation 'org.bouncycastle:bcprov-jdk18on:1.79'

implementation 'org.kohsuke:github-api:1.326'

testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.1'
testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine:5.8.1'

Expand Down
62 changes: 62 additions & 0 deletions infrastructure/lib/constructs/automationAppSns.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import {Alarm, ComparisonOperator, MathExpression, Metric, TreatMissingData} from "aws-cdk-lib/aws-cloudwatch";
import { Construct } from "constructs";
import { SnsMonitors, SnsMonitorsProps } from "./snsMonitor";
import {Duration} from "aws-cdk-lib";

interface automationAppSnsProps extends SnsMonitorsProps {
readonly automationAppSnsAlarms: Array<{ alertName: string, metricName: string}>;
}

export class AutomationAppSns extends SnsMonitors {
private readonly automationAppSnsAlarms: Array<{ alertName: string, metricName: string}>;
constructor(scope: Construct, id: string, props: automationAppSnsProps) {
super(scope, id, props);
this.automationAppSnsAlarms = props.automationAppSnsAlarms;
this.automationAppSnsAlarms.forEach(({ alertName, metricName }) => {
const alarm = this.automationAppFailed(alertName, metricName);
this.map[alarm[1]] = alarm[0];
});
this.createTopic();
}

private automationAppFailed(alertName: string, metricName: string): [Alarm, string] {
const metricPeriod = Duration.minutes(10);

const automationAppFailedMetric = new Metric({
namespace: this.alarmNameSpace,
metricName: metricName,
statistic: "Sum",
period: metricPeriod,
});

const filledAutomationAppFailedMetric = new MathExpression({
expression: "FILL(metric, 0)",
usingMetrics: {
metric: automationAppFailedMetric,
},
period: metricPeriod,
});

const alarmObject = new Alarm(this, `error_alarm_${alertName}`, {
metric: filledAutomationAppFailedMetric,
threshold: 1,
evaluationPeriods: 1,
comparisonOperator: ComparisonOperator.LESS_THAN_THRESHOLD,
datapointsToAlarm: 1,
treatMissingData: TreatMissingData.BREACHING,
alarmDescription: "Detect GitHub Automation App failure",
alarmName: alertName,
});
return [alarmObject, alertName];
}
}

3 changes: 2 additions & 1 deletion infrastructure/lib/enums/project.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ enum Project {
RESTRICTED_PREFIX = '',
LAMBDA_PACKAGE = 'opensearch-metrics-1.0.zip',
EC2_AMI_SSM = '',
SNS_ALERT_EMAIL = 'insert@test.mail'
SNS_ALERT_EMAIL = 'insert@test.mail',
EVENT_CANARY_REPO_TARGET = '',
}
export default Project;
10 changes: 10 additions & 0 deletions infrastructure/lib/infrastructure-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { OpenSearchWAF } from "./stacks/waf";
import { GitHubWorkflowMonitorAlarms } from "./stacks/gitHubWorkflowMonitorAlarms";
import { OpenSearchS3EventIndexWorkflowStack } from "./stacks/s3EventIndexWorkflow";
import { OpenSearchMaintainerInactivityWorkflowStack } from "./stacks/maintainerInactivityWorkflow";
import {OpenSearchEventCanaryWorkflowStack} from "./stacks/eventCanaryWorkflow";

export class InfrastructureStack extends Stack {
constructor(scope: Construct, id: string, props?: StackProps) {
Expand Down Expand Up @@ -113,6 +114,15 @@ export class InfrastructureStack extends Stack {
secretName: 'metrics-creds'
});

// Create OpenSearch Event Canary Lambda setup
const openSearchEventCanaryWorkflowStack = new OpenSearchEventCanaryWorkflowStack(app, 'OpenSearchEventCanary-Workflow', {
vpcStack: vpcStack,
lambdaPackage: Project.LAMBDA_PACKAGE,
gitHubRepoTarget: Project.EVENT_CANARY_REPO_TARGET,
gitHubAppSecret: openSearchMetricsSecretsStack.secret,
})
openSearchEventCanaryWorkflowStack.node.addDependency(vpcStack);

// Create Monitoring Dashboard

const openSearchMetricsMonitoringStack = new OpenSearchMetricsMonitoringStack(app, "OpenSearchMetrics-Monitoring", {
Expand Down
100 changes: 100 additions & 0 deletions infrastructure/lib/stacks/eventCanaryWorkflow.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import { Duration, Stack, StackProps } from "aws-cdk-lib";
import { Rule, RuleTargetInput, Schedule } from "aws-cdk-lib/aws-events";
import { SfnStateMachine } from "aws-cdk-lib/aws-events-targets";
import { Bucket } from "aws-cdk-lib/aws-s3";
import { JsonPath, StateMachine, TaskInput } from "aws-cdk-lib/aws-stepfunctions";
import { LambdaInvoke } from "aws-cdk-lib/aws-stepfunctions-tasks";
import { Construct } from 'constructs';
import { OpenSearchLambda } from "../constructs/lambda";
import { OpenSearchDomainStack } from "./opensearch";
import { VpcStack } from "./vpc";
import {Effect, ManagedPolicy, PolicyDocument, PolicyStatement, Role, ServicePrincipal} from "aws-cdk-lib/aws-iam";
import {Secret} from "aws-cdk-lib/aws-secretsmanager";

export interface OpenSearchEventCanaryWorkflowStackProps extends StackProps {
readonly vpcStack: VpcStack;
readonly lambdaPackage: string;
readonly gitHubRepoTarget: string;
readonly gitHubAppSecret: Secret;
}

export interface WorkflowComponent {
opensearchEventCanaryWorkflowStateMachineName: string
}

export class OpenSearchEventCanaryWorkflowStack extends Stack {
public readonly workflowComponent: WorkflowComponent;
constructor(scope: Construct, id: string, props: OpenSearchEventCanaryWorkflowStackProps) {
super(scope, id, props);

const eventCanaryTask = this.createEventCanaryTask(this,
props.vpcStack,
props.lambdaPackage,
props.gitHubRepoTarget,
props.gitHubAppSecret,
);

const opensearchEventCanaryWorkflow = new StateMachine(this, 'OpenSearchEventCanaryWorkflow', {
definition: eventCanaryTask,
timeout: Duration.minutes(15),
stateMachineName: 'OpenSearchEventCanaryWorkflow'
})

new Rule(this, 'OpenSearchEventCanaryWorkflow-Every-10mins', {
schedule: Schedule.expression('cron(0/10 * * * ? *)'),
targets: [new SfnStateMachine(opensearchEventCanaryWorkflow)],
});

this.workflowComponent = {
opensearchEventCanaryWorkflowStateMachineName: opensearchEventCanaryWorkflow.stateMachineName
}
}

private createEventCanaryTask(scope: Construct, vpcStack: VpcStack, lambdaPackage: string, gitHubRepoTarget: string, gitHubAppSecret: Secret) {
const eventCanaryLambdaRole = new Role(this, 'OpenSearchEventCanaryLambdaRole', {
assumedBy: new ServicePrincipal('lambda.amazonaws.com'),
description: "OpenSearch Metrics Event Canary Lambda Execution Role",
roleName: "OpenSearchEventCanaryLambdaRole",
managedPolicies: [
ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaBasicExecutionRole'),
ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSLambdaVPCAccessExecutionRole'),
]
});

eventCanaryLambdaRole.addToPolicy(
new PolicyStatement({
effect: Effect.ALLOW,
actions: ["secretsmanager:GetSecretValue"],
resources: [`${gitHubAppSecret.secretFullArn}`],
}),
);

const eventCanaryLambda = new OpenSearchLambda(this, "OpenSearchMetricsEventCanaryLambdaFunction", {
lambdaNameBase: "OpenSearchMetricsEventCanary",
handler: "org.opensearchmetrics.lambda.EventCanaryLambda",
lambdaZipPath: `../../../build/distributions/${lambdaPackage}`,
vpc: vpcStack.vpc,
securityGroup: vpcStack.securityGroup,
role: eventCanaryLambdaRole,
environment: {
GITHUB_REPO_TARGET: gitHubRepoTarget,
API_CREDENTIALS_SECRETS: gitHubAppSecret.secretName,
SECRETS_MANAGER_REGION: gitHubAppSecret.env.region,
}
}).lambda;
return new LambdaInvoke(scope, 'Event Canary Lambda', {
lambdaFunction: eventCanaryLambda,
resultPath: JsonPath.DISCARD,
timeout: Duration.minutes(15)
}).addRetry();
}
}
9 changes: 3 additions & 6 deletions infrastructure/lib/stacks/gitHubAutomationApp.ts
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,12 @@ export class GitHubAutomationApp extends Stack {
'sudo systemctl start docker',
'sudo curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/sbin/docker-compose',
'sudo chmod a+x /usr/local/sbin/docker-compose',
'git clone --branch 0.1.18 https://github.com/opensearch-project/automation-app.git automation-app-0.1.18',
'git clone --branch 0.3.2 https://github.com/opensearch-project/automation-app.git automation-app-0.3.2',
`aws secretsmanager get-secret-value --secret-id ${secretName} --query SecretString --output text >> automation-app-0.1.18/.env`,
'cp automation-app-0.1.18/.env automation-app-0.3.2/.env',
'cd automation-app-0.1.18/docker',
'git clone https://github.com/opensearch-project/automation-app.git --branch 0.3.6',
`aws secretsmanager get-secret-value --secret-id ${secretName} --query SecretString --output text >> automation-app/.env`,
'cd automation-app/docker',
'PORT=8080 RESOURCE_CONFIG=configs/resources/opensearch-project-resource.yml OPERATION_CONFIG=configs/operations/github-merged-pulls-monitor.yml docker-compose -p github-merged-pulls-monitor up -d',
'PORT=8081 RESOURCE_CONFIG=configs/resources/opensearch-project-resource.yml OPERATION_CONFIG=configs/operations/github-workflow-runs-monitor.yml docker-compose -p github-workflow-runs-monitor up -d',
'PORT=8082 RESOURCE_CONFIG=configs/resources/opensearch-project-only-org.yml OPERATION_CONFIG=configs/operations/github-events-to-s3.yml docker-compose -p github-events-to-s3 up -d',
'cd ../../automation-app-0.3.2/docker',
'PORT=8083 RESOURCE_CONFIG=configs/resources/opensearch-project-resource.yml OPERATION_CONFIG=configs/operations/add-meta-rfc-issues-to-os-roadmap.yml ADDITIONAL_RESOURCE_CONTEXT=true docker-compose -p github-os-roadmap up -d',
];
}
Expand Down
22 changes: 21 additions & 1 deletion infrastructure/lib/stacks/monitoringDashboard.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import { OpenSearchLambda } from "../constructs/lambda";
import { StepFunctionSns } from "../constructs/stepFunctionSns";
import Project from "../enums/project";
import { VpcStack } from "./vpc";
import {AutomationAppSns} from "../constructs/automationAppSns";


interface OpenSearchMetricsMonitoringStackProps extends StackProps {
Expand Down Expand Up @@ -56,12 +57,13 @@ export class OpenSearchMetricsMonitoringStack extends Stack {
lambdaZipPath: `../../../build/distributions/${props.lambdaPackage}`,
role: slackLambdaRole,
environment: {
SLACK_CREDENTIALS_SECRETS: props.secrets.secretName,
API_CREDENTIALS_SECRETS: props.secrets.secretName,
SECRETS_MANAGER_REGION: props.secrets.env.region
}
});
this.snsMonitorStepFunctionExecutionsFailed();
this.snsMonitorCanaryFailed('metrics_heartbeat', `https://${Project.METRICS_HOSTED_ZONE}`, props.vpcStack);
this.snsMonitorAutomationAppFailed();
}

/**
Expand Down Expand Up @@ -117,5 +119,23 @@ export class OpenSearchMetricsMonitoringStack extends Stack {
slackLambda: this.slackLambda
});
}

/**
* Create SNS alarms for if the GitHub Event Data Lake App goes down.
*/
private snsMonitorAutomationAppFailed(): void {
const automationAppSnsAlarms = [
{ alertName: 'Event_data_lake_app_failed', metricName: 'AutomationApp_EventDataLake'},
];

new AutomationAppSns(this, "SnsMonitors-EventDataLakeAppFailed", {
region: this.props.region,
accountId: this.props.account,
automationAppSnsAlarms: automationAppSnsAlarms,
alarmNameSpace: "GitHubLabelCanary",
snsTopicName: "AutomationAppFailed",
slackLambda: this.slackLambda
});
}
}

89 changes: 89 additions & 0 deletions infrastructure/test/event-canary-workflow-stack.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

import { App } from "aws-cdk-lib";
import { Template } from "aws-cdk-lib/assertions";
import Project from "../lib/enums/project";
import { VpcStack } from "../lib/stacks/vpc";
import {OpenSearchEventCanaryWorkflowStack} from "../lib/stacks/eventCanaryWorkflow";
import {OpenSearchMetricsSecretsStack} from "../lib/stacks/secrets";

test('Event Canary Workflow Stack Test', () => {
const app = new App();
const vpcStack = new VpcStack(app, 'Test-OpenSearchHealth-VPC', {});

// Create Secret Manager for the metrics project
const openSearchMetricsSecretsStack = new OpenSearchMetricsSecretsStack(app, "OpenSearchMetrics-Secrets", {
secretName: 'metrics-creds'
});

const openSearchEventCanaryWorkflowStack = new OpenSearchEventCanaryWorkflowStack(app, 'OpenSearchEventCanary-Workflow', {
vpcStack: vpcStack,
lambdaPackage: Project.LAMBDA_PACKAGE,
gitHubRepoTarget: Project.EVENT_CANARY_REPO_TARGET,
gitHubAppSecret: openSearchMetricsSecretsStack.secret,
})

openSearchEventCanaryWorkflowStack.node.addDependency(vpcStack);
const template = Template.fromStack(openSearchEventCanaryWorkflowStack);
template.resourceCountIs('AWS::IAM::Role', 3);
template.resourceCountIs('AWS::Lambda::Function', 1);
template.hasResourceProperties('AWS::Lambda::Function', {
"FunctionName": "OpenSearchMetricsEventCanaryLambda",
"Handler": "org.opensearchmetrics.lambda.EventCanaryLambda"
});
template.resourceCountIs('AWS::StepFunctions::StateMachine', 1);
template.hasResourceProperties('AWS::StepFunctions::StateMachine', {
"DefinitionString": {
"Fn::Join": [
"",
[
"{\"StartAt\":\"Event Canary Lambda\",\"States\":{\"Event Canary Lambda\":{\"End\":true,\"Retry\":[{\"ErrorEquals\":[\"Lambda.ClientExecutionTimeoutException\",\"Lambda.ServiceException\",\"Lambda.AWSLambdaException\",\"Lambda.SdkClientException\"],\"IntervalSeconds\":2,\"MaxAttempts\":6,\"BackoffRate\":2},{\"ErrorEquals\":[\"States.ALL\"]}],\"Type\":\"Task\",\"TimeoutSeconds\":900,\"ResultPath\":null,\"Resource\":\"arn:",
{
"Ref": "AWS::Partition"
},
":states:::lambda:invoke\",\"Parameters\":{\"FunctionName\":\"",
{
"Fn::GetAtt": [
"OpenSearchMetricsEventCanaryLambda358BAA07",
"Arn"
]
},
"\",\"Payload.$\":\"$\"}}},\"TimeoutSeconds\":900}"
]
]
},
"RoleArn": {
"Fn::GetAtt": [
"OpenSearchEventCanaryWorkflowRoleDC920D0E",
"Arn"
]
},
"StateMachineName": "OpenSearchEventCanaryWorkflow"
});
template.resourceCountIs('AWS::Events::Rule', 1);
template.hasResourceProperties('AWS::Events::Rule', {
"ScheduleExpression": "cron(0/10 * * * ? *)",
"State": "ENABLED",
"Targets": [
{
"Arn": {
"Ref": "OpenSearchEventCanaryWorkflowEB1017B7"
},
"Id": "Target0",
"RoleArn": {
"Fn::GetAtt": [
"OpenSearchEventCanaryWorkflowEventsRoleA5644829",
"Arn"
]
}
}
]
});
});
Loading

0 comments on commit fa15082

Please sign in to comment.