-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcheck_AWS_CloudWatch_Alarm.php
221 lines (189 loc) · 8.95 KB
/
check_AWS_CloudWatch_Alarm.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/php
<?php
// check_AWS_CloudWatch_Alarm.php
//
// By Stefan Wuensch stefan_wuensch@harvard.edu 2014-08-21
//
// Usage:
// check_AWS_CloudWatch_Alarm.php [ -h ] [ -v ] --hostName string --hostData string --serviceDescription string [ --serviceData string ] --profile string [ --help ]
//
// This Nagios plugin makes a call to AWS using the AWS command-line tools. It queries AWS for the specific
// CloudWatch Alarm that represents the Nagios Host and Service. The Alarm StateValue is used to determine
// the status of the AWS Alarm.
//
// Mapping of AWS Alarm state to Nagios state, except for AWS/ELB with HTTPCode_ELB_{4,5}XX:
// AWS CloudFront Nagios
// ------------------------------------
// OK OK
// INSUFFICIENT_DATA Warning
// ALARM Critical
// (any other state) Unknown
//
// Mapping of AWS Alarm state to Nagios state for AWS/ELB with HTTPCode_ELB_{4,5}XX (see notes inline):
// AWS CloudFront Nagios
// ------------------------------------
// INSUFFICIENT_DATA OK
// OK Warning
// ALARM Critical
// (any other state) Unknown
error_reporting( E_ALL );
ini_set( 'display_errors', true );
ini_set( 'html_errors', false );
date_default_timezone_set('America/New_York');
//For Debugging.
$debug = false;
// Load our constants and etc.
include_once(dirname(__FILE__).'/utils.php');
$commandOptions = getopt( "hv", array( "hostName:", "hostData:", "serviceDescription:", "serviceData:", "profile:", "help" ) ) ;
if ( isset( $commandOptions[ "h" ] ) || isset( $commandOptions[ "help" ] ) ) {
usage() ;
exit( STATE_UNKNOWN );
}
foreach( array( "hostName", "hostData", "serviceDescription", "profile" ) as $testThis ) {
if ( ! isset( $commandOptions[ $testThis ] ) || $commandOptions[ $testThis ] == "" ) {
print "Error: Missing value for " . $testThis . "\n" ;
usage() ;
exit( STATE_UNKNOWN );
}
}
// Old way - throws errors if there's not enough elements
// list( $sitename, $namespace, $dimensionsName ) = preg_split( '/:/', $commandOptions[ "hostData" ], 3 ) ;
// list( $sitename, $dimensionsValue ) = preg_split( '/:/', $commandOptions[ "hostName" ], 2 ) ;
// New way - pad with nulls - safer.
list( $sitename, $namespace, $dimensionsName ) = array_pad( explode( ':', $commandOptions[ "hostData" ], 3 ), 3, null );
list( $sitename, $dimensionsValue ) = array_pad( explode( ':', $commandOptions[ "hostName" ], 2 ), 2, null ) ;
if( $debug ){
////LOG TO FILE:
$dateString = date("Y-m-d");
$safeNamespace = str_replace( "AWS/", "", $namespace ) ; // Can't have a slash in a UNIX log file name, and 'AWS' is obvious!
// $logFile = "/var/tmp/cloudwatch/" . $dateString . "_" . $commandOptions[ "profile" ] . "_" . $safeNamespace . ".txt"; // File name specific to namespace being monitored
$logFile = "/var/tmp/cloudwatch/" . $dateString . "_" . $commandOptions[ "profile" ] . ".txt";
// Any exit() call after this should be preceeded by fclose( $logFH ) if $debug
$logFH = fopen($logFile, 'a') or die("Log File Cannot Be Opened.");
fwrite( $logFH, "==============================================================================================================\n" ) ;
fwrite( $logFH, __FILE__ . " " . date("Y-m-d H:i:s") . "\n\n" ) ;
// Enable this to see $argv in the log file.
// fwrite( $logFH, "ARGV:\n" ) ;
// foreach( $argv as $arg ) {
// if ( preg_match( "/ /", $arg ) ) {
// $arg = '"' . $arg . '"' ;
// }
// fwrite( $logFH, $arg . " " ) ;
// }
// fwrite( $logFH, "\n\nARGV as object:\n" ) ;
// ob_start();
// var_dump( $argv );
// $contents = ob_get_contents();
// ob_end_clean();
// fwrite( $logFH, $contents . "\n" );
}
// Example that works as of 2014-08-21:
// aws cloudwatch describe-alarms-for-metric --profile hwp --metric-name Latency --namespace AWS/ELB --dimensions Name=LoadBalancerName,Value=HPACWWWPr-ElasticL-JZF3JWQ62LQC
// This was the old way to get the alarm data. This assumed that there was only one instance of an alarm with
// a particular MetricName for each "--dimensions Name="
// However, now (2015) that we need to be able to have multiple alarms with the same MetricName, we need to be
// able to query for something unique.
// $awsReadAlarmCommand = "aws cloudwatch describe-alarms-for-metric" ;
// $awsReadAlarmCommand .= " --profile " . $commandOptions[ "profile" ] ;
// $awsReadAlarmCommand .= " --metric-name " . $commandOptions[ "serviceDescription" ] ;
// $awsReadAlarmCommand .= " --namespace " . $namespace ;
// $awsReadAlarmCommand .= " --dimensions Name=" . $dimensionsName . ",Value=" . $dimensionsValue ;
// Here's the new way. This assumes that the Nagios Service name is now built from [ MetricName + ": " + AlarmName ]
// Example Service Name: "HealthyHostCount: online-learning-harvard-edu Load Balancer Healthy Instance Count 1 Minute"
// Here we will split on the ": " and take the second element as the query {item} for "describe-alarms --alarm-names {item}"
// This will always return a single Alarm, because the CloudWatch Alarm Name (AlarmName) is forced to be unique for us!
// First test to make sure we got something that contains ": " which is our manditory delimiter
if ( ! preg_match( "/: /", $commandOptions[ "serviceDescription" ] ) ) {
$errorOut = "Error - serviceDescription expected to be made up of [ MetricName + \": \" + AlarmName ] \n" ;
print $errorOut ;
if( $debug ){
fwrite( $logFH, $errorOut ) ;
fclose( $logFH ) ;
}
exit( STATE_UNKNOWN ) ;
}
// Now break up the serviceDescription and use the second element for our --alarm-names query
list( $NagiosMetricName, $NagiosAlarmName ) = preg_split( '/: /', $commandOptions[ "serviceDescription" ], 2 ) ;
// However, if we got an Alarm Name from the serviceData arg, we'll use that instead.
// This allows the Service Name to be stripped of problem characters, but we'll still get
// the correct Alarm Name to check from serviceData ("_AWS_Data" in config or "$_SERVICEAWS_DATA$" macro).
if ( isset( $commandOptions[ "serviceData" ] ) && $commandOptions[ "serviceData" ] != "" ) {
$NagiosAlarmName = $commandOptions[ "serviceData" ] ;
}
$awsReadAlarmCommand = "aws cloudwatch describe-alarms" ;
$awsReadAlarmCommand .= " --alarm-names \"" . $NagiosAlarmName . "\"";
$awsReadAlarmCommand .= " --profile " . $commandOptions[ "profile" ] ;
if ( $debug ) {
fwrite( $logFH, "AWS CLI command:\n" . $awsReadAlarmCommand . "\n\n" ) ;
}
$CloudWatchAlarmsJSON = json_decode( shell_exec( $awsReadAlarmCommand ) ) ;
// Dump to log file the entire JSON object we got from the CLI call.
if ( $debug && ! is_null( $CloudWatchAlarmsJSON ) && $CloudWatchAlarmsJSON != "" ) {
fwrite( $logFH, "AWS CLI JSON output:\n" ) ;
ob_start();
print_r( $CloudWatchAlarmsJSON );
$output = ob_get_clean();
fwrite ( $logFH, $output . "\n\n" );
}
// Check for getting something back!
if ( ! isset( $CloudWatchAlarmsJSON ) || $CloudWatchAlarmsJSON == "" ) {
$errorOut = "Error - no JSON data returned from \"$awsReadAlarmCommand\" - could be a problem reaching the AWS API\n" ;
print $errorOut ;
if( $debug ){
fwrite( $logFH, $errorOut ) ;
fclose( $logFH ) ;
}
exit( STATE_UNKNOWN ) ;
}
// If we got more than one match, that's a problem!
if ( sizeof( $CloudWatchAlarmsJSON->MetricAlarms ) != 1 ) {
$errorOut = "Error - Found " . sizeof( $CloudWatchAlarmsJSON->MetricAlarms ) . " MetricAlarms from \"" . $awsReadAlarmCommand . "\"\n" ;
print $errorOut ;
if( $debug ){
fwrite( $logFH, $errorOut ) ;
fclose( $logFH ) ;
}
exit( STATE_UNKNOWN ) ;
}
// We are not doing any debug log output after this, so close out the FH.
if( $debug ){
fclose( $logFH ) ;
}
$alarmInstance = $CloudWatchAlarmsJSON->MetricAlarms[ 0 ] ;
$nagiosStatus = STATE_UNKNOWN ;
if ( $alarmInstance->StateValue == "ALARM" ) {
$nagiosStatus = STATE_CRITICAL ;
}
// Special handling of ELB 4xx and 5xx codes, because the "INSUFFICIENT_DATA" is actually OK and "OK" really means Warning.
// Why? Because if there's no data at all that means there's no 4xx/5xx codes seen - and that's good.
// If we do see *some* 4xx/5xx but it's below the threshold, that's not a big deal = Warning.
if ( ( $namespace == "AWS/ELB" || $namespace == "AWS/ApplicationELB" ) && preg_match( "/HTTPCode_ELB_/i", $commandOptions[ "serviceDescription" ] ) ) {
if ( $alarmInstance->StateValue == "INSUFFICIENT_DATA" ) {
$nagiosStatus = STATE_OK ;
}
if ( $alarmInstance->StateValue == "OK" ) {
$nagiosStatus = STATE_WARNING ;
}
} else {
if ( $alarmInstance->StateValue == "INSUFFICIENT_DATA" ) {
$nagiosStatus = STATE_WARNING ;
}
if ( $alarmInstance->StateValue == "OK" ) {
$nagiosStatus = STATE_OK ;
}
}
print $alarmInstance->StateValue
. ": "
. $alarmInstance->AlarmName
. ": "
. $alarmInstance->StateReason
. " Last state change: "
. $alarmInstance->StateUpdatedTimestamp
. "\n" ;
exit( $nagiosStatus ) ;
//=============================================================================
function usage() {
print "Usage: \n" ;
print __FILE__ . " [ -h ] [ -v ] --hostName string --hostData string --serviceDescription string [ --serviceData string ] --profile string [ --help ]\n" ;
}
//=============================================================================