Skip to content

Commit 27709c2

Browse files
committed
Stationary anomaly detection functions modified to call a library function, to enable customization.
1 parent a5a4806 commit 27709c2

5 files changed

+150
-70
lines changed

distribution/zip.xml

+7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
</includes>
2323
<outputDirectory>/lib</outputDirectory>
2424
</fileSet>
25+
<fileSet>
26+
<directory>${project.basedir}/../home/lib/anomalies</directory>
27+
<includes>
28+
<include>*.py</include>
29+
</includes>
30+
<outputDirectory>/lib/anomalies</outputDirectory>
31+
</fileSet>
2532
<fileSet>
2633
<directory>${project.basedir}/../home/lib</directory>
2734
<includes>

home/lib/anomalies/__init__.py

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright © 2021 DQOps (support@dqops.com)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
#
16+
#
17+
# Licensed under the Apache License, Version 2.0 (the "License");
18+
# you may not use this file except in compliance with the License.
19+
# You may obtain a copy of the License at
20+
#
21+
# http://www.apache.org/licenses/LICENSE-2.0
22+
#
23+
# Unless required by applicable law or agreed to in writing, software
24+
# distributed under the License is distributed on an "AS IS" BASIS,
25+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26+
# See the License for the specific language governing permissions and
27+
# limitations under the License.
28+
#
+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# Copyright © 2021 DQOps (support@dqops.com)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# Licensed under the Apache License, Version 2.0 (the "License");
16+
# you may not use this file except in compliance with the License.
17+
# You may obtain a copy of the License at
18+
#
19+
# http://www.apache.org/licenses/LICENSE-2.0
20+
#
21+
# Unless required by applicable law or agreed to in writing, software
22+
# distributed under the License is distributed on an "AS IS" BASIS,
23+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24+
# See the License for the specific language governing permissions and
25+
# limitations under the License.
26+
27+
#
28+
#
29+
# Licensed under the Apache License, Version 2.0 (the "License");
30+
# you may not use this file except in compliance with the License.
31+
# You may obtain a copy of the License at
32+
#
33+
# http://www.apache.org/licenses/LICENSE-2.0
34+
#
35+
# Unless required by applicable law or agreed to in writing, software
36+
# distributed under the License is distributed on an "AS IS" BASIS,
37+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
38+
# See the License for the specific language governing permissions and
39+
# limitations under the License.
40+
#
41+
42+
from typing import Sequence
43+
import numpy as np
44+
import scipy
45+
import scipy.stats
46+
47+
48+
def detect_upper_bound_anomaly(values_above_median: list[float], degrees_of_freedom: int, tail: float):
49+
values_array = np.array(values_above_median, dtype=float)
50+
values_median = np.median(values_array)
51+
values_std = scipy.stats.tstd(values_array)
52+
53+
if float(values_std) == 0:
54+
return values_median
55+
else:
56+
# Assumption: the historical data follows t-student distribution
57+
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
58+
scale=values_std)
59+
return float(upper_readout_distribution.ppf(1 - tail))
60+
61+
62+
def detect_lower_bound_anomaly(values_below_median: list[float], degrees_of_freedom: int, tail: float):
63+
values_array = np.array(values_below_median, dtype=float)
64+
values_median = np.median(values_array)
65+
values_std = scipy.stats.tstd(values_array)
66+
67+
if float(values_std) == 0:
68+
return values_median
69+
else:
70+
# Assumption: the historical data follows t-student distribution
71+
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=values_median,
72+
scale=values_std)
73+
return float(lower_readout_distribution.ppf(tail))

home/rules/percentile/anomaly_stationary_percentile_moving_average.py

+21-35
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import numpy as np
2020
import scipy
2121
import scipy.stats
22+
from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly
2223

2324

2425
# rule specific parameters object, contains values received from the quality check threshold configuration
@@ -104,30 +105,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
104105
if all(readout > 0 for readout in extracted):
105106
# using a 0-based calculation (scale from 0)
106107
upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
107-
upper_multiples = np.array(upper_median_multiples_array, dtype=float)
108-
upper_multiples_median = np.median(upper_multiples)
109-
upper_multiples_std = scipy.stats.tstd(upper_multiples)
108+
threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
109+
degrees_of_freedom=degrees_of_freedom, tail=tail)
110110

111-
if float(upper_multiples_std) == 0:
112-
threshold_upper = filtered_median_float
113-
else:
114-
# Assumption: the historical data follows t-student distribution
115-
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
116-
threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
111+
if threshold_upper_multiple is not None:
117112
threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
113+
else:
114+
threshold_upper = rule_parameters.actual_value
118115

119116
lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
120-
lower_multiples = np.array(lower_median_multiples_array, dtype=float)
121-
lower_multiples_median = np.median(lower_multiples)
122-
lower_multiples_std = scipy.stats.tstd(lower_multiples)
117+
threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
118+
degrees_of_freedom=degrees_of_freedom, tail=tail)
123119

124-
if float(lower_multiples_std) == 0:
125-
threshold_lower = filtered_median_float
126-
else:
127-
# Assumption: the historical data follows t-student distribution
128-
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
129-
threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
120+
if threshold_lower_multiple is not None:
130121
threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
122+
else:
123+
threshold_lower = rule_parameters.actual_value
131124

132125
passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
133126

@@ -139,28 +132,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
139132
else:
140133
# using unrestricted method
141134
upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
142-
upper_half = np.array(upper_half_filtered, dtype=float)
143-
upper_half_median = np.median(upper_half)
144-
upper_half_std = scipy.stats.tstd(upper_half)
135+
threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
136+
degrees_of_freedom=degrees_of_freedom, tail=tail)
145137

146-
if float(upper_half_std) == 0:
147-
threshold_upper = filtered_median_float
138+
if threshold_upper_result is not None:
139+
threshold_upper = threshold_upper_result
148140
else:
149-
# Assumption: the historical data follows t-student distribution
150-
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
151-
threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
141+
threshold_upper = rule_parameters.actual_value
152142

153143
lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
154-
lower_half = np.array(lower_half_list, dtype=float)
155-
lower_half_median = np.median(lower_half)
156-
lower_half_std = scipy.stats.tstd(lower_half)
157-
158-
if float(lower_half_std) == 0:
159-
threshold_lower = filtered_median_float
144+
threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
145+
degrees_of_freedom=degrees_of_freedom, tail=tail)
146+
if threshold_lower_result is not None:
147+
threshold_lower = threshold_lower_result
160148
else:
161-
# Assumption: the historical data follows t-student distribution
162-
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
163-
threshold_lower = float(lower_readout_distribution.ppf(tail))
149+
threshold_lower = rule_parameters.actual_value
164150

165151
passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
166152

home/rules/percentile/anomaly_stationary_percentile_moving_average_30_days.py

+21-35
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import numpy as np
2020
import scipy
2121
import scipy.stats
22+
from lib.anomalies.anomaly_detection import detect_upper_bound_anomaly, detect_lower_bound_anomaly
2223

2324

2425
# rule specific parameters object, contains values received from the quality check threshold configuration
@@ -107,30 +108,22 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
107108
if all(readout > 0 for readout in extracted):
108109
# using a 0-based calculation (scale from 0)
109110
upper_median_multiples_array = [(readout / filtered_median_float - 1.0) for readout in extracted if readout >= filtered_median_float]
110-
upper_multiples = np.array(upper_median_multiples_array, dtype=float)
111-
upper_multiples_median = np.median(upper_multiples)
112-
upper_multiples_std = scipy.stats.tstd(upper_multiples)
111+
threshold_upper_multiple = detect_upper_bound_anomaly(values_above_median=upper_median_multiples_array,
112+
degrees_of_freedom=degrees_of_freedom, tail=tail)
113113

114-
if float(upper_multiples_std) == 0:
115-
threshold_upper = filtered_median_float
116-
else:
117-
# Assumption: the historical data follows t-student distribution
118-
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_multiples_median, scale=upper_multiples_std)
119-
threshold_upper_multiple = float(upper_readout_distribution.ppf(1 - tail))
114+
if threshold_upper_multiple is not None:
120115
threshold_upper = (threshold_upper_multiple + 1.0) * filtered_median_float
116+
else:
117+
threshold_upper = rule_parameters.actual_value
121118

122119
lower_median_multiples_array = [(-1.0 / (readout / filtered_median_float)) for readout in extracted if readout <= filtered_median_float if readout != 0]
123-
lower_multiples = np.array(lower_median_multiples_array, dtype=float)
124-
lower_multiples_median = np.median(lower_multiples)
125-
lower_multiples_std = scipy.stats.tstd(lower_multiples)
120+
threshold_lower_multiple = detect_lower_bound_anomaly(values_below_median=lower_median_multiples_array,
121+
degrees_of_freedom=degrees_of_freedom, tail=tail)
126122

127-
if float(lower_multiples_std) == 0:
128-
threshold_lower = filtered_median_float
129-
else:
130-
# Assumption: the historical data follows t-student distribution
131-
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_multiples_median, scale=lower_multiples_std)
132-
threshold_lower_multiple = float(lower_readout_distribution.ppf(tail))
123+
if threshold_lower_multiple is not None:
133124
threshold_lower = filtered_median_float * (-1.0 / threshold_lower_multiple)
125+
else:
126+
threshold_lower = rule_parameters.actual_value
134127

135128
passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
136129

@@ -142,28 +135,21 @@ def evaluate_rule(rule_parameters: RuleExecutionRunParameters) -> RuleExecutionR
142135
else:
143136
# using unrestricted method
144137
upper_half_filtered = [readout for readout in extracted if readout >= filtered_median_float]
145-
upper_half = np.array(upper_half_filtered, dtype=float)
146-
upper_half_median = np.median(upper_half)
147-
upper_half_std = scipy.stats.tstd(upper_half)
138+
threshold_upper_result = detect_upper_bound_anomaly(values_above_median=upper_half_filtered,
139+
degrees_of_freedom=degrees_of_freedom, tail=tail)
148140

149-
if float(upper_half_std) == 0:
150-
threshold_upper = filtered_median_float
141+
if threshold_upper_result is not None:
142+
threshold_upper = threshold_upper_result
151143
else:
152-
# Assumption: the historical data follows t-student distribution
153-
upper_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=upper_half_median, scale=upper_half_std)
154-
threshold_upper = float(upper_readout_distribution.ppf(1 - tail))
144+
threshold_upper = rule_parameters.actual_value
155145

156146
lower_half_list = [readout for readout in extracted if readout <= filtered_median_float]
157-
lower_half = np.array(lower_half_list, dtype=float)
158-
lower_half_median = np.median(lower_half)
159-
lower_half_std = scipy.stats.tstd(lower_half)
160-
161-
if float(lower_half_std) == 0:
162-
threshold_lower = filtered_median_float
147+
threshold_lower_result = detect_lower_bound_anomaly(values_below_median=lower_half_list,
148+
degrees_of_freedom=degrees_of_freedom, tail=tail)
149+
if threshold_lower_result is not None:
150+
threshold_lower = threshold_lower_result
163151
else:
164-
# Assumption: the historical data follows t-student distribution
165-
lower_readout_distribution = scipy.stats.t(df=degrees_of_freedom, loc=lower_half_median, scale=lower_half_std)
166-
threshold_lower = float(lower_readout_distribution.ppf(tail))
152+
threshold_lower = rule_parameters.actual_value
167153

168154
passed = threshold_lower <= rule_parameters.actual_value <= threshold_upper
169155

0 commit comments

Comments
 (0)