From a722c377b04897ac6503d742191e52ff853c634a Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Mon, 12 Feb 2024 16:51:02 +0000 Subject: [PATCH 1/7] BUG: fix bugs with service-status-to-influx script update script to latest version on prod fix not adding statustext to hv when nova-compute service found --- .../tests/test_service_status_to_influx.py | 48 +++++++++++-------- .../usr/local/bin/service_status_to_influx.py | 14 +++++- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/MonitoringTools/tests/test_service_status_to_influx.py b/MonitoringTools/tests/test_service_status_to_influx.py index 06b7a26b..f05ce32a 100644 --- a/MonitoringTools/tests/test_service_status_to_influx.py +++ b/MonitoringTools/tests/test_service_status_to_influx.py @@ -21,24 +21,29 @@ def test_get_hypervisor_properties_state_up(): """ mock_hv = { "state": "up", - "memory_size": 1, - "memory_used": 2, - "memory_free": 3, + "memory_size": 2, + "memory_used": 1, "vcpus_used": 4, "vcpus": 5, } expected_result = { "hv": { "aggregate": "no-aggregate", - "memorymax": 1, - "memoryused": 2, - "memoryavailable": 3, + "memorymax": 2, + "memoryused": 1, + "memoryavailable": 1, + "memperc": 0.5, "cpuused": 4, "cpumax": 5, "cpuavailable": 1, + "cpuperc": 0.8, "agent": 1, "state": 1, "statetext": "Up", + "utilperc": 0.8, + "cpufull": 0, + "memfull": 1, + "full": 1, } } assert get_hypervisor_properties(mock_hv) == expected_result @@ -52,24 +57,29 @@ def test_get_hypervisor_properties_state_down(): """ mock_hv = { "state": "down", - "memory_size": 1, - "memory_used": 2, - "memory_free": 3, + "memory_size": 2, + "memory_used": 1, "vcpus_used": 4, "vcpus": 5, } expected_result = { "hv": { "aggregate": "no-aggregate", - "memorymax": 1, - "memoryused": 2, - "memoryavailable": 3, + "memorymax": 2, + "memoryused": 1, + "memoryavailable": 1, + "memperc": 0.5, "cpuused": 4, "cpumax": 5, "cpuavailable": 1, + "cpuperc": 0.8, "agent": 1, "state": 0, "statetext": "Down", + "utilperc": 0.8, + "cpufull": 0, + "memfull": 1, + "full": 1, } } assert get_hypervisor_properties(mock_hv) == expected_result @@ -328,10 +338,10 @@ def test_update_with_service_statuses(mock_get_service_properties): # stubs out actually getting properties mock_get_service_properties.side_effect = [ - {"nova-compute": {"status": "enabled"}}, + {"nova-compute": {"status": 1, "statustext": "enabled"}}, {"other-service": {}}, - {"other-service": {"status": "enabled"}}, - {"nova-compute": {"status": "disabled"}}, + {"other-service": {"status": 1, "statustext": "enabled"}}, + {"nova-compute": {"status": 0, "statustext": "disabled"}}, ] res = update_with_service_statuses(mock_conn, mock_status_details) @@ -342,16 +352,16 @@ def test_update_with_service_statuses(mock_get_service_properties): # shouldn't override what's already there # add hv status == nova-compute svc status "hv1": { - "hv": {"status": "enabled"}, - "nova-compute": {"status": "enabled"}, + "hv": {"status": 1, "statustext": "enabled"}, + "nova-compute": {"status": 1, "statustext": "enabled"}, "foo": {}, "bar": {}, "other-service": {}, }, # only nova-compute status adds hv status - "hv2": {"hv": {}, "other-service": {"status": "enabled"}}, + "hv2": {"hv": {}, "other-service": {"status": 1, "statustext": "enabled"}}, # adds what doesn't exist, no "hv" so no setting status - "hv3": {"nova-compute": {"status": "disabled"}}, + "hv3": {"nova-compute": {"status": 0, "statustext": "disabled"}}, } diff --git a/MonitoringTools/usr/local/bin/service_status_to_influx.py b/MonitoringTools/usr/local/bin/service_status_to_influx.py index 4b33309c..9e155d91 100644 --- a/MonitoringTools/usr/local/bin/service_status_to_influx.py +++ b/MonitoringTools/usr/local/bin/service_status_to_influx.py @@ -20,15 +20,24 @@ def get_hypervisor_properties(hypervisor: Hypervisor) -> Dict: "aggregate": "no-aggregate", "memorymax": hypervisor["memory_size"], "memoryused": hypervisor["memory_used"], - "memoryavailable": hypervisor["memory_free"], + "memoryavailable": hypervisor["memory_size"] - hypervisor["memory_used"], + "memperc": hypervisor["memory_used"] / hypervisor["memory_size"], "cpumax": hypervisor["vcpus"], "cpuused": hypervisor["vcpus_used"], "cpuavailable": hypervisor["vcpus"] - hypervisor["vcpus_used"], + "cpuperc": hypervisor["vcpus_used"] / hypervisor["vcpus"], "agent": 1, "state": 1 if hypervisor["state"] == "up" else 0, "statetext": hypervisor["state"].capitalize(), } } + hv_info = hv_prop_dict["hv"] + + hv_info["utilperc"] = max(hv_info["cpuperc"], hv_info["memperc"]) + hv_info["cpufull"] = 1 if hv_info["cpuperc"] >= 0.97 else 0 + hv_info["memfull"] = 1 if hv_info["memoryavailable"] <= 8192 else 0 + hv_info["full"] = int(hv_info["memfull"] or hv_info["cpufull"]) + return hv_prop_dict @@ -137,6 +146,9 @@ def update_with_service_statuses(conn, status_details: Dict) -> Dict: service_host.update(get_service_properties(service)) if "hv" in service_host and service["binary"] == "nova-compute": service_host["hv"]["status"] = service_host["nova-compute"]["status"] + service_host["hv"]["statustext"] = service_host["nova-compute"][ + "statustext" + ] return status_details From 2de2bef14ff1c512dac0e7e2ed2f7eb55de5bfc0 Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Tue, 13 Feb 2024 14:08:58 +0000 Subject: [PATCH 2/7] BUG: We should be collecting max slot info not gpu capacity this was a bug in the original script - we were storing total gpus available when we should store slots available. I think this script was used for multiple purposes and this is a holdover. --- MonitoringTools/tests/test_slottifier.py | 24 +++++++++++++++++++++ MonitoringTools/usr/local/bin/slottifier.py | 6 ++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/MonitoringTools/tests/test_slottifier.py b/MonitoringTools/tests/test_slottifier.py index 793f3ec7..c69d1110 100644 --- a/MonitoringTools/tests/test_slottifier.py +++ b/MonitoringTools/tests/test_slottifier.py @@ -420,6 +420,30 @@ def test_calculate_slots_on_hv_gpu_available_max(): assert res.max_gpu_slots_capacity_enabled == 5 +def test_calculate_slots_on_hv_gpu_max_slots_calculated_properly(): + """ + tests calculate_slots_on_hv calculates max slots properly for gpu flavor + """ + res = calculate_slots_on_hv( + # specifies a gpu flavor + "g-flavor1", + {"gpus_required": 2, "cores_required": 10, "mem_required": 10}, + { + "compute_service_status": "enabled", + # should find 3 slots since we require 2 gpus for each slot + "gpu_capacity": 6, + "cores_available": 100, + "mem_available": 100, + "core_capacity": 100, + "mem_capacity": 100, + }, + ) + assert res.slots_available == 3 + assert res.max_gpu_slots_capacity == 3 + assert res.estimated_gpu_slots_used == 0 + assert res.max_gpu_slots_capacity_enabled == 3 + + def test_calculate_slots_on_hv_calculates_used_gpu_capacity(): """ tests calculate_slots_on_hv calculates slots properly for gpu flavor diff --git a/MonitoringTools/usr/local/bin/slottifier.py b/MonitoringTools/usr/local/bin/slottifier.py index 30f1bf90..64dfca74 100644 --- a/MonitoringTools/usr/local/bin/slottifier.py +++ b/MonitoringTools/usr/local/bin/slottifier.py @@ -173,10 +173,12 @@ def calculate_slots_on_hv( theoretical_gpu_slots_available, estimated_slots_used ) - slots_dataclass.max_gpu_slots_capacity = hv_info["gpu_capacity"] + slots_dataclass.max_gpu_slots_capacity = theoretical_gpu_slots_available if hv_info["compute_service_status"] == "enabled": - slots_dataclass.max_gpu_slots_capacity_enabled = hv_info["gpu_capacity"] + slots_dataclass.max_gpu_slots_capacity_enabled = ( + theoretical_gpu_slots_available + ) slots_available = min( slots_available, From a689df98918e4ca8b7c89e65e13a27b2b9683808 Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Tue, 13 Feb 2024 15:26:59 +0000 Subject: [PATCH 3/7] BUG: fix issue with datastring formatting service_status_to_influx requires strings to be part of metadata --- .../tests/test_service_status_to_influx.py | 92 +++++++++++++------ .../usr/local/bin/service_status_to_influx.py | 20 ++-- 2 files changed, 76 insertions(+), 36 deletions(-) diff --git a/MonitoringTools/tests/test_service_status_to_influx.py b/MonitoringTools/tests/test_service_status_to_influx.py index f05ce32a..d306d0ab 100644 --- a/MonitoringTools/tests/test_service_status_to_influx.py +++ b/MonitoringTools/tests/test_service_status_to_influx.py @@ -180,16 +180,24 @@ def test_convert_to_data_string_one_hv_one_service(mock_get_service_prop_string) Tests convert_to_data_string works with single entry in details """ mock_instance = "prod" - mock_service_details = NonCallableMock() + mock_service_details = { + "aggregate": "ag1", + "statetext": "Up", + "statustext": "Enabled", + "prop1": "val1", + } mock_details = {"hv1": {"service1": mock_service_details}} mock_get_service_prop_string.return_value = "prop1=val1" res = convert_to_data_string(mock_instance, mock_details) assert ( - res == 'ServiceStatus,host="hv1",service="service1",instance=Prod prop1=val1\n' + res == + 'ServiceStatus,host="hv1",service="service1",instance=Prod,' + 'aggregate="ag1",statetext="Up",statustext="Enabled" ' + 'prop1=val1\n' ) - mock_get_service_prop_string.assert_called_once_with(mock_service_details) + mock_get_service_prop_string.assert_called_once_with({"prop1": "val1"}) @patch("service_status_to_influx.get_service_prop_string") @@ -198,8 +206,18 @@ def test_convert_to_data_string_one_hv_multi_service(mock_get_service_prop_strin Tests convert_to_data_string works with single entry in details with multiple service binaries """ mock_instance = "prod" - mock_service_details_1 = NonCallableMock() - mock_service_details_2 = NonCallableMock() + mock_service_details_1 = { + "aggregate": "ag1", + "statetext": "Up", + "statustext": "Enabled", + "prop1": "val1", + } + mock_service_details_2 = { + "aggregate": "ag2", + "statetext": "Down", + "statustext": "Disabled", + "prop1": "val2", + } mock_details = { "hv1": {"service1": mock_service_details_1, "service2": mock_service_details_2} } @@ -208,11 +226,15 @@ def test_convert_to_data_string_one_hv_multi_service(mock_get_service_prop_strin res = convert_to_data_string(mock_instance, mock_details) assert res == ( - 'ServiceStatus,host="hv1",service="service1",instance=Prod prop1=val1\n' - 'ServiceStatus,host="hv1",service="service2",instance=Prod prop1=val2\n' + 'ServiceStatus,host="hv1",service="service1",instance=Prod,' + 'aggregate="ag1",statetext="Up",statustext="Enabled" ' + 'prop1=val1\n' + 'ServiceStatus,host="hv1",service="service2",instance=Prod,' + 'aggregate="ag2",statetext="Down",statustext="Disabled" ' + 'prop1=val2\n' ) mock_get_service_prop_string.assert_has_calls( - [call(mock_service_details_1), call(mock_service_details_2)] + [call({"prop1": "val1"}), call({"prop1": "val2"})] ) @@ -222,9 +244,25 @@ def test_convert_to_data_string_multi_item(mock_get_service_prop_string): Tests convert_to_data_string works with multiple entries in dict for details """ mock_instance = "prod" - mock_service_details_1 = NonCallableMock() - mock_service_details_2 = NonCallableMock() - mock_service_details_3 = NonCallableMock() + mock_service_details_1 = { + "aggregate": "ag1", + "statetext": "Up", + "statustext": "Enabled", + "prop1": "val1", + } + mock_service_details_2 = { + "aggregate": "ag2", + "statetext": "Down", + "statustext": "Disabled", + "prop1": "val2", + } + mock_service_details_3 = { + "aggregate": "ag3", + "statetext": "Up", + "statustext": "Disabled", + "prop1": "val3", + } + mock_details = { "hv1": { "service1": mock_service_details_1, @@ -241,15 +279,21 @@ def test_convert_to_data_string_multi_item(mock_get_service_prop_string): res = convert_to_data_string(mock_instance, mock_details) assert res == ( - 'ServiceStatus,host="hv1",service="service1",instance=Prod prop1=val1\n' - 'ServiceStatus,host="hv1",service="service2",instance=Prod prop1=val2\n' - 'ServiceStatus,host="hv2",service="service3",instance=Prod prop1=val3\n' + 'ServiceStatus,host="hv1",service="service1",instance=Prod,' + 'aggregate="ag1",statetext="Up",statustext="Enabled" ' + 'prop1=val1\n' + 'ServiceStatus,host="hv1",service="service2",instance=Prod,' + 'aggregate="ag2",statetext="Down",statustext="Disabled" ' + 'prop1=val2\n' + 'ServiceStatus,host="hv2",service="service3",instance=Prod,' + 'aggregate="ag3",statetext="Up",statustext="Disabled" ' + 'prop1=val3\n' ) mock_get_service_prop_string.assert_has_calls( [ - call(mock_service_details_1), - call(mock_service_details_2), - call(mock_service_details_3), + call({"prop1": "val1"}), + call({"prop1": "val2"}), + call({"prop1": "val3"}), ] ) @@ -261,20 +305,10 @@ def test_get_service_prop_string_empty_dict(): assert get_service_prop_string({}) == "" -def test_get_service_prop_string_with_string_props(): - """ - tests get_service_prop_string returns correct prop string - when given string props it should not suffix each property value with i - """ - props = {"statetext": "foo", "statustext": "bar", "aggregate": "baz"} - expected_result = 'statetext="foo",statustext="bar",aggregate="baz"' - assert get_service_prop_string(props) == expected_result - - -def test_get_service_prop_string_with_int_props(): +def test_get_service_prop_string(): """ tests get_service_prop_string returns correct prop string - when given int props it should suffix each property value with i + it should suffix each property value with i """ props = {"prop1": 1, "prop2": 2, "prop3": 3} expected_result = "prop1=1i,prop2=2i,prop3=3i" diff --git a/MonitoringTools/usr/local/bin/service_status_to_influx.py b/MonitoringTools/usr/local/bin/service_status_to_influx.py index 9e155d91..a14db92d 100644 --- a/MonitoringTools/usr/local/bin/service_status_to_influx.py +++ b/MonitoringTools/usr/local/bin/service_status_to_influx.py @@ -89,10 +89,19 @@ def convert_to_data_string(instance: str, service_details: Dict) -> str: data_string = "" for hypervisor_name, services in service_details.items(): for service_binary, service_stats in services.items(): + aggregate = service_stats.pop("aggregate") + statustext = service_stats.pop("statustext") + statetext = service_stats.pop("statetext") + data_string += ( - f'ServiceStatus,host="{hypervisor_name}",' - f'service="{service_binary}",instance={instance.capitalize()} ' - f"{get_service_prop_string(service_stats)}\n" + f'ServiceStatus' + f',host="{hypervisor_name}"' + f',service="{service_binary}"' + f',instance={instance.capitalize()}' + f',aggregate="{aggregate}"' + f',statetext="{statetext}"' + f',statustext="{statustext}"' + f" {get_service_prop_string(service_stats)}\n" ) return data_string @@ -106,10 +115,7 @@ def get_service_prop_string(service_dict: Dict) -> str: """ stats_strings = [] for stat, val in service_dict.items(): - stats_string = f'{stat}="{val}"' - if stat not in ["statetext", "statustext", "aggregate"]: - stats_string = f"{stat}={val}i" - stats_strings.append(stats_string) + stats_strings.append(f"{stat}={val}i") return ",".join(stats_strings) From 92ed68bd65162a8874ddafc022c3c98d1f649d5b Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Tue, 13 Feb 2024 15:39:15 +0000 Subject: [PATCH 4/7] BUG: fix if aggregate doesn't exist --- .../usr/local/bin/service_status_to_influx.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/MonitoringTools/usr/local/bin/service_status_to_influx.py b/MonitoringTools/usr/local/bin/service_status_to_influx.py index a14db92d..5c7861b4 100644 --- a/MonitoringTools/usr/local/bin/service_status_to_influx.py +++ b/MonitoringTools/usr/local/bin/service_status_to_influx.py @@ -89,20 +89,24 @@ def convert_to_data_string(instance: str, service_details: Dict) -> str: data_string = "" for hypervisor_name, services in service_details.items(): for service_binary, service_stats in services.items(): - aggregate = service_stats.pop("aggregate") statustext = service_stats.pop("statustext") statetext = service_stats.pop("statetext") - - data_string += ( + new_data_string = ( f'ServiceStatus' f',host="{hypervisor_name}"' f',service="{service_binary}"' f',instance={instance.capitalize()}' - f',aggregate="{aggregate}"' f',statetext="{statetext}"' f',statustext="{statustext}"' - f" {get_service_prop_string(service_stats)}\n" ) + + aggregate = service_stats.pop("aggregate", None) + if aggregate: + new_data_string += f',aggregate="{aggregate}"' + + new_data_string += f" {get_service_prop_string(service_stats)}\n" + data_string += new_data_string + return data_string From 18378b49a9b314f18f0d77319c3183acb21047e8 Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Tue, 13 Feb 2024 17:05:23 +0000 Subject: [PATCH 5/7] BUG: fix issue with decimals influx doesn't take decimals. T here is some crazy bug in the original scripts which seemed to silently ignore decimal places. I changed it to calculate actual percentages and round up to nearest percent. I think this is accurate enough for this --- .../tests/test_service_status_to_influx.py | 26 +++++++++---------- .../usr/local/bin/service_status_to_influx.py | 9 ++++--- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/MonitoringTools/tests/test_service_status_to_influx.py b/MonitoringTools/tests/test_service_status_to_influx.py index d306d0ab..925af497 100644 --- a/MonitoringTools/tests/test_service_status_to_influx.py +++ b/MonitoringTools/tests/test_service_status_to_influx.py @@ -32,15 +32,15 @@ def test_get_hypervisor_properties_state_up(): "memorymax": 2, "memoryused": 1, "memoryavailable": 1, - "memperc": 0.5, + "memperc": 50, "cpuused": 4, "cpumax": 5, "cpuavailable": 1, - "cpuperc": 0.8, + "cpuperc": 80, "agent": 1, "state": 1, "statetext": "Up", - "utilperc": 0.8, + "utilperc": 80, "cpufull": 0, "memfull": 1, "full": 1, @@ -68,15 +68,15 @@ def test_get_hypervisor_properties_state_down(): "memorymax": 2, "memoryused": 1, "memoryavailable": 1, - "memperc": 0.5, + "memperc": 50, "cpuused": 4, "cpumax": 5, "cpuavailable": 1, - "cpuperc": 0.8, + "cpuperc": 80, "agent": 1, "state": 0, "statetext": "Down", - "utilperc": 0.8, + "utilperc": 80, "cpufull": 0, "memfull": 1, "full": 1, @@ -194,8 +194,8 @@ def test_convert_to_data_string_one_hv_one_service(mock_get_service_prop_string) assert ( res == 'ServiceStatus,host="hv1",service="service1",instance=Prod,' - 'aggregate="ag1",statetext="Up",statustext="Enabled" ' - 'prop1=val1\n' + 'statetext="Up",statustext="Enabled",aggregate="ag1"' + ' prop1=val1\n' ) mock_get_service_prop_string.assert_called_once_with({"prop1": "val1"}) @@ -227,10 +227,10 @@ def test_convert_to_data_string_one_hv_multi_service(mock_get_service_prop_strin res = convert_to_data_string(mock_instance, mock_details) assert res == ( 'ServiceStatus,host="hv1",service="service1",instance=Prod,' - 'aggregate="ag1",statetext="Up",statustext="Enabled" ' + 'statetext="Up",statustext="Enabled",aggregate="ag1" ' 'prop1=val1\n' 'ServiceStatus,host="hv1",service="service2",instance=Prod,' - 'aggregate="ag2",statetext="Down",statustext="Disabled" ' + 'statetext="Down",statustext="Disabled",aggregate="ag2" ' 'prop1=val2\n' ) mock_get_service_prop_string.assert_has_calls( @@ -280,13 +280,13 @@ def test_convert_to_data_string_multi_item(mock_get_service_prop_string): res = convert_to_data_string(mock_instance, mock_details) assert res == ( 'ServiceStatus,host="hv1",service="service1",instance=Prod,' - 'aggregate="ag1",statetext="Up",statustext="Enabled" ' + 'statetext="Up",statustext="Enabled",aggregate="ag1" ' 'prop1=val1\n' 'ServiceStatus,host="hv1",service="service2",instance=Prod,' - 'aggregate="ag2",statetext="Down",statustext="Disabled" ' + 'statetext="Down",statustext="Disabled",aggregate="ag2" ' 'prop1=val2\n' 'ServiceStatus,host="hv2",service="service3",instance=Prod,' - 'aggregate="ag3",statetext="Up",statustext="Disabled" ' + 'statetext="Up",statustext="Disabled",aggregate="ag3" ' 'prop1=val3\n' ) mock_get_service_prop_string.assert_has_calls( diff --git a/MonitoringTools/usr/local/bin/service_status_to_influx.py b/MonitoringTools/usr/local/bin/service_status_to_influx.py index 5c7861b4..fc77504e 100644 --- a/MonitoringTools/usr/local/bin/service_status_to_influx.py +++ b/MonitoringTools/usr/local/bin/service_status_to_influx.py @@ -21,11 +21,11 @@ def get_hypervisor_properties(hypervisor: Hypervisor) -> Dict: "memorymax": hypervisor["memory_size"], "memoryused": hypervisor["memory_used"], "memoryavailable": hypervisor["memory_size"] - hypervisor["memory_used"], - "memperc": hypervisor["memory_used"] / hypervisor["memory_size"], + "memperc": round((hypervisor["memory_used"] / hypervisor["memory_size"]) * 100), "cpumax": hypervisor["vcpus"], "cpuused": hypervisor["vcpus_used"], "cpuavailable": hypervisor["vcpus"] - hypervisor["vcpus_used"], - "cpuperc": hypervisor["vcpus_used"] / hypervisor["vcpus"], + "cpuperc": round((hypervisor["vcpus_used"] / hypervisor["vcpus"]) * 100), "agent": 1, "state": 1 if hypervisor["state"] == "up" else 0, "statetext": hypervisor["state"].capitalize(), @@ -34,7 +34,7 @@ def get_hypervisor_properties(hypervisor: Hypervisor) -> Dict: hv_info = hv_prop_dict["hv"] hv_info["utilperc"] = max(hv_info["cpuperc"], hv_info["memperc"]) - hv_info["cpufull"] = 1 if hv_info["cpuperc"] >= 0.97 else 0 + hv_info["cpufull"] = 1 if hv_info["cpuperc"] >= 97 else 0 hv_info["memfull"] = 1 if hv_info["memoryavailable"] <= 8192 else 0 hv_info["full"] = int(hv_info["memfull"] or hv_info["cpufull"]) @@ -203,4 +203,5 @@ def main(user_args: List): if __name__ == "__main__": - main(sys.argv[1:]) + #main(sys.argv[1:]) + print(get_all_service_statuses("prod")) From 58eeca2bc8632b2eb5d3307f356a69742d1d0c2e Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Tue, 13 Feb 2024 17:07:55 +0000 Subject: [PATCH 6/7] BUG: remove testing print statement --- MonitoringTools/usr/local/bin/service_status_to_influx.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MonitoringTools/usr/local/bin/service_status_to_influx.py b/MonitoringTools/usr/local/bin/service_status_to_influx.py index fc77504e..25c433cc 100644 --- a/MonitoringTools/usr/local/bin/service_status_to_influx.py +++ b/MonitoringTools/usr/local/bin/service_status_to_influx.py @@ -203,5 +203,4 @@ def main(user_args: List): if __name__ == "__main__": - #main(sys.argv[1:]) - print(get_all_service_statuses("prod")) + main(sys.argv[1:]) From 7f8c13840a31a9e2a5b6b23302bfcb0271373f7f Mon Sep 17 00:00:00 2001 From: anish-mudaraddi Date: Thu, 15 Feb 2024 13:41:02 +0000 Subject: [PATCH 7/7] BUG: change how total gpu slots is calculaed take into account cores/mem when calculating theoretical gpu slots --- MonitoringTools/usr/local/bin/slottifier.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MonitoringTools/usr/local/bin/slottifier.py b/MonitoringTools/usr/local/bin/slottifier.py index 64dfca74..b294ceaa 100644 --- a/MonitoringTools/usr/local/bin/slottifier.py +++ b/MonitoringTools/usr/local/bin/slottifier.py @@ -154,8 +154,10 @@ def calculate_slots_on_hv( f"gpu flavor {flavor_name} does not have 'gpunum' metadata" ) - theoretical_gpu_slots_available = ( - hv_info["gpu_capacity"] // flavor_reqs["gpus_required"] + theoretical_gpu_slots_available = min( + hv_info["gpu_capacity"] // flavor_reqs["gpus_required"], + hv_info["core_capacity"] // flavor_reqs["cores_required"], + hv_info["mem_capacity"] // flavor_reqs["mem_required"], ) estimated_slots_used = (