Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
vuksanv committed Jun 20, 2012
2 parents 2c8459d + 9303fdc commit a1f9d33
Show file tree
Hide file tree
Showing 13 changed files with 2,644 additions and 8 deletions.
5 changes: 4 additions & 1 deletion gpu/nvidia/README
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,14 @@ The following metrics have been implemented:
* gpu_graphics_speed
* gpu_sm_speed
* gpu_mem_speed
* gpu_max_graphics_speed
* gpu_max_sm_speed
* gpu_max_mem_speed
* gpu_temp
* gpu_util
* gpu_mem_util
* gpu_mem_used
* gpu_fan
* gpu_power_usage
* gpu_power_state
* gpu_perf_state
* gpu_ecc_mode
27 changes: 24 additions & 3 deletions gpu/nvidia/conf.d/nvidia.pyconf
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ collection_group {
}

metric {
name_match = "([\\S]+)_power_state"
name = "\\1_power_state"
title= "\\1 Power State"
name_match = "([\\S]+)_perf_state"
name = "\\1_perf_state"
title= "\\1 Performance State"
value_threshold = 1.0
}

Expand Down Expand Up @@ -124,4 +124,25 @@ collection_group {
name = "\\1_mem_total"
title = "\\1 Memory Total"
}

metric {
name_match = "([\\S]+)_max_graphics_speed"
name = "\\1_max_graphics_speed"
title = "\\1 Max Graphics Speed"
value_threshold = 1.0
}

metric {
name_match = "([\\S]+)_max_sm_speed"
name = "\\1_max_sm_speed"
title = "\\1 Max SM Speed"
value_threshold = 1.0
}

metric {
name_match = "([\\S]+)_max_mem_speed"
name = "\\1_max_mem_speed"
title = "\\1 Max Memory Speed"
value_threshold = 1.0
}
}
17 changes: 13 additions & 4 deletions gpu/nvidia/python_modules/nvidia.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# NVIDIA GPU metric module using the Python bindings for NVML
#
# (C)opyright 2011 Bernard Li <bernard@vanhpc.org>
# (C)opyright 2011, 2012 Bernard Li <bernard@vanhpc.org>
# All Rights Reserved.
#
# Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -99,8 +99,8 @@ def gpu_device_handler(name):
except NVMLError, nvmlError:
if NVML_ERROR_NOT_SUPPORTED == nvmlError.value:
return 'N/A'
elif (metric == 'power_state'):
state = nvmlDeviceGetPowerState(gpu_device)
elif (metric == 'perf_state'):
state = nvmlDeviceGetPerformanceState(gpu_device)
try:
int(state)
return "P%s" % state
Expand All @@ -114,6 +114,12 @@ def gpu_device_handler(name):
return nvmlDeviceGetClockInfo(gpu_device, NVML_CLOCK_MEM)
elif (metric == 'power_usage'):
return nvmlDeviceGetPowerUsage(gpu_device)
elif (metric == 'max_graphics_speed'):
return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_GRAPHICS)
elif (metric == 'max_sm_speed'):
return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_SM)
elif (metric == 'max_mem_speed'):
return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_MEM)
else:
print "Handler for %s not implemented, please fix in gpu_device_handler()" % metric
os._exit(1)
Expand All @@ -138,13 +144,16 @@ def metric_init(params):
build_descriptor('gpu%s_graphics_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Graphics Speed' % i, 'gpu')
build_descriptor('gpu%s_sm_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s SM Speed' % i, 'gpu')
build_descriptor('gpu%s_mem_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Memory Speed' % i, 'gpu')
build_descriptor('gpu%s_max_graphics_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'zero', '%u', 'GPU%s Max Graphics Speed' % i, 'gpu')
build_descriptor('gpu%s_max_sm_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'zero', '%u', 'GPU%s Max SM Speed' % i, 'gpu')
build_descriptor('gpu%s_max_mem_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'zero', '%u', 'GPU%s Max Memory Speed' % i, 'gpu')
build_descriptor('gpu%s_uuid' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s UUID' % i, 'gpu')
build_descriptor('gpu%s_pci_id' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s PCI ID' % i, 'gpu')
build_descriptor('gpu%s_temp' % i, gpu_device_handler, default_time_max, 'uint', 'C', 'both', '%u', 'Temperature of GPU %s' % i, 'gpu,temp')
build_descriptor('gpu%s_mem_total' % i, gpu_device_handler, default_time_max, 'uint', 'KB', 'zero', '%u', 'GPU%s Total Memory' %i, 'gpu')
build_descriptor('gpu%s_mem_used' % i, gpu_device_handler, default_time_max, 'uint', 'KB', 'both', '%u', 'GPU%s Used Memory' %i, 'gpu')
build_descriptor('gpu%s_ecc_mode' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s ECC Mode' %i, 'gpu')
build_descriptor('gpu%s_power_state' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power State' %i, 'gpu')
build_descriptor('gpu%s_perf_state' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Performance State' %i, 'gpu')
build_descriptor('gpu%s_util' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Utilization' %i, 'gpu')
build_descriptor('gpu%s_mem_util' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Memory Utilization' %i, 'gpu')
build_descriptor('gpu%s_fan' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Fan Speed' %i, 'gpu')
Expand Down
Loading

0 comments on commit a1f9d33

Please sign in to comment.