From fc9378b30baea56998df7a3041c9e166759ae64b Mon Sep 17 00:00:00 2001 From: Robert Alexander Date: Mon, 7 May 2012 15:07:23 -0700 Subject: [PATCH 01/39] Add latest python bindings to module to simplify install --- gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO | 20 + gpu/nvidia/nvidia-ml-py-3.295.00/README.txt | 139 +++ .../build/lib/nvidia_smi.py | 455 +++++++++ .../nvidia-ml-py-3.295.00/build/lib/pynvml.py | 903 ++++++++++++++++++ .../nvidia-ml-py-3.295.00/nvidia_smi.py | 455 +++++++++ gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py | 903 ++++++++++++++++++ gpu/nvidia/nvidia-ml-py-3.295.00/setup.py | 32 + 7 files changed, 2907 insertions(+) create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/README.txt create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py create mode 100644 gpu/nvidia/nvidia-ml-py-3.295.00/setup.py diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO b/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO new file mode 100644 index 00000000..3c0212b9 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO @@ -0,0 +1,20 @@ +Metadata-Version: 1.0 +Name: nvidia-ml-py +Version: 3.295.00 +Summary: Python Bindings for the NVIDIA Management Library +Home-page: http://www.nvidia.com/ +Author: NVIDIA Corporation +Author-email: nvml-bindings@nvidia.com +License: BSD +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: Microsoft :: Windows +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: System :: Hardware +Classifier: Topic :: System :: Systems Administration diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt b/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt new file mode 100644 index 00000000..4cfec876 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt @@ -0,0 +1,139 @@ +====== +pyNVML +====== + +------------------------------------------------ +Python bindings to the NVIDIA Management Library +------------------------------------------------ + +Provides a Python interface to GPU management and monitoring functions. + +This is a wrapper around the NVML library. +For information about the NVML library, see the NVML developer page +http://developer.nvidia.com/nvidia-management-library-nvml + +Download the latest package from: +http://pypi.python.org/pypi/nvidia-ml-py/ + +Note this file can be run with 'python -m doctest -v README.txt' +although the results are system dependent + +REQUIRES +-------- +Python 2.5, or an earlier version with the ctypes module. + +INSTALLATION +------------ +sudo python setup.py install + +USAGE +----- + + >>> from pynvml import * + >>> nvmlInit() + >>> print "Driver Version:", nvmlSystemGetDriverVersion() + Driver Version: 295.00 + >>> deviceCount = nvmlDeviceGetCount() + >>> for i in range(deviceCount): + ... handle = nvmlDeviceGetHandleByIndex(i) + ... print "Device", i, ":", nvmlDeviceGetName(handle) + ... + Device 0 : Tesla C2070 + + >>> nvmlShutdown() + +Additionally, see nvidia_smi.py. A sample application. + +FUNCTIONS +--------- +Python methods wrap NVML functions, implemented in a C shared library. +Each function's use is the same with the following exceptions: + +- Instead of returning error codes, failing error codes are raised as + Python exceptions. + + >>> try: + ... nvmlDeviceGetCount() + ... except NVMLError as error: + ... print error + ... + Uninitialized + +- C function output parameters are returned from the corresponding + Python function left to right. + +:: + + nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); + + >>> nvmlInit() + >>> handle = nvmlDeviceGetHandleByIndex(0) + >>> (current, pending) = nvmlDeviceGetEccMode(handle) + +- C structs are converted into Python classes. + +:: + + nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, + nvmlMemory_t *memory); + typedef struct nvmlMemory_st { + unsigned long long total; + unsigned long long free; + unsigned long long used; + } nvmlMemory_t; + + >>> info = nvmlDeviceGetMemoryInfo(handle) + >>> print "Total memory:", info.total + Total memory: 5636292608 + >>> print "Free memory:", info.free + Free memory: 5578420224 + >>> print "Used memory:", info.used + Used memory: 57872384 + +- Python handles string buffer creation. + +:: + + nvmlReturn_t nvmlSystemGetDriverVersion(char* version, + unsigned int length); + + >>> version = nvmlSystemGetDriverVersion(); + >>> nvmlShutdown() + +For usage information see the NVML documentation. + +VARIABLES +--------- +All meaningful NVML constants and enums are exposed in Python. + +The NVML_VALUE_NOT_AVAILABLE constant is not used. Instead None is mapped to the field. + +RELEASE NOTES +------------- +Version 2.285.0 +- Added new functions for NVML 2.285. See NVML documentation for more information. +- Ported to support Python 3.0 and Python 2.0 syntax. +- Added nvidia_smi.py tool as a sample app. +Version 3.295.0 +- Added new functions for NVML 3.295. See NVML documentation for more information. +- Updated nvidia_smi.py tool + - Includes additional error handling + +COPYRIGHT +--------- +Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. + +LICENSE +------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +- Neither the name of the NVIDIA Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py new file mode 100644 index 00000000..f1a42707 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py @@ -0,0 +1,455 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +# +# nvidia_smi +# nvml_bindings nvidia com +# +# Sample code that attempts to reproduce the output of nvidia-smi -q- x +# For many cases the output should match +# +# To Run: +# $ python +# Python 2.7 (r27:82500, Sep 16 2010, 18:02:00) +# [GCC 4.5.1 20100907 (Red Hat 4.5.1-3)] on linux2 +# Type "help", "copyright", "credits" or "license" for more information. +# >>> import nvidia_smi +# >>> print(nvidia_smi.XmlDeviceQuery()) +# ... +# + +from pynvml import * +import datetime + +# +# Helper functions +# +def GetEccByType(handle, counterType, bitType): + try: + count = str(nvmlDeviceGetTotalEccErrors(handle, bitType, counterType)) + except NVMLError as err: + count = handleError(err) + + try: + detail = nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType) + deviceMemory = str(detail.deviceMemory) + registerFile = str(detail.registerFile) + l1Cache = str(detail.l1Cache) + l2Cache = str(detail.l2Cache) + except NVMLError as err: + msg = handleError(err) + deviceMemory = msg + registerFile = msg + l1Cache = msg + l2Cache = msg + strResult = '' + strResult += ' ' + deviceMemory + '\n' + strResult += ' ' + registerFile + '\n' + strResult += ' ' + l1Cache + '\n' + strResult += ' ' + l2Cache + '\n' + strResult += ' ' + count + '\n' + return strResult + +def GetEccByCounter(handle, counterType): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_SINGLE_BIT_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_DOUBLE_BIT_ECC)) + strResult += ' \n' + return strResult + +def GetEccStr(handle): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_VOLATILE_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_AGGREGATE_ECC)) + strResult += ' \n' + return strResult + +# +# Converts errors into string messages +# +def handleError(err): + if (err.value == NVML_ERROR_NOT_SUPPORTED): + return "N/A" + else: + return err.__str__() + +####### +def XmlDeviceQuery(): + + try: + # + # Initialize NVML + # + nvmlInit() + strResult = '' + + strResult += '\n' + strResult += '\n' + strResult += '\n' + + strResult += ' ' + str(datetime.date.today()) + '\n' + strResult += ' ' + str(nvmlSystemGetDriverVersion()) + '\n' + + deviceCount = nvmlDeviceGetCount() + strResult += ' ' + str(deviceCount) + '\n' + + for i in range(0, deviceCount): + handle = nvmlDeviceGetHandleByIndex(i) + + pciInfo = nvmlDeviceGetPciInfo(handle) + + strResult += ' \n' % pciInfo.busId + + strResult += ' ' + nvmlDeviceGetName(handle) + '\n' + + try: + state = ('Enabled' if (nvmlDeviceGetDisplayMode(handle) != 0) else 'Disabled') + except NVMLError as err: + state = handleError(err) + + strResult += ' ' + state + '\n' + + try: + mode = 'Enabled' if (nvmlDeviceGetPersistenceMode(handle) != 0) else 'Disabled' + except NVMLError as err: + mode = handleError(err) + + strResult += ' ' + mode + '\n' + + strResult += ' \n' + + try: + current = str(nvmlDeviceGetCurrentDriverModel(handle)) + except NVMLError as err: + current = handleError(err) + strResult += ' ' + current + '\n' + + try: + pending = str(nvmlDeviceGetPendingDriverModel(handle)) + except NVMLError as err: + pending = handleError(err) + + strResult += ' ' + pending + '\n' + + strResult += ' \n' + + try: + serial = nvmlDeviceGetSerial(handle) + except NVMLError as err: + serial = handleError(err) + + strResult += ' ' + serial + '\n' + + try: + uuid = nvmlDeviceGetUUID(handle) + except NVMLError as err: + uuid = handleError(err) + + strResult += ' ' + uuid + '\n' + + try: + vbios = nvmlDeviceGetVbiosVersion(handle) + except NVMLError as err: + vbios = handleError(err) + + strResult += ' ' + vbios + '\n' + + strResult += ' \n' + + try: + oem = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_OEM) + if oem == '': + oem = 'N/A' + except NVMLError as err: + oem = handleError(err) + + strResult += ' ' + oem + '\n' + + try: + ecc = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_ECC) + if ecc == '': + ecc = 'N/A' + except NVMLError as err: + ecc = handleError(err) + + strResult += ' ' + ecc + '\n' + try: + pwr = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_POWER) + if pwr == '': + pwr = 'N/A' + except NVMLError as err: + pwr = handleError(err) + + strResult += ' ' + pwr + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += ' %02X\n' % pciInfo.bus + strResult += ' %02X\n' % pciInfo.device + strResult += ' %04X\n' % pciInfo.domain + strResult += ' %08X\n' % (pciInfo.pciDeviceId) + strResult += ' %08X\n' % (pciInfo.pciSubSystemId) + strResult += ' ' + str(pciInfo.busId) + '\n' + strResult += ' \n' + + + strResult += ' \n' + + try: + gen = str(nvmlDeviceGetMaxPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + + try: + gen = str(nvmlDeviceGetCurrPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + strResult += ' \n' + strResult += ' \n' + + try: + width = str(nvmlDeviceGetMaxPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + try: + width = str(nvmlDeviceGetCurrPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + strResult += ' \n' + strResult += ' \n' + strResult += ' \n' + + try: + fan = str(nvmlDeviceGetFanSpeed(handle)) + ' %' + except NVMLError as err: + fan = handleError(err) + strResult += ' ' + fan + '\n' + + try: + memInfo = nvmlDeviceGetMemoryInfo(handle) + mem_total = str(memInfo.total / 1024 / 1024) + ' MB' + mem_used = str(memInfo.used / 1024 / 1024) + ' MB' + mem_free = str(memInfo.free / 1024 / 1024) + ' MB' + except NVMLError as err: + error = handleError(err) + mem_total = error + mem_used = error + mem_free = error + + strResult += ' \n' + strResult += ' ' + mem_total + '\n' + strResult += ' ' + mem_used + '\n' + strResult += ' ' + mem_free + '\n' + strResult += ' \n' + + + try: + mode = nvmlDeviceGetComputeMode(handle) + if mode == NVML_COMPUTEMODE_DEFAULT: + modeStr = 'Default' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD: + modeStr = 'Exclusive Thread' + elif mode == NVML_COMPUTEMODE_PROHIBITED: + modeStr = 'Prohibited' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: + modeStr = 'Exclusive Process' + else: + modeStr = 'Unknown' + except NVMLError as err: + modeStr = handleError(err) + + strResult += ' ' + modeStr + '\n' + + try: + util = nvmlDeviceGetUtilizationRates(handle) + gpu_util = str(util.gpu) + mem_util = str(util.memory) + except NVMLError as err: + error = handleError(err) + gpu_util = error + mem_util = error + + strResult += ' \n' + strResult += ' ' + gpu_util + ' %\n' + strResult += ' ' + mem_util + ' %\n' + strResult += ' \n' + + try: + (current, pending) = nvmlDeviceGetEccMode(handle) + curr_str = 'Enabled' if (current != 0) else 'Disabled' + pend_str = 'Enabled' if (pending != 0) else 'Disabled' + except NVMLError as err: + error = handleError(err) + curr_str = error + pend_str = error + + strResult += ' \n' + strResult += ' ' + curr_str + '\n' + strResult += ' ' + pend_str + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += GetEccStr(handle) + strResult += ' \n' + + try: + temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) + ' C' + except NVMLError as err: + temp = handleError(err) + + strResult += ' \n' + strResult += ' ' + temp + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + perfState = nvmlDeviceGetPowerState(handle) + except NVMLError as err: + perfState = handleError(err) + strResult += ' P%s\n' % perfState + try: + powMan = nvmlDeviceGetPowerManagementMode(handle) + powManStr = 'Supported' if powMan != 0 else 'N/A' + except NVMLError as err: + powManStr = handleError(err) + strResult += ' ' + powManStr + '\n' + try: + powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000.0) + powDrawStr = '%.2f W' % powDraw + except NVMLError as err: + powDrawStr = handleError(err) + strResult += ' ' + powDrawStr + '\n' + try: + powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000.0) + powLimitStr = '%d W' % powLimit + except NVMLError as err: + powLimitStr = handleError(err) + strResult += ' ' + powLimitStr + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' +graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' + graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + try: + perfState = nvmlDeviceGetPowerState(handle) + perfStateStr = 'P%s' % perfState + except NVMLError as err: + perfStateStr = handleError(err) + strResult += ' ' + perfStateStr + '\n' + + strResult += ' \n' + + procstr = "" + try: + procs = nvmlDeviceGetComputeRunningProcesses(handle) + except NVMLError as err: + procs = [] + procstr = handleError(err) + + for p in procs: + procstr += ' \n' + procstr += ' %d\n' % p.pid + try: + name = str(nvmlSystemGetProcessName(p.pid)) + except NVMLError as err: + if (err.value == NVML_ERROR_NOT_FOUND): + # probably went away + continue + else: + name = handleError(err) + procstr += ' ' + name + '\n' + procstr += ' \n' + if (p.usedGpuMemory == None): + procstr += 'N\A' + else: + procstr += '%d MB\n' % (p.usedGpuMemory / 1024 / 1024) + procstr += '\n' + procstr += ' \n' + + strResult += procstr + strResult += ' \n' + strResult += ' \n' + + strResult += '\n' + + except NVMLError as err: + strResult += 'nvidia_smi.py: ' + err.__str__() + '\n' + + nvmlShutdown() + + return strResult + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py new file mode 100644 index 00000000..90f8bdd8 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py @@ -0,0 +1,903 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +## +# Python bindings for the NVML library +## +from ctypes import * +from ctypes.util import find_library +import sys +import threading + +## C Type mappings ## +## Enums +_nvmlEnableState_t = c_uint +NVML_FEATURE_DISABLED = 0 +NVML_FEATURE_ENABLED = 1 + +_nvmlTemperatureSensors_t = c_uint +NVML_TEMPERATURE_GPU = 0 + +_nvmlComputeMode_t = c_uint +NVML_COMPUTEMODE_DEFAULT = 0 +NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 +NVML_COMPUTEMODE_PROHIBITED = 2 +NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 + +_nvmlEccBitType_t = c_uint +NVML_SINGLE_BIT_ECC = 0 +NVML_DOUBLE_BIT_ECC = 1 + +_nvmlEccCounterType_t = c_uint +NVML_VOLATILE_ECC = 0 +NVML_AGGREGATE_ECC = 1 + +_nvmlClockType_t = c_uint +NVML_CLOCK_GRAPHICS = 0 +NVML_CLOCK_SM = 1 +NVML_CLOCK_MEM = 2 + +_nvmlDriverModel_t = c_uint +NVML_DRIVER_WDDM = 0 +NVML_DRIVER_WDM = 1 + +_nvmlPstates_t = c_uint +NVML_PSTATE_0 = 0 +NVML_PSTATE_1 = 1 +NVML_PSTATE_2 = 2 +NVML_PSTATE_3 = 3 +NVML_PSTATE_4 = 4 +NVML_PSTATE_5 = 5 +NVML_PSTATE_6 = 6 +NVML_PSTATE_7 = 7 +NVML_PSTATE_8 = 8 +NVML_PSTATE_9 = 9 +NVML_PSTATE_10 = 10 +NVML_PSTATE_11 = 11 +NVML_PSTATE_12 = 12 +NVML_PSTATE_13 = 13 +NVML_PSTATE_14 = 14 +NVML_PSTATE_15 = 15 +NVML_PSTATE_UNKNOWN = 32 + +_nvmlInforomObject_t = c_uint +NVML_INFOROM_OEM = 0 +NVML_INFOROM_ECC = 1 +NVML_INFOROM_POWER = 2 + +_nvmlReturn_t = c_uint +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10, +NVML_ERROR_UNKNOWN = 999 + +_nvmlFanState_t = c_uint +NVML_FAN_NORMAL = 0 +NVML_FAN_FAILED = 1 + +_nvmlLedColor_t = c_uint +NVML_LED_COLOR_GREEN = 0 +NVML_LED_COLOR_AMBER = 1 + +# C preprocessor defined values +nvmlFlagDefault = 0 +nvmlFlagForce = 1 + +# buffer size +NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 +NVML_DEVICE_UUID_BUFFER_SIZE = 80 +NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 81 +NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 +NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 +NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 + +NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) + +## Lib loading ## +nvmlLib = None +libLoadLock = threading.Lock() + +## Error Checking ## +class NVMLError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return str(nvmlErrorString(self.value)) + +def _nvmlCheckReturn(ret): + if (ret != NVML_SUCCESS): + raise NVMLError(ret) + return ret + +## Function access ## +def _nvmlGetFunctionPointer(name): + global nvmlLib + global libLoadLock + + libLoadLock.acquire() + try: + # ensure library was loaded + if (nvmlLib == None): + raise NVMLError(NVML_ERROR_UNINITIALIZED) + try: + return getattr(nvmlLib, name) + except AttributeError as attrError: + raise NVMLError(NVML_ERROR_NOT_SUPPORTED) + finally: + # lock is always freed + libLoadLock.release() + +## Alternative object +# Allows the object to be printed +# Allows mismatched types to be assigned +# - like None when the Structure variant requires c_uint +class nvmlFriendlyObject(object): + def __init__(self, dictionary): + for x in dictionary: + setattr(self, x, dictionary[x]) + def __str__(self): + return self.__dict__.__str__() + +def nvmlStructToFriendlyObject(struct): + d = {} + for x in struct._fields_: + key = x[0] + value = getattr(struct, key) + d[key] = value + obj = nvmlFriendlyObject(d) + return obj + +# pack the object so it can be passed to the NVML library +def nvmlFriendlyObjectToStruct(obj, model): + for x in model._fields_: + key = x[0] + value = obj.__dict__[key] + setattr(model, key, value) + return model + +## Unit structures +class struct_c_nvmlUnit_t(Structure): + pass # opaque handle +c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t) + +class c_nvmlUnitInfo_t(Structure): + _fields_ = [ + ('name', c_char * 96), + ('id', c_char * 96), + ('serial', c_char * 96), + ('firmwareVersion', c_char * 96), + ] + +class c_nvmlLedState_t(Structure): + _fields_ = [ + ('cause', c_char * 256), + ('color', _nvmlLedColor_t), + ] + +class c_nvmlPSUInfo_t(Structure): + _fields_ = [ + ('state', c_char * 256), + ('current', c_uint), + ('voltage', c_uint), + ('power', c_uint), + ] + +class c_nvmlUnitFanInfo_t(Structure): + _fields_ = [ + ('speed', c_uint), + ('state', _nvmlFanState_t), + ] + +class c_nvmlUnitFanSpeeds_t(Structure): + _fields_ = [ + ('fans', c_nvmlUnitFanInfo_t * 24), + ('count', c_uint) + ] + +## Device structures +class struct_c_nvmlDevice_t(Structure): + pass # opaque handle +c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t) + +class nvmlPciInfo_t(Structure): + _fields_ = [ + ('busId', c_char * 16), + ('domain', c_uint), + ('bus', c_uint), + ('device', c_uint), + ('pciDeviceId', c_uint), + + # Added in 2.285 + ('pciSubSystemId', c_uint), + ('reserved0', c_uint), + ('reserved1', c_uint), + ('reserved2', c_uint), + ('reserved3', c_uint), + ] + +class c_nvmlMemory_t(Structure): + _fields_ = [ + ('total', c_ulonglong), + ('free', c_ulonglong), + ('used', c_ulonglong), + ] + +# On Windows with the WDDM driver, usedGpuMemory is reported as None +# Code that processes this structure should check for None, I.E. +# +# if (info.usedGpuMemory == None): +# # TODO handle the error +# pass +# else: +# print("Using %d MB of memory" % (info.usedGpuMemory / 1024 / 1024)) +# +# See NVML documentation for more information +class c_nvmlProcessInfo_t(Structure): + _fields_ = [ + ('pid', c_uint), + ('usedGpuMemory', c_ulonglong), + ] + +class c_nvmlEccErrorCounts_t(Structure): + _fields_ = [ + ('l1Cache', c_ulonglong), + ('l2Cache', c_ulonglong), + ('deviceMemory', c_ulonglong), + ('registerFile', c_ulonglong), + ] + +class c_nvmlUtilization_t(Structure): + _fields_ = [ + ('gpu', c_uint), + ('memory', c_uint), + ] + +# Added in 2.285 +class c_nvmlHwbcEntry_t(Structure): + _fields_ = [ + ('hwbcId', c_uint), + ('firmwareVersion', c_char * 32), + ] + +## Event structures +class struct_c_nvmlEventSet_t(Structure): + pass # opaque handle +c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t) + +nvmlEventTypeSingleBitEccError = 0x0000000000000001 +nvmlEventTypeDoubleBitEccError = 0x0000000000000002 +nvmlEventTypePState = 0x0000000000000004 +nvmlEventTypeXidCriticalError = 0x0000000000000008 +nvmlEventTypeNone = 0x0000000000000000 +nvmlEventTypeAll = ( + nvmlEventTypeNone | + nvmlEventTypeSingleBitEccError | + nvmlEventTypeDoubleBitEccError | + nvmlEventTypePState | + nvmlEventTypeXidCriticalError + ) + +class c_nvmlEventData_t(Structure): + _fields_ = [ + ('device', c_nvmlDevice_t), + ('eventType', c_ulonglong), + ('reserved', c_ulonglong) + ] + +## C function wrappers ## +def nvmlInit(): + global nvmlLib + global libLoadLock + + # + # Load the library if it isn't loaded already + # + if (nvmlLib == None): + # lock to ensure only one caller loads the library + libLoadLock.acquire() + + try: + # ensure the library still isn't loaded + if (nvmlLib == None): + try: + if (sys.platform[:3] == "win"): + # cdecl calling convention + nvmlLib = cdll.nvml + else: + # assume linux + nvmlLib = CDLL("libnvidia-ml.so") + except OSError as ose: + print(ose) + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + if (nvmlLib == None): + print("Failed to load NVML") + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + finally: + # lock is always freed + libLoadLock.release() + + # + # Initialize the library + # + fn = _nvmlGetFunctionPointer("nvmlInit") + ret = fn() + _nvmlCheckReturn(ret) + return None + +def nvmlShutdown(): + # + # Leave the library loaded, but shutdown the interface + # + fn = _nvmlGetFunctionPointer("nvmlShutdown") + ret = fn() + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlErrorString(result): + fn = _nvmlGetFunctionPointer("nvmlErrorString") + fn.restype = c_char_p # otherwise return is an int + ret = fn(result) + return ret + +# Added in 2.285 +def nvmlSystemGetNVMLVersion(): + c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetProcessName(pid): + c_name = create_string_buffer(1024) + fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName") + ret = fn(c_uint(pid), c_name, c_uint(1024)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlSystemGetDriverVersion(): + c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetHicVersion(): + c_count = c_uint(0) + hics = None + fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion") + + # get the count + ret = fn(byref(c_count), None) + + # this should only fail with insufficient size + if ((ret != NVML_SUCCESS) and + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + raise NVMLError(ret) + + # if there are no hics + if (c_count.value == 0): + return [] + + hic_array = c_nvmlHwbcEntry_t * c_count.value + hics = hic_array() + ret = fn(byref(c_count), hics) + _nvmlCheckReturn(ret) + return hics + +## Unit get functions +def nvmlUnitGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetHandleByIndex(index): + c_index = c_uint(index) + unit = c_nvmlUnit_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex") + ret = fn(c_index, byref(unit)) + _nvmlCheckReturn(ret) + return unit + +def nvmlUnitGetUnitInfo(unit): + c_info = c_nvmlUnitInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetLedState(unit): + c_state = c_nvmlLedState_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState") + ret = fn(unit, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state + +def nvmlUnitGetPsuInfo(unit): + c_info = c_nvmlPSUInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetTemperature(unit, type): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature") + ret = fn(unit, c_uint(type), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +def nvmlUnitGetFanSpeedInfo(unit): + c_speeds = c_nvmlUnitFanSpeeds_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo") + ret = fn(unit, byref(c_speeds)) + _nvmlCheckReturn(ret) + return c_speeds + +# added to API +def nvmlUnitGetDeviceCount(unit): + c_count = c_uint(0) + # query the unit to determine device count + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), None) + if (ret == NVML_ERROR_INSUFFICIENT_SIZE): + ret = NVML_ERROR_SUCCESS + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetDevices(unit): + c_count = c_uint(nvmlUnitGetDeviceCount(unit)) + device_array = c_nvmlDevice_t * c_count.value + c_devices = device_array() + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), c_devices) + _nvmlCheckReturn(ret) + return c_devices + +## Device get functions +def nvmlDeviceGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetHandleByIndex(index): + c_index = c_uint(index) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex") + ret = fn(c_index, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleBySerial(serial): + c_serial = c_char_p(serial) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial") + ret = fn(c_serial, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByUUID(uuid): + c_uuid = c_char_p(uuid) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID") + ret = fn(c_uuid, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByPciBusId(pciBusId): + c_busId = c_char_p(pciBusId) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId") + ret = fn(c_busId, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetName(handle): + c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetName") + ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlDeviceGetSerial(handle): + c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial") + ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_serial.value + +def nvmlDeviceGetUUID(handle): + c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID") + ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_uuid.value + +def nvmlDeviceGetInforomVersion(handle, infoRomObject): + c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") + ret = fn(handle, _nvmlInforomObject_t(infoRomObject), + c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +def nvmlDeviceGetDisplayMode(handle): + c_mode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetPersistenceMode(handle): + c_state = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode") + ret = fn(handle, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + +def nvmlDeviceGetPciInfo(handle): + c_info = nvmlPciInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v2") + ret = fn(handle, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlDeviceGetClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +# Added in 2.285 +def nvmlDeviceGetMaxClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +def nvmlDeviceGetFanSpeed(handle): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed") + ret = fn(handle, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + +def nvmlDeviceGetTemperature(handle, sensor): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") + ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +# DEPRECATED use nvmlDeviceGetPerformanceState +def nvmlDeviceGetPowerState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPerformanceState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPowerManagementMode(handle): + c_pcapMode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode") + ret = fn(handle, byref(c_pcapMode)) + _nvmlCheckReturn(ret) + return c_pcapMode.value + +def nvmlDeviceGetPowerManagementLimit(handle): + c_limit = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit") + ret = fn(handle, byref(c_limit)) + _nvmlCheckReturn(ret) + return c_limit.value + +def nvmlDeviceGetPowerUsage(handle): + c_watts = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage") + ret = fn(handle, byref(c_watts)) + _nvmlCheckReturn(ret) + return c_watts.value + +def nvmlDeviceGetMemoryInfo(handle): + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") + ret = fn(handle, byref(c_memory)) + _nvmlCheckReturn(ret) + return c_memory + +def nvmlDeviceGetComputeMode(handle): + c_mode = _nvmlComputeMode_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetEccMode(handle): + c_currState = _nvmlEnableState_t() + c_pendingState = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode") + ret = fn(handle, byref(c_currState), byref(c_pendingState)) + _nvmlCheckReturn(ret) + return [c_currState.value, c_pendingState.value] + +# added to API +def nvmlDeviceGetCurrentEccMode(handle): + return nvmlDeviceGetEccMode(handle)[0] + +# added to API +def nvmlDeviceGetPendingEccMode(handle): + return nvmlDeviceGetEccMode(handle)[1] + +def nvmlDeviceGetTotalEccErrors(handle, bitType, counterType): + c_count = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType): + c_count = c_nvmlEccErrorCounts_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count + +def nvmlDeviceGetUtilizationRates(handle): + c_util = c_nvmlUtilization_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates") + ret = fn(handle, byref(c_util)) + _nvmlCheckReturn(ret) + return c_util + +def nvmlDeviceGetDriverModel(handle): + c_currModel = _nvmlDriverModel_t() + c_pendingModel = _nvmlDriverModel_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel") + ret = fn(handle, byref(c_currModel), byref(c_pendingModel)) + _nvmlCheckReturn(ret) + return [c_currModel.value, c_pendingModel.value] + +# added to API +def nvmlDeviceGetCurrentDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[0] + +# added to API +def nvmlDeviceGetPendingDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[1] + +# Added in 2.285 +def nvmlDeviceGetVbiosVersion(handle): + c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion") + ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlDeviceGetComputeRunningProcesses(handle): + # first call to get the size + c_count = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses") + ret = fn(handle, byref(c_count), None) + + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + # typical case + # oversize the array incase more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = c_nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + # make the call again + ret = fn(handle, byref(c_count), c_procs) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) + +## Set functions +def nvmlUnitSetLedState(unit, color): + fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState") + ret = fn(unit, _nvmlLedColor_t(color)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetPersistenceMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetComputeMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode") + ret = fn(handle, _nvmlComputeMode_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetEccMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceClearEccErrorCounts(handle, counterType): + fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts") + ret = fn(handle, _nvmlEccCounterType_t(counterType)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetDriverModel(handle, model): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel") + ret = fn(handle, _nvmlDriverModel_t(model)) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventSetCreate(): + fn = _nvmlGetFunctionPointer("nvmlEventSetCreate") + eventSet = c_nvmlEventSet_t() + ret = fn(byref(eventSet)) + _nvmlCheckReturn(ret) + return eventSet + +# Added in 2.285 +def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): + fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents") + ret = fn(handle, c_ulonglong(eventTypes), eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlDeviceGetSupportedEventTypes(handle): + c_eventTypes = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes") + ret = fn(handle, byref(c_eventTypes)) + _nvmlCheckReturn(ret) + return c_eventTypes.value + +# Added in 2.285 +# raises NVML_ERROR_TIMEOUT exception on timeout +def nvmlEventSetWait(eventSet, timeoutms): + fn = _nvmlGetFunctionPointer("nvmlEventSetWait") + data = c_nvmlEventData_t() + ret = fn(eventSet, byref(data), c_uint(timeoutms)) + _nvmlCheckReturn(ret) + return data + +# Added in 2.285 +def nvmlEventSetFree(eventSet): + fn = _nvmlGetFunctionPointer("nvmlEventSetFree") + ret = fn(eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventDataGetPerformanceState(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetPerformanceState") + pstate = _nvmlPstates_t() + ret = fn(byref(data), byref(pstate)) + _nvmlCheckReturn(ret) + return pstate.value + +# Added in 2.285 +def nvmlEventDataGetXidCriticalError(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetXidCriticalError") + xid = c_uint() + ret = fn(byref(data), byref(xid)) + _nvmlCheckReturn(ret) + return xid.value + +# Added in 2.285 +def nvmlEventDataGetEccErrorCount(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetEccErrorCount") + ecc = c_ulonglong() + ret = fn(byref(data), byref(ecc)) + _nvmlCheckReturn(ret) + return ecc.value + +# Added in 3.295 +def nvmlDeviceOnSameBoard(handle1, handle2): + fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard") + onSameBoard = c_int() + ret = fn(handle1, handle2, byref(onSameBoard)) + _nvmlCheckReturn(ret) + return (onSameBoard.value != 0) + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + + + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py b/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py new file mode 100644 index 00000000..f1a42707 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py @@ -0,0 +1,455 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +# +# nvidia_smi +# nvml_bindings nvidia com +# +# Sample code that attempts to reproduce the output of nvidia-smi -q- x +# For many cases the output should match +# +# To Run: +# $ python +# Python 2.7 (r27:82500, Sep 16 2010, 18:02:00) +# [GCC 4.5.1 20100907 (Red Hat 4.5.1-3)] on linux2 +# Type "help", "copyright", "credits" or "license" for more information. +# >>> import nvidia_smi +# >>> print(nvidia_smi.XmlDeviceQuery()) +# ... +# + +from pynvml import * +import datetime + +# +# Helper functions +# +def GetEccByType(handle, counterType, bitType): + try: + count = str(nvmlDeviceGetTotalEccErrors(handle, bitType, counterType)) + except NVMLError as err: + count = handleError(err) + + try: + detail = nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType) + deviceMemory = str(detail.deviceMemory) + registerFile = str(detail.registerFile) + l1Cache = str(detail.l1Cache) + l2Cache = str(detail.l2Cache) + except NVMLError as err: + msg = handleError(err) + deviceMemory = msg + registerFile = msg + l1Cache = msg + l2Cache = msg + strResult = '' + strResult += ' ' + deviceMemory + '\n' + strResult += ' ' + registerFile + '\n' + strResult += ' ' + l1Cache + '\n' + strResult += ' ' + l2Cache + '\n' + strResult += ' ' + count + '\n' + return strResult + +def GetEccByCounter(handle, counterType): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_SINGLE_BIT_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_DOUBLE_BIT_ECC)) + strResult += ' \n' + return strResult + +def GetEccStr(handle): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_VOLATILE_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_AGGREGATE_ECC)) + strResult += ' \n' + return strResult + +# +# Converts errors into string messages +# +def handleError(err): + if (err.value == NVML_ERROR_NOT_SUPPORTED): + return "N/A" + else: + return err.__str__() + +####### +def XmlDeviceQuery(): + + try: + # + # Initialize NVML + # + nvmlInit() + strResult = '' + + strResult += '\n' + strResult += '\n' + strResult += '\n' + + strResult += ' ' + str(datetime.date.today()) + '\n' + strResult += ' ' + str(nvmlSystemGetDriverVersion()) + '\n' + + deviceCount = nvmlDeviceGetCount() + strResult += ' ' + str(deviceCount) + '\n' + + for i in range(0, deviceCount): + handle = nvmlDeviceGetHandleByIndex(i) + + pciInfo = nvmlDeviceGetPciInfo(handle) + + strResult += ' \n' % pciInfo.busId + + strResult += ' ' + nvmlDeviceGetName(handle) + '\n' + + try: + state = ('Enabled' if (nvmlDeviceGetDisplayMode(handle) != 0) else 'Disabled') + except NVMLError as err: + state = handleError(err) + + strResult += ' ' + state + '\n' + + try: + mode = 'Enabled' if (nvmlDeviceGetPersistenceMode(handle) != 0) else 'Disabled' + except NVMLError as err: + mode = handleError(err) + + strResult += ' ' + mode + '\n' + + strResult += ' \n' + + try: + current = str(nvmlDeviceGetCurrentDriverModel(handle)) + except NVMLError as err: + current = handleError(err) + strResult += ' ' + current + '\n' + + try: + pending = str(nvmlDeviceGetPendingDriverModel(handle)) + except NVMLError as err: + pending = handleError(err) + + strResult += ' ' + pending + '\n' + + strResult += ' \n' + + try: + serial = nvmlDeviceGetSerial(handle) + except NVMLError as err: + serial = handleError(err) + + strResult += ' ' + serial + '\n' + + try: + uuid = nvmlDeviceGetUUID(handle) + except NVMLError as err: + uuid = handleError(err) + + strResult += ' ' + uuid + '\n' + + try: + vbios = nvmlDeviceGetVbiosVersion(handle) + except NVMLError as err: + vbios = handleError(err) + + strResult += ' ' + vbios + '\n' + + strResult += ' \n' + + try: + oem = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_OEM) + if oem == '': + oem = 'N/A' + except NVMLError as err: + oem = handleError(err) + + strResult += ' ' + oem + '\n' + + try: + ecc = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_ECC) + if ecc == '': + ecc = 'N/A' + except NVMLError as err: + ecc = handleError(err) + + strResult += ' ' + ecc + '\n' + try: + pwr = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_POWER) + if pwr == '': + pwr = 'N/A' + except NVMLError as err: + pwr = handleError(err) + + strResult += ' ' + pwr + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += ' %02X\n' % pciInfo.bus + strResult += ' %02X\n' % pciInfo.device + strResult += ' %04X\n' % pciInfo.domain + strResult += ' %08X\n' % (pciInfo.pciDeviceId) + strResult += ' %08X\n' % (pciInfo.pciSubSystemId) + strResult += ' ' + str(pciInfo.busId) + '\n' + strResult += ' \n' + + + strResult += ' \n' + + try: + gen = str(nvmlDeviceGetMaxPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + + try: + gen = str(nvmlDeviceGetCurrPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + strResult += ' \n' + strResult += ' \n' + + try: + width = str(nvmlDeviceGetMaxPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + try: + width = str(nvmlDeviceGetCurrPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + strResult += ' \n' + strResult += ' \n' + strResult += ' \n' + + try: + fan = str(nvmlDeviceGetFanSpeed(handle)) + ' %' + except NVMLError as err: + fan = handleError(err) + strResult += ' ' + fan + '\n' + + try: + memInfo = nvmlDeviceGetMemoryInfo(handle) + mem_total = str(memInfo.total / 1024 / 1024) + ' MB' + mem_used = str(memInfo.used / 1024 / 1024) + ' MB' + mem_free = str(memInfo.free / 1024 / 1024) + ' MB' + except NVMLError as err: + error = handleError(err) + mem_total = error + mem_used = error + mem_free = error + + strResult += ' \n' + strResult += ' ' + mem_total + '\n' + strResult += ' ' + mem_used + '\n' + strResult += ' ' + mem_free + '\n' + strResult += ' \n' + + + try: + mode = nvmlDeviceGetComputeMode(handle) + if mode == NVML_COMPUTEMODE_DEFAULT: + modeStr = 'Default' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD: + modeStr = 'Exclusive Thread' + elif mode == NVML_COMPUTEMODE_PROHIBITED: + modeStr = 'Prohibited' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: + modeStr = 'Exclusive Process' + else: + modeStr = 'Unknown' + except NVMLError as err: + modeStr = handleError(err) + + strResult += ' ' + modeStr + '\n' + + try: + util = nvmlDeviceGetUtilizationRates(handle) + gpu_util = str(util.gpu) + mem_util = str(util.memory) + except NVMLError as err: + error = handleError(err) + gpu_util = error + mem_util = error + + strResult += ' \n' + strResult += ' ' + gpu_util + ' %\n' + strResult += ' ' + mem_util + ' %\n' + strResult += ' \n' + + try: + (current, pending) = nvmlDeviceGetEccMode(handle) + curr_str = 'Enabled' if (current != 0) else 'Disabled' + pend_str = 'Enabled' if (pending != 0) else 'Disabled' + except NVMLError as err: + error = handleError(err) + curr_str = error + pend_str = error + + strResult += ' \n' + strResult += ' ' + curr_str + '\n' + strResult += ' ' + pend_str + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += GetEccStr(handle) + strResult += ' \n' + + try: + temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) + ' C' + except NVMLError as err: + temp = handleError(err) + + strResult += ' \n' + strResult += ' ' + temp + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + perfState = nvmlDeviceGetPowerState(handle) + except NVMLError as err: + perfState = handleError(err) + strResult += ' P%s\n' % perfState + try: + powMan = nvmlDeviceGetPowerManagementMode(handle) + powManStr = 'Supported' if powMan != 0 else 'N/A' + except NVMLError as err: + powManStr = handleError(err) + strResult += ' ' + powManStr + '\n' + try: + powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000.0) + powDrawStr = '%.2f W' % powDraw + except NVMLError as err: + powDrawStr = handleError(err) + strResult += ' ' + powDrawStr + '\n' + try: + powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000.0) + powLimitStr = '%d W' % powLimit + except NVMLError as err: + powLimitStr = handleError(err) + strResult += ' ' + powLimitStr + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' +graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' + graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + try: + perfState = nvmlDeviceGetPowerState(handle) + perfStateStr = 'P%s' % perfState + except NVMLError as err: + perfStateStr = handleError(err) + strResult += ' ' + perfStateStr + '\n' + + strResult += ' \n' + + procstr = "" + try: + procs = nvmlDeviceGetComputeRunningProcesses(handle) + except NVMLError as err: + procs = [] + procstr = handleError(err) + + for p in procs: + procstr += ' \n' + procstr += ' %d\n' % p.pid + try: + name = str(nvmlSystemGetProcessName(p.pid)) + except NVMLError as err: + if (err.value == NVML_ERROR_NOT_FOUND): + # probably went away + continue + else: + name = handleError(err) + procstr += ' ' + name + '\n' + procstr += ' \n' + if (p.usedGpuMemory == None): + procstr += 'N\A' + else: + procstr += '%d MB\n' % (p.usedGpuMemory / 1024 / 1024) + procstr += '\n' + procstr += ' \n' + + strResult += procstr + strResult += ' \n' + strResult += ' \n' + + strResult += '\n' + + except NVMLError as err: + strResult += 'nvidia_smi.py: ' + err.__str__() + '\n' + + nvmlShutdown() + + return strResult + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py b/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py new file mode 100644 index 00000000..90f8bdd8 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py @@ -0,0 +1,903 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +## +# Python bindings for the NVML library +## +from ctypes import * +from ctypes.util import find_library +import sys +import threading + +## C Type mappings ## +## Enums +_nvmlEnableState_t = c_uint +NVML_FEATURE_DISABLED = 0 +NVML_FEATURE_ENABLED = 1 + +_nvmlTemperatureSensors_t = c_uint +NVML_TEMPERATURE_GPU = 0 + +_nvmlComputeMode_t = c_uint +NVML_COMPUTEMODE_DEFAULT = 0 +NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 +NVML_COMPUTEMODE_PROHIBITED = 2 +NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 + +_nvmlEccBitType_t = c_uint +NVML_SINGLE_BIT_ECC = 0 +NVML_DOUBLE_BIT_ECC = 1 + +_nvmlEccCounterType_t = c_uint +NVML_VOLATILE_ECC = 0 +NVML_AGGREGATE_ECC = 1 + +_nvmlClockType_t = c_uint +NVML_CLOCK_GRAPHICS = 0 +NVML_CLOCK_SM = 1 +NVML_CLOCK_MEM = 2 + +_nvmlDriverModel_t = c_uint +NVML_DRIVER_WDDM = 0 +NVML_DRIVER_WDM = 1 + +_nvmlPstates_t = c_uint +NVML_PSTATE_0 = 0 +NVML_PSTATE_1 = 1 +NVML_PSTATE_2 = 2 +NVML_PSTATE_3 = 3 +NVML_PSTATE_4 = 4 +NVML_PSTATE_5 = 5 +NVML_PSTATE_6 = 6 +NVML_PSTATE_7 = 7 +NVML_PSTATE_8 = 8 +NVML_PSTATE_9 = 9 +NVML_PSTATE_10 = 10 +NVML_PSTATE_11 = 11 +NVML_PSTATE_12 = 12 +NVML_PSTATE_13 = 13 +NVML_PSTATE_14 = 14 +NVML_PSTATE_15 = 15 +NVML_PSTATE_UNKNOWN = 32 + +_nvmlInforomObject_t = c_uint +NVML_INFOROM_OEM = 0 +NVML_INFOROM_ECC = 1 +NVML_INFOROM_POWER = 2 + +_nvmlReturn_t = c_uint +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10, +NVML_ERROR_UNKNOWN = 999 + +_nvmlFanState_t = c_uint +NVML_FAN_NORMAL = 0 +NVML_FAN_FAILED = 1 + +_nvmlLedColor_t = c_uint +NVML_LED_COLOR_GREEN = 0 +NVML_LED_COLOR_AMBER = 1 + +# C preprocessor defined values +nvmlFlagDefault = 0 +nvmlFlagForce = 1 + +# buffer size +NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 +NVML_DEVICE_UUID_BUFFER_SIZE = 80 +NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 81 +NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 +NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 +NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 + +NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) + +## Lib loading ## +nvmlLib = None +libLoadLock = threading.Lock() + +## Error Checking ## +class NVMLError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return str(nvmlErrorString(self.value)) + +def _nvmlCheckReturn(ret): + if (ret != NVML_SUCCESS): + raise NVMLError(ret) + return ret + +## Function access ## +def _nvmlGetFunctionPointer(name): + global nvmlLib + global libLoadLock + + libLoadLock.acquire() + try: + # ensure library was loaded + if (nvmlLib == None): + raise NVMLError(NVML_ERROR_UNINITIALIZED) + try: + return getattr(nvmlLib, name) + except AttributeError as attrError: + raise NVMLError(NVML_ERROR_NOT_SUPPORTED) + finally: + # lock is always freed + libLoadLock.release() + +## Alternative object +# Allows the object to be printed +# Allows mismatched types to be assigned +# - like None when the Structure variant requires c_uint +class nvmlFriendlyObject(object): + def __init__(self, dictionary): + for x in dictionary: + setattr(self, x, dictionary[x]) + def __str__(self): + return self.__dict__.__str__() + +def nvmlStructToFriendlyObject(struct): + d = {} + for x in struct._fields_: + key = x[0] + value = getattr(struct, key) + d[key] = value + obj = nvmlFriendlyObject(d) + return obj + +# pack the object so it can be passed to the NVML library +def nvmlFriendlyObjectToStruct(obj, model): + for x in model._fields_: + key = x[0] + value = obj.__dict__[key] + setattr(model, key, value) + return model + +## Unit structures +class struct_c_nvmlUnit_t(Structure): + pass # opaque handle +c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t) + +class c_nvmlUnitInfo_t(Structure): + _fields_ = [ + ('name', c_char * 96), + ('id', c_char * 96), + ('serial', c_char * 96), + ('firmwareVersion', c_char * 96), + ] + +class c_nvmlLedState_t(Structure): + _fields_ = [ + ('cause', c_char * 256), + ('color', _nvmlLedColor_t), + ] + +class c_nvmlPSUInfo_t(Structure): + _fields_ = [ + ('state', c_char * 256), + ('current', c_uint), + ('voltage', c_uint), + ('power', c_uint), + ] + +class c_nvmlUnitFanInfo_t(Structure): + _fields_ = [ + ('speed', c_uint), + ('state', _nvmlFanState_t), + ] + +class c_nvmlUnitFanSpeeds_t(Structure): + _fields_ = [ + ('fans', c_nvmlUnitFanInfo_t * 24), + ('count', c_uint) + ] + +## Device structures +class struct_c_nvmlDevice_t(Structure): + pass # opaque handle +c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t) + +class nvmlPciInfo_t(Structure): + _fields_ = [ + ('busId', c_char * 16), + ('domain', c_uint), + ('bus', c_uint), + ('device', c_uint), + ('pciDeviceId', c_uint), + + # Added in 2.285 + ('pciSubSystemId', c_uint), + ('reserved0', c_uint), + ('reserved1', c_uint), + ('reserved2', c_uint), + ('reserved3', c_uint), + ] + +class c_nvmlMemory_t(Structure): + _fields_ = [ + ('total', c_ulonglong), + ('free', c_ulonglong), + ('used', c_ulonglong), + ] + +# On Windows with the WDDM driver, usedGpuMemory is reported as None +# Code that processes this structure should check for None, I.E. +# +# if (info.usedGpuMemory == None): +# # TODO handle the error +# pass +# else: +# print("Using %d MB of memory" % (info.usedGpuMemory / 1024 / 1024)) +# +# See NVML documentation for more information +class c_nvmlProcessInfo_t(Structure): + _fields_ = [ + ('pid', c_uint), + ('usedGpuMemory', c_ulonglong), + ] + +class c_nvmlEccErrorCounts_t(Structure): + _fields_ = [ + ('l1Cache', c_ulonglong), + ('l2Cache', c_ulonglong), + ('deviceMemory', c_ulonglong), + ('registerFile', c_ulonglong), + ] + +class c_nvmlUtilization_t(Structure): + _fields_ = [ + ('gpu', c_uint), + ('memory', c_uint), + ] + +# Added in 2.285 +class c_nvmlHwbcEntry_t(Structure): + _fields_ = [ + ('hwbcId', c_uint), + ('firmwareVersion', c_char * 32), + ] + +## Event structures +class struct_c_nvmlEventSet_t(Structure): + pass # opaque handle +c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t) + +nvmlEventTypeSingleBitEccError = 0x0000000000000001 +nvmlEventTypeDoubleBitEccError = 0x0000000000000002 +nvmlEventTypePState = 0x0000000000000004 +nvmlEventTypeXidCriticalError = 0x0000000000000008 +nvmlEventTypeNone = 0x0000000000000000 +nvmlEventTypeAll = ( + nvmlEventTypeNone | + nvmlEventTypeSingleBitEccError | + nvmlEventTypeDoubleBitEccError | + nvmlEventTypePState | + nvmlEventTypeXidCriticalError + ) + +class c_nvmlEventData_t(Structure): + _fields_ = [ + ('device', c_nvmlDevice_t), + ('eventType', c_ulonglong), + ('reserved', c_ulonglong) + ] + +## C function wrappers ## +def nvmlInit(): + global nvmlLib + global libLoadLock + + # + # Load the library if it isn't loaded already + # + if (nvmlLib == None): + # lock to ensure only one caller loads the library + libLoadLock.acquire() + + try: + # ensure the library still isn't loaded + if (nvmlLib == None): + try: + if (sys.platform[:3] == "win"): + # cdecl calling convention + nvmlLib = cdll.nvml + else: + # assume linux + nvmlLib = CDLL("libnvidia-ml.so") + except OSError as ose: + print(ose) + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + if (nvmlLib == None): + print("Failed to load NVML") + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + finally: + # lock is always freed + libLoadLock.release() + + # + # Initialize the library + # + fn = _nvmlGetFunctionPointer("nvmlInit") + ret = fn() + _nvmlCheckReturn(ret) + return None + +def nvmlShutdown(): + # + # Leave the library loaded, but shutdown the interface + # + fn = _nvmlGetFunctionPointer("nvmlShutdown") + ret = fn() + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlErrorString(result): + fn = _nvmlGetFunctionPointer("nvmlErrorString") + fn.restype = c_char_p # otherwise return is an int + ret = fn(result) + return ret + +# Added in 2.285 +def nvmlSystemGetNVMLVersion(): + c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetProcessName(pid): + c_name = create_string_buffer(1024) + fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName") + ret = fn(c_uint(pid), c_name, c_uint(1024)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlSystemGetDriverVersion(): + c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetHicVersion(): + c_count = c_uint(0) + hics = None + fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion") + + # get the count + ret = fn(byref(c_count), None) + + # this should only fail with insufficient size + if ((ret != NVML_SUCCESS) and + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + raise NVMLError(ret) + + # if there are no hics + if (c_count.value == 0): + return [] + + hic_array = c_nvmlHwbcEntry_t * c_count.value + hics = hic_array() + ret = fn(byref(c_count), hics) + _nvmlCheckReturn(ret) + return hics + +## Unit get functions +def nvmlUnitGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetHandleByIndex(index): + c_index = c_uint(index) + unit = c_nvmlUnit_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex") + ret = fn(c_index, byref(unit)) + _nvmlCheckReturn(ret) + return unit + +def nvmlUnitGetUnitInfo(unit): + c_info = c_nvmlUnitInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetLedState(unit): + c_state = c_nvmlLedState_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState") + ret = fn(unit, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state + +def nvmlUnitGetPsuInfo(unit): + c_info = c_nvmlPSUInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetTemperature(unit, type): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature") + ret = fn(unit, c_uint(type), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +def nvmlUnitGetFanSpeedInfo(unit): + c_speeds = c_nvmlUnitFanSpeeds_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo") + ret = fn(unit, byref(c_speeds)) + _nvmlCheckReturn(ret) + return c_speeds + +# added to API +def nvmlUnitGetDeviceCount(unit): + c_count = c_uint(0) + # query the unit to determine device count + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), None) + if (ret == NVML_ERROR_INSUFFICIENT_SIZE): + ret = NVML_ERROR_SUCCESS + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetDevices(unit): + c_count = c_uint(nvmlUnitGetDeviceCount(unit)) + device_array = c_nvmlDevice_t * c_count.value + c_devices = device_array() + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), c_devices) + _nvmlCheckReturn(ret) + return c_devices + +## Device get functions +def nvmlDeviceGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetHandleByIndex(index): + c_index = c_uint(index) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex") + ret = fn(c_index, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleBySerial(serial): + c_serial = c_char_p(serial) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial") + ret = fn(c_serial, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByUUID(uuid): + c_uuid = c_char_p(uuid) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID") + ret = fn(c_uuid, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByPciBusId(pciBusId): + c_busId = c_char_p(pciBusId) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId") + ret = fn(c_busId, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetName(handle): + c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetName") + ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlDeviceGetSerial(handle): + c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial") + ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_serial.value + +def nvmlDeviceGetUUID(handle): + c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID") + ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_uuid.value + +def nvmlDeviceGetInforomVersion(handle, infoRomObject): + c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") + ret = fn(handle, _nvmlInforomObject_t(infoRomObject), + c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +def nvmlDeviceGetDisplayMode(handle): + c_mode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetPersistenceMode(handle): + c_state = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode") + ret = fn(handle, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + +def nvmlDeviceGetPciInfo(handle): + c_info = nvmlPciInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v2") + ret = fn(handle, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlDeviceGetClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +# Added in 2.285 +def nvmlDeviceGetMaxClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +def nvmlDeviceGetFanSpeed(handle): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed") + ret = fn(handle, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + +def nvmlDeviceGetTemperature(handle, sensor): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") + ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +# DEPRECATED use nvmlDeviceGetPerformanceState +def nvmlDeviceGetPowerState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPerformanceState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPowerManagementMode(handle): + c_pcapMode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode") + ret = fn(handle, byref(c_pcapMode)) + _nvmlCheckReturn(ret) + return c_pcapMode.value + +def nvmlDeviceGetPowerManagementLimit(handle): + c_limit = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit") + ret = fn(handle, byref(c_limit)) + _nvmlCheckReturn(ret) + return c_limit.value + +def nvmlDeviceGetPowerUsage(handle): + c_watts = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage") + ret = fn(handle, byref(c_watts)) + _nvmlCheckReturn(ret) + return c_watts.value + +def nvmlDeviceGetMemoryInfo(handle): + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") + ret = fn(handle, byref(c_memory)) + _nvmlCheckReturn(ret) + return c_memory + +def nvmlDeviceGetComputeMode(handle): + c_mode = _nvmlComputeMode_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetEccMode(handle): + c_currState = _nvmlEnableState_t() + c_pendingState = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode") + ret = fn(handle, byref(c_currState), byref(c_pendingState)) + _nvmlCheckReturn(ret) + return [c_currState.value, c_pendingState.value] + +# added to API +def nvmlDeviceGetCurrentEccMode(handle): + return nvmlDeviceGetEccMode(handle)[0] + +# added to API +def nvmlDeviceGetPendingEccMode(handle): + return nvmlDeviceGetEccMode(handle)[1] + +def nvmlDeviceGetTotalEccErrors(handle, bitType, counterType): + c_count = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType): + c_count = c_nvmlEccErrorCounts_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count + +def nvmlDeviceGetUtilizationRates(handle): + c_util = c_nvmlUtilization_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates") + ret = fn(handle, byref(c_util)) + _nvmlCheckReturn(ret) + return c_util + +def nvmlDeviceGetDriverModel(handle): + c_currModel = _nvmlDriverModel_t() + c_pendingModel = _nvmlDriverModel_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel") + ret = fn(handle, byref(c_currModel), byref(c_pendingModel)) + _nvmlCheckReturn(ret) + return [c_currModel.value, c_pendingModel.value] + +# added to API +def nvmlDeviceGetCurrentDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[0] + +# added to API +def nvmlDeviceGetPendingDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[1] + +# Added in 2.285 +def nvmlDeviceGetVbiosVersion(handle): + c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion") + ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlDeviceGetComputeRunningProcesses(handle): + # first call to get the size + c_count = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses") + ret = fn(handle, byref(c_count), None) + + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + # typical case + # oversize the array incase more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = c_nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + # make the call again + ret = fn(handle, byref(c_count), c_procs) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) + +## Set functions +def nvmlUnitSetLedState(unit, color): + fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState") + ret = fn(unit, _nvmlLedColor_t(color)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetPersistenceMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetComputeMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode") + ret = fn(handle, _nvmlComputeMode_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetEccMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceClearEccErrorCounts(handle, counterType): + fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts") + ret = fn(handle, _nvmlEccCounterType_t(counterType)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetDriverModel(handle, model): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel") + ret = fn(handle, _nvmlDriverModel_t(model)) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventSetCreate(): + fn = _nvmlGetFunctionPointer("nvmlEventSetCreate") + eventSet = c_nvmlEventSet_t() + ret = fn(byref(eventSet)) + _nvmlCheckReturn(ret) + return eventSet + +# Added in 2.285 +def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): + fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents") + ret = fn(handle, c_ulonglong(eventTypes), eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlDeviceGetSupportedEventTypes(handle): + c_eventTypes = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes") + ret = fn(handle, byref(c_eventTypes)) + _nvmlCheckReturn(ret) + return c_eventTypes.value + +# Added in 2.285 +# raises NVML_ERROR_TIMEOUT exception on timeout +def nvmlEventSetWait(eventSet, timeoutms): + fn = _nvmlGetFunctionPointer("nvmlEventSetWait") + data = c_nvmlEventData_t() + ret = fn(eventSet, byref(data), c_uint(timeoutms)) + _nvmlCheckReturn(ret) + return data + +# Added in 2.285 +def nvmlEventSetFree(eventSet): + fn = _nvmlGetFunctionPointer("nvmlEventSetFree") + ret = fn(eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventDataGetPerformanceState(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetPerformanceState") + pstate = _nvmlPstates_t() + ret = fn(byref(data), byref(pstate)) + _nvmlCheckReturn(ret) + return pstate.value + +# Added in 2.285 +def nvmlEventDataGetXidCriticalError(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetXidCriticalError") + xid = c_uint() + ret = fn(byref(data), byref(xid)) + _nvmlCheckReturn(ret) + return xid.value + +# Added in 2.285 +def nvmlEventDataGetEccErrorCount(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetEccErrorCount") + ecc = c_ulonglong() + ret = fn(byref(data), byref(ecc)) + _nvmlCheckReturn(ret) + return ecc.value + +# Added in 3.295 +def nvmlDeviceOnSameBoard(handle1, handle2): + fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard") + onSameBoard = c_int() + ret = fn(handle1, handle2, byref(onSameBoard)) + _nvmlCheckReturn(ret) + return (onSameBoard.value != 0) + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + + + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py b/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py new file mode 100644 index 00000000..ab1eddee --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py @@ -0,0 +1,32 @@ +from distutils.core import setup +from sys import version + +# earlier versions don't support all classifiers +if version < '2.2.3': + from distutils.dist import DistributionMetadata + DistributionMetadata.classifiers = None + DistributionMetadata.download_url = None + +setup(name='nvidia-ml-py', + version='3.295.00', + description='Python Bindings for the NVIDIA Management Library', + py_modules=['pynvml', 'nvidia_smi'], + package_data=['Example.txt'], + license="BSD", + url="http://www.nvidia.com/", + author="NVIDIA Corporation", + author_email="nvml-bindings@nvidia.com", + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: BSD License', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Hardware', + 'Topic :: System :: Systems Administration', + ], + ) + From 831d8cb6294d6030c3a714acb9a0c0afb35c65df Mon Sep 17 00:00:00 2001 From: Robert Alexander Date: Mon, 7 May 2012 15:12:29 -0700 Subject: [PATCH 02/39] Adding r295 features to the reporting end. No GUI changes --- gpu/nvidia/README | 15 ++++++++- gpu/nvidia/conf.d/nvidia.pyconf | 51 +++++++++++++++++++++++++++-- gpu/nvidia/python_modules/nvidia.py | 36 +++++++++++++++++--- 3 files changed, 93 insertions(+), 9 deletions(-) diff --git a/gpu/nvidia/README b/gpu/nvidia/README index e7776a7e..ab3d2be1 100644 --- a/gpu/nvidia/README +++ b/gpu/nvidia/README @@ -3,7 +3,9 @@ NVIDIA GPU monitoring plugin for gmond Installation instructions: * First install the Python Bindings for the NVIDIA Management Library: - http://pypi.python.org/pypi/nvidia-ml-py/ + $ cd nvidia-ml-py-* + $ sudo python setup.py install + For the latest bindings see: http://pypi.python.org/pypi/nvidia-ml-py/ You can do a site install or place it in {libdir}/ganglia/python_modules * Copy python_modules/nvidia.py to {libdir}/ganglia/python_modules * Copy conf.d/nvidia.pyconf to /etc/ganglia/conf.d @@ -34,3 +36,14 @@ The following metrics have been implemented: * gpu_power_usage * gpu_power_state * gpu_ecc_mode + +Version 2: + +The following metrics have been implemented: +* gpu_max_graphics_speed +* gpu_max_sm_speed +* gpu_max_mem_speed +* gpu_serial +* gpu_power_man_mode +* gpu_power_man_limit + diff --git a/gpu/nvidia/conf.d/nvidia.pyconf b/gpu/nvidia/conf.d/nvidia.pyconf index 4eaf8f01..08a4feb6 100644 --- a/gpu/nvidia/conf.d/nvidia.pyconf +++ b/gpu/nvidia/conf.d/nvidia.pyconf @@ -73,11 +73,16 @@ collection_group { } metric { - name_match = "([\\S]+)_power_state" - name = "\\1_power_state" - title= "\\1 Power State" + name_match = "([\\S]+)_performance_state" + name = "\\1_performance_state" + title= "\\1 Performance State" value_threshold = 1.0 } +} + +collection_group { + collect_every = 600 + time_threshold = 1200 metric { name_match = "([\\S]+)_ecc_mode" @@ -85,6 +90,21 @@ collection_group { title= "\\1 ECC Mode" value_threshold = 1.0 } + + metric { + name_match = "([\\S]+)_power_man_mode" + name = "\\1_power_man_mode" + title= "\\1 Power Management Mode" + value_threshold = 1.0 + } + + metric { + name_match = "([\\S]+)_power_man_limit" + name = "\\1_power_man_limit" + title= "\\1 Power Management Limit" + value_threshold = 1.0 + } + } collection_group { @@ -124,4 +144,29 @@ collection_group { name = "\\1_mem_total" title = "\\1 Memory Total" } + + metric { + name_match = "([\\S]+)_max_graphics_speed" + name = "\\1_max_graphics_speed" + title = "\\1 Max Graphics Clock Speed" + } + + metric { + name_match = "([\\S]+)_max_sm_speed" + name = "\\1_max_sm_speed" + title = "\\1 Max SM Clock Speed" + } + + metric { + name_match = "([\\S]+)_max_mem_speed" + name = "\\1_max_mem_speed" + title = "\\1 Max Memory Clock Speed" + } + + metric { + name_match = "([\\S]+)_serial" + name = "\\1_serial" + title = "\\1 Board Serial Number" + } } + diff --git a/gpu/nvidia/python_modules/nvidia.py b/gpu/nvidia/python_modules/nvidia.py index dc6e7301..11bfde06 100644 --- a/gpu/nvidia/python_modules/nvidia.py +++ b/gpu/nvidia/python_modules/nvidia.py @@ -90,17 +90,17 @@ def gpu_device_handler(name): elif (metric == 'ecc_mode'): try: ecc_mode = nvmlDeviceGetPendingEccMode(gpu_device) - if (ecc_mode == 0): + if (NVML_FEATURE_DISABLED == ecc_mode): return "OFF" - elif (ecc_mode == 1): + elif (NVML_FEATURE_ENABLED == ecc_mode): return "ON" else: return "UNKNOWN" except NVMLError, nvmlError: if NVML_ERROR_NOT_SUPPORTED == nvmlError.value: return 'N/A' - elif (metric == 'power_state'): - state = nvmlDeviceGetPowerState(gpu_device) + elif (metric == 'performance_state'): + state = nvmlDeviceGetPerformanceState(gpu_device) try: int(state) return "P%s" % state @@ -112,8 +112,26 @@ def gpu_device_handler(name): return nvmlDeviceGetClockInfo(gpu_device, NVML_CLOCK_SM) elif (metric == 'mem_speed'): return nvmlDeviceGetClockInfo(gpu_device, NVML_CLOCK_MEM) + elif (metric == 'max_graphics_speed'): + return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_GRAPHICS) + elif (metric == 'max_sm_speed'): + return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_SM) + elif (metric == 'max_mem_speed'): + return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_MEM) elif (metric == 'power_usage'): return nvmlDeviceGetPowerUsage(gpu_device) + elif (metric == 'serial'): + return nvmlDeviceGetSerial(gpu_device) + elif (metric == 'power_man_mode'): + pow_man_mode = nvmlDeviceGetPowerManagementMode(gpu_device) + if (NVML_FEATURE_DISABLED == pow_man_mode): + return "OFF" + elif (NVML_FEATURE_ENABLED == pow_man_mode): + return "ON" + else: + return "UNKNOWN" + elif (metric == 'power_man_limit'): + return nvmlDeviceGetPowerManagementLimit(gpu_device) else: print "Handler for %s not implemented, please fix in gpu_device_handler()" % metric os._exit(1) @@ -144,12 +162,20 @@ def metric_init(params): build_descriptor('gpu%s_mem_total' % i, gpu_device_handler, default_time_max, 'uint', 'KB', 'zero', '%u', 'GPU%s Total Memory' %i, 'gpu') build_descriptor('gpu%s_mem_used' % i, gpu_device_handler, default_time_max, 'uint', 'KB', 'both', '%u', 'GPU%s Used Memory' %i, 'gpu') build_descriptor('gpu%s_ecc_mode' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s ECC Mode' %i, 'gpu') - build_descriptor('gpu%s_power_state' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power State' %i, 'gpu') + build_descriptor('gpu%s_performance_state' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Performance State' %i, 'gpu') build_descriptor('gpu%s_util' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Utilization' %i, 'gpu') build_descriptor('gpu%s_mem_util' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Memory Utilization' %i, 'gpu') build_descriptor('gpu%s_fan' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Fan Speed' %i, 'gpu') build_descriptor('gpu%s_power_usage' % i, gpu_device_handler, default_time_max, 'uint', 'watts', 'both', '%u', 'GPU%s Power Usage' % i, 'gpu') + # Added for version 2.285 + build_descriptor('gpu%s_max_graphics_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max Graphics Speed' % i, 'gpu') + build_descriptor('gpu%s_max_sm_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max SM Speed' % i, 'gpu') + build_descriptor('gpu%s_max_mem_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max Memory Speed' % i, 'gpu') + build_descriptor('gpu%s_serial' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Serial' % i, 'gpu') + build_descriptor('gpu%s_power_man_mode' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power Management' % i, 'gpu') + build_descriptor('gpu%s_power_man_limit' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power Management Limit' % i, 'gpu') + return descriptors def metric_cleanup(): From 06331201d932cc96857410218b800578510fb1af Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Thu, 5 Jul 2012 23:34:45 +0000 Subject: [PATCH 03/39] Changed rabbitmq module to allow multiple vhosts --- rabbit/python_modules/rabbitmq.py | 133 ++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 42 deletions(-) diff --git a/rabbit/python_modules/rabbitmq.py b/rabbit/python_modules/rabbitmq.py index 52d1b9ef..af36917d 100644 --- a/rabbit/python_modules/rabbitmq.py +++ b/rabbit/python_modules/rabbitmq.py @@ -5,20 +5,23 @@ import urllib import time from string import Template +import itertools global url, descriptors, last_update, vhost, username, password, url_template, result, result_dict, keyToPath INTERVAL = 20 descriptors = list() username, password = "guest", "guest" stats = {} -last_update = {} +keyToPath = {} +last_update = None +#last_update = {} compiled_results = {"nodes" : None, "queues" : None, "connections" : None} #Make initial stat test time dict -for stat_type in ('queues', 'connections','exchanges', 'nodes'): - last_update[stat_type] = None - -keyToPath = {} +#for stat_type in ('queues', 'connections','exchanges', 'nodes'): +# last_update[stat_type] = None +### CONFIGURATION SECTION ### +STATS = ['nodes', 'queues'] # QUEUE METRICS # keyToPath['rmq_messages_ready'] = "%s.messages_ready" @@ -71,41 +74,63 @@ def dig_it_up(obj,path): print "Exception" return False +def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): + + global url_template + global last_update, url, compiled_results + + now = time.time() + + if not last_update: + diff = INTERVAL + else: + diff = now - last_update + + if diff >= INTERVAL or not last_update: + print "Fetching Results after %d seconds" % INTERVAL + last_update = now + for stat in stats: + for vhost in vhosts: + result_dict = {} + urlstring = url_template.safe_substitute(stats = stat, vhost = vhost) + result = json.load(urllib.urlopen(urlstring) + # Rearrange results so entry is held in a dict keyed by name - queue name, host name, etc. + if group in ('queues', 'nodes', 'exchanges'): + for entry in result: + name = entry['name'] + result_dict[name] = entry + compiled_results[(stat, vhost)] = result_dict + + return compiled_results + def refreshGroup(group): global url_template - urlstring = url_template.safe_substitute(stats = group) + urlstring = url_template.safe_substitute(stats = group, vhost = vhost) global last_update, url, compiled_results now = time.time() - if not last_update[group]: + if not last_update[(group, vhost)]: diff = INTERVAL else: - diff = now - last_update[group] + diff = now - last_update[(group, vhost)] - if diff >= INTERVAL or not last_update[group]: + if diff >= INTERVAL or not last_update[(group, vhost)]: result_dict = {} print "Fetching stats after %d seconds" % INTERVAL result = json.load(urllib.urlopen(urlstring)) - compiled_results[group] = result - last_update[group] = now + compiled_results[(group, vhost)] = result + last_update[(group, vhost)] = now #Refresh dict by names. We'll probably move this elsewhere. if group in ('queues', 'nodes'): for entry in result: name_attribute = entry['name'] result_dict[name_attribute] = entry - compiled_results[group] = result_dict + compiled_results[(group,vhost)] = result_dict - return compiled_results[group] - -def getConnectionTotal(name): - result = refreshGroup('connections') - return result.length() - -def getConnectionStats(name): - pass + return compiled_results[(group, vhost)] def validatedResult(value): if not isInstance(value, bool): @@ -113,27 +138,35 @@ def validatedResult(value): else: return None -def list_queues(): +def list_queues(vhost): # Make a list of queues - results = refreshGroup('queues') + results = refreshGroup('queues', vhost = vhost) return results.keys() -def list_nodes(): - results = refreshGroup('nodes') - return results.keys() +def list_queues(vhost): + global compiled_results + queues = compiled_results[('queues', vhost)].keys() + return queues + +def list_nodes(vhost): + global compiled_results + nodes = compiled_results[('nodes', vhost)].keys() + return nodes def getQueueStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" #handle queue names with . in them - split_name = name.split(".") + split_name, vhost = name.split("#") + split_name = split_name.split(".") stat_name = split_name[0] queue_name = ".".join(split_name[1:]) - result = refreshGroup('queues') + # Run refreshStats to get the result object + result = refreshStats(('queues', vhost)) value = dig_it_up(result, keyToPath[stat_name] % queue_name) - print name, value + print name, values #Convert Booleans if value is True: @@ -145,9 +178,11 @@ def getQueueStat(name): def getNodeStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" - stat_name, node_name = name.split(".") - result = refreshGroup('nodes') + stat_name = name.split(".")[0] + node_name, vhost = name.split(".")[1].split("#") + result = refreshStats(('nodes', vhost)) value = dig_it_up(result, keyToPath[stat_name] % node_name) + print name,value #Convert Booleans if value is True: @@ -156,24 +191,38 @@ def getNodeStat(name): value = 0 return float(value) + +def product(*args, **kwds): + # replacement for itertools.product + # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy + pools = map(tuple, args) * kwds.get('repeat', 1) + result = [[]] + for pool in pools: + result = [x+[y] for x in result for y in pool] + for prod in result: + yield tuple(prod) def metric_init(params): ''' Create the metric definition object ''' - global descriptors, stats, vhost, username, password, urlstring, url_template, compiled_results + global descriptors, stats, vhost, username, password, urlstring, url_template, compiled_results, STATS print 'received the following params:' #Set this globally so we can refresh stats if 'host' not in params: params['host'], params['vhost'],params['username'],params['password'] = "localhost", "/", "guest", "guest" - vhost = params['vhost'] + + # Set the vhosts as a list split from params + vhosts = params['vhost'].split(',') username, password = params['username'], params['password'] host = params['host'] - url = 'http://%s:%s@%s:55672/api/$stats' % (username, password, host) + url = 'http://%s:%s@%s:55672/api/$stats/$vhost' % (username, password, host) url_template = Template(url) print params - refreshGroup("nodes") - refreshGroup("queues") + refreshStats(stats = STATS, vhosts = vhosts) + + refreshGroup("nodes", vhost = vhost) + refreshGroup("queues", vhost = vhost) def create_desc(prop): d = { @@ -194,9 +243,10 @@ def create_desc(prop): def buildQueueDescriptors(): - for queue in list_queues(): - for metric in QUEUE_METRICS: - name = "%s.%s" % (metric, queue) + for vhost, metric in product(vhosts, QUEUE_METRICS): + queues = list_queues(vhost) + for queue in queues: + name = "%s.%s#%s" % (metric, queue, vhost) print name d1 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getQueueStat, @@ -210,10 +260,9 @@ def buildQueueDescriptors(): descriptors.append(d1) def buildNodeDescriptors(): - for node in list_nodes(): - #node = node.split('@')[0] - for stat in NODE_METRICS: - name = '%s.%s' % (stat, node) + for vhost, metric in product(vhosts, NODE_METRICS) + for node in list_nodes(): + name = '%s.%s#%s' % (stat, node, vhost) print name d2 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getNodeStat, From 39354b4612f1c661f870cfa82482e68978eac5f1 Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Thu, 5 Jul 2012 23:57:11 +0000 Subject: [PATCH 04/39] Testing code for multiple vhosts --- rabbit/python_modules/rabbitmq.py | 36 ++++++++++++------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/rabbit/python_modules/rabbitmq.py b/rabbit/python_modules/rabbitmq.py index af36917d..dd389ab5 100644 --- a/rabbit/python_modules/rabbitmq.py +++ b/rabbit/python_modules/rabbitmq.py @@ -93,9 +93,9 @@ def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): for vhost in vhosts: result_dict = {} urlstring = url_template.safe_substitute(stats = stat, vhost = vhost) - result = json.load(urllib.urlopen(urlstring) + result = json.load(urllib.urlopen(urlstring)) # Rearrange results so entry is held in a dict keyed by name - queue name, host name, etc. - if group in ('queues', 'nodes', 'exchanges'): + if stat in ("queues", "nodes", "exchanges"): for entry in result: name = entry['name'] result_dict[name] = entry @@ -104,7 +104,7 @@ def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): return compiled_results def refreshGroup(group): - + # No longer in use in the multiple_vhosts version global url_template urlstring = url_template.safe_substitute(stats = group, vhost = vhost) @@ -138,11 +138,6 @@ def validatedResult(value): else: return None -def list_queues(vhost): - # Make a list of queues - results = refreshGroup('queues', vhost = vhost) - return results.keys() - def list_queues(vhost): global compiled_results queues = compiled_results[('queues', vhost)].keys() @@ -157,16 +152,17 @@ def getQueueStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" #handle queue names with . in them + print name split_name, vhost = name.split("#") split_name = split_name.split(".") stat_name = split_name[0] queue_name = ".".join(split_name[1:]) # Run refreshStats to get the result object - result = refreshStats(('queues', vhost)) + result = compiled_results[('queues', vhost)] value = dig_it_up(result, keyToPath[stat_name] % queue_name) - print name, values + print name, value #Convert Booleans if value is True: @@ -180,7 +176,7 @@ def getNodeStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" stat_name = name.split(".")[0] node_name, vhost = name.split(".")[1].split("#") - result = refreshStats(('nodes', vhost)) + result = compiled_results[('nodes', vhost)] value = dig_it_up(result, keyToPath[stat_name] % node_name) print name,value @@ -221,9 +217,6 @@ def metric_init(params): refreshStats(stats = STATS, vhosts = vhosts) - refreshGroup("nodes", vhost = vhost) - refreshGroup("queues", vhost = vhost) - def create_desc(prop): d = { 'name' : 'XXX', @@ -260,9 +253,9 @@ def buildQueueDescriptors(): descriptors.append(d1) def buildNodeDescriptors(): - for vhost, metric in product(vhosts, NODE_METRICS) - for node in list_nodes(): - name = '%s.%s#%s' % (stat, node, vhost) + for vhost, metric in product(vhosts, NODE_METRICS): + for node in list_nodes(vhost): + name = '%s.%s#%s' % (metric, node, vhost) print name d2 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getNodeStat, @@ -290,9 +283,8 @@ def metric_cleanup(): url_template = Template(url) parameters = {"vhost":"/", "username":"guest","password":"guest", "metric_group":"rabbitmq"} metric_init(parameters) - result = refreshGroup('queues') - node_result = refreshGroup('nodes') + result = refreshStats(stats = ('queues', 'nodes'), vhosts = ('/')) print '***'*10 - getQueueStat('rmq_backing_queue_ack_egress_rate.gelf_client_three') - getNodeStat('rmq_disk_free.rmqtwo@inrmq02d1') - getNodeStat('rmq_mem_used.rmqtwo@inrmq02d1') + getQueueStat('rmq_backing_queue_ack_egress_rate.gelf_client_three#/') + getNodeStat('rmq_disk_free.rmqtwo@inrmq02d1#/') + getNodeStat('rmq_mem_used.rmqtwo@inrmq02d1#/') From 8f040d1549b54bd56880842dcdfaa2f5ee6f2b9d Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Thu, 5 Jul 2012 23:58:07 +0000 Subject: [PATCH 05/39] Testing code for multiple vhosts - removed refreshGroup --- rabbit/python_modules/rabbitmq.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/rabbit/python_modules/rabbitmq.py b/rabbit/python_modules/rabbitmq.py index dd389ab5..527e1c1e 100644 --- a/rabbit/python_modules/rabbitmq.py +++ b/rabbit/python_modules/rabbitmq.py @@ -103,35 +103,6 @@ def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): return compiled_results -def refreshGroup(group): - # No longer in use in the multiple_vhosts version - - global url_template - urlstring = url_template.safe_substitute(stats = group, vhost = vhost) - - global last_update, url, compiled_results - - now = time.time() - if not last_update[(group, vhost)]: - diff = INTERVAL - else: - diff = now - last_update[(group, vhost)] - - if diff >= INTERVAL or not last_update[(group, vhost)]: - result_dict = {} - print "Fetching stats after %d seconds" % INTERVAL - result = json.load(urllib.urlopen(urlstring)) - compiled_results[(group, vhost)] = result - last_update[(group, vhost)] = now - #Refresh dict by names. We'll probably move this elsewhere. - if group in ('queues', 'nodes'): - for entry in result: - name_attribute = entry['name'] - result_dict[name_attribute] = entry - compiled_results[(group,vhost)] = result_dict - - return compiled_results[(group, vhost)] - def validatedResult(value): if not isInstance(value, bool): return float(value) From 5ebe62dae5d657c7f5ed6c7f224d22906b9305dc Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Mon, 9 Jul 2012 18:59:13 +0000 Subject: [PATCH 06/39] RabbitMQ module now supports multiple vhosts --- rabbit/conf.d/rabbitmq.pyconf | 2 +- rabbit/python_modules/rabbitmq.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/rabbit/conf.d/rabbitmq.pyconf b/rabbit/conf.d/rabbitmq.pyconf index 76d44048..d7abefbb 100644 --- a/rabbit/conf.d/rabbitmq.pyconf +++ b/rabbit/conf.d/rabbitmq.pyconf @@ -13,7 +13,7 @@ modules { } param vhost { - value = "/" + value = "/,meow" } param username { value = "guest" diff --git a/rabbit/python_modules/rabbitmq.py b/rabbit/python_modules/rabbitmq.py index 527e1c1e..2bdccbb5 100644 --- a/rabbit/python_modules/rabbitmq.py +++ b/rabbit/python_modules/rabbitmq.py @@ -91,8 +91,11 @@ def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): last_update = now for stat in stats: for vhost in vhosts: + if stat in ('nodes'): + vhost = '/' result_dict = {} urlstring = url_template.safe_substitute(stats = stat, vhost = vhost) + print urlstring result = json.load(urllib.urlopen(urlstring)) # Rearrange results so entry is held in a dict keyed by name - queue name, host name, etc. if stat in ("queues", "nodes", "exchanges"): @@ -114,9 +117,9 @@ def list_queues(vhost): queues = compiled_results[('queues', vhost)].keys() return queues -def list_nodes(vhost): +def list_nodes(): global compiled_results - nodes = compiled_results[('nodes', vhost)].keys() + nodes = compiled_results[('nodes', '/')].keys() return nodes def getQueueStat(name): @@ -147,7 +150,7 @@ def getNodeStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" stat_name = name.split(".")[0] node_name, vhost = name.split(".")[1].split("#") - result = compiled_results[('nodes', vhost)] + result = compiled_results[('nodes', '/')] value = dig_it_up(result, keyToPath[stat_name] % node_name) print name,value @@ -224,9 +227,9 @@ def buildQueueDescriptors(): descriptors.append(d1) def buildNodeDescriptors(): - for vhost, metric in product(vhosts, NODE_METRICS): - for node in list_nodes(vhost): - name = '%s.%s#%s' % (metric, node, vhost) + for metric in NODE_METRICS: + for node in list_nodes(): + name = '%s.%s#%s' % (metric, node, '/') print name d2 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getNodeStat, From aea3cfbce534a88752a5765db522be8a108150a4 Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Mon, 9 Jul 2012 19:06:33 +0000 Subject: [PATCH 07/39] Changed sample pyconf file --- rabbit/conf.d/rabbitmq.pyconf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rabbit/conf.d/rabbitmq.pyconf b/rabbit/conf.d/rabbitmq.pyconf index d7abefbb..f0cee349 100644 --- a/rabbit/conf.d/rabbitmq.pyconf +++ b/rabbit/conf.d/rabbitmq.pyconf @@ -13,7 +13,7 @@ modules { } param vhost { - value = "/,meow" + value = "/,vhost1,vhost2" } param username { value = "guest" From 92c400c5dd39c9549d392cb0e98d97e02890ca3d Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Mon, 9 Jul 2012 19:09:16 +0000 Subject: [PATCH 08/39] Modified README. --- rabbit/README.mkdn | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rabbit/README.mkdn b/rabbit/README.mkdn index 50594517..5a468135 100644 --- a/rabbit/README.mkdn +++ b/rabbit/README.mkdn @@ -7,10 +7,14 @@ python module for ganglia 3.1. http://(node-ip):55672/api/queues (or nodes) -This module requires simplejson, or if using a 2.6 interpreter with mod_python, json. Modify accordingly. +This module requires simplejson, or if using a 2.6 interpreter with mod_python, the json module. Modify accordingly. The digItUp function, and the keyToPath syntax, were borrowed from the ElasticSearch module. +To use multiple vhosts, separate them by comma in the vhosts file. + +To get metrics besides nodes or queues, either check out how the buildQueueDescriptors and buildNodeDescriptors were set up and make a new descriptor builder/modify stats at the top of the python file and contribute the changes, or ask for my assistance and I'll see what I can do. + ## AUTHORS Gregory Rice From 1a6a2d843480c51b11fdd8cb0f3dc7c89c8ead81 Mon Sep 17 00:00:00 2001 From: Greg Rice Date: Mon, 9 Jul 2012 19:11:32 +0000 Subject: [PATCH 09/39] Modified README. --- rabbit/README.mkdn | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rabbit/README.mkdn b/rabbit/README.mkdn index 5a468135..a785546e 100644 --- a/rabbit/README.mkdn +++ b/rabbit/README.mkdn @@ -5,7 +5,9 @@ python module for ganglia 3.1. "rabbit" sends metrics on RabbitMQ nodes using the stats api. It is based off the very similar ElasticSearch module. -http://(node-ip):55672/api/queues (or nodes) +http://(node-ip):55672/api/queues (or nodes, or exchanges) + +Please see http://hg.rabbitmq.com/rabbitmq-management/raw-file/rabbitmq_v2_7_1/priv/www/api/index.html for more info on the management API. That's a good place to start if you want to extend this module and include new metrics. This module requires simplejson, or if using a 2.6 interpreter with mod_python, the json module. Modify accordingly. From e4c8a9688e02b22fb6bb8fdd2d06a7b5c5a0833f Mon Sep 17 00:00:00 2001 From: Andreas Lappe Date: Thu, 26 Jul 2012 15:14:31 +0200 Subject: [PATCH 10/39] Add couchdb module. --- couchdb/README.mkdn | 50 +++++ couchdb/conf.d/couchdb.pyconf | 207 +++++++++++++++++++ couchdb/python_modules/couchdb.py | 321 ++++++++++++++++++++++++++++++ 3 files changed, 578 insertions(+) create mode 100644 couchdb/README.mkdn create mode 100644 couchdb/conf.d/couchdb.pyconf create mode 100644 couchdb/python_modules/couchdb.py diff --git a/couchdb/README.mkdn b/couchdb/README.mkdn new file mode 100644 index 00000000..6d15f6a6 --- /dev/null +++ b/couchdb/README.mkdn @@ -0,0 +1,50 @@ +couchdb +======= + +python module for ganglia 3.1. + +## Metrics + * Number of authentication cache hits + * Number of authentication cache misses + * Number of times a document was read from a database + * Number of times a document was changed + * Number of open databases + * Number of file descriptors CouchDB has open + * Request time + * Number of bulk requests + * Number of clients for continuous _changes + * Number of HTTP requests + * Number of temporary view reads + * Number of view reads + * Number of HTTP COPY requests + * Number of HTTP DELETE requests + * Number of HTTP GET requests + * Number of HTTP HEAD requests + * Number of HTTP POST requests + * Number of HTTP PUT requests + * Number of HTTP 200 OK responses + * Number of HTTP 201 Created responses + * Number of HTTP 202 Accepted responses + * Number of HTTP 301 Moved Permanently responses + * Number of HTTP 304 Not Modified responses + * Number of HTTP 400 Bad Request responses + * Number of HTTP 401 Unauthorized responses + * Number of HTTP 403 Forbidden responses + * Number of HTTP 404 Not Found responses + * Number of HTTP 405 Method Not Allowed responses + * Number of HTTP 409 Conflict responses + * Number of HTTP 412 Precondition Failed responses + * Number of HTTP 500 Internal Server Error responses + +## Parameters + * stats_url (The URL to query for CouchDB _stats. Default: 'http://127.0.0.1:5984/_stats' + * refresh_rate (The time in seconds between polling the stats. Either 60, 300 or 900. Default: 60) + +## Notes + * This has been tested with: + - python 2.7.1 on Mac OS X + - python 2.7.3 on Ubuntu 12.04 + +## AUTHORS + +Andreas Lappe diff --git a/couchdb/conf.d/couchdb.pyconf b/couchdb/conf.d/couchdb.pyconf new file mode 100644 index 00000000..3463bcee --- /dev/null +++ b/couchdb/conf.d/couchdb.pyconf @@ -0,0 +1,207 @@ +# + +modules { + module { + name = 'couchdb' + language = 'python' + + param stats_url { + value = 'http://localhost:5984/_stats' + } + + param refresh_rate { + value = '60' + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 20 + + metric { + name = 'couchdb_couchdb_auth_cache_hits' + title = 'Number of authentication cache hits' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_auth_cache_misses' + title = 'Number of authentication cache misses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_reads' + title = 'Number of times a document was read from a database' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_writes' + title = 'Number of times a document was changed' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_databases' + title = 'Number of open databases' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_os_files' + title = 'Number of file descriptors CouchDB has open' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_request_time' + title = 'Request Time' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_bulk_requests' + title = 'Number of bulk requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_clients_requesting_changes' + title = 'Number of clients for continuous _changes' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_requests' + title = 'Number of HTTP requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_temporary_view_reads' + title = 'Number of temporary view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_view_reads' + title = 'Number of view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_COPY' + title = 'Number of HTTP COPY requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_DELETE' + title = 'Number of HTTP DELETE requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_GET' + title = 'Number of HTTP GET requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_HEAD' + title = 'Number of HTTP HEAD requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_POST' + title = 'Number of HTTP POST requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_PUT' + title = 'Number of HTTP PUT requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_200' + title = 'Number of HTTP 200 OK responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_201' + title = 'Number of HTTP 201 Created responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_202' + title = 'Number of HTTP 202 Accepted responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_301' + title = 'Number of HTTP 301 Moved Permanently responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_304' + title = 'Number of HTTP 304 Not Modified responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_400' + title = 'Number of HTTP 400 Bad Request responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_401' + title = 'Number of HTTP 401 Unauthorized responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_403' + title = 'Number of HTTP 403 Forbidden responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_404' + title = 'Number of HTTP 404 Not Found responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_405' + title = 'Number of HTTP 405 Method Not Allowed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_409' + title = 'Number of HTTP 409 Conflict responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_412' + title = 'Number of HTTP 412 Precondition Failed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_500' + title = 'Number of HTTP 500 Internal Server Error responses' + value_threshold = 1.0 + } +} diff --git a/couchdb/python_modules/couchdb.py b/couchdb/python_modules/couchdb.py new file mode 100644 index 00000000..a24f21f4 --- /dev/null +++ b/couchdb/python_modules/couchdb.py @@ -0,0 +1,321 @@ +### This script reports couchdb metrics to ganglia. + +### License to use, modify, and distribute under the GPL +### http://www.gnu.org/licenses/gpl.txt +import logging +import os +import subprocess +import sys +import threading +import time +import traceback +import urllib2 +import json + +logging.basicConfig(level=logging.ERROR) + +_Worker_Thread = None + +class UpdateCouchdbThread(threading.Thread): + + def __init__(self, params): + threading.Thread.__init__(self) + self.running = False + self.shuttingdown = False + self.refresh_rate = int(params['refresh_rate']) + self.metrics = {} + self.settings = {} + self.stats_url = params['stats_url'] + self._metrics_lock = threading.Lock() + self._settings_lock = threading.Lock() + + def shutdown(self): + self.shuttingdown = True + if not self.running: + return + self.join() + + def run(self): + global _Lock + + self.running = True + + while not self.shuttingdown: + time.sleep(self.refresh_rate) + self.refresh_metrics() + + self.running = False + + @staticmethod + def _get_couchdb_stats(url, refresh_rate): + if refresh_rate == 60 or refresh_rate == 300 or refresh_rate == 900: + url += '?range=' + str(refresh_rate) + else: + logging.warning('The specified refresh_rate of %d is invalid and has been substituted with 60!' % refresh_rate) + url += '?range=60' + + c = urllib2.urlopen(url) + json_data = c.read() + c.close() + + data = json.loads(json_data) + couchdb = data['couchdb'] + httpd = data['httpd'] + request_methods = data['httpd_request_methods'] + status_codes = data['httpd_status_codes'] + + result = {} + for first_level_key in data: + for second_level_key in data[first_level_key]: + value = data[first_level_key][second_level_key]['current'] + if value is None: + value = 0 + else: + if second_level_key in ['open_databases', 'open_os_files', 'clients_requesting_changes']: + print second_level_key + ': ' + str(value) + value = int(value) + else: + # We need to devide by the range as couchdb provides no per second values + value = float(value) / refresh_rate + result['couchdb_' + first_level_key + '_' + second_level_key ] = value + + return result + + def refresh_metrics(self): + logging.debug('refresh metrics') + + try: + logging.debug(' opening URL: ' + str(self.stats_url)) + data = UpdateCouchdbThread._get_couchdb_stats(self.stats_url, self.refresh_rate) + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + + try: + self._metrics_lock.acquire() + self.metrics = {} + for k, v in data.items(): + self.metrics[k] = v + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + return False + + finally: + self._metrics_lock.release() + + if not self.metrics: + logging.warning('error refreshing metrics') + return False + + logging.debug('success refreshing metrics') + logging.debug('metrics: ' + str(self.metrics)) + + return True + + def metric_of(self, name): + logging.debug('getting metric: ' + name) + + try: + if name in self.metrics: + try: + self._metrics_lock.acquire() + logging.debug('metric: %s = %s' % (name, self.metrics[name])) + return self.metrics[name] + finally: + self._metrics_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + + def setting_of(self, name): + logging.debug('getting setting: ' + name) + + try: + if name in self.settings: + try: + self._settings_lock.acquire() + logging.debug('setting: %s = %s' % (name, self.settings[name])) + return self.settings[name] + finally: + self._settings_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + +def metric_init(params): + logging.debug('init: ' + str(params)) + global _Worker_Thread + + METRIC_DEFAULTS = { + 'units': 'requests/s', + 'groups': 'couchdb', + 'slope': 'both', + 'value_type': 'float', + 'format': '%.3f', + 'description': '', + 'call_back': metric_of + } + + descriptions = dict( + couchdb_couchdb_auth_cache_hits={ + 'units': 'hits/s', + 'description': 'Number of authentication cache hits'}, + couchdb_couchdb_auth_cache_misses={ + 'units': 'misses/s', + 'description': 'Number of authentication cache misses'}, + couchdb_couchdb_database_reads={ + 'units': 'reads/s', + 'description': 'Number of times a document was read from a database'}, + couchdb_couchdb_database_writes={ + 'units': 'writes/s', + 'description': 'Number of times a document was changed'}, + couchdb_couchdb_open_databases={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'databases', + 'description': 'Number of open databases'}, + couchdb_couchdb_open_os_files={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'files', + 'description': 'Number of file descriptors CouchDB has open'}, + couchdb_couchdb_request_time={ + 'units': 'ms', + 'description': 'Request time'}, + couchdb_httpd_bulk_requests={ + 'description': 'Number of bulk requests'}, + couchdb_httpd_clients_requesting_changes={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'clients', + 'description': 'Number of clients for continuous _changes'}, + couchdb_httpd_requests={ + 'description': 'Number of HTTP requests'}, + couchdb_httpd_temporary_view_reads={ + 'units': 'reads', + 'description': 'Number of temporary view reads'}, + couchdb_httpd_view_reads={ + 'description': 'Number of view reads'}, + couchdb_httpd_request_methods_COPY={ + 'description': 'Number of HTTP COPY requests'}, + couchdb_httpd_request_methods_DELETE={ + 'description': 'Number of HTTP DELETE requests'}, + couchdb_httpd_request_methods_GET={ + 'description': 'Number of HTTP GET requests'}, + couchdb_httpd_request_methods_HEAD={ + 'description': 'Number of HTTP HEAD requests'}, + couchdb_httpd_request_methods_POST={ + 'description': 'Number of HTTP POST requests'}, + couchdb_httpd_request_methods_PUT={ + 'description': 'Number of HTTP PUT requests'}, + couchdb_httpd_status_codes_200={ + 'units': 'responses/s', + 'description': 'Number of HTTP 200 OK responses'}, + couchdb_httpd_status_codes_201={ + 'units': 'responses/s', + 'description': 'Number of HTTP 201 Created responses'}, + couchdb_httpd_status_codes_202={ + 'units': 'responses/s', + 'description': 'Number of HTTP 202 Accepted responses'}, + couchdb_httpd_status_codes_301={ + 'units': 'responses/s', + 'description': 'Number of HTTP 301 Moved Permanently responses'}, + couchdb_httpd_status_codes_304={ + 'units': 'responses/s', + 'description': 'Number of HTTP 304 Not Modified responses'}, + couchdb_httpd_status_codes_400={ + 'units': 'responses/s', + 'description': 'Number of HTTP 400 Bad Request responses'}, + couchdb_httpd_status_codes_401={ + 'units': 'responses/s', + 'description': 'Number of HTTP 401 Unauthorized responses'}, + couchdb_httpd_status_codes_403={ + 'units': 'responses/s', + 'description': 'Number of HTTP 403 Forbidden responses'}, + couchdb_httpd_status_codes_404={ + 'units': 'responses/s', + 'description': 'Number of HTTP 404 Not Found responses'}, + couchdb_httpd_status_codes_405={ + 'units': 'responses/s', + 'description': 'Number of HTTP 405 Method Not Allowed responses'}, + couchdb_httpd_status_codes_409={ + 'units': 'responses/s', + 'description': 'Number of HTTP 409 Conflict responses'}, + couchdb_httpd_status_codes_412={ + 'units': 'responses/s', + 'description': 'Number of HTTP 412 Precondition Failed responses'}, + couchdb_httpd_status_codes_500={ + 'units': 'responses/s', + 'description': 'Number of HTTP 500 Internal Server Error responses'}) + + if _Worker_Thread is not None: + raise Exception('Worker thread already exists') + + _Worker_Thread = UpdateCouchdbThread(params) + _Worker_Thread.refresh_metrics() + _Worker_Thread.start() + + descriptors = [] + + for name, desc in descriptions.iteritems(): + d = desc.copy() + d['name'] = str(name) + [ d.setdefault(key, METRIC_DEFAULTS[key]) for key in METRIC_DEFAULTS.iterkeys() ] + descriptors.append(d) + + return descriptors + +def metric_of(name): + global _Worker_Thread + return _Worker_Thread.metric_of(name) + +def setting_of(name): + global _Worker_Thread + return _Worker_Thread.setting_of(name) + +def metric_cleanup(): + global _Worker_Thread + if _Worker_Thread is not None: + _Worker_Thread.shutdown() + logging.shutdown() + pass + +if __name__ == '__main__': + from optparse import OptionParser + + try: + logging.debug('running from the cmd line') + parser = OptionParser() + parser.add_option('-u', '--URL', dest='stats_url', default='http://127.0.0.1:5984/_stats', help='URL for couchdb stats page') + parser.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_option('-r', '--refresh-rate', dest='refresh_rate', default=60) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False) + + (options, args) = parser.parse_args() + + descriptors = metric_init({ + 'stats_url': options.stats_url, + 'refresh_rate': options.refresh_rate + }) + + if options.debug: + from pprint import pprint + pprint(descriptors) + + for d in descriptors: + v = d['call_back'](d['name']) + + if not options.quiet: + print ' {0}: {1} {2} [{3}]' . format(d['name'], v, d['units'], d['description']) + + os._exit(1) + + except KeyboardInterrupt: + time.sleep(0.2) + os._exit(1) + except StandardError: + traceback.print_exc() + os._exit(1) + finally: + metric_cleanup() From d8b9c7b88bc8b1aee29530f28a3da68ba1653d0a Mon Sep 17 00:00:00 2001 From: Andreas Lappe Date: Fri, 27 Jul 2012 17:41:33 +0200 Subject: [PATCH 11/39] =?UTF-8?q?Fixes=20email=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- couchdb/README.mkdn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/couchdb/README.mkdn b/couchdb/README.mkdn index 6d15f6a6..a1f1774a 100644 --- a/couchdb/README.mkdn +++ b/couchdb/README.mkdn @@ -47,4 +47,4 @@ python module for ganglia 3.1. ## AUTHORS -Andreas Lappe +Andreas Lappe From dc527090ac5193ea4ac50e9198ebe616f6ee9ac0 Mon Sep 17 00:00:00 2001 From: Jeff Buchbinder Date: Mon, 30 Jul 2012 12:33:29 -0400 Subject: [PATCH 12/39] Fix for RHEL/CentOS old Python versions. --- elasticsearch/python_modules/elasticsearch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index 8f0fa895..c51938d0 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -1,6 +1,8 @@ #! /usr/bin/python -import json +try: import simplejson as json +except ImportError: import json + import time import urllib From a8146c90f6b0d032d63b62cf6b3094398efde59d Mon Sep 17 00:00:00 2001 From: Andreas Lappe Date: Mon, 30 Jul 2012 19:39:48 +0200 Subject: [PATCH 13/39] Fixes spelling error of configuration key. --- php_fpm/conf.d/php_fpm.pyconf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/php_fpm/conf.d/php_fpm.pyconf b/php_fpm/conf.d/php_fpm.pyconf index 59c9e2f8..a95ead0e 100644 --- a/php_fpm/conf.d/php_fpm.pyconf +++ b/php_fpm/conf.d/php_fpm.pyconf @@ -9,7 +9,7 @@ modules { value = 'localhost' } - param port { + param ports { value = '9000' } From bd119050c77e32dc2e1c6b7a3ce3a1ec31848a3d Mon Sep 17 00:00:00 2001 From: efraser Date: Fri, 3 Aug 2012 08:59:19 +1200 Subject: [PATCH 14/39] Added EMC RecoverPoint module --- recoverpoint/README.mkdn | 18 ++++ recoverpoint/recoverpoint.py | 142 +++++++++++++++++++++++++++++++ recoverpoint/recoverpoint.pyconf | 28 ++++++ 3 files changed, 188 insertions(+) create mode 100644 recoverpoint/README.mkdn create mode 100755 recoverpoint/recoverpoint.py create mode 100644 recoverpoint/recoverpoint.pyconf diff --git a/recoverpoint/README.mkdn b/recoverpoint/README.mkdn new file mode 100644 index 00000000..2a0ba382 --- /dev/null +++ b/recoverpoint/README.mkdn @@ -0,0 +1,18 @@ +EMC RecoverPoint +=============== + +This is a GMOND Python Module that gets metrics from EMC RecoverPoint replication appliances. + +## DEPENDS + * python YAML + * paramiko modules + * ssh access to the recoverpoint appliance (paramiko can use ssh keys if required) + +## USAGE + * Save the recoverpoint.pyconf into /etc/ganglia/conf.d directory and update the management IP and sitenames (the sitenames have been lowercase'd) + * Save the recoverpoint.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules. Update the username/passwords if necessary. + * Restart gmond and a "recoverpoint" host should appear in ganglia. + +## AUTHOR + +Author: Evan Fraser <evan.fraser@trademe.co.nz> \ No newline at end of file diff --git a/recoverpoint/recoverpoint.py b/recoverpoint/recoverpoint.py new file mode 100755 index 00000000..8e3486bb --- /dev/null +++ b/recoverpoint/recoverpoint.py @@ -0,0 +1,142 @@ +#!/usr/bin/python +# Name: recoverpoint.py +# Desc: Ganglia Python module for gathering EMC recoverpoint statistics via SSH +# Author: Evan Fraser (evan.fraser@trademe.co.nz) +# Date: 01/08/2012 + + +import yaml +import warnings +import pprint +import time +import re + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + import paramiko + +descriptors = list() +NIMETRICS = { + 'time' : 0, + 'data' : {} +} +#This is the minimum interval between querying the RPA for metrics. +#Each ssh query takes 1.6s so we limit the interval between getting metrics to this interval. +NIMETRICS_CACHE_MAX = 5 + +ipaddr = '' + +#Example of data structure: +#{'RPA statistics': {'Site 1 RPA 1': {'Compression CPU usage': '0.00%', +# 'Latency (ms)': 12, +# 'Packet loss': '0.00%', +# 'Traffic': {'Application': {'SAN': '0 bps', +# 'WAN': '432 bps'}, +# 'Application (writes)': 0, +# 'Compression': 0}}, + +def define_metrics(Desc_Skel, statsDict): + for rpa in statsDict['RPA statistics']: + #pprint.pprint(statsDict['RPA statistics'][rpa]) + for metric in statsDict['RPA statistics'][rpa].keys(): + if "Latency (ms)" in metric: + descriptors.append(create_desc(Desc_Skel, { + "name" : (rpa.lower()).replace(' ','_') + '_latency', + "units" : "ms", + "description" : "latency in ms", + "groups" : "Latency" + })) + if "Traffic" in metric: + #define the Application/[SAN|WAN] metrics + for net in statsDict['RPA statistics'][rpa]['Traffic']['Application'].keys(): + #print net + descriptors.append(create_desc(Desc_Skel, { + "name" : (rpa.lower()).replace(' ','_') + '_' + net.lower(), + "units" : "bits/sec", + "description" : net + ' traffic', + "groups" : net + " Traffic", + })) + + return descriptors + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + +def get_metrics(name): + global NIMETRICS,ipaddr + # if interval since last check > NIMETRICS_CACHE_MAX get metrics again + metrics = {} + if (time.time() - NIMETRICS['time']) > NIMETRICS_CACHE_MAX: + + sshcon = paramiko.SSHClient() + sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics") + rawmetrics = yaml.load(stdout) + for rpa in rawmetrics['RPA statistics']: + for metric in rawmetrics['RPA statistics'][rpa]: + if "Latency (ms)" in metric: + metrics[(rpa.lower()).replace(' ','_') + '_latency'] = rawmetrics['RPA statistics'][rpa]['Latency (ms)'] + if "Traffic" in metric: + #store the Application/[SAN|WAN] metrics + for net in rawmetrics['RPA statistics'][rpa]['Traffic']['Application'].keys(): + traffic,junk = rawmetrics['RPA statistics'][rpa]['Traffic']['Application'][net].split() + metrics[(rpa.lower()).replace(' ','_') + '_' + net.lower()] = int(traffic) + + + NIMETRICS = { + 'time': time.time(), + 'data': metrics + } + else: + metrics = NIMETRICS['data'] + return metrics[name] + + + +def metric_init(params): + global descriptors, Desc_Skel, ipaddr + print '[recoverpoint] Recieved the following parameters' + print params + ipaddr = params['mgmtip'] + print ipaddr + spoof_string = ipaddr + ':recoverpoint' + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_metrics, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'netiron', + 'spoof_host' : spoof_string + } + + sshcon = paramiko.SSHClient() + sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics") + statsDict = yaml.load(stdout) + sshcon.close() + descriptors = define_metrics(Desc_Skel, statsDict) + + return descriptors + +# For CLI Debuging: +if __name__ == '__main__': + params = { + 'mgmtip' : '192.168.1.100', + } + descriptors = metric_init(params) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + print 'value for %s is %u' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) +#exit(0) diff --git a/recoverpoint/recoverpoint.pyconf b/recoverpoint/recoverpoint.pyconf new file mode 100644 index 00000000..448b2a62 --- /dev/null +++ b/recoverpoint/recoverpoint.pyconf @@ -0,0 +1,28 @@ +# Name: recoverpoint.pyconf +# Author: Evan Fraser (evan.fraser@trademe.co.nz) +# Desc: Config file for the ganglia gmond recoverpoint module. +# Date: 03/08/2012 +# To use: Save this file in /etc/ganglia/conf.d/, update the mgmtip value to the IP address of one of your RecoverPoint management IP's and change the name_match lines below to match your site names. + +modules { + module { + name = "recoverpoint" + language = "python" + param mgmtip { + value = '192.168.1.100' + } + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 20 + time_threshold = 50 + metric { + name_match = "site1(.+)" + } + metric { + name_match = "site2(.+)" + } + } + From 4ffa92ab172d8f71b9ee5ded43a748a7e6e21907 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Fri, 3 Aug 2012 15:23:54 +1200 Subject: [PATCH 15/39] Adding Fibrechannel switch module --- fibrechannel/README.mkdn | 28 ++++ fibrechannel/fibrechannel.py | 271 +++++++++++++++++++++++++++++++ fibrechannel/fibrechannel.pyconf | 25 +++ 3 files changed, 324 insertions(+) create mode 100644 fibrechannel/README.mkdn create mode 100755 fibrechannel/fibrechannel.py create mode 100644 fibrechannel/fibrechannel.pyconf diff --git a/fibrechannel/README.mkdn b/fibrechannel/README.mkdn new file mode 100644 index 00000000..15df7ceb --- /dev/null +++ b/fibrechannel/README.mkdn @@ -0,0 +1,28 @@ +Brocade FibreChannel +This is gmond python module that allows SNMP polling of Fibrechannel switches to get interface packet and throughput metrics. + + * It works for Brocade FC switches, and probably for any other SNMP enabled switch. + * It requires pysnmp (available in debian repositorys) + * Handles polling multiple switches from a single gmond. + * Spoofs the switch hostname, so each switch shows up separately in ganglia + +## DEPENDS + * python pysnmp + +## USAGE + * Save the fibrechannel.pyconf into directory and update the switch(s) name & IP's + * Save the fibrechannel.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules + * Update SNMP community / ports if necessary + +If you're handling a large number of metrics, you may wish to set your sysctl settings as below: + +net.core.rmem_max=104857600 +net.core.rmem_default=104857600 +vm.dirty_ratio=100 +vm.dirty_background_ratio=100 +vm.dirty_expire_centisecs=720000 + +## AUTHOR + +Author: Evan Fraser <evan.fraser@trademe.co.nz> + diff --git a/fibrechannel/fibrechannel.py b/fibrechannel/fibrechannel.py new file mode 100755 index 00000000..3e693f06 --- /dev/null +++ b/fibrechannel/fibrechannel.py @@ -0,0 +1,271 @@ +#!/usr/bin/python +# Name: fibrechannel.py +# Desc: Ganglia module for polling Brocade Fibrechannel switches via snmnp (probably work with any snmp capable device) +# Author: Evan Fraser evan.fraser@trademe.co.nz +# Date: August 2012 +# Copyright: GPL + +import sys +import os +import re +import time +import pprint +from pysnmp.entity.rfc3413.oneliner import cmdgen +NIPARAMS = {} + +NIMETRICS = { + 'time' : 0, + 'data' : {} +} +LAST_NIMETRICS = dict(NIMETRICS) +NIMETRICS_CACHE_MAX = 5 + +descriptors = list() + +oidDict = { + 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), + 'ifDescr' : (1,3,6,1,2,1,2,2,1,2), + 'ifInOctets' : (1,3,6,1,2,1,2,2,1,10), + 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), + 'ifInErrors' : (1,3,6,1,2,1,2,2,1,14), + 'ifOutOctets' : (1,3,6,1,2,1,2,2,1,16), + 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), + 'ifOutErrors' : (1,3,6,1,2,1,2,2,1,20), + } +#oidDict = { +# 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), +# 'ifName' : (1,3,6,1,2,1,31,1,1,1,1), +# 'ifAlias' : (1,3,6,1,2,1,31,1,1,1,18), +# 'ifHCInOctets' : (1,3,6,1,2,1,31,1,1,1,6), +# 'ifHCOutOctets' : (1,3,6,1,2,1,31,1,1,1,10), +# 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), +# 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), +# } + +def get_metrics(): + """Return all metrics""" + + global NIMETRICS, LAST_NIMETRICS + + # if interval since last check > NIMETRICS_CACHE_MAX get metrics again + if (time.time() - NIMETRICS['time']) > NIMETRICS_CACHE_MAX: + metrics = {} + for para in NIPARAMS.keys(): + if para.startswith('switch_'): + ipaddr,name = NIPARAMS[para].split(':') + snmpTable = runSnmp(oidDict,ipaddr) + newmetrics = buildDict(oidDict,snmpTable,name) + metrics = dict(newmetrics, **metrics) + + # update cache + LAST_NIMETRICS = dict(NIMETRICS) + NIMETRICS = { + 'time': time.time(), + 'data': metrics + } + + return [NIMETRICS, LAST_NIMETRICS] + +def get_delta(name): + """Return change over time for the requested metric""" + + # get metrics + [curr_metrics, last_metrics] = get_metrics() + try: + delta = float(curr_metrics['data'][name] - last_metrics['data'][name])/(curr_metrics['time'] - last_metrics['time']) + #print delta + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + + return delta + +# Separate routine to perform SNMP queries and returns table (dict) +def runSnmp(oidDict,ip): + + # cmdgen only takes tuples, oid strings don't work +## ifIndex = (1,3,6,1,2,1,2,2,1,1) +## ifName = (1,3,6,1,2,1,31,1,1,1,1) +## ifAlias = (1,3,6,1,2,1,31,1,1,1,18) +## ifHCInOctets = (1,3,6,1,2,1,31,1,1,1,6) +## ifHCOutOctets = (1,3,6,1,2,1,31,1,1,1,10) + +# 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), +# 'ifDescr' : (1,3,6,1,2,1,2,2,1,2), +# 'ifInOctets' : (1,3,6,1,2,1,2,2,1,10), +# 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), +# 'ifInErrors' : (1,3,6,1,2,1,2,2,1,14), +# 'ifOutOctets' : (1,3,6,1,2,1,2,2,1,16), +# 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), +# 'ifOutErrors' : (1,3,6,1,2,1,2,2,1,20), + + #Runs the SNMP query, The order that oid's are passed determines the order in the results + errorIndication, errorStatus, errorIndex, varBindTable = cmdgen.CommandGenerator().nextCmd( + # SNMP v2 + cmdgen.CommunityData('test-agent', 'public'), + cmdgen.UdpTransportTarget((ip, 161)), + oidDict['ifIndex'], + oidDict['ifDescr'], + oidDict['ifInOctets'], + oidDict['ifInErrors'], + oidDict['ifInUcastPkts'], + oidDict['ifOutOctets'], + oidDict['ifOutErrors'], + oidDict['ifOutUcastPkts'], + ) + #pprint.pprint(varBindTable) + # Check for SNMP errors + if errorIndication: + print errorIndication + else: + if errorStatus: + print '%s at %s\n' % ( + errorStatus.prettyPrint(), errorIndex and varBindTable[-1][int(errorIndex)-1] or '?' + ) + else: + return(varBindTable) + +def buildDict(oidDict,t,switch): # passed a list of tuples, build's a dict based on the alias name + builtdict = {} + + for line in t: + # if t[t.index(line)][2][1] != '': + string = str(t[t.index(line)][1][1]) # this is the ifDescr + #print string + match = re.search(r'FC port', string) + if match and t[t.index(line)][0][1] != '': + #alias = str(t[t.index(line)][0][1]) + index = str(t[t.index(line)][0][1]) + temp = str(t[t.index(line)][1][1]) #(use ifDescr) + #lowercase the name, change spaces + '/' to '_' + name = ((temp.lower()).replace(' ','_')).replace('/','_') + #print name + inoct = str(t[t.index(line)][2][1]) + builtdict[switch+'_'+name+'_bitsin'] = int(inoct) * 8 + outoct = str(t[t.index(line)][5][1]) + builtdict[switch+'_'+name+'_bitsout'] = int(outoct) * 8 + inpkt = str(t[t.index(line)][4][1]) + builtdict[switch+'_'+name+'_pktsin'] = int(inpkt) + outpkt = str(t[t.index(line)][7][1]) + builtdict[switch+'_'+name+'_pktsout'] = int(outpkt) + #if match and t[t.index(line)][0][1] != '': + # alias = str(t[t.index(line)][0][1]) + # index = str(t[t.index(line)][1][1]) + # name = str(t[t.index(line)][2][1]) + # hcinoct = str(t[t.index(line)][3][1]) + # builtdict[switch+'_'+alias+'_bitsin'] = int(hcinoct) * 8 + # hcoutoct = str(t[t.index(line)][4][1]) + # builtdict[switch+'_'+alias+'_bitsout'] = int(hcoutoct) * 8 + # hcinpkt = str(t[t.index(line)][5][1]) + # builtdict[switch+'_'+alias+'_pktsin'] = int(hcinpkt) + # hcoutpkt = str(t[t.index(line)][6][1]) + # builtdict[switch+'_'+alias+'_pktsout'] = int(hcoutpkt) + + #pprint.pprint(builtdict) + return builtdict + +# define_metrics will run an snmp query on an ipaddr, find interfaces, build descriptors and set spoof_host +# define_metrics is called from metric_init +def define_metrics(Desc_Skel, ipaddr, switch): + snmpTable = runSnmp(oidDict,ipaddr) + aliasdict = buildDict(oidDict,snmpTable,switch) + spoof_string = ipaddr + ':' + switch + #print newdict + + for key in aliasdict.keys(): + if "bitsin" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "bits/sec", + "description" : "received bits per sec", + "groups" : "Throughput", + "spoof_host" : spoof_string, + })) + elif "bitsout" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "bits/sec", + "description" : "transmitted bits per sec", + "groups" : "Throughput", + "spoof_host" : spoof_string, + })) + elif "pktsin" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "pkts/sec", + "description" : "received packets per sec", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + elif "pktsout" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "pkts/sec", + "description" : "transmitted packets per sec", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + + + return descriptors + +def metric_init(params): + global descriptors, Desc_Skel, _Worker_Thread, Debug, newdict + + print '[switch] Received the following parameters' + print params + + #Import the params into the global NIPARAMS + for key in params: + NIPARAMS[key] = params[key] + + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_delta, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'switch', + } + + # Find all the switch's passed in params + for para in params.keys(): + if para.startswith('switch_'): + #Get ipaddr + name of switchs from params + ipaddr,name = params[para].split(':') + # pass skel, ip and name to define_metrics to create descriptors + descriptors = define_metrics(Desc_Skel, ipaddr, name) + #Return the descriptors back to gmond + return descriptors + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + + +def metric_cleanup(): + '''Clean up the metric module.''' + pass + +# For CLI Debuging: +if __name__ == '__main__': + params = { + 'switch_1' : '192.168.1.1:switch1', + #'switch_2' : '192.168.1.2:switch2', + } + descriptors = metric_init(params) + print len(descriptors) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + print 'value for %s is %u' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) +#exit(0) diff --git a/fibrechannel/fibrechannel.pyconf b/fibrechannel/fibrechannel.pyconf new file mode 100644 index 00000000..9a0990a4 --- /dev/null +++ b/fibrechannel/fibrechannel.pyconf @@ -0,0 +1,25 @@ +modules { + module { + name = "fibrechannel" + language = "python" + param switch_1 { + # ip:hostname + value = '192.168.1.1:switch1' + } + #param switch_2 { + # value = '192.168.1.2:switch2' + #} + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 20 + time_threshold = 50 + metric { + name_match = "(.+)in" + } + metric { + name_match = "(.+)out" + } + } From f6211b7b0f3703e5779f04ca5b15352b70200415 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Mon, 6 Aug 2012 12:59:30 +1200 Subject: [PATCH 16/39] Added FC error metrics --- fibrechannel/fibrechannel.py | 50 ++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/fibrechannel/fibrechannel.py b/fibrechannel/fibrechannel.py index 3e693f06..ce192292 100755 --- a/fibrechannel/fibrechannel.py +++ b/fibrechannel/fibrechannel.py @@ -32,15 +32,6 @@ 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), 'ifOutErrors' : (1,3,6,1,2,1,2,2,1,20), } -#oidDict = { -# 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), -# 'ifName' : (1,3,6,1,2,1,31,1,1,1,1), -# 'ifAlias' : (1,3,6,1,2,1,31,1,1,1,18), -# 'ifHCInOctets' : (1,3,6,1,2,1,31,1,1,1,6), -# 'ifHCOutOctets' : (1,3,6,1,2,1,31,1,1,1,10), -# 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), -# 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), -# } def get_metrics(): """Return all metrics""" @@ -86,11 +77,6 @@ def get_delta(name): def runSnmp(oidDict,ip): # cmdgen only takes tuples, oid strings don't work -## ifIndex = (1,3,6,1,2,1,2,2,1,1) -## ifName = (1,3,6,1,2,1,31,1,1,1,1) -## ifAlias = (1,3,6,1,2,1,31,1,1,1,18) -## ifHCInOctets = (1,3,6,1,2,1,31,1,1,1,6) -## ifHCOutOctets = (1,3,6,1,2,1,31,1,1,1,10) # 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), # 'ifDescr' : (1,3,6,1,2,1,2,2,1,2), @@ -141,7 +127,6 @@ def buildDict(oidDict,t,switch): # passed a list of tuples, build's a dict based temp = str(t[t.index(line)][1][1]) #(use ifDescr) #lowercase the name, change spaces + '/' to '_' name = ((temp.lower()).replace(' ','_')).replace('/','_') - #print name inoct = str(t[t.index(line)][2][1]) builtdict[switch+'_'+name+'_bitsin'] = int(inoct) * 8 outoct = str(t[t.index(line)][5][1]) @@ -150,19 +135,11 @@ def buildDict(oidDict,t,switch): # passed a list of tuples, build's a dict based builtdict[switch+'_'+name+'_pktsin'] = int(inpkt) outpkt = str(t[t.index(line)][7][1]) builtdict[switch+'_'+name+'_pktsout'] = int(outpkt) - #if match and t[t.index(line)][0][1] != '': - # alias = str(t[t.index(line)][0][1]) - # index = str(t[t.index(line)][1][1]) - # name = str(t[t.index(line)][2][1]) - # hcinoct = str(t[t.index(line)][3][1]) - # builtdict[switch+'_'+alias+'_bitsin'] = int(hcinoct) * 8 - # hcoutoct = str(t[t.index(line)][4][1]) - # builtdict[switch+'_'+alias+'_bitsout'] = int(hcoutoct) * 8 - # hcinpkt = str(t[t.index(line)][5][1]) - # builtdict[switch+'_'+alias+'_pktsin'] = int(hcinpkt) - # hcoutpkt = str(t[t.index(line)][6][1]) - # builtdict[switch+'_'+alias+'_pktsout'] = int(hcoutpkt) - + inerrors = str(t[t.index(line)][3][1]) + builtdict[switch+'_'+name+'_inerrors'] = int(inerrors) + outerrors = str(t[t.index(line)][6][1]) + builtdict[switch+'_'+name+'_outerrors'] = int(outerrors) + #pprint.pprint(builtdict) return builtdict @@ -173,6 +150,7 @@ def define_metrics(Desc_Skel, ipaddr, switch): aliasdict = buildDict(oidDict,snmpTable,switch) spoof_string = ipaddr + ':' + switch #print newdict + #pprint.pprint(aliasdict.keys()) for key in aliasdict.keys(): if "bitsin" in key: @@ -207,6 +185,22 @@ def define_metrics(Desc_Skel, ipaddr, switch): "groups" : "Packets", "spoof_host" : spoof_string, })) + elif "inerrors" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "errors", + "description" : "inbound packet errors", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + elif "outerrors" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "errors", + "description" : "outbound packet errors", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) return descriptors From 81ad2efec3957245fc34d0ad4c73b629c443b502 Mon Sep 17 00:00:00 2001 From: Ori Livneh Date: Tue, 7 Aug 2012 09:54:25 -0700 Subject: [PATCH 17/39] Add additional module for memcached Although there is already one memcached Python module, mine approaches things someone differently, and adds aggregated stats about the max age of items in slabs -- metrics that are useful to us at Wikimedia and hopefully will be elsewhere, too. --- memcached_maxage/README.md | 21 ++ memcached_maxage/conf.d/memcached.pyconf | 133 ++++++++ memcached_maxage/python_modules/every.py | 70 +++++ memcached_maxage/python_modules/memcached.py | 137 +++++++++ .../python_modules/memcached_metrics.py | 284 ++++++++++++++++++ 5 files changed, 645 insertions(+) create mode 100644 memcached_maxage/README.md create mode 100644 memcached_maxage/conf.d/memcached.pyconf create mode 100644 memcached_maxage/python_modules/every.py create mode 100644 memcached_maxage/python_modules/memcached.py create mode 100644 memcached_maxage/python_modules/memcached_metrics.py diff --git a/memcached_maxage/README.md b/memcached_maxage/README.md new file mode 100644 index 00000000..7ba8fcef --- /dev/null +++ b/memcached_maxage/README.md @@ -0,0 +1,21 @@ +python-memcached-gmond +====================== + + +This is a Python Gmond module for Memcached, compatible with both Python 2 and +3. In addition to the usual datapoints provided by "stats", this module +aggregates max age metrics from "stats items". All metrics are available in a +"memcached" collection group. + +If you've installed ganglia at the standard locations, you should be able to +install this module by copying `memcached.pyconf` to `/etc/ganglia/conf.d` and +`memcached.py`, `memcached_metrics.py`, and 'every.py' to +`/usr/lib/ganglia/python_modules`. The memcached server's host and port can be +specified in the configuration in memcached.pyconf. + +For more information, see the section [Gmond Python metric modules][1] in the +Ganglia documentation. + +Author: Ori Livneh + + [1]: http://sourceforge.net/apps/trac/ganglia/wiki/ganglia_gmond_python_modules diff --git a/memcached_maxage/conf.d/memcached.pyconf b/memcached_maxage/conf.d/memcached.pyconf new file mode 100644 index 00000000..1552487a --- /dev/null +++ b/memcached_maxage/conf.d/memcached.pyconf @@ -0,0 +1,133 @@ +# Gmond configuration for memcached metric module +# Install to /etc/ganglia/conf.d + +modules { + module { + name = "memcached" + language = "python" + param host { + value = "127.0.0.1" + } + param port { + value = "11211" + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 60 + + metric { + name = "curr_items" + title = "curr_items" + } + metric { + name = "total_items" + title = "total_items" + } + metric { + name = "bytes" + title = "bytes" + } + metric { + name = "curr_connections" + title = "curr_connections" + } + metric { + name = "total_connections" + title = "total_connections" + } + metric { + name = "connection_structures" + title = "connection_structures" + } + metric { + name = "cmd_get" + title = "cmd_get" + } + metric { + name = "cmd_set" + title = "cmd_set" + } + metric { + name = "get_hits" + title = "get_hits" + } + metric { + name = "get_misses" + title = "get_misses" + } + metric { + name = "delete_hits" + title = "delete_hits" + } + metric { + name = "delete_misses" + title = "delete_misses" + } + metric { + name = "incr_hits" + title = "incr_hits" + } + metric { + name = "incr_misses" + title = "incr_misses" + } + metric { + name = "decr_hits" + title = "decr_hits" + } + metric { + name = "decr_misses" + title = "decr_misses" + } + metric { + name = "cas_hits" + title = "cas_hits" + } + metric { + name = "cas_misses" + title = "cas_misses" + } + metric { + name = "evictions" + title = "evictions" + } + metric { + name = "bytes_read" + title = "bytes_read" + } + metric { + name = "bytes_written" + title = "bytes_written" + } + metric { + name = "limit_maxbytes" + title = "limit_maxbytes" + } + metric { + name = "threads" + title = "threads" + } + metric { + name = "conn_yields" + title = "conn_yields" + } + metric { + name = "age_mean" + title = "age_mean" + } + metric { + name = "age_median" + title = "age_median" + } + metric { + name = "age_min" + title = "age_min" + } + metric { + name = "age_max" + title = "age_max" + } +} diff --git a/memcached_maxage/python_modules/every.py b/memcached_maxage/python_modules/every.py new file mode 100644 index 00000000..117bf6ba --- /dev/null +++ b/memcached_maxage/python_modules/every.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Every + + Python decorator; decorated function is called on a set interval. + + :author: Ori Livneh + :copyright: (c) 2012 Wikimedia Foundation + :license: GPL, version 2 or later +""" +from __future__ import division +from datetime import timedelta +import signal +import sys +import threading + + +# pylint: disable=C0111, W0212, W0613, W0621 + + +__all__ = ('every', ) + + +def total_seconds(delta): + """ + Get total seconds of timedelta object. Equivalent to + timedelta.total_seconds(), which was introduced in Python 2.7. + """ + us = (delta.microseconds + (delta.seconds + delta.days * 24 * 3600) * 10**6) + return us / 1000000.0 + + +def handle_sigint(signal, frame): + """ + Attempt to kill all child threads and exit. Installing this as a sigint + handler allows the program to run indefinitely if unmolested, but still + terminate gracefully on Ctrl-C. + """ + for thread in threading.enumerate(): + if thread.isAlive(): + thread._Thread__stop() + sys.exit(0) + + +def every(*args, **kwargs): + """ + Decorator; calls decorated function on a set interval. Arguments to every() + are passed on to the constructor of datetime.timedelta(), which accepts the + following arguments: days, seconds, microseconds, milliseconds, minutes, + hours, weeks. This decorator is intended for functions with side effects; + the return value is discarded. + """ + interval = total_seconds(timedelta(*args, **kwargs)) + def decorator(func): + def poll(): + func() + threading.Timer(interval, poll).start() + poll() + return func + return decorator + + +def join(): + """Pause until sigint""" + signal.signal(signal.SIGINT, handle_sigint) + signal.pause() + + +every.join = join diff --git a/memcached_maxage/python_modules/memcached.py b/memcached_maxage/python_modules/memcached.py new file mode 100644 index 00000000..a4db94de --- /dev/null +++ b/memcached_maxage/python_modules/memcached.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Python Gmond Module for Memcached + + This module declares a "memcached" collection group. For more information, + including installation instructions, see: + + http://sourceforge.net/apps/trac/ganglia/wiki/ganglia_gmond_python_modules + + When invoked as a standalone script, this module will attempt to use the + default configuration to query memcached every 10 seconds and print out the + results. + + Based on a suggestion from Domas Mitzuas, this module also reports the min, + max, median and mean of the 'age' metric across slabs, as reported by the + "stats items" memcached command. + + :copyright: (c) 2012 Wikimedia Foundation + :author: Ori Livneh + :license: GPL, v2 or later +""" +from __future__ import division, print_function + +from threading import Timer + +import logging +import os +import pprint +import sys +import telnetlib + +logging.basicConfig(level=logging.DEBUG) + +# Hack: load a file from the current module's directory, because gmond doesn't +# know how to work with Python packages. (To be fair, neither does Python.) +sys.path.insert(0, os.path.dirname(__file__)) +from memcached_metrics import descriptors +from every import every +sys.path.pop(0) + + +# Default configuration +config = { + 'host' : '127.0.0.1', + 'port' : 11211, +} + +stats = {} +client = telnetlib.Telnet() + + +def median(values): + """Calculate median of series""" + values = sorted(values) + length = len(values) + mid = length // 2 + if (length % 2): + return values[mid] + else: + return (values[mid - 1] + values[mid]) / 2 + + +def mean(values): + """Calculate mean (average) of series""" + return sum(values) / len(values) + + +def cast(value): + """Cast value to float or int, if possible""" + try: + return float(value) if '.' in value else int(value) + except ValueError: + return value + + +def query(command): + """Send `command` to memcached and stream response""" + client.write(command.encode('ascii') + b'\n') + while True: + line = client.read_until(b'\r\n').decode('ascii').strip() + if not line or line == 'END': + break + (_, metric, value) = line.split(None, 2) + yield metric, cast(value) + + +@every(seconds=10) +def update_stats(): + """Refresh stats by polling memcached server""" + try: + client.open(**config) + stats.update(query('stats')) + ages = [v for k, v in query('stats items') if k.endswith('age')] + if not ages: + return {'age_min': 0, 'age_max': 0, 'age_mean': 0, 'age_median': 0} + stats.update({ + 'age_min' : min(ages), + 'age_max' : max(ages), + 'age_mean' : mean(ages), + 'age_median' : median(ages) + }) + finally: + client.close() + logging.info("Updated stats: %s", pprint.pformat(stats, indent=4)) + + +# +# Gmond Interface +# + +def metric_handler(name): + """Get the value for a particular metric; part of Gmond interface""" + return stats[name] + + +def metric_init(params): + """Initialize; part of Gmond interface""" + print('[memcached] memcached stats') + config.update(params) + for metric in descriptors: + metric['call_back'] = metric_handler + return descriptors + + +def metric_cleanup(): + """Teardown; part of Gmond interface""" + client.close() + + +if __name__ == '__main__': + # When invoked as standalone script, run a self-test by querying each + # metric descriptor and printing it out. + for metric in metric_init({}): + value = metric['call_back'](metric['name']) + print(( "%s => " + metric['format'] ) % ( metric['name'], value )) + every.join() diff --git a/memcached_maxage/python_modules/memcached_metrics.py b/memcached_maxage/python_modules/memcached_metrics.py new file mode 100644 index 00000000..9677a94d --- /dev/null +++ b/memcached_maxage/python_modules/memcached_metrics.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +descriptors = [ { + "slope": "both", + "time_max": 60, + "description": "Current number of items stored by this instance", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "curr_items" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of items stored during the life of this instance", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "total_items" + }, + { + "slope": "both", + "time_max": 60, + "description": "Current number of bytes used by this server to store items", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes" + }, + { + "slope": "both", + "time_max": 60, + "description": "Current number of open connections", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "curr_connections" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of connections opened since the server started running", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "total_connections" + }, + { + "slope": "both", + "time_max": 60, + "description": "Number of connection structures allocated by the server", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "connection_structures" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of retrieval requests (get operations)", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cmd_get" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of storage requests (set operations)", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cmd_set" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been requested and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "get_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been requested and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "get_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been deleted and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "delete_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been delete and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "delete_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been incremented and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "incr_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been incremented and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "incr_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been decremented and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "decr_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been decremented and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "decr_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been compared and swapped and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cas_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been compared and swapped and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cas_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of valid items removed from cache to free memory for new items", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "evictions" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of bytes read by this server from network", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes_read" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of bytes sent by this server to network", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes_written" + }, + { + "slope": "zero", + "time_max": 60, + "description": "Number of bytes this server is permitted to use for storage", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "limit_maxbytes" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of worker threads requested", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "threads" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of yields for connections", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "conn_yields" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (mean)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_mean" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (median)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_median" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (min)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_min" + }, + { + "slope": "positive", + "time_max": 60, + "description": "The age of the oldest item within slabs (max)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_max" + } +] From 4d1f4d544b751a9c346bed0d43b6b598a968d11f Mon Sep 17 00:00:00 2001 From: Andreas Lappe Date: Sun, 12 Aug 2012 13:42:55 +0200 Subject: [PATCH 18/39] Add jenkins module. --- jenkins/README.mkdn | 29 ++++ jenkins/conf.d/jenkins.pyconf | 83 ++++++++++ jenkins/python_modules/jenkins.py | 254 ++++++++++++++++++++++++++++++ 3 files changed, 366 insertions(+) create mode 100644 jenkins/README.mkdn create mode 100644 jenkins/conf.d/jenkins.pyconf create mode 100644 jenkins/python_modules/jenkins.py diff --git a/jenkins/README.mkdn b/jenkins/README.mkdn new file mode 100644 index 00000000..33309f55 --- /dev/null +++ b/jenkins/README.mkdn @@ -0,0 +1,29 @@ +jenkins +======= + +python module for ganglia 3.1. + +## Metrics +* Number of total executors +* Number of busy executors +* Length of the queue +* Total number of jobs +* Number of jobs with blue status +* Number of jobs with red status +* Number of jobs with yellow status +* Number of jobs with grey status +* Number of jobs with aborted status +* Number of jobs with not-built status +* Number of jobs with disabled status + +## Parameters + * base_url (The URL to query for Jenkins statistics. Default: 'http://127.0.0.1:8080' + +## Notes + * This has been tested with: + - python 2.7.1 on Mac OS X + - python 2.7.3 on Ubuntu 12.04 + +## AUTHORS + +Andreas Lappe diff --git a/jenkins/conf.d/jenkins.pyconf b/jenkins/conf.d/jenkins.pyconf new file mode 100644 index 00000000..8f64efbb --- /dev/null +++ b/jenkins/conf.d/jenkins.pyconf @@ -0,0 +1,83 @@ +# + +modules { + module { + name = 'jenkins' + language = 'python' + + param base_url { + value = 'http://127.0.0.1:8080' + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 20 + + metric { + name = 'jenkins_overallload_busy_executors' + title = 'Number of busy executors on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_overallload_queue_length' + title = 'Length of the queue on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_overallload_total_executors' + title = 'Number of executors on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_total' + title = 'Total number of jobs' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_blue' + title = 'Number of jobs with status blue' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_red' + title = 'Number of jobs with status red' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_yellow' + title = 'Number of jobs with status yellow' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_grey' + title = 'Number of jobs with status grey' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_aborted' + title = 'Number of jobs with status aborted' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_notbuilt' + title = 'Number of jobs with status notbuilt' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_disabled' + title = 'Number of jobs with status disabled' + value_threshold = 1.0 + } +} diff --git a/jenkins/python_modules/jenkins.py b/jenkins/python_modules/jenkins.py new file mode 100644 index 00000000..e7cef064 --- /dev/null +++ b/jenkins/python_modules/jenkins.py @@ -0,0 +1,254 @@ +### This script reports jenkins metrics to ganglia. + +### License to use, modify, and distribute under the GPL +### http://www.gnu.org/licenses/gpl.txt +import logging +import os +import subprocess +import sys +import threading +import time +import traceback +import urllib2 +import json + +logging.basicConfig(level=logging.ERROR) + +_Worker_Thread = None + +class UpdateJenkinsThread(threading.Thread): + + def __init__(self, params): + threading.Thread.__init__(self) + self.running = False + self.shuttingdown = False + self.metrics = {} + self.settings = {} + self.refresh_rate = 60 + self.base_url = params['base_url'] + self._metrics_lock = threading.Lock() + self._settings_lock = threading.Lock() + + def shutdown(self): + self.shuttingdown = True + if not self.running: + return + self.join() + + def run(self): + global _Lock + + self.running = True + + while not self.shuttingdown: + time.sleep(self.refresh_rate) + self.refresh_metrics() + + self.running = False + + @staticmethod + def _get_jenkins_statistics(url): + + url += '/api/json' + url += '?tree=jobs[color],overallLoad[busyExecutors[min[latest]],queueLength[min[latest]],totalExecutors[min[latest]]]' + + c = urllib2.urlopen(url) + json_data = c.read() + c.close() + + data = json.loads(json_data) + + result = {} + result['jenkins_overallload_busy_executors'] = data['overallLoad']['busyExecutors']['min']['latest'] + result['jenkins_overallload_queue_length'] = data['overallLoad']['queueLength']['min']['latest'] + result['jenkins_overallload_total_executors'] = data['overallLoad']['totalExecutors']['min']['latest'] + result['jenkins_jobs_total'] = len(data['jobs']) + result['jenkins_jobs_red'] = result['jenkins_jobs_yellow'] = result['jenkins_jobs_grey'] = result['jenkins_jobs_disabled'] = result['jenkins_jobs_aborted'] = result['jenkins_jobs_notbuilt'] = result['jenkins_jobs_blue'] = 0 + + # Possible values: http://javadoc.jenkins-ci.org/hudson/model/BallColor.html + colors = ['red', 'yellow', 'grey', 'disabled', 'aborted', 'notbuilt', 'blue'] + for color in colors: + result['jenkins_jobs_' + color] = 0 + for job in data['jobs']: + color = job['color'] + for c in colors: + if color == c or color == c + '_anime': + result['jenkins_jobs_' + c] += 1 + return result + + def refresh_metrics(self): + logging.debug('refresh metrics') + + try: + logging.debug(' opening URL: ' + str(self.base_url)) + data = UpdateJenkinsThread._get_jenkins_statistics(self.base_url) + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + + try: + self._metrics_lock.acquire() + self.metrics = {} + for k, v in data.items(): + self.metrics[k] = v + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + return False + + finally: + self._metrics_lock.release() + + if not self.metrics: + logging.warning('error refreshing metrics') + return False + + logging.debug('success refreshing metrics') + logging.debug('metrics: ' + str(self.metrics)) + + return True + + def metric_of(self, name): + logging.debug('getting metric: ' + name) + + try: + if name in self.metrics: + try: + self._metrics_lock.acquire() + logging.debug('metric: %s = %s' % (name, self.metrics[name])) + return self.metrics[name] + finally: + self._metrics_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + + def setting_of(self, name): + logging.debug('getting setting: ' + name) + + try: + if name in self.settings: + try: + self._settings_lock.acquire() + logging.debug('setting: %s = %s' % (name, self.settings[name])) + return self.settings[name] + finally: + self._settings_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + +def metric_init(params): + logging.debug('init: ' + str(params)) + global _Worker_Thread + + METRIC_DEFAULTS = { + 'units': 'jobs', + 'groups': 'jenkins', + 'slope': 'both', + 'value_type': 'uint', + 'format': '%d', + 'description': '', + 'call_back': metric_of + } + + descriptions = dict( + jenkins_overallload_busy_executors = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'executors', + 'description': 'Number of busy executors (master and slaves)'}, + jenkins_overallload_queue_length = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'queued items', + 'description': 'Length of the queue (master and slaves)'}, + jenkins_overallload_total_executors = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'executors', + 'description': 'Number of executors (master and slaves)'}, + jenkins_jobs_total = { + 'description': 'Total number of jobs'}, + jenkins_jobs_blue = { + 'description': 'Blue jobs'}, + jenkins_jobs_red = { + 'description': 'Red jobs'}, + jenkins_jobs_yellow = { + 'description': 'Yellow jobs'}, + jenkins_jobs_grey = { + 'description': 'Grey jobs'}, + jenkins_jobs_disabled = { + 'description': 'Disabled jobs'}, + jenkins_jobs_aborted = { + 'description': 'Aborted jobs'}, + jenkins_jobs_notbuilt = { + 'description': 'Not-built jobs'}) + + if _Worker_Thread is not None: + raise Exception('Worker thread already exists') + + _Worker_Thread = UpdateJenkinsThread(params) + _Worker_Thread.refresh_metrics() + _Worker_Thread.start() + + descriptors = [] + + for name, desc in descriptions.iteritems(): + d = desc.copy() + d['name'] = str(name) + [ d.setdefault(key, METRIC_DEFAULTS[key]) for key in METRIC_DEFAULTS.iterkeys() ] + descriptors.append(d) + return descriptors + +def metric_of(name): + global _Worker_Thread + return _Worker_Thread.metric_of(name) + +def setting_of(name): + global _Worker_Thread + return _Worker_Thread.setting_of(name) + +def metric_cleanup(): + global _Worker_Thread + if _Worker_Thread is not None: + _Worker_Thread.shutdown() + logging.shutdown() + pass + +if __name__ == '__main__': + from optparse import OptionParser + + try: + logging.debug('running from the cmd line') + parser = OptionParser() + parser.add_option('-u', '--URL', dest='base_url', default='http://127.0.0.1:8080', help='Base-URL for jenkins api (default: http://127.0.0.1:8080)') + parser.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False) + + (options, args) = parser.parse_args() + + descriptors = metric_init({ + 'base_url': options.base_url, + }) + + if options.debug: + from pprint import pprint + pprint(descriptors) + + for d in descriptors: + v = d['call_back'](d['name']) + + if not options.quiet: + print ' {0}: {1} {2} [{3}]' . format(d['name'], v, d['units'], d['description']) + + os._exit(1) + + except KeyboardInterrupt: + time.sleep(0.2) + os._exit(1) + except StandardError: + traceback.print_exc() + os._exit(1) + finally: + metric_cleanup() From 1d9a0c06842d0f761b443d43daa5d4785cf94d9a Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Mon, 13 Aug 2012 17:00:23 +1200 Subject: [PATCH 19/39] Added Netapp API based volume latency and iop module --- netapp_api/README.mkdn | 19 ++ netapp_api/conf.d/netapp_api.pyconf | 18 ++ netapp_api/python_modules/netapp_api.py | 346 ++++++++++++++++++++++++ 3 files changed, 383 insertions(+) create mode 100644 netapp_api/README.mkdn create mode 100644 netapp_api/conf.d/netapp_api.pyconf create mode 100755 netapp_api/python_modules/netapp_api.py diff --git a/netapp_api/README.mkdn b/netapp_api/README.mkdn new file mode 100644 index 00000000..d062a179 --- /dev/null +++ b/netapp_api/README.mkdn @@ -0,0 +1,19 @@ +NetApp Filer API metrics +======================== + +This is a GMOND Python Module that gathers metrics from NetApp appliances via the Netapp Data ONTAP APIs. +The API allows counter access to many more metrics than available through SNMP. + +This module currently gathers per volume Read/Write/Average IOPs and Latency and handles multiple filers. + +## DEPENDS + * Netapp Managemability SDK 5.0 (download from now.netapp.com to /opt/netapp) + +## USAGE + * Save the netapp_api.pyconf into /etc/ganglia/conf.d + * Save the netapp_api.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules. + * Update the Username, password, IP and filer name. + * Restart gmond and the volume latency & iop metrics should appear in ganglia. + +## AUTHOR + * Author: Evan Fraser <evan.fraser@trademe.co.nz> diff --git a/netapp_api/conf.d/netapp_api.pyconf b/netapp_api/conf.d/netapp_api.pyconf new file mode 100644 index 00000000..906f3c8f --- /dev/null +++ b/netapp_api/conf.d/netapp_api.pyconf @@ -0,0 +1,18 @@ +modules { + module { + name = "netapp_api" + language = "python" + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 15 + time_threshold = 70 + metric { + name_match = "(.+)latency" + } + metric { + name_match = "(.+)ops" + } +} diff --git a/netapp_api/python_modules/netapp_api.py b/netapp_api/python_modules/netapp_api.py new file mode 100755 index 00000000..c417f158 --- /dev/null +++ b/netapp_api/python_modules/netapp_api.py @@ -0,0 +1,346 @@ +#!/usr/bin/python +import sys +import time +import pprint +import unicodedata +import os + +sys.path.append("/opt/netapp/lib/python/NetApp") +from NaServer import * + +descriptors = list() +params = {} +filerdict = {} +FASMETRICS = { + 'time' : 0, + 'data' : {} +} +LAST_FASMETRICS = dict(FASMETRICS) +#This is the minimum interval between querying the RPA for metrics +FASMETRICS_CACHE_MAX = 10 + +def get_metrics(name): + global FASMETRICS, LAST_FASMETRICS, FASMETRICS_CACHE_MAX, params + max_records = 10 + metrics = {} + if (time.time() - FASMETRICS['time']) > FASMETRICS_CACHE_MAX: + + for filer in filerdict.keys(): + s = NaServer(filerdict[filer]['ipaddr'], 1, 3) + out = s.set_transport_type('HTTPS') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + + out = s.set_style('LOGIN') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + out = s.set_admin_user(filerdict[filer]['user'], filerdict[filer]['password']) + perf_in = NaElement("perf-object-get-instances-iter-start") + #Hard coding volume object for testing + obj_name = "volume" + perf_in.child_add_string("objectname", obj_name) + #Create object of type counters + counters = NaElement("counters") + #Add counter names to the object + counters.child_add_string("counter", "total_ops") + counters.child_add_string("counter", "avg_latency") + counters.child_add_string("counter", "read_ops") + counters.child_add_string("counter", "read_latency") + counters.child_add_string("counter", "write_ops") + counters.child_add_string("counter", "write_latency") + + perf_in.child_add(counters) + + #Invoke API + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + iter_tag = out.child_get_string("tag") + num_records = 1 + + filername = filerdict[filer]['name'] + + while(int(num_records) != 0): + perf_in = NaElement("perf-object-get-instances-iter-next") + perf_in.child_add_string("tag", iter_tag) + perf_in.child_add_string("maximum", max_records) + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + num_records = out.child_get_int("records") + + if(num_records > 0) : + instances_list = out.child_get("instances") + instances = instances_list.children_get() + + for inst in instances: + inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore') + counters_list = inst.child_get("counters") + counters = counters_list.children_get() + + for counter in counters: + counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore') + counter_value = counter.child_get_string("value") + counter_unit = counter.child_get_string("unit") + metrics[filername + '_vol_' + inst_name + '_' + counter_name] = float(counter_value) + # update cache + LAST_FASMETRICS = dict(FASMETRICS) + FASMETRICS = { + 'time': time.time(), + 'data': metrics + } + + + else: + metrics = FASMETRICS['data'] + #print name + #calculate change in values and return + if 'total_ops' in name: + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + #This is the Operations per second + return delta + + elif 'avg_latency' in name: + try: + #T1 and T2 + #(T2_lat - T1_lat) / (T2_ops - T1_ops) + #Find the metric name of the base counter + total_ops_name = name.replace('avg_latency', 'total_ops') + #Calculate latency in time (div 100 to change to ms) + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][total_ops_name] -LAST_FASMETRICS['data'][total_ops_name])) / 100 + except StandardError: + return 0 + elif 'read_ops' in name: + + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + return delta + + elif 'read_latency' in name: + try: + read_ops_name = name.replace('read_latency', 'read_ops') + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][read_ops_name] -LAST_FASMETRICS['data'][read_ops_name])) / 100 + except StandardError: + return 0 + elif 'write_ops' in name: + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + return delta + + elif 'write_latency' in name: + try: + write_ops_name = name.replace('write_latency', 'write_ops') + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][write_ops_name] -LAST_FASMETRICS['data'][write_ops_name])) / 100 + except StandardError: + return 0 + + + return 0 + + + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + +def define_metrics(Desc_Skel,params): + max_records = 10 + for filer in params.keys(): + s = NaServer(params[filer]['ipaddr'], 1, 3) + out = s.set_transport_type('HTTPS') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + + out = s.set_style('LOGIN') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + out = s.set_admin_user(params[filer]['user'], params[filer]['password']) + perf_in = NaElement("perf-object-get-instances-iter-start") + #Hard coded volume, only volume stats gathered at present + obj_name = "volume" + perf_in.child_add_string("objectname", obj_name) + #Create object of type counters + counters = NaElement("counters") + #Add counter names to the object + counters.child_add_string("counter", "total_ops") + counters.child_add_string("counter", "avg_latency") + counters.child_add_string("counter", "read_ops") + counters.child_add_string("counter", "read_latency") + counters.child_add_string("counter", "write_ops") + counters.child_add_string("counter", "write_latency") + + perf_in.child_add(counters) + + #Invoke API + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + iter_tag = out.child_get_string("tag") + num_records = 1 + filername = params[filer]['name'] + + while(int(num_records) != 0): + perf_in = NaElement("perf-object-get-instances-iter-next") + perf_in.child_add_string("tag", iter_tag) + perf_in.child_add_string("maximum", max_records) + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + num_records = out.child_get_int("records") + + if(num_records > 0) : + instances_list = out.child_get("instances") + instances = instances_list.children_get() + + for inst in instances: + inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore') + #print ("Instance = " + inst_name + "\n") + counters_list = inst.child_get("counters") + counters = counters_list.children_get() + + for counter in counters: + counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore') + counter_value = counter.child_get_string("value") + counter_unit = counter.child_get_string("unit") + if 'total_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'avg_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume avg latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + elif 'read_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume read iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'read_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume read latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + elif 'write_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume write iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'write_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume write latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + + return descriptors + +def metric_init(params): + global descriptors,filerdict + print 'netapp_stats] Received the following parameters' + pprint.pprint(params) + params = { + 'filer1' : { + 'name' : 'filer1.localdomain', + 'ipaddr' : '192.168.1.100', + 'user' : 'root', + 'password' : 'password', + }, + } + + filerdict = dict(params) + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_metrics, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'netiron', + 'spoof_host' : 'XXX', + } + + # Run define_metrics + descriptors = define_metrics(Desc_Skel,params) + + return descriptors + +# For CLI Debugging: +if __name__ == '__main__': + #global params + params = { + 'filer1' : { + 'name' : 'filer1.localdomain', + 'ipaddr' : '192.168.1.100', + 'user' : 'root', + 'password' : 'password', + }, + } + descriptors = metric_init(params) + pprint.pprint(descriptors) + #print len(descriptors) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + #print v + print 'value for %s is %.2f' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) From adc01a52e324e18fe827c75facba8446f2d37a2e Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Mon, 13 Aug 2012 17:12:47 +1200 Subject: [PATCH 20/39] Added name,desc and author header --- netapp_api/python_modules/netapp_api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/netapp_api/python_modules/netapp_api.py b/netapp_api/python_modules/netapp_api.py index c417f158..50089c16 100755 --- a/netapp_api/python_modules/netapp_api.py +++ b/netapp_api/python_modules/netapp_api.py @@ -1,4 +1,9 @@ #!/usr/bin/python +#Name: netapp_api.py +#Desc: Uses Netapp Data Ontap API to get per volume latency & iops metrics. Download the managemability SDK from now.netapp.com +#Author: Evan Fraser +#Date: 13/08/2012 + import sys import time import pprint From 021ffbfb30b87062935b7962c5637acafdc0ef32 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Wed, 15 Aug 2012 16:38:42 +1200 Subject: [PATCH 21/39] Added Consistency Group Lag metrics --- recoverpoint/README.mkdn | 6 ++- recoverpoint/recoverpoint.py | 64 +++++++++++++++++++++++++++++--- recoverpoint/recoverpoint.pyconf | 9 +++++ 3 files changed, 72 insertions(+), 7 deletions(-) diff --git a/recoverpoint/README.mkdn b/recoverpoint/README.mkdn index 2a0ba382..84504077 100644 --- a/recoverpoint/README.mkdn +++ b/recoverpoint/README.mkdn @@ -3,6 +3,10 @@ EMC RecoverPoint This is a GMOND Python Module that gets metrics from EMC RecoverPoint replication appliances. +Currently gathers: + * Per RPA WAN/SAN traffic and Latency + * Per Consistency Group Write, Data and Time lags. + ## DEPENDS * python YAML * paramiko modules @@ -15,4 +19,4 @@ This is a GMOND Python Module that gets metrics from EMC RecoverPoint replicatio ## AUTHOR -Author: Evan Fraser <evan.fraser@trademe.co.nz> \ No newline at end of file +Author: Evan Fraser <evan.fraser@trademe.co.nz> diff --git a/recoverpoint/recoverpoint.py b/recoverpoint/recoverpoint.py index 8e3486bb..05fead4c 100755 --- a/recoverpoint/recoverpoint.py +++ b/recoverpoint/recoverpoint.py @@ -22,7 +22,7 @@ } #This is the minimum interval between querying the RPA for metrics. #Each ssh query takes 1.6s so we limit the interval between getting metrics to this interval. -NIMETRICS_CACHE_MAX = 5 +NIMETRICS_CACHE_MAX = 10 ipaddr = '' @@ -56,7 +56,26 @@ def define_metrics(Desc_Skel, statsDict): "description" : net + ' traffic', "groups" : net + " Traffic", })) - + #Define Consistency Group metrics this is paintfully nested in the dict. + for group in statsDict['Group']: + for repname in statsDict['Group'][group]['Link stats']: + #Define CG Lag metrics + for lagfields in statsDict['Group'][group]['Link stats'][repname]['Replication']['Lag']: + #print lagfields + ' = ' + str(statsDict['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) + lagunit = '' + if 'Writes' in lagfields: + lagunit = 'Writes' + elif 'Data' in lagfields: + lagunit = 'Bytes' + elif 'Time' in lagfields: + lagunit = 'Seconds' + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Lag_' + lagfields, + "units" : lagunit, + "description" : group + ' Lag ' + lagunit, + "groups" : 'Lag', + })) + return descriptors def create_desc(skel, prop): @@ -74,8 +93,11 @@ def get_metrics(name): sshcon = paramiko.SSHClient() sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') - stdin, stdout, sterr = sshcon.exec_command("get_system_statistics") - rawmetrics = yaml.load(stdout) + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics;get_group_statistics") + rawdata = stdout.read() + #Group stats don't leave a space after the colon in some places + rawmetrics = yaml.safe_load(rawdata.replace(':N',': N')) + #Get RPA metrics for rpa in rawmetrics['RPA statistics']: for metric in rawmetrics['RPA statistics'][rpa]: if "Latency (ms)" in metric: @@ -86,6 +108,31 @@ def get_metrics(name): traffic,junk = rawmetrics['RPA statistics'][rpa]['Traffic']['Application'][net].split() metrics[(rpa.lower()).replace(' ','_') + '_' + net.lower()] = int(traffic) + for group in rawmetrics['Group']: + for repname in rawmetrics['Group'][group]['Link stats']: + for lagfields in rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag']: + print lagfields + ' = ' + str(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) + if 'Data' in lagfields: + #Convert 12.34(GB|MB|KB) to bytes + datastr = rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields] + print datastr + amount = float(datastr[:-2]) + unitstr = datastr[-2:] + if 'MB' in unitstr: + amount = amount * 1024 * 1024 + elif 'KB' in unitstr: + amount = amount * 1024 + elif 'GB' in unitstr: + amount = amount * 1024 * 1024 * 1024 + metrics[group + '_Lag_' + lagfields] = amount + #metrics[group + '_Lag_' + lagfields] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) + elif 'Time' in lagfields: + #Strip 'sec' from value, convert to float. + lagtime = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields][:-3]) + metrics[group + '_Lag_' + lagfields] = lagtime + else: + #Writes Lag + metrics[group + '_Lag_' + lagfields] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) NIMETRICS = { 'time': time.time(), @@ -120,8 +167,10 @@ def metric_init(params): sshcon = paramiko.SSHClient() sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') - stdin, stdout, sterr = sshcon.exec_command("get_system_statistics") - statsDict = yaml.load(stdout) + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics;get_group_statistics") + rawdata = stdout.read() + #Group stats don't leave a space after the colon in some places + statsDict = yaml.safe_load(rawdata.replace(':N',': N')) sshcon.close() descriptors = define_metrics(Desc_Skel, statsDict) @@ -131,8 +180,11 @@ def metric_init(params): if __name__ == '__main__': params = { 'mgmtip' : '192.168.1.100', + } descriptors = metric_init(params) + pprint.pprint(descriptors) + print len(descriptors) while True: for d in descriptors: v = d['call_back'](d['name']) diff --git a/recoverpoint/recoverpoint.pyconf b/recoverpoint/recoverpoint.pyconf index 448b2a62..b5329441 100644 --- a/recoverpoint/recoverpoint.pyconf +++ b/recoverpoint/recoverpoint.pyconf @@ -24,5 +24,14 @@ collection_group { metric { name_match = "site2(.+)" } + metric { + name_match = "(.+)Time" + } + metric { + name_match = "(.+)Data" } + metric { + name_match = "(.+)Writes" + } +} From 7d0652c79f163a9ae5ccecb5365b698cd64f422b Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Thu, 16 Aug 2012 12:04:15 +1200 Subject: [PATCH 22/39] Added CG WAN/SAN and journal lag metrics --- recoverpoint/README.mkdn | 2 +- recoverpoint/recoverpoint.py | 70 ++++++++++++++++++++++++++------ recoverpoint/recoverpoint.pyconf | 20 ++++++--- 3 files changed, 74 insertions(+), 18 deletions(-) diff --git a/recoverpoint/README.mkdn b/recoverpoint/README.mkdn index 84504077..5c25b7b6 100644 --- a/recoverpoint/README.mkdn +++ b/recoverpoint/README.mkdn @@ -5,7 +5,7 @@ This is a GMOND Python Module that gets metrics from EMC RecoverPoint replicatio Currently gathers: * Per RPA WAN/SAN traffic and Latency - * Per Consistency Group Write, Data and Time lags. + * Per Consistency Group Write, Data, Time and Journal Lags, as well as WAN and SAN traffic. ## DEPENDS * python YAML diff --git a/recoverpoint/recoverpoint.py b/recoverpoint/recoverpoint.py index 05fead4c..71630905 100755 --- a/recoverpoint/recoverpoint.py +++ b/recoverpoint/recoverpoint.py @@ -47,7 +47,7 @@ def define_metrics(Desc_Skel, statsDict): "groups" : "Latency" })) if "Traffic" in metric: - #define the Application/[SAN|WAN] metrics + #define the Appliance/[SAN|WAN] metrics for net in statsDict['RPA statistics'][rpa]['Traffic']['Application'].keys(): #print net descriptors.append(create_desc(Desc_Skel, { @@ -56,12 +56,38 @@ def define_metrics(Desc_Skel, statsDict): "description" : net + ' traffic', "groups" : net + " Traffic", })) + #Define Consistency Group metrics this is paintfully nested in the dict. for group in statsDict['Group']: + #CG SAN and Journal lag are under the policies + for policyname in statsDict['Group'][group]['Copy stats']: + if 'SAN traffic' in statsDict['Group'][group]['Copy stats'][policyname]: + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_SAN_Traffic', + "units" : 'Bits/s', + "description" : group + ' SAN Traffic', + "groups" : 'SAN Traffic', + })) + elif 'Journal' in statsDict['Group'][group]['Copy stats'][policyname]: + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Journal_Lag', + "units" : 'Bytes', + "description" : group + ' Journal Lag', + "groups" : 'Lag', + })) + + #CG Lag and WAN stats are in the Link stats section for repname in statsDict['Group'][group]['Link stats']: + #Define CG WAN traffic metrics + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_WAN_Traffic', + "units" : 'Bits/s', + "description" : group + ' WAN Traffic', + "groups" : 'WAN Traffic', + })) + #Define CG Lag metrics for lagfields in statsDict['Group'][group]['Link stats'][repname]['Replication']['Lag']: - #print lagfields + ' = ' + str(statsDict['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) lagunit = '' if 'Writes' in lagfields: lagunit = 'Writes' @@ -70,12 +96,12 @@ def define_metrics(Desc_Skel, statsDict): elif 'Time' in lagfields: lagunit = 'Seconds' descriptors.append(create_desc(Desc_Skel, { - "name" : group + '_Lag_' + lagfields, - "units" : lagunit, - "description" : group + ' Lag ' + lagunit, - "groups" : 'Lag', - })) - + "name" : group + '_Lag_' + lagfields, + "units" : lagunit, + "description" : group + ' Lag ' + lagunit, + "groups" : 'Lag', + })) + return descriptors def create_desc(skel, prop): @@ -106,16 +132,36 @@ def get_metrics(name): #store the Application/[SAN|WAN] metrics for net in rawmetrics['RPA statistics'][rpa]['Traffic']['Application'].keys(): traffic,junk = rawmetrics['RPA statistics'][rpa]['Traffic']['Application'][net].split() - metrics[(rpa.lower()).replace(' ','_') + '_' + net.lower()] = int(traffic) + metrics[(rpa.lower()).replace(' ','_') + '_' + net.lower()] = float(traffic) for group in rawmetrics['Group']: + #CG SAN and Journal lag are under the policies + for policyname in rawmetrics['Group'][group]['Copy stats']: + #Get CG SAN metrics (remove 'Mbps' from end + convert to float and then bits) + if 'SAN traffic' in rawmetrics['Group'][group]['Copy stats'][policyname]: + metrics[group + '_SAN_Traffic'] = float(rawmetrics['Group'][group]['Copy stats'][policyname]['SAN traffic']['Current throughput'][:-4]) * 1024 * 1024 + elif 'Journal' in rawmetrics['Group'][group]['Copy stats'][policyname]: + datastr = rawmetrics['Group'][group]['Copy stats'][policyname]['Journal']['Journal lag'] + amount = float(datastr[:-2]) + unitstr = datastr[-2:] + if 'MB' in unitstr: + amount = amount * 1024 * 1024 + elif 'KB' in unitstr: + amount = amount * 1024 + elif 'GB' in unitstr: + amount = amount * 1024 * 1024 * 1024 + metrics[group + '_Journal_Lag'] = amount + #CG Lag and WAN stats are in the Link stats section for repname in rawmetrics['Group'][group]['Link stats']: + #Get CG WAN metrics (remove 'Mbps' from end + convert to float and then bits) + metrics[group + '_WAN_Traffic'] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['WAN traffic'][:-4]) * 1024 * 1024 + + #Get CG Lag metrics for lagfields in rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag']: - print lagfields + ' = ' + str(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) if 'Data' in lagfields: #Convert 12.34(GB|MB|KB) to bytes datastr = rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields] - print datastr + #print datastr amount = float(datastr[:-2]) unitstr = datastr[-2:] if 'MB' in unitstr: @@ -125,7 +171,7 @@ def get_metrics(name): elif 'GB' in unitstr: amount = amount * 1024 * 1024 * 1024 metrics[group + '_Lag_' + lagfields] = amount - #metrics[group + '_Lag_' + lagfields] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) + elif 'Time' in lagfields: #Strip 'sec' from value, convert to float. lagtime = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields][:-3]) diff --git a/recoverpoint/recoverpoint.pyconf b/recoverpoint/recoverpoint.pyconf index b5329441..5d2c5c75 100644 --- a/recoverpoint/recoverpoint.pyconf +++ b/recoverpoint/recoverpoint.pyconf @@ -19,19 +19,29 @@ collection_group { collect_every = 20 time_threshold = 50 metric { - name_match = "site1(.+)" + name_match = "(.+)_wan" } metric { - name_match = "site2(.+)" + name_match = "(.+)_lan" + } + metric { + name_match = "(.+)_latency" } metric { name_match = "(.+)Time" - } + } metric { name_match = "(.+)Data" - } + } metric { name_match = "(.+)Writes" - } + } + metric { + name_match = "(.+)Traffic" + } + metric { + name_match = "(.+)Lag" + } + } From 7e40f7ea624757066e4fa56cb1da62f8f9aebcb9 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Thu, 16 Aug 2012 14:44:42 +1200 Subject: [PATCH 23/39] Change fibrechannel readme to emphasis sysctl settings importance --- fibrechannel/README.mkdn | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fibrechannel/README.mkdn b/fibrechannel/README.mkdn index 15df7ceb..1d137e71 100644 --- a/fibrechannel/README.mkdn +++ b/fibrechannel/README.mkdn @@ -13,8 +13,7 @@ This is gmond python module that allows SNMP polling of Fibrechannel switches to * Save the fibrechannel.pyconf into directory and update the switch(s) name & IP's * Save the fibrechannel.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules * Update SNMP community / ports if necessary - -If you're handling a large number of metrics, you may wish to set your sysctl settings as below: + * If FC metrics aren't appearing increase your net.core.rmem_max and default settings as below: net.core.rmem_max=104857600 net.core.rmem_default=104857600 From aa701025a9aaa597ab962a2ef4f0791e33dbac0f Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Thu, 16 Aug 2012 14:47:29 +1200 Subject: [PATCH 24/39] Update fibrechannel/README.mkdn --- fibrechannel/README.mkdn | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fibrechannel/README.mkdn b/fibrechannel/README.mkdn index 1d137e71..e8314a06 100644 --- a/fibrechannel/README.mkdn +++ b/fibrechannel/README.mkdn @@ -16,9 +16,13 @@ This is gmond python module that allows SNMP polling of Fibrechannel switches to * If FC metrics aren't appearing increase your net.core.rmem_max and default settings as below: net.core.rmem_max=104857600 + net.core.rmem_default=104857600 + vm.dirty_ratio=100 + vm.dirty_background_ratio=100 + vm.dirty_expire_centisecs=720000 ## AUTHOR From edcce95c037839716895c38bfc932d6ec431279b Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Thu, 16 Aug 2012 14:48:16 +1200 Subject: [PATCH 25/39] Update fibrechannel/README.mkdn --- fibrechannel/README.mkdn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fibrechannel/README.mkdn b/fibrechannel/README.mkdn index e8314a06..90bca5ab 100644 --- a/fibrechannel/README.mkdn +++ b/fibrechannel/README.mkdn @@ -13,7 +13,7 @@ This is gmond python module that allows SNMP polling of Fibrechannel switches to * Save the fibrechannel.pyconf into directory and update the switch(s) name & IP's * Save the fibrechannel.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules * Update SNMP community / ports if necessary - * If FC metrics aren't appearing increase your net.core.rmem_max and default settings as below: + * If FC metrics aren't appearing, increase your net.core.rmem_max and default settings as below: net.core.rmem_max=104857600 From 637613a49351430fcf21647fbe472e6f4fd47bcb Mon Sep 17 00:00:00 2001 From: Jeff Buchbinder Date: Tue, 28 Aug 2012 14:05:12 -0400 Subject: [PATCH 26/39] Add new elasticsearch report. --- elasticsearch/graph.d/es_report.json | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 elasticsearch/graph.d/es_report.json diff --git a/elasticsearch/graph.d/es_report.json b/elasticsearch/graph.d/es_report.json new file mode 100644 index 00000000..a8cf0e6c --- /dev/null +++ b/elasticsearch/graph.d/es_report.json @@ -0,0 +1,14 @@ +{ + "report_name" : "es_report", + "report_type" : "standard", + "title" : "Elasticsearch", + "vertical_label" : "ms", + "series" : [ + { "metric": "es_fetch_time", "color": "BBBBBB", "label": "Fetch", "type": "line" }, + { "metric": "es_get_time", "color": "00FF00", "label": "Get", "line_width": "2", "type": "line" }, + { "metric": "es_flush_time", "color": "FF0000", "label": "Flush", "line_width": "2", "type": "line" }, + { "metric": "es_gc_time", "color": "2030F4", "label": "GC", "line_width": "2", "type": "line" }, + { "metric": "es_indexing_delete_time", "color": "FF30F4", "label": "Indexing Delete", "line_width": "2", "type": "line" }, + { "metric": "es_indexing_index_time", "color": "20FFF4", "label": "Indexing Index", "line_width": "2", "type": "line" } + ] +} From 36a3a283b943a57e0f74a6aac6f8c3f475220e50 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Mon, 3 Sep 2012 12:18:29 +1200 Subject: [PATCH 27/39] Added recoverpoint protection window metrics --- recoverpoint/recoverpoint.py | 19 +++++++++++++++++++ recoverpoint/recoverpoint.pyconf | 3 +++ 2 files changed, 22 insertions(+) diff --git a/recoverpoint/recoverpoint.py b/recoverpoint/recoverpoint.py index 71630905..ab93f140 100755 --- a/recoverpoint/recoverpoint.py +++ b/recoverpoint/recoverpoint.py @@ -75,6 +75,13 @@ def define_metrics(Desc_Skel, statsDict): "description" : group + ' Journal Lag', "groups" : 'Lag', })) + #Protection window + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Protection_Window', + "units" : 'mins', + "description" : group + ' Protection Window', + "groups" : 'Protection', + })) #CG Lag and WAN stats are in the Link stats section for repname in statsDict['Group'][group]['Link stats']: @@ -151,6 +158,18 @@ def get_metrics(name): elif 'GB' in unitstr: amount = amount * 1024 * 1024 * 1024 metrics[group + '_Journal_Lag'] = amount + #Protection Window is in Journal section + prowindowstr = rawmetrics['Group'][group]['Copy stats'][policyname]['Journal']['Protection window']['Current']['Value'] + protectmins = 0 + protimelist = prowindowstr.split(' ') + if 'hr' in protimelist: + hrindex = protimelist.index('hr') + protectmins = protectmins + (int(protimelist[int(hrindex) - 1]) * 60) + if 'min' in protimelist: + minindex = protimelist.index('min') + protectmins = protectmins + int(protimelist[int(minindex) -1]) + metrics[group + '_Protection_Window'] = float(protectmins) + #CG Lag and WAN stats are in the Link stats section for repname in rawmetrics['Group'][group]['Link stats']: #Get CG WAN metrics (remove 'Mbps' from end + convert to float and then bits) diff --git a/recoverpoint/recoverpoint.pyconf b/recoverpoint/recoverpoint.pyconf index 5d2c5c75..a63031a0 100644 --- a/recoverpoint/recoverpoint.pyconf +++ b/recoverpoint/recoverpoint.pyconf @@ -42,6 +42,9 @@ collection_group { metric { name_match = "(.+)Lag" } + metric { + name_match = "(.+)Window" + } } From ccab9dea33566a3bf6642222abb15592b95b2744 Mon Sep 17 00:00:00 2001 From: Evan Fraser Date: Mon, 3 Sep 2012 12:24:07 +1200 Subject: [PATCH 28/39] updated readme --- recoverpoint/README.mkdn | 1 + 1 file changed, 1 insertion(+) diff --git a/recoverpoint/README.mkdn b/recoverpoint/README.mkdn index 5c25b7b6..5202cd0f 100644 --- a/recoverpoint/README.mkdn +++ b/recoverpoint/README.mkdn @@ -6,6 +6,7 @@ This is a GMOND Python Module that gets metrics from EMC RecoverPoint replicatio Currently gathers: * Per RPA WAN/SAN traffic and Latency * Per Consistency Group Write, Data, Time and Journal Lags, as well as WAN and SAN traffic. + * Per Consistency Group Protection Window metrics. ## DEPENDS * python YAML From bcbf2f504aeb018a212875b1109e0e583c841630 Mon Sep 17 00:00:00 2001 From: Barnaby Gray Date: Sun, 9 Sep 2012 10:22:46 +0100 Subject: [PATCH 29/39] Change value_type on a few metrics to double to handle indexes > 4G in size. --- elasticsearch/python_modules/elasticsearch.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index c51938d0..0540ef80 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -169,6 +169,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Java Heap Committed (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -176,6 +177,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Java Heap Used (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -183,6 +185,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Java Non Heap Committed (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -190,6 +193,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Java Non Heap Used (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -235,6 +239,7 @@ def metric_init(params): 'format' : '%.0f', 'slope' : 'positive', 'description': 'RX (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -272,6 +277,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Index Size (Bytes)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -320,6 +326,7 @@ def metric_init(params): 'format' : '%.0f', 'slope' : 'positive', 'description': 'Merges size (total)', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -351,6 +358,7 @@ def metric_init(params): 'units' : 'docs', 'format' : '%.0f', 'description': 'Number of Documents', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -358,6 +366,7 @@ def metric_init(params): 'units' : 'docs', 'format' : '%.0f', 'description': 'Number of Documents Deleted', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -380,6 +389,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Field Cache Size', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -400,6 +410,7 @@ def metric_init(params): 'units' : 'Bytes', 'format' : '%.0f', 'description': 'Filter Cache Size', + 'value_type' : 'double', })) descriptors.append(create_desc({ @@ -511,6 +522,7 @@ def metric_init(params): 'name' : 'es_indexing_delete_total', 'units' : 'docs', 'format' : '%d', + 'slope' : 'positive', 'description': 'Delete Total', })) @@ -526,6 +538,7 @@ def metric_init(params): 'name' : 'es_indexing_index_total', 'units' : 'docs', 'format' : '%d', + 'slope' : 'positive', 'description': 'Indexing Documents Total', })) From 791562a2296509adbf2e9bcb7836006bb820c049 Mon Sep 17 00:00:00 2001 From: Mathias Fussenegger Date: Sun, 16 Sep 2012 06:25:26 +0200 Subject: [PATCH 30/39] reformat accordig to pep8 --- elasticsearch/python_modules/elasticsearch.py | 945 ++++++++++-------- 1 file changed, 531 insertions(+), 414 deletions(-) diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index 0540ef80..77fe293e 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -1,26 +1,30 @@ #! /usr/bin/python -try: import simplejson as json -except ImportError: import json +try: + import simplejson as json + assert json # silence pyflakes +except ImportError: + import json import time import urllib global url, last_update, keyToPath -def dig_it_up(obj,path): + +def dig_it_up(obj, path): try: - if type(path) in (str,unicode): + if type(path) in (str, unicode): path = path.split('.') - return reduce(lambda x,y:x[y],path,obj) + return reduce(lambda x, y: x[y], path, obj) except: return False # Set IP address and JSON Url -url="http://localhost:9200/_cluster/nodes/_local/stats?all=true" +url = "http://localhost:9200/_cluster/nodes/_local/stats?all=true" # short name to full path for stats -keyToPath=dict() +keyToPath = dict() # Initial time modification stamp - Used to determine # when JSON is updated @@ -32,8 +36,10 @@ def dig_it_up(obj,path): keyToPath['es_cache_field_eviction'] = "nodes.%s.indices.cache.field_evictions" keyToPath['es_cache_field_size'] = "nodes.%s.indices.cache.field_size_in_bytes" keyToPath['es_cache_filter_count'] = "nodes.%s.indices.cache.filter_count" -keyToPath['es_cache_filter_evictions'] = "nodes.%s.indices.cache.filter_evictions" -keyToPath['es_cache_filter_size'] = "nodes.%s.indices.cache.filter_size_in_bytes" +keyToPath[ + 'es_cache_filter_evictions'] = "nodes.%s.indices.cache.filter_evictions" +keyToPath[ + 'es_cache_filter_size'] = "nodes.%s.indices.cache.filter_size_in_bytes" ## DOCS keyToPath['es_docs_count'] = "nodes.%s.indices.docs.count" @@ -48,12 +54,14 @@ def dig_it_up(obj,path): keyToPath['es_get_exists_total'] = "nodes.%s.indices.get.exists_total" keyToPath['es_get_time'] = "nodes.%s.indices.get.time_in_millis" keyToPath['es_get_total'] = "nodes.%s.indices.get.total" -keyToPath['es_get_missing_time'] = "nodes.%s.indices.get.missing_time_in_millis" +keyToPath[ + 'es_get_missing_time'] = "nodes.%s.indices.get.missing_time_in_millis" keyToPath['es_get_missing_total'] = "nodes.%s.indices.get.missing_total" ## INDEXING keyToPath['es_indexing_delete_time'] = "nodes.%s.indices.indexing.delete_time_in_millis" -keyToPath['es_indexing_delete_total'] = "nodes.%s.indices.indexing.delete_total" +keyToPath[ + 'es_indexing_delete_total'] = "nodes.%s.indices.indexing.delete_total" keyToPath['es_indexing_index_time'] = "nodes.%s.indices.indexing.index_time_in_millis" keyToPath['es_indexing_index_total'] = "nodes.%s.indices.indexing.index_total" @@ -63,7 +71,8 @@ def dig_it_up(obj,path): keyToPath['es_merges_current_size'] = "nodes.%s.indices.merges.current_size_in_bytes" keyToPath['es_merges_total'] = "nodes.%s.indices.merges.total" keyToPath['es_merges_total_docs'] = "nodes.%s.indices.merges.total_docs" -keyToPath['es_merges_total_size'] = "nodes.%s.indices.merges.total_size_in_bytes" +keyToPath[ + 'es_merges_total_size'] = "nodes.%s.indices.merges.total_size_in_bytes" keyToPath['es_merges_time'] = "nodes.%s.indices.merges.total_time_in_millis" ## REFRESH @@ -85,7 +94,8 @@ def dig_it_up(obj,path): ## MEM keyToPath['es_heap_committed'] = "nodes.%s.jvm.mem.heap_committed_in_bytes" keyToPath['es_heap_used'] = "nodes.%s.jvm.mem.heap_used_in_bytes" -keyToPath['es_non_heap_committed'] = "nodes.%s.jvm.mem.non_heap_committed_in_bytes" +keyToPath[ + 'es_non_heap_committed'] = "nodes.%s.jvm.mem.non_heap_committed_in_bytes" keyToPath['es_non_heap_used'] = "nodes.%s.jvm.mem.non_heap_used_in_bytes" ## THREADS @@ -108,7 +118,9 @@ def dig_it_up(obj,path): keyToPath['es_http_total_open'] = "nodes.%s.http.total_opened" # PROCESS METRICS # -keyToPath['es_open_file_descriptors'] = "nodes.%s.process.open_file_descriptors" +keyToPath[ + 'es_open_file_descriptors'] = "nodes.%s.process.open_file_descriptors" + def getStat(name): global last_update, result, url @@ -122,21 +134,23 @@ def getStat(name): last_update = now node = result['nodes'].keys()[0] - val = dig_it_up(result, keyToPath[name] % node ) + val = dig_it_up(result, keyToPath[name] % node) # Check to make sure we have a valid result # JsonPath returns False if no match found - if not isinstance(val,bool): + if not isinstance(val, bool): return int(val) else: return None + def create_desc(prop): d = Desc_Skel.copy() - for k,v in prop.iteritems(): + for k, v in prop.iteritems(): d[k] = v return d + def metric_init(params): global result, url, descriptors, Desc_Skel @@ -153,406 +167,510 @@ def metric_init(params): params["metric_group"] = "elasticsearch" Desc_Skel = { - 'name' : 'XXX', - 'call_back' : getStat, - 'time_max' : 60, - 'value_type' : 'uint', - 'units' : 'units', - 'slope' : 'both', - 'format' : '%d', - 'description' : 'XXX', - 'groups' : params["metric_group"], + 'name': 'XXX', + 'call_back': getStat, + 'time_max': 60, + 'value_type': 'uint', + 'units': 'units', + 'slope': 'both', + 'format': '%d', + 'description': 'XXX', + 'groups': params["metric_group"], } - descriptors.append(create_desc({ - 'name' : 'es_heap_committed', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Heap Committed (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_heap_used', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Heap Used (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_non_heap_committed', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Non Heap Committed (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_non_heap_used', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Non Heap Used (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_threads', - 'units' : 'threads', - 'format' : '%d', - 'description': 'Threads (open)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_threads_peak', - 'units' : 'threads', - 'format' : '%d', - 'description': 'Threads Peak (open)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_gc_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Java GC Time (ms)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'Transport Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_rx_count', - 'units' : 'rx', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'RX Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_rx_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'RX (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_tx_count', - 'units' : 'tx', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'TX Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_tx_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'TX (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_http_current_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'HTTP Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_http_total_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'HTTP Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indices_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Index Size (Bytes)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_gc_count', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Java GC Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current', - 'format' : '%d', - 'description': 'Merges (current)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current_docs', - 'format' : '%d', - 'description': 'Merges (docs)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges (total)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total_docs', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges (total docs)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Merges size (current)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'Merges size (total)', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges Time (ms)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_refresh_total', - 'units' : 'refreshes', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Refresh', - })) - - descriptors.append(create_desc({ - 'name' : 'es_refresh_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Refresh Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_docs_count', - 'units' : 'docs', - 'format' : '%.0f', - 'description': 'Number of Documents', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_docs_deleted', - 'units' : 'docs', - 'format' : '%.0f', - 'description': 'Number of Documents Deleted', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_open_file_descriptors', - 'units' : 'files', - 'format' : '%d', - 'description': 'Open File Descriptors', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_field_eviction', - 'units' : 'units', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Field Cache Evictions', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_field_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Field Cache Size', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_count', - 'format' : '%d', - 'description': 'Filter Cache Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_evictions', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Filter Cache Evictions', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Filter Cache Size', - 'value_type' : 'double', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_current', - 'units' : 'Queries', - 'format' : '%d', - 'description': 'Current Queries', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Query Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_current', - 'units' : 'fetches', - 'format' : '%d', - 'description': 'Current Fetches', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_total', - 'units' : 'fetches', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Fetches', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Fetch Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_flush_total', - 'units' : 'flushes', - 'format' : '%d', - 'description': 'Total Flushes', - })) - - descriptors.append(create_desc({ - 'name' : 'es_flush_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Flush Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_exists_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Exists Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_exists_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Exists Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Get Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Get Total', - })) - descriptors.append(create_desc({ - 'name' : 'es_get_missing_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Missing Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_missing_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Missing Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_delete_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Delete Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_delete_total', - 'units' : 'docs', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Delete Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_index_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Indexing Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_index_total', - 'units' : 'docs', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Indexing Documents Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_total', - 'units' : 'Queries', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Queries', - })) + descriptors.append( + create_desc({ + 'name': 'es_heap_committed', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Heap Committed (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_heap_used', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Heap Used (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_non_heap_committed', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Non Heap Committed (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_non_heap_used', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Non Heap Used (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_threads', + 'units': 'threads', + 'format': '%d', + 'description': 'Threads (open)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_threads_peak', + 'units': 'threads', + 'format': '%d', + 'description': 'Threads Peak (open)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_gc_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Java GC Time (ms)' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_transport_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'Transport Open (sockets)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_transport_rx_count', + 'units': 'rx', + 'format': '%d', + 'slope': 'positive', + 'description': 'RX Count' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_transport_rx_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'RX (Bytes)', + 'value_type': 'double', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_transport_tx_count', + 'units': 'tx', + 'format': '%d', + 'slope': 'positive', + 'description': 'TX Count' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_transport_tx_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'TX (Bytes)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_http_current_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'HTTP Open (sockets)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_http_total_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'HTTP Open (sockets)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_indices_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Index Size (Bytes)', + 'value_type': 'double', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_gc_count', + 'format': '%d', + 'slope': 'positive', + 'description': 'Java GC Count', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_current', + 'format': '%d', + 'description': 'Merges (current)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_current_docs', + 'format': '%d', + 'description': 'Merges (docs)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_total', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges (total)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_total_docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges (total docs)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_current_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Merges size (current)', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_total_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'Merges size (total)', + 'value_type': 'double', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_merges_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges Time (ms)' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_refresh_total', + 'units': 'refreshes', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Refresh' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_refresh_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Refresh Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_docs_count', + 'units': 'docs', + 'format': '%.0f', + 'description': 'Number of Documents', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_docs_deleted', + 'units': 'docs', + 'format': '%.0f', + 'description': 'Number of Documents Deleted', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_open_file_descriptors', + 'units': 'files', + 'format': '%d', + 'description': 'Open File Descriptors', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_cache_field_eviction', + 'units': 'units', + 'format': '%d', + 'slope': 'positive', + 'description': 'Field Cache Evictions', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_cache_field_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Field Cache Size', + 'value_type': 'double', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_cache_filter_count', + 'format': '%d', + 'description': 'Filter Cache Count', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_cache_filter_evictions', + 'format': '%d', + 'slope': 'positive', + 'description': 'Filter Cache Evictions', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_cache_filter_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Filter Cache Size', + 'value_type': 'double' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_query_current', + 'units': 'Queries', + 'format': '%d', + 'description': 'Current Queries', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_query_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Query Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_fetch_current', + 'units': 'fetches', + 'format': '%d', + 'description': 'Current Fetches', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_fetch_total', + 'units': 'fetches', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Fetches' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_fetch_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Fetch Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_flush_total', + 'units': 'flushes', + 'format': '%d', + 'description': 'Total Flushes', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_flush_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Flush Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_exists_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Exists Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_exists_total', + 'units': 'total', + 'format': '%d', + 'description': 'Exists Total', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Get Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_total', + 'units': 'total', + 'format': '%d', + 'description': 'Get Total', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_missing_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Missing Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_get_missing_total', + 'units': 'total', + 'format': '%d', + 'description': 'Missing Total', + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_indexing_delete_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Delete Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_indexing_delete_total', + 'units': 'docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Delete Total' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_indexing_index_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Indexing Time' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_indexing_index_total', + 'units': 'docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Indexing Documents Total' + }) + ) + + descriptors.append( + create_desc({ + 'name': 'es_query_total', + 'units': 'Queries', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Queries' + }) + ) return descriptors - + + def metric_cleanup(): - pass + pass #This code is for debugging and unit testing if __name__ == '__main__': @@ -560,4 +678,3 @@ def metric_cleanup(): for d in descriptors: v = d['call_back'](d['name']) print 'value for %s is %s' % (d['name'], str(v)) - From 2f29c553f98a252c0a070e9f3f750cf7dab06b8f Mon Sep 17 00:00:00 2001 From: Mathias Fussenegger Date: Sun, 16 Sep 2012 06:45:43 +0200 Subject: [PATCH 31/39] reduce the use of globals --- elasticsearch/python_modules/elasticsearch.py | 151 +++++++++--------- 1 file changed, 75 insertions(+), 76 deletions(-) diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index 77fe293e..e96a0a95 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -8,20 +8,8 @@ import time import urllib +from functools import partial -global url, last_update, keyToPath - - -def dig_it_up(obj, path): - try: - if type(path) in (str, unicode): - path = path.split('.') - return reduce(lambda x, y: x[y], path, obj) - except: - return False - -# Set IP address and JSON Url -url = "http://localhost:9200/_cluster/nodes/_local/stats?all=true" # short name to full path for stats keyToPath = dict() @@ -122,8 +110,17 @@ def dig_it_up(obj, path): 'es_open_file_descriptors'] = "nodes.%s.process.open_file_descriptors" -def getStat(name): - global last_update, result, url +def dig_it_up(obj, path): + try: + if type(path) in (str, unicode): + path = path.split('.') + return reduce(lambda x, y: x[y], path, obj) + except: + return False + + +def getStat(result, url, name): + global last_update # If time delta is > 20 seconds, then update the JSON results now = time.time() @@ -144,42 +141,44 @@ def getStat(name): return None -def create_desc(prop): - d = Desc_Skel.copy() +def create_desc(skel, prop): + d = skel.copy() for k, v in prop.iteritems(): d[k] = v return d def metric_init(params): - global result, url, descriptors, Desc_Skel + descriptors = [] print '[elasticsearch] Received the following parameters' print params + host = params.get('host', 'http://localhost:9200/') + url = '{0}_cluster/nodes/_local/stats?all=true'.format(host) + # First iteration - Grab statistics print '[elasticsearch] Fetching ' + url result = json.load(urllib.urlopen(url)) - descriptors = [] - - if "metric_group" not in params: - params["metric_group"] = "elasticsearch" + metric_group = params.get('metric_group', 'elasticsearch') Desc_Skel = { 'name': 'XXX', - 'call_back': getStat, + 'call_back': partial(getStat, result, url), 'time_max': 60, 'value_type': 'uint', 'units': 'units', 'slope': 'both', 'format': '%d', 'description': 'XXX', - 'groups': params["metric_group"], + 'groups': metric_group, } + _create_desc = partial(create_desc, Desc_Skel) + descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_heap_committed', 'units': 'Bytes', 'format': '%.0f', @@ -189,7 +188,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_heap_used', 'units': 'Bytes', 'format': '%.0f', @@ -199,7 +198,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_non_heap_committed', 'units': 'Bytes', 'format': '%.0f', @@ -209,7 +208,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_non_heap_used', 'units': 'Bytes', 'format': '%.0f', @@ -219,7 +218,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_threads', 'units': 'threads', 'format': '%d', @@ -228,7 +227,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_threads_peak', 'units': 'threads', 'format': '%d', @@ -237,7 +236,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_gc_time', 'units': 'ms', 'format': '%d', @@ -247,7 +246,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_transport_open', 'units': 'sockets', 'format': '%d', @@ -256,7 +255,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_transport_rx_count', 'units': 'rx', 'format': '%d', @@ -266,7 +265,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_transport_rx_size', 'units': 'Bytes', 'format': '%.0f', @@ -277,7 +276,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_transport_tx_count', 'units': 'tx', 'format': '%d', @@ -287,7 +286,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_transport_tx_size', 'units': 'Bytes', 'format': '%.0f', @@ -297,7 +296,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_http_current_open', 'units': 'sockets', 'format': '%d', @@ -306,7 +305,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_http_total_open', 'units': 'sockets', 'format': '%d', @@ -315,7 +314,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_indices_size', 'units': 'Bytes', 'format': '%.0f', @@ -325,7 +324,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_gc_count', 'format': '%d', 'slope': 'positive', @@ -334,7 +333,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_current', 'format': '%d', 'description': 'Merges (current)', @@ -342,7 +341,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_current_docs', 'format': '%d', 'description': 'Merges (docs)', @@ -350,7 +349,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_total', 'format': '%d', 'slope': 'positive', @@ -359,7 +358,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_total_docs', 'format': '%d', 'slope': 'positive', @@ -368,7 +367,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_current_size', 'units': 'Bytes', 'format': '%.0f', @@ -377,7 +376,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_total_size', 'units': 'Bytes', 'format': '%.0f', @@ -388,7 +387,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_merges_time', 'units': 'ms', 'format': '%d', @@ -398,7 +397,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_refresh_total', 'units': 'refreshes', 'format': '%d', @@ -408,7 +407,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_refresh_time', 'units': 'ms', 'format': '%d', @@ -418,7 +417,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_docs_count', 'units': 'docs', 'format': '%.0f', @@ -428,7 +427,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_docs_deleted', 'units': 'docs', 'format': '%.0f', @@ -438,7 +437,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_open_file_descriptors', 'units': 'files', 'format': '%d', @@ -447,7 +446,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_cache_field_eviction', 'units': 'units', 'format': '%d', @@ -457,7 +456,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_cache_field_size', 'units': 'Bytes', 'format': '%.0f', @@ -467,7 +466,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_cache_filter_count', 'format': '%d', 'description': 'Filter Cache Count', @@ -475,7 +474,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_cache_filter_evictions', 'format': '%d', 'slope': 'positive', @@ -484,7 +483,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_cache_filter_size', 'units': 'Bytes', 'format': '%.0f', @@ -494,7 +493,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_query_current', 'units': 'Queries', 'format': '%d', @@ -503,7 +502,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_query_time', 'units': 'ms', 'format': '%d', @@ -513,7 +512,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_fetch_current', 'units': 'fetches', 'format': '%d', @@ -522,7 +521,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_fetch_total', 'units': 'fetches', 'format': '%d', @@ -532,7 +531,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_fetch_time', 'units': 'ms', 'format': '%d', @@ -542,7 +541,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_flush_total', 'units': 'flushes', 'format': '%d', @@ -551,7 +550,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_flush_time', 'units': 'ms', 'format': '%d', @@ -561,7 +560,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_exists_time', 'units': 'ms', 'format': '%d', @@ -571,7 +570,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_exists_total', 'units': 'total', 'format': '%d', @@ -580,7 +579,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_time', 'units': 'ms', 'format': '%d', @@ -590,7 +589,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_total', 'units': 'total', 'format': '%d', @@ -599,7 +598,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_missing_time', 'units': 'ms', 'format': '%d', @@ -609,7 +608,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_get_missing_total', 'units': 'total', 'format': '%d', @@ -618,7 +617,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_indexing_delete_time', 'units': 'ms', 'format': '%d', @@ -628,7 +627,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_indexing_delete_total', 'units': 'docs', 'format': '%d', @@ -638,7 +637,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_indexing_index_time', 'units': 'ms', 'format': '%d', @@ -648,7 +647,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_indexing_index_total', 'units': 'docs', 'format': '%d', @@ -658,7 +657,7 @@ def metric_init(params): ) descriptors.append( - create_desc({ + _create_desc({ 'name': 'es_query_total', 'units': 'Queries', 'format': '%d', @@ -674,7 +673,7 @@ def metric_cleanup(): #This code is for debugging and unit testing if __name__ == '__main__': - metric_init({}) + descriptors = metric_init({}) for d in descriptors: v = d['call_back'](d['name']) print 'value for %s is %s' % (d['name'], str(v)) From 9cb755a825a2b50d99d39f801e80a1b91efab0ce Mon Sep 17 00:00:00 2001 From: Mathias Fussenegger Date: Sun, 16 Sep 2012 07:23:50 +0200 Subject: [PATCH 32/39] Add the ability to read index specific statistics --- elasticsearch/conf.d/elasticsearch.pyconf | 14 ++++ elasticsearch/python_modules/elasticsearch.py | 66 +++++++++++++++++-- 2 files changed, 73 insertions(+), 7 deletions(-) diff --git a/elasticsearch/conf.d/elasticsearch.pyconf b/elasticsearch/conf.d/elasticsearch.pyconf index 15789c37..6b3a500d 100644 --- a/elasticsearch/conf.d/elasticsearch.pyconf +++ b/elasticsearch/conf.d/elasticsearch.pyconf @@ -9,6 +9,20 @@ modules { value = "elasticsearch" } + param host { + value = "http://localhost:9200/" + } + + # In order to get index specific stats specify each index seperated by + # whitespace. + # + # indices can be grouped by using comma, + # e.g. index3,index4 will give statistics (docs_count, etc.) for both + # index1 and index2 + param indices { + value = "*" +# value = "index1 index2 index3,index4" + } } } diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index e96a0a95..1f1444da 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -119,7 +119,7 @@ def dig_it_up(obj, path): return False -def getStat(result, url, name): +def update_result(result, url): global last_update # If time delta is > 20 seconds, then update the JSON results @@ -130,6 +130,22 @@ def getStat(result, url, name): result = json.load(urllib.urlopen(url)) last_update = now + return result + + +def get_stat_index(result, url, path, name): + result = update_result(result, url) + val = dig_it_up(result, path) + + if not isinstance(val, bool): + return int(val) + else: + return None + + +def getStat(result, url, name): + result = update_result(result, url) + node = result['nodes'].keys()[0] val = dig_it_up(result, keyToPath[name] % node) @@ -148,24 +164,48 @@ def create_desc(skel, prop): return d +def get_indices_descriptors(index, skel, result, url): + metric_tpl = 'es_index_{0}_{{0}}'.format(index) + callback = partial(get_stat_index, result, url) + _create_desc = partial(create_desc, skel) + + descriptors = [ + _create_desc({ + 'call_back': partial(callback, '_all.primaries.docs.count'), + 'name': metric_tpl.format('docs_count'), + 'description': 'document count for index {0}'.format(index), + }), + _create_desc({ + 'call_back': partial(callback, '_all.primaries.store.size_in_bytes'), + 'name': metric_tpl.format('size'), + 'description': 'size in bytes for index {0}'.format(index), + 'units': 'Bytes', + 'format': '%.0f', + 'value_type': 'double' + }) + ] + + return descriptors + + def metric_init(params): descriptors = [] - print '[elasticsearch] Received the following parameters' - print params + print('[elasticsearch] Received the following parameters') + print(params) host = params.get('host', 'http://localhost:9200/') - url = '{0}_cluster/nodes/_local/stats?all=true'.format(host) + url_cluster = '{0}_cluster/nodes/_local/stats?all=true'.format(host) # First iteration - Grab statistics - print '[elasticsearch] Fetching ' + url - result = json.load(urllib.urlopen(url)) + print('[elasticsearch] Fetching ' + url_cluster) + result = json.load(urllib.urlopen(url_cluster)) metric_group = params.get('metric_group', 'elasticsearch') Desc_Skel = { 'name': 'XXX', - 'call_back': partial(getStat, result, url), + 'call_back': partial(getStat, result, url_cluster), 'time_max': 60, 'value_type': 'uint', 'units': 'units', @@ -175,6 +215,17 @@ def metric_init(params): 'groups': metric_group, } + indices = params.get('indices', '*').split() + for index in indices: + url_indices = '{0}{1}/_stats'.format(host, index) + print('[elasticsearch] Fetching ' + url_indices) + + r_indices = json.load(urllib.urlopen(url_indices)) + descriptors += get_indices_descriptors(index, + Desc_Skel, + r_indices, + url_indices) + _create_desc = partial(create_desc, Desc_Skel) descriptors.append( @@ -671,6 +722,7 @@ def metric_init(params): def metric_cleanup(): pass + #This code is for debugging and unit testing if __name__ == '__main__': descriptors = metric_init({}) From 81019d1aec56ea37161da9091704c63733215df2 Mon Sep 17 00:00:00 2001 From: Justin Campbell Date: Wed, 19 Sep 2012 18:01:20 -0400 Subject: [PATCH 33/39] Skip commented lines --- redis/python_modules/redis.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/redis/python_modules/redis.py b/redis/python_modules/redis.py index 92557f73..c04626de 100644 --- a/redis/python_modules/redis.py +++ b/redis/python_modules/redis.py @@ -28,6 +28,8 @@ def metric_handler(name): for line in info.splitlines()[1:]: if "" == line: continue + if "#" == line[0]: + continue n, v = line.split(":") if n in metric_handler.descriptors: metric_handler.info[n] = int(v) # TODO Use value_type. From 9abc6a203e41775ed5f01c38869d574336c1a9cd Mon Sep 17 00:00:00 2001 From: Justin Campbell Date: Wed, 19 Sep 2012 18:01:32 -0400 Subject: [PATCH 34/39] Remove vm_enabled --- redis/python_modules/redis.py | 1 - 1 file changed, 1 deletion(-) diff --git a/redis/python_modules/redis.py b/redis/python_modules/redis.py index c04626de..ed318bd4 100644 --- a/redis/python_modules/redis.py +++ b/redis/python_modules/redis.py @@ -66,7 +66,6 @@ def metric_init(params={}): "expired_keys": {"units": "keys"}, "pubsub_channels": {"units": "channels"}, "pubsub_patterns": {"units": "patterns"}, - "vm_enabled": {"units": "yes/no"}, "master_last_io_seconds_ago": {"units": "seconds ago"}, } metric_handler.descriptors = {} From a6d1e9f923bab24bbf85e10a52d9638a7616439e Mon Sep 17 00:00:00 2001 From: "Jacob V. Rasmussen" Date: Wed, 26 Sep 2012 19:20:54 +0200 Subject: [PATCH 35/39] Added Another PHP Cache status module --- apc_status/README.mkdn | 13 +++ apc_status/conf.d/apc_status.pyconf | 97 +++++++++++++++++++++ apc_status/document_root/apc-json.php | 44 ++++++++++ apc_status/python_modules/apc_status.py | 110 ++++++++++++++++++++++++ 4 files changed, 264 insertions(+) create mode 100644 apc_status/README.mkdn create mode 100644 apc_status/conf.d/apc_status.pyconf create mode 100644 apc_status/document_root/apc-json.php create mode 100644 apc_status/python_modules/apc_status.py diff --git a/apc_status/README.mkdn b/apc_status/README.mkdn new file mode 100644 index 00000000..9dde1757 --- /dev/null +++ b/apc_status/README.mkdn @@ -0,0 +1,13 @@ +apc_status +=============== + +python module for ganglia 3.1. + +"apc_status" sends metrics on Another PHP Cache process status refering to +apc-json.php. + +To use this you will need to copy apc-json.php to your webdir. + +## AUTHOR + +Jacob V. Rasmussen diff --git a/apc_status/conf.d/apc_status.pyconf b/apc_status/conf.d/apc_status.pyconf new file mode 100644 index 00000000..d3e8413d --- /dev/null +++ b/apc_status/conf.d/apc_status.pyconf @@ -0,0 +1,97 @@ +modules { + module { + name = "apc_status" + language = "python" + + # URL of the resident apc-json.php script, which will translate the APC figures to JSON + param url { + value = "http://localhost/apc-json.php" + } + + # Which metric group should these metrics be put into + param metric_group { + value = "apc_cache" + } + } +} + +collection_group { + collect_every = 30 + time_threshold = 90 + + metric { + name = "apc_mem_size" + title = "Total Memory" + value_threshold = 0 + } + metric { + name = "apc_mem_avail" + title = "Free Memory" + value_threshold = 0 + } + metric { + name = "apc_mem_used" + title = "Used Memory" + value_threshold = 0 + } + metric { + name = "apc_num_slots" + title = "Number of Slots" + value_threshold = 0 + } + metric { + name = "apc_num_hits" + title = "Number of Cache Hits" + value_threshold = 0 + } + metric { + name = "apc_num_misses" + title = "Number of Cache Misses" + value_threshold = 0 + } + metric { + name = "apc_num_inserts" + title = "Number of Cache Inserts" + value_threshold = 0 + } + metric { + name = "apc_expunges" + title = "Number of Cache Deletes" + value_threshold = 0 + } + metric { + name = "apc_num_entries" + title = "Cached Files" + value_threshold = 0 + } + metric { + name = "apc_num_seg" + title = "Segments" + value_threshold = 0 + } + metric { + name = "apc_uptime" + title = "Uptime" + value_threshold = 0 + } + metric { + name = "apc_request_rate" + title = "Request Rate (hits, misses)" + value_threshold = 0.0 + } + metric { + name = "apc_hit_rate" + title = "Hit Rate" + value_threshold = 0.0 + } + metric { + name = "apc_miss_rate" + title = "Miss Rate" + value_threshold = 0.0 + } + metric { + name = "apc_insert_rate" + title = "Insert Rate" + value_threshold = 0.0 + } +} diff --git a/apc_status/document_root/apc-json.php b/apc_status/document_root/apc-json.php new file mode 100644 index 00000000..b5d9e89e --- /dev/null +++ b/apc_status/document_root/apc-json.php @@ -0,0 +1,44 @@ + Date: Mon, 1 Oct 2012 09:29:26 -0500 Subject: [PATCH 36/39] kill on timeout with sudo --- passenger/python_modules/passenger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passenger/python_modules/passenger.py b/passenger/python_modules/passenger.py index 5d4f4e8f..20e9af59 100644 --- a/passenger/python_modules/passenger.py +++ b/passenger/python_modules/passenger.py @@ -190,7 +190,7 @@ def timeout_command(command, timeout): time.sleep(0.2) now = datetime.datetime.now() if (now - start).seconds> timeout: - os.kill(process.pid, signal.SIGKILL) + os.system("sudo kill %s" % process.pid) os.waitpid(-1, os.WNOHANG) return None return process.stdout.readlines() From 380737558be8d7d85e46aec4eedd6d20d138d06d Mon Sep 17 00:00:00 2001 From: Adam DeConinck Date: Tue, 2 Oct 2012 16:35:03 -0700 Subject: [PATCH 37/39] Change "||" to "or" to avoid syntax error The "logical OR" operator in Python is "or", not "||". Before this change, when gmond runs in debug level 2, the following message is printed: loaded module: multicpu_module [PYTHON] Can't import the metric module [nvidia]. File "/usr/lib64/ganglia/python_modules/nvidia.py", line 102 elif (metric == 'perf_state' || metric == 'performance_state'): ^ SyntaxError: invalid syntax --- gpu/nvidia/python_modules/nvidia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/nvidia/python_modules/nvidia.py b/gpu/nvidia/python_modules/nvidia.py index c8ae6bec..84957bb4 100644 --- a/gpu/nvidia/python_modules/nvidia.py +++ b/gpu/nvidia/python_modules/nvidia.py @@ -99,7 +99,7 @@ def gpu_device_handler(name): except NVMLError, nvmlError: if NVML_ERROR_NOT_SUPPORTED == nvmlError.value: return 'N/A' - elif (metric == 'perf_state' || metric == 'performance_state'): + elif (metric == 'perf_state' or metric == 'performance_state'): state = nvmlDeviceGetPerformanceState(gpu_device) try: int(state) From 83858b3128aee67cb4273e7d81f72d35c60e966c Mon Sep 17 00:00:00 2001 From: Nathan L Smith Date: Wed, 3 Oct 2012 14:40:02 -0500 Subject: [PATCH 38/39] return empty list when passenger status times out --- passenger/python_modules/passenger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/passenger/python_modules/passenger.py b/passenger/python_modules/passenger.py index 20e9af59..7f082244 100644 --- a/passenger/python_modules/passenger.py +++ b/passenger/python_modules/passenger.py @@ -192,7 +192,7 @@ def timeout_command(command, timeout): if (now - start).seconds> timeout: os.system("sudo kill %s" % process.pid) os.waitpid(-1, os.WNOHANG) - return None + return [] return process.stdout.readlines() if __name__ == '__main__': From 148b86ed9ef3d3fb9979f26a1e6ec247dc3b1748 Mon Sep 17 00:00:00 2001 From: Martin Walsh Date: Wed, 10 Oct 2012 12:13:30 -0500 Subject: [PATCH 39/39] deepcopy dicts to avoid passing around references --- network/netstats/python_modules/netstats.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/network/netstats/python_modules/netstats.py b/network/netstats/python_modules/netstats.py index d22360d8..feb8cdbb 100644 --- a/network/netstats/python_modules/netstats.py +++ b/network/netstats/python_modules/netstats.py @@ -1,6 +1,7 @@ import sys import re import time +import copy PARAMS = {} @@ -13,7 +14,7 @@ tcpext_file = "/proc/net/netstat" snmp_file = "/proc/net/snmp" -LAST_METRICS = dict(METRICS) +LAST_METRICS = copy.deepcopy(METRICS) METRICS_CACHE_MAX = 5 stats_pos = {} @@ -188,10 +189,10 @@ def get_metrics(): if re.match("TcpExt: [0-9]", line): metrics = re.split("\s+", line) - file.close + file.close() # update cache - LAST_METRICS = dict(METRICS) + LAST_METRICS = copy.deepcopy(METRICS) METRICS = { 'time': time.time(), 'tcpext': metrics @@ -216,7 +217,7 @@ def get_metrics(): METRICS['tcp'] = re.split("\s+", line) - file.close + file.close() return [METRICS, LAST_METRICS]