diff --git a/apc_status/README.mkdn b/apc_status/README.mkdn new file mode 100644 index 00000000..9dde1757 --- /dev/null +++ b/apc_status/README.mkdn @@ -0,0 +1,13 @@ +apc_status +=============== + +python module for ganglia 3.1. + +"apc_status" sends metrics on Another PHP Cache process status refering to +apc-json.php. + +To use this you will need to copy apc-json.php to your webdir. + +## AUTHOR + +Jacob V. Rasmussen diff --git a/apc_status/conf.d/apc_status.pyconf b/apc_status/conf.d/apc_status.pyconf new file mode 100644 index 00000000..d3e8413d --- /dev/null +++ b/apc_status/conf.d/apc_status.pyconf @@ -0,0 +1,97 @@ +modules { + module { + name = "apc_status" + language = "python" + + # URL of the resident apc-json.php script, which will translate the APC figures to JSON + param url { + value = "http://localhost/apc-json.php" + } + + # Which metric group should these metrics be put into + param metric_group { + value = "apc_cache" + } + } +} + +collection_group { + collect_every = 30 + time_threshold = 90 + + metric { + name = "apc_mem_size" + title = "Total Memory" + value_threshold = 0 + } + metric { + name = "apc_mem_avail" + title = "Free Memory" + value_threshold = 0 + } + metric { + name = "apc_mem_used" + title = "Used Memory" + value_threshold = 0 + } + metric { + name = "apc_num_slots" + title = "Number of Slots" + value_threshold = 0 + } + metric { + name = "apc_num_hits" + title = "Number of Cache Hits" + value_threshold = 0 + } + metric { + name = "apc_num_misses" + title = "Number of Cache Misses" + value_threshold = 0 + } + metric { + name = "apc_num_inserts" + title = "Number of Cache Inserts" + value_threshold = 0 + } + metric { + name = "apc_expunges" + title = "Number of Cache Deletes" + value_threshold = 0 + } + metric { + name = "apc_num_entries" + title = "Cached Files" + value_threshold = 0 + } + metric { + name = "apc_num_seg" + title = "Segments" + value_threshold = 0 + } + metric { + name = "apc_uptime" + title = "Uptime" + value_threshold = 0 + } + metric { + name = "apc_request_rate" + title = "Request Rate (hits, misses)" + value_threshold = 0.0 + } + metric { + name = "apc_hit_rate" + title = "Hit Rate" + value_threshold = 0.0 + } + metric { + name = "apc_miss_rate" + title = "Miss Rate" + value_threshold = 0.0 + } + metric { + name = "apc_insert_rate" + title = "Insert Rate" + value_threshold = 0.0 + } +} diff --git a/apc_status/document_root/apc-json.php b/apc_status/document_root/apc-json.php new file mode 100644 index 00000000..b5d9e89e --- /dev/null +++ b/apc_status/document_root/apc-json.php @@ -0,0 +1,44 @@ + diff --git a/couchdb/conf.d/couchdb.pyconf b/couchdb/conf.d/couchdb.pyconf new file mode 100644 index 00000000..3463bcee --- /dev/null +++ b/couchdb/conf.d/couchdb.pyconf @@ -0,0 +1,207 @@ +# + +modules { + module { + name = 'couchdb' + language = 'python' + + param stats_url { + value = 'http://localhost:5984/_stats' + } + + param refresh_rate { + value = '60' + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 20 + + metric { + name = 'couchdb_couchdb_auth_cache_hits' + title = 'Number of authentication cache hits' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_auth_cache_misses' + title = 'Number of authentication cache misses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_reads' + title = 'Number of times a document was read from a database' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_database_writes' + title = 'Number of times a document was changed' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_databases' + title = 'Number of open databases' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_open_os_files' + title = 'Number of file descriptors CouchDB has open' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_couchdb_request_time' + title = 'Request Time' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_bulk_requests' + title = 'Number of bulk requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_clients_requesting_changes' + title = 'Number of clients for continuous _changes' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_requests' + title = 'Number of HTTP requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_temporary_view_reads' + title = 'Number of temporary view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_view_reads' + title = 'Number of view reads' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_COPY' + title = 'Number of HTTP COPY requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_DELETE' + title = 'Number of HTTP DELETE requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_GET' + title = 'Number of HTTP GET requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_HEAD' + title = 'Number of HTTP HEAD requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_POST' + title = 'Number of HTTP POST requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_request_methods_PUT' + title = 'Number of HTTP PUT requests' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_200' + title = 'Number of HTTP 200 OK responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_201' + title = 'Number of HTTP 201 Created responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_202' + title = 'Number of HTTP 202 Accepted responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_301' + title = 'Number of HTTP 301 Moved Permanently responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_304' + title = 'Number of HTTP 304 Not Modified responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_400' + title = 'Number of HTTP 400 Bad Request responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_401' + title = 'Number of HTTP 401 Unauthorized responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_403' + title = 'Number of HTTP 403 Forbidden responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_404' + title = 'Number of HTTP 404 Not Found responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_405' + title = 'Number of HTTP 405 Method Not Allowed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_409' + title = 'Number of HTTP 409 Conflict responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_412' + title = 'Number of HTTP 412 Precondition Failed responses' + value_threshold = 1.0 + } + + metric { + name = 'couchdb_httpd_status_codes_500' + title = 'Number of HTTP 500 Internal Server Error responses' + value_threshold = 1.0 + } +} diff --git a/couchdb/python_modules/couchdb.py b/couchdb/python_modules/couchdb.py new file mode 100644 index 00000000..a24f21f4 --- /dev/null +++ b/couchdb/python_modules/couchdb.py @@ -0,0 +1,321 @@ +### This script reports couchdb metrics to ganglia. + +### License to use, modify, and distribute under the GPL +### http://www.gnu.org/licenses/gpl.txt +import logging +import os +import subprocess +import sys +import threading +import time +import traceback +import urllib2 +import json + +logging.basicConfig(level=logging.ERROR) + +_Worker_Thread = None + +class UpdateCouchdbThread(threading.Thread): + + def __init__(self, params): + threading.Thread.__init__(self) + self.running = False + self.shuttingdown = False + self.refresh_rate = int(params['refresh_rate']) + self.metrics = {} + self.settings = {} + self.stats_url = params['stats_url'] + self._metrics_lock = threading.Lock() + self._settings_lock = threading.Lock() + + def shutdown(self): + self.shuttingdown = True + if not self.running: + return + self.join() + + def run(self): + global _Lock + + self.running = True + + while not self.shuttingdown: + time.sleep(self.refresh_rate) + self.refresh_metrics() + + self.running = False + + @staticmethod + def _get_couchdb_stats(url, refresh_rate): + if refresh_rate == 60 or refresh_rate == 300 or refresh_rate == 900: + url += '?range=' + str(refresh_rate) + else: + logging.warning('The specified refresh_rate of %d is invalid and has been substituted with 60!' % refresh_rate) + url += '?range=60' + + c = urllib2.urlopen(url) + json_data = c.read() + c.close() + + data = json.loads(json_data) + couchdb = data['couchdb'] + httpd = data['httpd'] + request_methods = data['httpd_request_methods'] + status_codes = data['httpd_status_codes'] + + result = {} + for first_level_key in data: + for second_level_key in data[first_level_key]: + value = data[first_level_key][second_level_key]['current'] + if value is None: + value = 0 + else: + if second_level_key in ['open_databases', 'open_os_files', 'clients_requesting_changes']: + print second_level_key + ': ' + str(value) + value = int(value) + else: + # We need to devide by the range as couchdb provides no per second values + value = float(value) / refresh_rate + result['couchdb_' + first_level_key + '_' + second_level_key ] = value + + return result + + def refresh_metrics(self): + logging.debug('refresh metrics') + + try: + logging.debug(' opening URL: ' + str(self.stats_url)) + data = UpdateCouchdbThread._get_couchdb_stats(self.stats_url, self.refresh_rate) + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + + try: + self._metrics_lock.acquire() + self.metrics = {} + for k, v in data.items(): + self.metrics[k] = v + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + return False + + finally: + self._metrics_lock.release() + + if not self.metrics: + logging.warning('error refreshing metrics') + return False + + logging.debug('success refreshing metrics') + logging.debug('metrics: ' + str(self.metrics)) + + return True + + def metric_of(self, name): + logging.debug('getting metric: ' + name) + + try: + if name in self.metrics: + try: + self._metrics_lock.acquire() + logging.debug('metric: %s = %s' % (name, self.metrics[name])) + return self.metrics[name] + finally: + self._metrics_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + + def setting_of(self, name): + logging.debug('getting setting: ' + name) + + try: + if name in self.settings: + try: + self._settings_lock.acquire() + logging.debug('setting: %s = %s' % (name, self.settings[name])) + return self.settings[name] + finally: + self._settings_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + +def metric_init(params): + logging.debug('init: ' + str(params)) + global _Worker_Thread + + METRIC_DEFAULTS = { + 'units': 'requests/s', + 'groups': 'couchdb', + 'slope': 'both', + 'value_type': 'float', + 'format': '%.3f', + 'description': '', + 'call_back': metric_of + } + + descriptions = dict( + couchdb_couchdb_auth_cache_hits={ + 'units': 'hits/s', + 'description': 'Number of authentication cache hits'}, + couchdb_couchdb_auth_cache_misses={ + 'units': 'misses/s', + 'description': 'Number of authentication cache misses'}, + couchdb_couchdb_database_reads={ + 'units': 'reads/s', + 'description': 'Number of times a document was read from a database'}, + couchdb_couchdb_database_writes={ + 'units': 'writes/s', + 'description': 'Number of times a document was changed'}, + couchdb_couchdb_open_databases={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'databases', + 'description': 'Number of open databases'}, + couchdb_couchdb_open_os_files={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'files', + 'description': 'Number of file descriptors CouchDB has open'}, + couchdb_couchdb_request_time={ + 'units': 'ms', + 'description': 'Request time'}, + couchdb_httpd_bulk_requests={ + 'description': 'Number of bulk requests'}, + couchdb_httpd_clients_requesting_changes={ + 'value_type': 'uint', + 'format': '%d', + 'units': 'clients', + 'description': 'Number of clients for continuous _changes'}, + couchdb_httpd_requests={ + 'description': 'Number of HTTP requests'}, + couchdb_httpd_temporary_view_reads={ + 'units': 'reads', + 'description': 'Number of temporary view reads'}, + couchdb_httpd_view_reads={ + 'description': 'Number of view reads'}, + couchdb_httpd_request_methods_COPY={ + 'description': 'Number of HTTP COPY requests'}, + couchdb_httpd_request_methods_DELETE={ + 'description': 'Number of HTTP DELETE requests'}, + couchdb_httpd_request_methods_GET={ + 'description': 'Number of HTTP GET requests'}, + couchdb_httpd_request_methods_HEAD={ + 'description': 'Number of HTTP HEAD requests'}, + couchdb_httpd_request_methods_POST={ + 'description': 'Number of HTTP POST requests'}, + couchdb_httpd_request_methods_PUT={ + 'description': 'Number of HTTP PUT requests'}, + couchdb_httpd_status_codes_200={ + 'units': 'responses/s', + 'description': 'Number of HTTP 200 OK responses'}, + couchdb_httpd_status_codes_201={ + 'units': 'responses/s', + 'description': 'Number of HTTP 201 Created responses'}, + couchdb_httpd_status_codes_202={ + 'units': 'responses/s', + 'description': 'Number of HTTP 202 Accepted responses'}, + couchdb_httpd_status_codes_301={ + 'units': 'responses/s', + 'description': 'Number of HTTP 301 Moved Permanently responses'}, + couchdb_httpd_status_codes_304={ + 'units': 'responses/s', + 'description': 'Number of HTTP 304 Not Modified responses'}, + couchdb_httpd_status_codes_400={ + 'units': 'responses/s', + 'description': 'Number of HTTP 400 Bad Request responses'}, + couchdb_httpd_status_codes_401={ + 'units': 'responses/s', + 'description': 'Number of HTTP 401 Unauthorized responses'}, + couchdb_httpd_status_codes_403={ + 'units': 'responses/s', + 'description': 'Number of HTTP 403 Forbidden responses'}, + couchdb_httpd_status_codes_404={ + 'units': 'responses/s', + 'description': 'Number of HTTP 404 Not Found responses'}, + couchdb_httpd_status_codes_405={ + 'units': 'responses/s', + 'description': 'Number of HTTP 405 Method Not Allowed responses'}, + couchdb_httpd_status_codes_409={ + 'units': 'responses/s', + 'description': 'Number of HTTP 409 Conflict responses'}, + couchdb_httpd_status_codes_412={ + 'units': 'responses/s', + 'description': 'Number of HTTP 412 Precondition Failed responses'}, + couchdb_httpd_status_codes_500={ + 'units': 'responses/s', + 'description': 'Number of HTTP 500 Internal Server Error responses'}) + + if _Worker_Thread is not None: + raise Exception('Worker thread already exists') + + _Worker_Thread = UpdateCouchdbThread(params) + _Worker_Thread.refresh_metrics() + _Worker_Thread.start() + + descriptors = [] + + for name, desc in descriptions.iteritems(): + d = desc.copy() + d['name'] = str(name) + [ d.setdefault(key, METRIC_DEFAULTS[key]) for key in METRIC_DEFAULTS.iterkeys() ] + descriptors.append(d) + + return descriptors + +def metric_of(name): + global _Worker_Thread + return _Worker_Thread.metric_of(name) + +def setting_of(name): + global _Worker_Thread + return _Worker_Thread.setting_of(name) + +def metric_cleanup(): + global _Worker_Thread + if _Worker_Thread is not None: + _Worker_Thread.shutdown() + logging.shutdown() + pass + +if __name__ == '__main__': + from optparse import OptionParser + + try: + logging.debug('running from the cmd line') + parser = OptionParser() + parser.add_option('-u', '--URL', dest='stats_url', default='http://127.0.0.1:5984/_stats', help='URL for couchdb stats page') + parser.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_option('-r', '--refresh-rate', dest='refresh_rate', default=60) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False) + + (options, args) = parser.parse_args() + + descriptors = metric_init({ + 'stats_url': options.stats_url, + 'refresh_rate': options.refresh_rate + }) + + if options.debug: + from pprint import pprint + pprint(descriptors) + + for d in descriptors: + v = d['call_back'](d['name']) + + if not options.quiet: + print ' {0}: {1} {2} [{3}]' . format(d['name'], v, d['units'], d['description']) + + os._exit(1) + + except KeyboardInterrupt: + time.sleep(0.2) + os._exit(1) + except StandardError: + traceback.print_exc() + os._exit(1) + finally: + metric_cleanup() diff --git a/elasticsearch/conf.d/elasticsearch.pyconf b/elasticsearch/conf.d/elasticsearch.pyconf index 15789c37..6b3a500d 100644 --- a/elasticsearch/conf.d/elasticsearch.pyconf +++ b/elasticsearch/conf.d/elasticsearch.pyconf @@ -9,6 +9,20 @@ modules { value = "elasticsearch" } + param host { + value = "http://localhost:9200/" + } + + # In order to get index specific stats specify each index seperated by + # whitespace. + # + # indices can be grouped by using comma, + # e.g. index3,index4 will give statistics (docs_count, etc.) for both + # index1 and index2 + param indices { + value = "*" +# value = "index1 index2 index3,index4" + } } } diff --git a/elasticsearch/graph.d/es_report.json b/elasticsearch/graph.d/es_report.json new file mode 100644 index 00000000..a8cf0e6c --- /dev/null +++ b/elasticsearch/graph.d/es_report.json @@ -0,0 +1,14 @@ +{ + "report_name" : "es_report", + "report_type" : "standard", + "title" : "Elasticsearch", + "vertical_label" : "ms", + "series" : [ + { "metric": "es_fetch_time", "color": "BBBBBB", "label": "Fetch", "type": "line" }, + { "metric": "es_get_time", "color": "00FF00", "label": "Get", "line_width": "2", "type": "line" }, + { "metric": "es_flush_time", "color": "FF0000", "label": "Flush", "line_width": "2", "type": "line" }, + { "metric": "es_gc_time", "color": "2030F4", "label": "GC", "line_width": "2", "type": "line" }, + { "metric": "es_indexing_delete_time", "color": "FF30F4", "label": "Indexing Delete", "line_width": "2", "type": "line" }, + { "metric": "es_indexing_index_time", "color": "20FFF4", "label": "Indexing Index", "line_width": "2", "type": "line" } + ] +} diff --git a/elasticsearch/python_modules/elasticsearch.py b/elasticsearch/python_modules/elasticsearch.py index 8f0fa895..1f1444da 100755 --- a/elasticsearch/python_modules/elasticsearch.py +++ b/elasticsearch/python_modules/elasticsearch.py @@ -1,24 +1,18 @@ #! /usr/bin/python -import json +try: + import simplejson as json + assert json # silence pyflakes +except ImportError: + import json + import time import urllib +from functools import partial -global url, last_update, keyToPath - -def dig_it_up(obj,path): - try: - if type(path) in (str,unicode): - path = path.split('.') - return reduce(lambda x,y:x[y],path,obj) - except: - return False - -# Set IP address and JSON Url -url="http://localhost:9200/_cluster/nodes/_local/stats?all=true" # short name to full path for stats -keyToPath=dict() +keyToPath = dict() # Initial time modification stamp - Used to determine # when JSON is updated @@ -30,8 +24,10 @@ def dig_it_up(obj,path): keyToPath['es_cache_field_eviction'] = "nodes.%s.indices.cache.field_evictions" keyToPath['es_cache_field_size'] = "nodes.%s.indices.cache.field_size_in_bytes" keyToPath['es_cache_filter_count'] = "nodes.%s.indices.cache.filter_count" -keyToPath['es_cache_filter_evictions'] = "nodes.%s.indices.cache.filter_evictions" -keyToPath['es_cache_filter_size'] = "nodes.%s.indices.cache.filter_size_in_bytes" +keyToPath[ + 'es_cache_filter_evictions'] = "nodes.%s.indices.cache.filter_evictions" +keyToPath[ + 'es_cache_filter_size'] = "nodes.%s.indices.cache.filter_size_in_bytes" ## DOCS keyToPath['es_docs_count'] = "nodes.%s.indices.docs.count" @@ -46,12 +42,14 @@ def dig_it_up(obj,path): keyToPath['es_get_exists_total'] = "nodes.%s.indices.get.exists_total" keyToPath['es_get_time'] = "nodes.%s.indices.get.time_in_millis" keyToPath['es_get_total'] = "nodes.%s.indices.get.total" -keyToPath['es_get_missing_time'] = "nodes.%s.indices.get.missing_time_in_millis" +keyToPath[ + 'es_get_missing_time'] = "nodes.%s.indices.get.missing_time_in_millis" keyToPath['es_get_missing_total'] = "nodes.%s.indices.get.missing_total" ## INDEXING keyToPath['es_indexing_delete_time'] = "nodes.%s.indices.indexing.delete_time_in_millis" -keyToPath['es_indexing_delete_total'] = "nodes.%s.indices.indexing.delete_total" +keyToPath[ + 'es_indexing_delete_total'] = "nodes.%s.indices.indexing.delete_total" keyToPath['es_indexing_index_time'] = "nodes.%s.indices.indexing.index_time_in_millis" keyToPath['es_indexing_index_total'] = "nodes.%s.indices.indexing.index_total" @@ -61,7 +59,8 @@ def dig_it_up(obj,path): keyToPath['es_merges_current_size'] = "nodes.%s.indices.merges.current_size_in_bytes" keyToPath['es_merges_total'] = "nodes.%s.indices.merges.total" keyToPath['es_merges_total_docs'] = "nodes.%s.indices.merges.total_docs" -keyToPath['es_merges_total_size'] = "nodes.%s.indices.merges.total_size_in_bytes" +keyToPath[ + 'es_merges_total_size'] = "nodes.%s.indices.merges.total_size_in_bytes" keyToPath['es_merges_time'] = "nodes.%s.indices.merges.total_time_in_millis" ## REFRESH @@ -83,7 +82,8 @@ def dig_it_up(obj,path): ## MEM keyToPath['es_heap_committed'] = "nodes.%s.jvm.mem.heap_committed_in_bytes" keyToPath['es_heap_used'] = "nodes.%s.jvm.mem.heap_used_in_bytes" -keyToPath['es_non_heap_committed'] = "nodes.%s.jvm.mem.non_heap_committed_in_bytes" +keyToPath[ + 'es_non_heap_committed'] = "nodes.%s.jvm.mem.non_heap_committed_in_bytes" keyToPath['es_non_heap_used'] = "nodes.%s.jvm.mem.non_heap_used_in_bytes" ## THREADS @@ -106,10 +106,21 @@ def dig_it_up(obj,path): keyToPath['es_http_total_open'] = "nodes.%s.http.total_opened" # PROCESS METRICS # -keyToPath['es_open_file_descriptors'] = "nodes.%s.process.open_file_descriptors" +keyToPath[ + 'es_open_file_descriptors'] = "nodes.%s.process.open_file_descriptors" + + +def dig_it_up(obj, path): + try: + if type(path) in (str, unicode): + path = path.split('.') + return reduce(lambda x, y: x[y], path, obj) + except: + return False + -def getStat(name): - global last_update, result, url +def update_result(result, url): + global last_update # If time delta is > 20 seconds, then update the JSON results now = time.time() @@ -119,430 +130,602 @@ def getStat(name): result = json.load(urllib.urlopen(url)) last_update = now + return result + + +def get_stat_index(result, url, path, name): + result = update_result(result, url) + val = dig_it_up(result, path) + + if not isinstance(val, bool): + return int(val) + else: + return None + + +def getStat(result, url, name): + result = update_result(result, url) + node = result['nodes'].keys()[0] - val = dig_it_up(result, keyToPath[name] % node ) + val = dig_it_up(result, keyToPath[name] % node) # Check to make sure we have a valid result # JsonPath returns False if no match found - if not isinstance(val,bool): + if not isinstance(val, bool): return int(val) else: return None -def create_desc(prop): - d = Desc_Skel.copy() - for k,v in prop.iteritems(): + +def create_desc(skel, prop): + d = skel.copy() + for k, v in prop.iteritems(): d[k] = v return d -def metric_init(params): - global result, url, descriptors, Desc_Skel - print '[elasticsearch] Received the following parameters' - print params +def get_indices_descriptors(index, skel, result, url): + metric_tpl = 'es_index_{0}_{{0}}'.format(index) + callback = partial(get_stat_index, result, url) + _create_desc = partial(create_desc, skel) + + descriptors = [ + _create_desc({ + 'call_back': partial(callback, '_all.primaries.docs.count'), + 'name': metric_tpl.format('docs_count'), + 'description': 'document count for index {0}'.format(index), + }), + _create_desc({ + 'call_back': partial(callback, '_all.primaries.store.size_in_bytes'), + 'name': metric_tpl.format('size'), + 'description': 'size in bytes for index {0}'.format(index), + 'units': 'Bytes', + 'format': '%.0f', + 'value_type': 'double' + }) + ] - # First iteration - Grab statistics - print '[elasticsearch] Fetching ' + url - result = json.load(urllib.urlopen(url)) + return descriptors + +def metric_init(params): descriptors = [] - if "metric_group" not in params: - params["metric_group"] = "elasticsearch" + print('[elasticsearch] Received the following parameters') + print(params) + + host = params.get('host', 'http://localhost:9200/') + url_cluster = '{0}_cluster/nodes/_local/stats?all=true'.format(host) + + # First iteration - Grab statistics + print('[elasticsearch] Fetching ' + url_cluster) + result = json.load(urllib.urlopen(url_cluster)) + + metric_group = params.get('metric_group', 'elasticsearch') Desc_Skel = { - 'name' : 'XXX', - 'call_back' : getStat, - 'time_max' : 60, - 'value_type' : 'uint', - 'units' : 'units', - 'slope' : 'both', - 'format' : '%d', - 'description' : 'XXX', - 'groups' : params["metric_group"], + 'name': 'XXX', + 'call_back': partial(getStat, result, url_cluster), + 'time_max': 60, + 'value_type': 'uint', + 'units': 'units', + 'slope': 'both', + 'format': '%d', + 'description': 'XXX', + 'groups': metric_group, } - descriptors.append(create_desc({ - 'name' : 'es_heap_committed', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Heap Committed (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_heap_used', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Heap Used (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_non_heap_committed', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Non Heap Committed (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_non_heap_used', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Java Non Heap Used (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_threads', - 'units' : 'threads', - 'format' : '%d', - 'description': 'Threads (open)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_threads_peak', - 'units' : 'threads', - 'format' : '%d', - 'description': 'Threads Peak (open)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_gc_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Java GC Time (ms)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'Transport Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_rx_count', - 'units' : 'rx', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'RX Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_rx_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'RX (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_tx_count', - 'units' : 'tx', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'TX Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_transport_tx_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'TX (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_http_current_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'HTTP Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_http_total_open', - 'units' : 'sockets', - 'format' : '%d', - 'description': 'HTTP Open (sockets)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indices_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Index Size (Bytes)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_gc_count', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Java GC Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current', - 'format' : '%d', - 'description': 'Merges (current)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current_docs', - 'format' : '%d', - 'description': 'Merges (docs)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges (total)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total_docs', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges (total docs)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_current_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Merges size (current)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_total_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'slope' : 'positive', - 'description': 'Merges size (total)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_merges_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Merges Time (ms)', - })) - - descriptors.append(create_desc({ - 'name' : 'es_refresh_total', - 'units' : 'refreshes', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Refresh', - })) - - descriptors.append(create_desc({ - 'name' : 'es_refresh_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Refresh Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_docs_count', - 'units' : 'docs', - 'format' : '%.0f', - 'description': 'Number of Documents', - })) - - descriptors.append(create_desc({ - 'name' : 'es_docs_deleted', - 'units' : 'docs', - 'format' : '%.0f', - 'description': 'Number of Documents Deleted', - })) - - descriptors.append(create_desc({ - 'name' : 'es_open_file_descriptors', - 'units' : 'files', - 'format' : '%d', - 'description': 'Open File Descriptors', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_field_eviction', - 'units' : 'units', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Field Cache Evictions', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_field_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Field Cache Size', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_count', - 'format' : '%d', - 'description': 'Filter Cache Count', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_evictions', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Filter Cache Evictions', - })) - - descriptors.append(create_desc({ - 'name' : 'es_cache_filter_size', - 'units' : 'Bytes', - 'format' : '%.0f', - 'description': 'Filter Cache Size', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_current', - 'units' : 'Queries', - 'format' : '%d', - 'description': 'Current Queries', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Query Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_current', - 'units' : 'fetches', - 'format' : '%d', - 'description': 'Current Fetches', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_total', - 'units' : 'fetches', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Fetches', - })) - - descriptors.append(create_desc({ - 'name' : 'es_fetch_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Fetch Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_flush_total', - 'units' : 'flushes', - 'format' : '%d', - 'description': 'Total Flushes', - })) - - descriptors.append(create_desc({ - 'name' : 'es_flush_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Flush Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_exists_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Exists Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_exists_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Exists Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Get Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Get Total', - })) - descriptors.append(create_desc({ - 'name' : 'es_get_missing_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Missing Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_get_missing_total', - 'units' : 'total', - 'format' : '%d', - 'description': 'Missing Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_delete_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Delete Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_delete_total', - 'units' : 'docs', - 'format' : '%d', - 'description': 'Delete Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_index_time', - 'units' : 'ms', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Indexing Time', - })) - - descriptors.append(create_desc({ - 'name' : 'es_indexing_index_total', - 'units' : 'docs', - 'format' : '%d', - 'description': 'Indexing Documents Total', - })) - - descriptors.append(create_desc({ - 'name' : 'es_query_total', - 'units' : 'Queries', - 'format' : '%d', - 'slope' : 'positive', - 'description': 'Total Queries', - })) + indices = params.get('indices', '*').split() + for index in indices: + url_indices = '{0}{1}/_stats'.format(host, index) + print('[elasticsearch] Fetching ' + url_indices) + + r_indices = json.load(urllib.urlopen(url_indices)) + descriptors += get_indices_descriptors(index, + Desc_Skel, + r_indices, + url_indices) + + _create_desc = partial(create_desc, Desc_Skel) + + descriptors.append( + _create_desc({ + 'name': 'es_heap_committed', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Heap Committed (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_heap_used', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Heap Used (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_non_heap_committed', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Non Heap Committed (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_non_heap_used', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Java Non Heap Used (Bytes)', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_threads', + 'units': 'threads', + 'format': '%d', + 'description': 'Threads (open)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_threads_peak', + 'units': 'threads', + 'format': '%d', + 'description': 'Threads Peak (open)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_gc_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Java GC Time (ms)' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_transport_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'Transport Open (sockets)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_transport_rx_count', + 'units': 'rx', + 'format': '%d', + 'slope': 'positive', + 'description': 'RX Count' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_transport_rx_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'RX (Bytes)', + 'value_type': 'double', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_transport_tx_count', + 'units': 'tx', + 'format': '%d', + 'slope': 'positive', + 'description': 'TX Count' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_transport_tx_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'TX (Bytes)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_http_current_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'HTTP Open (sockets)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_http_total_open', + 'units': 'sockets', + 'format': '%d', + 'description': 'HTTP Open (sockets)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_indices_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Index Size (Bytes)', + 'value_type': 'double', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_gc_count', + 'format': '%d', + 'slope': 'positive', + 'description': 'Java GC Count', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_current', + 'format': '%d', + 'description': 'Merges (current)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_current_docs', + 'format': '%d', + 'description': 'Merges (docs)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_total', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges (total)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_total_docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges (total docs)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_current_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Merges size (current)', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_total_size', + 'units': 'Bytes', + 'format': '%.0f', + 'slope': 'positive', + 'description': 'Merges size (total)', + 'value_type': 'double', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_merges_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Merges Time (ms)' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_refresh_total', + 'units': 'refreshes', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Refresh' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_refresh_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Refresh Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_docs_count', + 'units': 'docs', + 'format': '%.0f', + 'description': 'Number of Documents', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_docs_deleted', + 'units': 'docs', + 'format': '%.0f', + 'description': 'Number of Documents Deleted', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_open_file_descriptors', + 'units': 'files', + 'format': '%d', + 'description': 'Open File Descriptors', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_cache_field_eviction', + 'units': 'units', + 'format': '%d', + 'slope': 'positive', + 'description': 'Field Cache Evictions', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_cache_field_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Field Cache Size', + 'value_type': 'double', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_cache_filter_count', + 'format': '%d', + 'description': 'Filter Cache Count', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_cache_filter_evictions', + 'format': '%d', + 'slope': 'positive', + 'description': 'Filter Cache Evictions', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_cache_filter_size', + 'units': 'Bytes', + 'format': '%.0f', + 'description': 'Filter Cache Size', + 'value_type': 'double' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_query_current', + 'units': 'Queries', + 'format': '%d', + 'description': 'Current Queries', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_query_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Query Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_fetch_current', + 'units': 'fetches', + 'format': '%d', + 'description': 'Current Fetches', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_fetch_total', + 'units': 'fetches', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Fetches' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_fetch_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Fetch Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_flush_total', + 'units': 'flushes', + 'format': '%d', + 'description': 'Total Flushes', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_flush_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Flush Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_exists_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Exists Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_exists_total', + 'units': 'total', + 'format': '%d', + 'description': 'Exists Total', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Get Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_total', + 'units': 'total', + 'format': '%d', + 'description': 'Get Total', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_missing_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Missing Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_get_missing_total', + 'units': 'total', + 'format': '%d', + 'description': 'Missing Total', + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_indexing_delete_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Delete Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_indexing_delete_total', + 'units': 'docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Delete Total' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_indexing_index_time', + 'units': 'ms', + 'format': '%d', + 'slope': 'positive', + 'description': 'Indexing Time' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_indexing_index_total', + 'units': 'docs', + 'format': '%d', + 'slope': 'positive', + 'description': 'Indexing Documents Total' + }) + ) + + descriptors.append( + _create_desc({ + 'name': 'es_query_total', + 'units': 'Queries', + 'format': '%d', + 'slope': 'positive', + 'description': 'Total Queries' + }) + ) return descriptors - + + def metric_cleanup(): - pass + pass + #This code is for debugging and unit testing if __name__ == '__main__': - metric_init({}) + descriptors = metric_init({}) for d in descriptors: v = d['call_back'](d['name']) print 'value for %s is %s' % (d['name'], str(v)) - diff --git a/fibrechannel/README.mkdn b/fibrechannel/README.mkdn new file mode 100644 index 00000000..90bca5ab --- /dev/null +++ b/fibrechannel/README.mkdn @@ -0,0 +1,31 @@ +Brocade FibreChannel +This is gmond python module that allows SNMP polling of Fibrechannel switches to get interface packet and throughput metrics. + + * It works for Brocade FC switches, and probably for any other SNMP enabled switch. + * It requires pysnmp (available in debian repositorys) + * Handles polling multiple switches from a single gmond. + * Spoofs the switch hostname, so each switch shows up separately in ganglia + +## DEPENDS + * python pysnmp + +## USAGE + * Save the fibrechannel.pyconf into directory and update the switch(s) name & IP's + * Save the fibrechannel.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules + * Update SNMP community / ports if necessary + * If FC metrics aren't appearing, increase your net.core.rmem_max and default settings as below: + +net.core.rmem_max=104857600 + +net.core.rmem_default=104857600 + +vm.dirty_ratio=100 + +vm.dirty_background_ratio=100 + +vm.dirty_expire_centisecs=720000 + +## AUTHOR + +Author: Evan Fraser <evan.fraser@trademe.co.nz> + diff --git a/fibrechannel/fibrechannel.py b/fibrechannel/fibrechannel.py new file mode 100755 index 00000000..ce192292 --- /dev/null +++ b/fibrechannel/fibrechannel.py @@ -0,0 +1,265 @@ +#!/usr/bin/python +# Name: fibrechannel.py +# Desc: Ganglia module for polling Brocade Fibrechannel switches via snmnp (probably work with any snmp capable device) +# Author: Evan Fraser evan.fraser@trademe.co.nz +# Date: August 2012 +# Copyright: GPL + +import sys +import os +import re +import time +import pprint +from pysnmp.entity.rfc3413.oneliner import cmdgen +NIPARAMS = {} + +NIMETRICS = { + 'time' : 0, + 'data' : {} +} +LAST_NIMETRICS = dict(NIMETRICS) +NIMETRICS_CACHE_MAX = 5 + +descriptors = list() + +oidDict = { + 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), + 'ifDescr' : (1,3,6,1,2,1,2,2,1,2), + 'ifInOctets' : (1,3,6,1,2,1,2,2,1,10), + 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), + 'ifInErrors' : (1,3,6,1,2,1,2,2,1,14), + 'ifOutOctets' : (1,3,6,1,2,1,2,2,1,16), + 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), + 'ifOutErrors' : (1,3,6,1,2,1,2,2,1,20), + } + +def get_metrics(): + """Return all metrics""" + + global NIMETRICS, LAST_NIMETRICS + + # if interval since last check > NIMETRICS_CACHE_MAX get metrics again + if (time.time() - NIMETRICS['time']) > NIMETRICS_CACHE_MAX: + metrics = {} + for para in NIPARAMS.keys(): + if para.startswith('switch_'): + ipaddr,name = NIPARAMS[para].split(':') + snmpTable = runSnmp(oidDict,ipaddr) + newmetrics = buildDict(oidDict,snmpTable,name) + metrics = dict(newmetrics, **metrics) + + # update cache + LAST_NIMETRICS = dict(NIMETRICS) + NIMETRICS = { + 'time': time.time(), + 'data': metrics + } + + return [NIMETRICS, LAST_NIMETRICS] + +def get_delta(name): + """Return change over time for the requested metric""" + + # get metrics + [curr_metrics, last_metrics] = get_metrics() + try: + delta = float(curr_metrics['data'][name] - last_metrics['data'][name])/(curr_metrics['time'] - last_metrics['time']) + #print delta + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + + return delta + +# Separate routine to perform SNMP queries and returns table (dict) +def runSnmp(oidDict,ip): + + # cmdgen only takes tuples, oid strings don't work + +# 'ifIndex' : (1,3,6,1,2,1,2,2,1,1), +# 'ifDescr' : (1,3,6,1,2,1,2,2,1,2), +# 'ifInOctets' : (1,3,6,1,2,1,2,2,1,10), +# 'ifInUcastPkts' : (1,3,6,1,2,1,2,2,1,11), +# 'ifInErrors' : (1,3,6,1,2,1,2,2,1,14), +# 'ifOutOctets' : (1,3,6,1,2,1,2,2,1,16), +# 'ifOutUcastPkts' : (1,3,6,1,2,1,2,2,1,17), +# 'ifOutErrors' : (1,3,6,1,2,1,2,2,1,20), + + #Runs the SNMP query, The order that oid's are passed determines the order in the results + errorIndication, errorStatus, errorIndex, varBindTable = cmdgen.CommandGenerator().nextCmd( + # SNMP v2 + cmdgen.CommunityData('test-agent', 'public'), + cmdgen.UdpTransportTarget((ip, 161)), + oidDict['ifIndex'], + oidDict['ifDescr'], + oidDict['ifInOctets'], + oidDict['ifInErrors'], + oidDict['ifInUcastPkts'], + oidDict['ifOutOctets'], + oidDict['ifOutErrors'], + oidDict['ifOutUcastPkts'], + ) + #pprint.pprint(varBindTable) + # Check for SNMP errors + if errorIndication: + print errorIndication + else: + if errorStatus: + print '%s at %s\n' % ( + errorStatus.prettyPrint(), errorIndex and varBindTable[-1][int(errorIndex)-1] or '?' + ) + else: + return(varBindTable) + +def buildDict(oidDict,t,switch): # passed a list of tuples, build's a dict based on the alias name + builtdict = {} + + for line in t: + # if t[t.index(line)][2][1] != '': + string = str(t[t.index(line)][1][1]) # this is the ifDescr + #print string + match = re.search(r'FC port', string) + if match and t[t.index(line)][0][1] != '': + #alias = str(t[t.index(line)][0][1]) + index = str(t[t.index(line)][0][1]) + temp = str(t[t.index(line)][1][1]) #(use ifDescr) + #lowercase the name, change spaces + '/' to '_' + name = ((temp.lower()).replace(' ','_')).replace('/','_') + inoct = str(t[t.index(line)][2][1]) + builtdict[switch+'_'+name+'_bitsin'] = int(inoct) * 8 + outoct = str(t[t.index(line)][5][1]) + builtdict[switch+'_'+name+'_bitsout'] = int(outoct) * 8 + inpkt = str(t[t.index(line)][4][1]) + builtdict[switch+'_'+name+'_pktsin'] = int(inpkt) + outpkt = str(t[t.index(line)][7][1]) + builtdict[switch+'_'+name+'_pktsout'] = int(outpkt) + inerrors = str(t[t.index(line)][3][1]) + builtdict[switch+'_'+name+'_inerrors'] = int(inerrors) + outerrors = str(t[t.index(line)][6][1]) + builtdict[switch+'_'+name+'_outerrors'] = int(outerrors) + + #pprint.pprint(builtdict) + return builtdict + +# define_metrics will run an snmp query on an ipaddr, find interfaces, build descriptors and set spoof_host +# define_metrics is called from metric_init +def define_metrics(Desc_Skel, ipaddr, switch): + snmpTable = runSnmp(oidDict,ipaddr) + aliasdict = buildDict(oidDict,snmpTable,switch) + spoof_string = ipaddr + ':' + switch + #print newdict + #pprint.pprint(aliasdict.keys()) + + for key in aliasdict.keys(): + if "bitsin" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "bits/sec", + "description" : "received bits per sec", + "groups" : "Throughput", + "spoof_host" : spoof_string, + })) + elif "bitsout" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "bits/sec", + "description" : "transmitted bits per sec", + "groups" : "Throughput", + "spoof_host" : spoof_string, + })) + elif "pktsin" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "pkts/sec", + "description" : "received packets per sec", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + elif "pktsout" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "pkts/sec", + "description" : "transmitted packets per sec", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + elif "inerrors" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "errors", + "description" : "inbound packet errors", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + elif "outerrors" in key: + descriptors.append(create_desc(Desc_Skel, { + "name" : key, + "units" : "errors", + "description" : "outbound packet errors", + "groups" : "Packets", + "spoof_host" : spoof_string, + })) + + + return descriptors + +def metric_init(params): + global descriptors, Desc_Skel, _Worker_Thread, Debug, newdict + + print '[switch] Received the following parameters' + print params + + #Import the params into the global NIPARAMS + for key in params: + NIPARAMS[key] = params[key] + + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_delta, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'switch', + } + + # Find all the switch's passed in params + for para in params.keys(): + if para.startswith('switch_'): + #Get ipaddr + name of switchs from params + ipaddr,name = params[para].split(':') + # pass skel, ip and name to define_metrics to create descriptors + descriptors = define_metrics(Desc_Skel, ipaddr, name) + #Return the descriptors back to gmond + return descriptors + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + + +def metric_cleanup(): + '''Clean up the metric module.''' + pass + +# For CLI Debuging: +if __name__ == '__main__': + params = { + 'switch_1' : '192.168.1.1:switch1', + #'switch_2' : '192.168.1.2:switch2', + } + descriptors = metric_init(params) + print len(descriptors) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + print 'value for %s is %u' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) +#exit(0) diff --git a/fibrechannel/fibrechannel.pyconf b/fibrechannel/fibrechannel.pyconf new file mode 100644 index 00000000..9a0990a4 --- /dev/null +++ b/fibrechannel/fibrechannel.pyconf @@ -0,0 +1,25 @@ +modules { + module { + name = "fibrechannel" + language = "python" + param switch_1 { + # ip:hostname + value = '192.168.1.1:switch1' + } + #param switch_2 { + # value = '192.168.1.2:switch2' + #} + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 20 + time_threshold = 50 + metric { + name_match = "(.+)in" + } + metric { + name_match = "(.+)out" + } + } diff --git a/gpu/nvidia/README b/gpu/nvidia/README index 8f5d1cf6..6aaaef93 100644 --- a/gpu/nvidia/README +++ b/gpu/nvidia/README @@ -3,7 +3,9 @@ NVIDIA GPU monitoring plugin for gmond Installation instructions: * First install the Python Bindings for the NVIDIA Management Library: - http://pypi.python.org/pypi/nvidia-ml-py/ + $ cd nvidia-ml-py-* + $ sudo python setup.py install + For the latest bindings see: http://pypi.python.org/pypi/nvidia-ml-py/ You can do a site install or place it in {libdir}/ganglia/python_modules * Copy python_modules/nvidia.py to {libdir}/ganglia/python_modules * Copy conf.d/nvidia.pyconf to /etc/ganglia/conf.d @@ -37,3 +39,14 @@ The following metrics have been implemented: * gpu_power_usage * gpu_perf_state * gpu_ecc_mode + +Version 2: + +The following metrics have been implemented: +* gpu_max_graphics_speed +* gpu_max_sm_speed +* gpu_max_mem_speed +* gpu_serial +* gpu_power_man_mode +* gpu_power_man_limit + diff --git a/gpu/nvidia/conf.d/nvidia.pyconf b/gpu/nvidia/conf.d/nvidia.pyconf index 5c3355b2..19022862 100644 --- a/gpu/nvidia/conf.d/nvidia.pyconf +++ b/gpu/nvidia/conf.d/nvidia.pyconf @@ -78,6 +78,11 @@ collection_group { title= "\\1 Performance State" value_threshold = 1.0 } +} + +collection_group { + collect_every = 600 + time_threshold = 1200 metric { name_match = "([\\S]+)_ecc_mode" @@ -85,6 +90,21 @@ collection_group { title= "\\1 ECC Mode" value_threshold = 1.0 } + + metric { + name_match = "([\\S]+)_power_man_mode" + name = "\\1_power_man_mode" + title= "\\1 Power Management Mode" + value_threshold = 1.0 + } + + metric { + name_match = "([\\S]+)_power_man_limit" + name = "\\1_power_man_limit" + title= "\\1 Power Management Limit" + value_threshold = 1.0 + } + } collection_group { @@ -128,21 +148,28 @@ collection_group { metric { name_match = "([\\S]+)_max_graphics_speed" name = "\\1_max_graphics_speed" - title = "\\1 Max Graphics Speed" + title = "\\1 Max Graphics Clock Speed" value_threshold = 1.0 } metric { name_match = "([\\S]+)_max_sm_speed" name = "\\1_max_sm_speed" - title = "\\1 Max SM Speed" + title = "\\1 Max SM Clock Speed" value_threshold = 1.0 } metric { name_match = "([\\S]+)_max_mem_speed" name = "\\1_max_mem_speed" - title = "\\1 Max Memory Speed" + title = "\\1 Max Memory Clock Speed" value_threshold = 1.0 } + + metric { + name_match = "([\\S]+)_serial" + name = "\\1_serial" + title = "\\1 Board Serial Number" + } } + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO b/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO new file mode 100644 index 00000000..3c0212b9 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/PKG-INFO @@ -0,0 +1,20 @@ +Metadata-Version: 1.0 +Name: nvidia-ml-py +Version: 3.295.00 +Summary: Python Bindings for the NVIDIA Management Library +Home-page: http://www.nvidia.com/ +Author: NVIDIA Corporation +Author-email: nvml-bindings@nvidia.com +License: BSD +Description: UNKNOWN +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: License :: OSI Approved :: BSD License +Classifier: Operating System :: Microsoft :: Windows +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: System :: Hardware +Classifier: Topic :: System :: Systems Administration diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt b/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt new file mode 100644 index 00000000..4cfec876 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/README.txt @@ -0,0 +1,139 @@ +====== +pyNVML +====== + +------------------------------------------------ +Python bindings to the NVIDIA Management Library +------------------------------------------------ + +Provides a Python interface to GPU management and monitoring functions. + +This is a wrapper around the NVML library. +For information about the NVML library, see the NVML developer page +http://developer.nvidia.com/nvidia-management-library-nvml + +Download the latest package from: +http://pypi.python.org/pypi/nvidia-ml-py/ + +Note this file can be run with 'python -m doctest -v README.txt' +although the results are system dependent + +REQUIRES +-------- +Python 2.5, or an earlier version with the ctypes module. + +INSTALLATION +------------ +sudo python setup.py install + +USAGE +----- + + >>> from pynvml import * + >>> nvmlInit() + >>> print "Driver Version:", nvmlSystemGetDriverVersion() + Driver Version: 295.00 + >>> deviceCount = nvmlDeviceGetCount() + >>> for i in range(deviceCount): + ... handle = nvmlDeviceGetHandleByIndex(i) + ... print "Device", i, ":", nvmlDeviceGetName(handle) + ... + Device 0 : Tesla C2070 + + >>> nvmlShutdown() + +Additionally, see nvidia_smi.py. A sample application. + +FUNCTIONS +--------- +Python methods wrap NVML functions, implemented in a C shared library. +Each function's use is the same with the following exceptions: + +- Instead of returning error codes, failing error codes are raised as + Python exceptions. + + >>> try: + ... nvmlDeviceGetCount() + ... except NVMLError as error: + ... print error + ... + Uninitialized + +- C function output parameters are returned from the corresponding + Python function left to right. + +:: + + nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, + nvmlEnableState_t *current, + nvmlEnableState_t *pending); + + >>> nvmlInit() + >>> handle = nvmlDeviceGetHandleByIndex(0) + >>> (current, pending) = nvmlDeviceGetEccMode(handle) + +- C structs are converted into Python classes. + +:: + + nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, + nvmlMemory_t *memory); + typedef struct nvmlMemory_st { + unsigned long long total; + unsigned long long free; + unsigned long long used; + } nvmlMemory_t; + + >>> info = nvmlDeviceGetMemoryInfo(handle) + >>> print "Total memory:", info.total + Total memory: 5636292608 + >>> print "Free memory:", info.free + Free memory: 5578420224 + >>> print "Used memory:", info.used + Used memory: 57872384 + +- Python handles string buffer creation. + +:: + + nvmlReturn_t nvmlSystemGetDriverVersion(char* version, + unsigned int length); + + >>> version = nvmlSystemGetDriverVersion(); + >>> nvmlShutdown() + +For usage information see the NVML documentation. + +VARIABLES +--------- +All meaningful NVML constants and enums are exposed in Python. + +The NVML_VALUE_NOT_AVAILABLE constant is not used. Instead None is mapped to the field. + +RELEASE NOTES +------------- +Version 2.285.0 +- Added new functions for NVML 2.285. See NVML documentation for more information. +- Ported to support Python 3.0 and Python 2.0 syntax. +- Added nvidia_smi.py tool as a sample app. +Version 3.295.0 +- Added new functions for NVML 3.295. See NVML documentation for more information. +- Updated nvidia_smi.py tool + - Includes additional error handling + +COPYRIGHT +--------- +Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. + +LICENSE +------- +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +- Neither the name of the NVIDIA Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py new file mode 100644 index 00000000..f1a42707 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/nvidia_smi.py @@ -0,0 +1,455 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +# +# nvidia_smi +# nvml_bindings nvidia com +# +# Sample code that attempts to reproduce the output of nvidia-smi -q- x +# For many cases the output should match +# +# To Run: +# $ python +# Python 2.7 (r27:82500, Sep 16 2010, 18:02:00) +# [GCC 4.5.1 20100907 (Red Hat 4.5.1-3)] on linux2 +# Type "help", "copyright", "credits" or "license" for more information. +# >>> import nvidia_smi +# >>> print(nvidia_smi.XmlDeviceQuery()) +# ... +# + +from pynvml import * +import datetime + +# +# Helper functions +# +def GetEccByType(handle, counterType, bitType): + try: + count = str(nvmlDeviceGetTotalEccErrors(handle, bitType, counterType)) + except NVMLError as err: + count = handleError(err) + + try: + detail = nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType) + deviceMemory = str(detail.deviceMemory) + registerFile = str(detail.registerFile) + l1Cache = str(detail.l1Cache) + l2Cache = str(detail.l2Cache) + except NVMLError as err: + msg = handleError(err) + deviceMemory = msg + registerFile = msg + l1Cache = msg + l2Cache = msg + strResult = '' + strResult += ' ' + deviceMemory + '\n' + strResult += ' ' + registerFile + '\n' + strResult += ' ' + l1Cache + '\n' + strResult += ' ' + l2Cache + '\n' + strResult += ' ' + count + '\n' + return strResult + +def GetEccByCounter(handle, counterType): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_SINGLE_BIT_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_DOUBLE_BIT_ECC)) + strResult += ' \n' + return strResult + +def GetEccStr(handle): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_VOLATILE_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_AGGREGATE_ECC)) + strResult += ' \n' + return strResult + +# +# Converts errors into string messages +# +def handleError(err): + if (err.value == NVML_ERROR_NOT_SUPPORTED): + return "N/A" + else: + return err.__str__() + +####### +def XmlDeviceQuery(): + + try: + # + # Initialize NVML + # + nvmlInit() + strResult = '' + + strResult += '\n' + strResult += '\n' + strResult += '\n' + + strResult += ' ' + str(datetime.date.today()) + '\n' + strResult += ' ' + str(nvmlSystemGetDriverVersion()) + '\n' + + deviceCount = nvmlDeviceGetCount() + strResult += ' ' + str(deviceCount) + '\n' + + for i in range(0, deviceCount): + handle = nvmlDeviceGetHandleByIndex(i) + + pciInfo = nvmlDeviceGetPciInfo(handle) + + strResult += ' \n' % pciInfo.busId + + strResult += ' ' + nvmlDeviceGetName(handle) + '\n' + + try: + state = ('Enabled' if (nvmlDeviceGetDisplayMode(handle) != 0) else 'Disabled') + except NVMLError as err: + state = handleError(err) + + strResult += ' ' + state + '\n' + + try: + mode = 'Enabled' if (nvmlDeviceGetPersistenceMode(handle) != 0) else 'Disabled' + except NVMLError as err: + mode = handleError(err) + + strResult += ' ' + mode + '\n' + + strResult += ' \n' + + try: + current = str(nvmlDeviceGetCurrentDriverModel(handle)) + except NVMLError as err: + current = handleError(err) + strResult += ' ' + current + '\n' + + try: + pending = str(nvmlDeviceGetPendingDriverModel(handle)) + except NVMLError as err: + pending = handleError(err) + + strResult += ' ' + pending + '\n' + + strResult += ' \n' + + try: + serial = nvmlDeviceGetSerial(handle) + except NVMLError as err: + serial = handleError(err) + + strResult += ' ' + serial + '\n' + + try: + uuid = nvmlDeviceGetUUID(handle) + except NVMLError as err: + uuid = handleError(err) + + strResult += ' ' + uuid + '\n' + + try: + vbios = nvmlDeviceGetVbiosVersion(handle) + except NVMLError as err: + vbios = handleError(err) + + strResult += ' ' + vbios + '\n' + + strResult += ' \n' + + try: + oem = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_OEM) + if oem == '': + oem = 'N/A' + except NVMLError as err: + oem = handleError(err) + + strResult += ' ' + oem + '\n' + + try: + ecc = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_ECC) + if ecc == '': + ecc = 'N/A' + except NVMLError as err: + ecc = handleError(err) + + strResult += ' ' + ecc + '\n' + try: + pwr = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_POWER) + if pwr == '': + pwr = 'N/A' + except NVMLError as err: + pwr = handleError(err) + + strResult += ' ' + pwr + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += ' %02X\n' % pciInfo.bus + strResult += ' %02X\n' % pciInfo.device + strResult += ' %04X\n' % pciInfo.domain + strResult += ' %08X\n' % (pciInfo.pciDeviceId) + strResult += ' %08X\n' % (pciInfo.pciSubSystemId) + strResult += ' ' + str(pciInfo.busId) + '\n' + strResult += ' \n' + + + strResult += ' \n' + + try: + gen = str(nvmlDeviceGetMaxPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + + try: + gen = str(nvmlDeviceGetCurrPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + strResult += ' \n' + strResult += ' \n' + + try: + width = str(nvmlDeviceGetMaxPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + try: + width = str(nvmlDeviceGetCurrPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + strResult += ' \n' + strResult += ' \n' + strResult += ' \n' + + try: + fan = str(nvmlDeviceGetFanSpeed(handle)) + ' %' + except NVMLError as err: + fan = handleError(err) + strResult += ' ' + fan + '\n' + + try: + memInfo = nvmlDeviceGetMemoryInfo(handle) + mem_total = str(memInfo.total / 1024 / 1024) + ' MB' + mem_used = str(memInfo.used / 1024 / 1024) + ' MB' + mem_free = str(memInfo.free / 1024 / 1024) + ' MB' + except NVMLError as err: + error = handleError(err) + mem_total = error + mem_used = error + mem_free = error + + strResult += ' \n' + strResult += ' ' + mem_total + '\n' + strResult += ' ' + mem_used + '\n' + strResult += ' ' + mem_free + '\n' + strResult += ' \n' + + + try: + mode = nvmlDeviceGetComputeMode(handle) + if mode == NVML_COMPUTEMODE_DEFAULT: + modeStr = 'Default' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD: + modeStr = 'Exclusive Thread' + elif mode == NVML_COMPUTEMODE_PROHIBITED: + modeStr = 'Prohibited' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: + modeStr = 'Exclusive Process' + else: + modeStr = 'Unknown' + except NVMLError as err: + modeStr = handleError(err) + + strResult += ' ' + modeStr + '\n' + + try: + util = nvmlDeviceGetUtilizationRates(handle) + gpu_util = str(util.gpu) + mem_util = str(util.memory) + except NVMLError as err: + error = handleError(err) + gpu_util = error + mem_util = error + + strResult += ' \n' + strResult += ' ' + gpu_util + ' %\n' + strResult += ' ' + mem_util + ' %\n' + strResult += ' \n' + + try: + (current, pending) = nvmlDeviceGetEccMode(handle) + curr_str = 'Enabled' if (current != 0) else 'Disabled' + pend_str = 'Enabled' if (pending != 0) else 'Disabled' + except NVMLError as err: + error = handleError(err) + curr_str = error + pend_str = error + + strResult += ' \n' + strResult += ' ' + curr_str + '\n' + strResult += ' ' + pend_str + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += GetEccStr(handle) + strResult += ' \n' + + try: + temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) + ' C' + except NVMLError as err: + temp = handleError(err) + + strResult += ' \n' + strResult += ' ' + temp + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + perfState = nvmlDeviceGetPowerState(handle) + except NVMLError as err: + perfState = handleError(err) + strResult += ' P%s\n' % perfState + try: + powMan = nvmlDeviceGetPowerManagementMode(handle) + powManStr = 'Supported' if powMan != 0 else 'N/A' + except NVMLError as err: + powManStr = handleError(err) + strResult += ' ' + powManStr + '\n' + try: + powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000.0) + powDrawStr = '%.2f W' % powDraw + except NVMLError as err: + powDrawStr = handleError(err) + strResult += ' ' + powDrawStr + '\n' + try: + powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000.0) + powLimitStr = '%d W' % powLimit + except NVMLError as err: + powLimitStr = handleError(err) + strResult += ' ' + powLimitStr + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' +graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' + graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + try: + perfState = nvmlDeviceGetPowerState(handle) + perfStateStr = 'P%s' % perfState + except NVMLError as err: + perfStateStr = handleError(err) + strResult += ' ' + perfStateStr + '\n' + + strResult += ' \n' + + procstr = "" + try: + procs = nvmlDeviceGetComputeRunningProcesses(handle) + except NVMLError as err: + procs = [] + procstr = handleError(err) + + for p in procs: + procstr += ' \n' + procstr += ' %d\n' % p.pid + try: + name = str(nvmlSystemGetProcessName(p.pid)) + except NVMLError as err: + if (err.value == NVML_ERROR_NOT_FOUND): + # probably went away + continue + else: + name = handleError(err) + procstr += ' ' + name + '\n' + procstr += ' \n' + if (p.usedGpuMemory == None): + procstr += 'N\A' + else: + procstr += '%d MB\n' % (p.usedGpuMemory / 1024 / 1024) + procstr += '\n' + procstr += ' \n' + + strResult += procstr + strResult += ' \n' + strResult += ' \n' + + strResult += '\n' + + except NVMLError as err: + strResult += 'nvidia_smi.py: ' + err.__str__() + '\n' + + nvmlShutdown() + + return strResult + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py new file mode 100644 index 00000000..90f8bdd8 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/build/lib/pynvml.py @@ -0,0 +1,903 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +## +# Python bindings for the NVML library +## +from ctypes import * +from ctypes.util import find_library +import sys +import threading + +## C Type mappings ## +## Enums +_nvmlEnableState_t = c_uint +NVML_FEATURE_DISABLED = 0 +NVML_FEATURE_ENABLED = 1 + +_nvmlTemperatureSensors_t = c_uint +NVML_TEMPERATURE_GPU = 0 + +_nvmlComputeMode_t = c_uint +NVML_COMPUTEMODE_DEFAULT = 0 +NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 +NVML_COMPUTEMODE_PROHIBITED = 2 +NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 + +_nvmlEccBitType_t = c_uint +NVML_SINGLE_BIT_ECC = 0 +NVML_DOUBLE_BIT_ECC = 1 + +_nvmlEccCounterType_t = c_uint +NVML_VOLATILE_ECC = 0 +NVML_AGGREGATE_ECC = 1 + +_nvmlClockType_t = c_uint +NVML_CLOCK_GRAPHICS = 0 +NVML_CLOCK_SM = 1 +NVML_CLOCK_MEM = 2 + +_nvmlDriverModel_t = c_uint +NVML_DRIVER_WDDM = 0 +NVML_DRIVER_WDM = 1 + +_nvmlPstates_t = c_uint +NVML_PSTATE_0 = 0 +NVML_PSTATE_1 = 1 +NVML_PSTATE_2 = 2 +NVML_PSTATE_3 = 3 +NVML_PSTATE_4 = 4 +NVML_PSTATE_5 = 5 +NVML_PSTATE_6 = 6 +NVML_PSTATE_7 = 7 +NVML_PSTATE_8 = 8 +NVML_PSTATE_9 = 9 +NVML_PSTATE_10 = 10 +NVML_PSTATE_11 = 11 +NVML_PSTATE_12 = 12 +NVML_PSTATE_13 = 13 +NVML_PSTATE_14 = 14 +NVML_PSTATE_15 = 15 +NVML_PSTATE_UNKNOWN = 32 + +_nvmlInforomObject_t = c_uint +NVML_INFOROM_OEM = 0 +NVML_INFOROM_ECC = 1 +NVML_INFOROM_POWER = 2 + +_nvmlReturn_t = c_uint +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10, +NVML_ERROR_UNKNOWN = 999 + +_nvmlFanState_t = c_uint +NVML_FAN_NORMAL = 0 +NVML_FAN_FAILED = 1 + +_nvmlLedColor_t = c_uint +NVML_LED_COLOR_GREEN = 0 +NVML_LED_COLOR_AMBER = 1 + +# C preprocessor defined values +nvmlFlagDefault = 0 +nvmlFlagForce = 1 + +# buffer size +NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 +NVML_DEVICE_UUID_BUFFER_SIZE = 80 +NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 81 +NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 +NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 +NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 + +NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) + +## Lib loading ## +nvmlLib = None +libLoadLock = threading.Lock() + +## Error Checking ## +class NVMLError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return str(nvmlErrorString(self.value)) + +def _nvmlCheckReturn(ret): + if (ret != NVML_SUCCESS): + raise NVMLError(ret) + return ret + +## Function access ## +def _nvmlGetFunctionPointer(name): + global nvmlLib + global libLoadLock + + libLoadLock.acquire() + try: + # ensure library was loaded + if (nvmlLib == None): + raise NVMLError(NVML_ERROR_UNINITIALIZED) + try: + return getattr(nvmlLib, name) + except AttributeError as attrError: + raise NVMLError(NVML_ERROR_NOT_SUPPORTED) + finally: + # lock is always freed + libLoadLock.release() + +## Alternative object +# Allows the object to be printed +# Allows mismatched types to be assigned +# - like None when the Structure variant requires c_uint +class nvmlFriendlyObject(object): + def __init__(self, dictionary): + for x in dictionary: + setattr(self, x, dictionary[x]) + def __str__(self): + return self.__dict__.__str__() + +def nvmlStructToFriendlyObject(struct): + d = {} + for x in struct._fields_: + key = x[0] + value = getattr(struct, key) + d[key] = value + obj = nvmlFriendlyObject(d) + return obj + +# pack the object so it can be passed to the NVML library +def nvmlFriendlyObjectToStruct(obj, model): + for x in model._fields_: + key = x[0] + value = obj.__dict__[key] + setattr(model, key, value) + return model + +## Unit structures +class struct_c_nvmlUnit_t(Structure): + pass # opaque handle +c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t) + +class c_nvmlUnitInfo_t(Structure): + _fields_ = [ + ('name', c_char * 96), + ('id', c_char * 96), + ('serial', c_char * 96), + ('firmwareVersion', c_char * 96), + ] + +class c_nvmlLedState_t(Structure): + _fields_ = [ + ('cause', c_char * 256), + ('color', _nvmlLedColor_t), + ] + +class c_nvmlPSUInfo_t(Structure): + _fields_ = [ + ('state', c_char * 256), + ('current', c_uint), + ('voltage', c_uint), + ('power', c_uint), + ] + +class c_nvmlUnitFanInfo_t(Structure): + _fields_ = [ + ('speed', c_uint), + ('state', _nvmlFanState_t), + ] + +class c_nvmlUnitFanSpeeds_t(Structure): + _fields_ = [ + ('fans', c_nvmlUnitFanInfo_t * 24), + ('count', c_uint) + ] + +## Device structures +class struct_c_nvmlDevice_t(Structure): + pass # opaque handle +c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t) + +class nvmlPciInfo_t(Structure): + _fields_ = [ + ('busId', c_char * 16), + ('domain', c_uint), + ('bus', c_uint), + ('device', c_uint), + ('pciDeviceId', c_uint), + + # Added in 2.285 + ('pciSubSystemId', c_uint), + ('reserved0', c_uint), + ('reserved1', c_uint), + ('reserved2', c_uint), + ('reserved3', c_uint), + ] + +class c_nvmlMemory_t(Structure): + _fields_ = [ + ('total', c_ulonglong), + ('free', c_ulonglong), + ('used', c_ulonglong), + ] + +# On Windows with the WDDM driver, usedGpuMemory is reported as None +# Code that processes this structure should check for None, I.E. +# +# if (info.usedGpuMemory == None): +# # TODO handle the error +# pass +# else: +# print("Using %d MB of memory" % (info.usedGpuMemory / 1024 / 1024)) +# +# See NVML documentation for more information +class c_nvmlProcessInfo_t(Structure): + _fields_ = [ + ('pid', c_uint), + ('usedGpuMemory', c_ulonglong), + ] + +class c_nvmlEccErrorCounts_t(Structure): + _fields_ = [ + ('l1Cache', c_ulonglong), + ('l2Cache', c_ulonglong), + ('deviceMemory', c_ulonglong), + ('registerFile', c_ulonglong), + ] + +class c_nvmlUtilization_t(Structure): + _fields_ = [ + ('gpu', c_uint), + ('memory', c_uint), + ] + +# Added in 2.285 +class c_nvmlHwbcEntry_t(Structure): + _fields_ = [ + ('hwbcId', c_uint), + ('firmwareVersion', c_char * 32), + ] + +## Event structures +class struct_c_nvmlEventSet_t(Structure): + pass # opaque handle +c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t) + +nvmlEventTypeSingleBitEccError = 0x0000000000000001 +nvmlEventTypeDoubleBitEccError = 0x0000000000000002 +nvmlEventTypePState = 0x0000000000000004 +nvmlEventTypeXidCriticalError = 0x0000000000000008 +nvmlEventTypeNone = 0x0000000000000000 +nvmlEventTypeAll = ( + nvmlEventTypeNone | + nvmlEventTypeSingleBitEccError | + nvmlEventTypeDoubleBitEccError | + nvmlEventTypePState | + nvmlEventTypeXidCriticalError + ) + +class c_nvmlEventData_t(Structure): + _fields_ = [ + ('device', c_nvmlDevice_t), + ('eventType', c_ulonglong), + ('reserved', c_ulonglong) + ] + +## C function wrappers ## +def nvmlInit(): + global nvmlLib + global libLoadLock + + # + # Load the library if it isn't loaded already + # + if (nvmlLib == None): + # lock to ensure only one caller loads the library + libLoadLock.acquire() + + try: + # ensure the library still isn't loaded + if (nvmlLib == None): + try: + if (sys.platform[:3] == "win"): + # cdecl calling convention + nvmlLib = cdll.nvml + else: + # assume linux + nvmlLib = CDLL("libnvidia-ml.so") + except OSError as ose: + print(ose) + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + if (nvmlLib == None): + print("Failed to load NVML") + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + finally: + # lock is always freed + libLoadLock.release() + + # + # Initialize the library + # + fn = _nvmlGetFunctionPointer("nvmlInit") + ret = fn() + _nvmlCheckReturn(ret) + return None + +def nvmlShutdown(): + # + # Leave the library loaded, but shutdown the interface + # + fn = _nvmlGetFunctionPointer("nvmlShutdown") + ret = fn() + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlErrorString(result): + fn = _nvmlGetFunctionPointer("nvmlErrorString") + fn.restype = c_char_p # otherwise return is an int + ret = fn(result) + return ret + +# Added in 2.285 +def nvmlSystemGetNVMLVersion(): + c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetProcessName(pid): + c_name = create_string_buffer(1024) + fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName") + ret = fn(c_uint(pid), c_name, c_uint(1024)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlSystemGetDriverVersion(): + c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetHicVersion(): + c_count = c_uint(0) + hics = None + fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion") + + # get the count + ret = fn(byref(c_count), None) + + # this should only fail with insufficient size + if ((ret != NVML_SUCCESS) and + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + raise NVMLError(ret) + + # if there are no hics + if (c_count.value == 0): + return [] + + hic_array = c_nvmlHwbcEntry_t * c_count.value + hics = hic_array() + ret = fn(byref(c_count), hics) + _nvmlCheckReturn(ret) + return hics + +## Unit get functions +def nvmlUnitGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetHandleByIndex(index): + c_index = c_uint(index) + unit = c_nvmlUnit_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex") + ret = fn(c_index, byref(unit)) + _nvmlCheckReturn(ret) + return unit + +def nvmlUnitGetUnitInfo(unit): + c_info = c_nvmlUnitInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetLedState(unit): + c_state = c_nvmlLedState_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState") + ret = fn(unit, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state + +def nvmlUnitGetPsuInfo(unit): + c_info = c_nvmlPSUInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetTemperature(unit, type): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature") + ret = fn(unit, c_uint(type), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +def nvmlUnitGetFanSpeedInfo(unit): + c_speeds = c_nvmlUnitFanSpeeds_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo") + ret = fn(unit, byref(c_speeds)) + _nvmlCheckReturn(ret) + return c_speeds + +# added to API +def nvmlUnitGetDeviceCount(unit): + c_count = c_uint(0) + # query the unit to determine device count + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), None) + if (ret == NVML_ERROR_INSUFFICIENT_SIZE): + ret = NVML_ERROR_SUCCESS + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetDevices(unit): + c_count = c_uint(nvmlUnitGetDeviceCount(unit)) + device_array = c_nvmlDevice_t * c_count.value + c_devices = device_array() + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), c_devices) + _nvmlCheckReturn(ret) + return c_devices + +## Device get functions +def nvmlDeviceGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetHandleByIndex(index): + c_index = c_uint(index) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex") + ret = fn(c_index, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleBySerial(serial): + c_serial = c_char_p(serial) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial") + ret = fn(c_serial, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByUUID(uuid): + c_uuid = c_char_p(uuid) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID") + ret = fn(c_uuid, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByPciBusId(pciBusId): + c_busId = c_char_p(pciBusId) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId") + ret = fn(c_busId, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetName(handle): + c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetName") + ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlDeviceGetSerial(handle): + c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial") + ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_serial.value + +def nvmlDeviceGetUUID(handle): + c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID") + ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_uuid.value + +def nvmlDeviceGetInforomVersion(handle, infoRomObject): + c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") + ret = fn(handle, _nvmlInforomObject_t(infoRomObject), + c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +def nvmlDeviceGetDisplayMode(handle): + c_mode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetPersistenceMode(handle): + c_state = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode") + ret = fn(handle, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + +def nvmlDeviceGetPciInfo(handle): + c_info = nvmlPciInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v2") + ret = fn(handle, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlDeviceGetClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +# Added in 2.285 +def nvmlDeviceGetMaxClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +def nvmlDeviceGetFanSpeed(handle): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed") + ret = fn(handle, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + +def nvmlDeviceGetTemperature(handle, sensor): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") + ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +# DEPRECATED use nvmlDeviceGetPerformanceState +def nvmlDeviceGetPowerState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPerformanceState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPowerManagementMode(handle): + c_pcapMode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode") + ret = fn(handle, byref(c_pcapMode)) + _nvmlCheckReturn(ret) + return c_pcapMode.value + +def nvmlDeviceGetPowerManagementLimit(handle): + c_limit = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit") + ret = fn(handle, byref(c_limit)) + _nvmlCheckReturn(ret) + return c_limit.value + +def nvmlDeviceGetPowerUsage(handle): + c_watts = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage") + ret = fn(handle, byref(c_watts)) + _nvmlCheckReturn(ret) + return c_watts.value + +def nvmlDeviceGetMemoryInfo(handle): + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") + ret = fn(handle, byref(c_memory)) + _nvmlCheckReturn(ret) + return c_memory + +def nvmlDeviceGetComputeMode(handle): + c_mode = _nvmlComputeMode_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetEccMode(handle): + c_currState = _nvmlEnableState_t() + c_pendingState = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode") + ret = fn(handle, byref(c_currState), byref(c_pendingState)) + _nvmlCheckReturn(ret) + return [c_currState.value, c_pendingState.value] + +# added to API +def nvmlDeviceGetCurrentEccMode(handle): + return nvmlDeviceGetEccMode(handle)[0] + +# added to API +def nvmlDeviceGetPendingEccMode(handle): + return nvmlDeviceGetEccMode(handle)[1] + +def nvmlDeviceGetTotalEccErrors(handle, bitType, counterType): + c_count = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType): + c_count = c_nvmlEccErrorCounts_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count + +def nvmlDeviceGetUtilizationRates(handle): + c_util = c_nvmlUtilization_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates") + ret = fn(handle, byref(c_util)) + _nvmlCheckReturn(ret) + return c_util + +def nvmlDeviceGetDriverModel(handle): + c_currModel = _nvmlDriverModel_t() + c_pendingModel = _nvmlDriverModel_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel") + ret = fn(handle, byref(c_currModel), byref(c_pendingModel)) + _nvmlCheckReturn(ret) + return [c_currModel.value, c_pendingModel.value] + +# added to API +def nvmlDeviceGetCurrentDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[0] + +# added to API +def nvmlDeviceGetPendingDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[1] + +# Added in 2.285 +def nvmlDeviceGetVbiosVersion(handle): + c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion") + ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlDeviceGetComputeRunningProcesses(handle): + # first call to get the size + c_count = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses") + ret = fn(handle, byref(c_count), None) + + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + # typical case + # oversize the array incase more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = c_nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + # make the call again + ret = fn(handle, byref(c_count), c_procs) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) + +## Set functions +def nvmlUnitSetLedState(unit, color): + fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState") + ret = fn(unit, _nvmlLedColor_t(color)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetPersistenceMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetComputeMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode") + ret = fn(handle, _nvmlComputeMode_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetEccMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceClearEccErrorCounts(handle, counterType): + fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts") + ret = fn(handle, _nvmlEccCounterType_t(counterType)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetDriverModel(handle, model): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel") + ret = fn(handle, _nvmlDriverModel_t(model)) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventSetCreate(): + fn = _nvmlGetFunctionPointer("nvmlEventSetCreate") + eventSet = c_nvmlEventSet_t() + ret = fn(byref(eventSet)) + _nvmlCheckReturn(ret) + return eventSet + +# Added in 2.285 +def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): + fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents") + ret = fn(handle, c_ulonglong(eventTypes), eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlDeviceGetSupportedEventTypes(handle): + c_eventTypes = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes") + ret = fn(handle, byref(c_eventTypes)) + _nvmlCheckReturn(ret) + return c_eventTypes.value + +# Added in 2.285 +# raises NVML_ERROR_TIMEOUT exception on timeout +def nvmlEventSetWait(eventSet, timeoutms): + fn = _nvmlGetFunctionPointer("nvmlEventSetWait") + data = c_nvmlEventData_t() + ret = fn(eventSet, byref(data), c_uint(timeoutms)) + _nvmlCheckReturn(ret) + return data + +# Added in 2.285 +def nvmlEventSetFree(eventSet): + fn = _nvmlGetFunctionPointer("nvmlEventSetFree") + ret = fn(eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventDataGetPerformanceState(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetPerformanceState") + pstate = _nvmlPstates_t() + ret = fn(byref(data), byref(pstate)) + _nvmlCheckReturn(ret) + return pstate.value + +# Added in 2.285 +def nvmlEventDataGetXidCriticalError(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetXidCriticalError") + xid = c_uint() + ret = fn(byref(data), byref(xid)) + _nvmlCheckReturn(ret) + return xid.value + +# Added in 2.285 +def nvmlEventDataGetEccErrorCount(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetEccErrorCount") + ecc = c_ulonglong() + ret = fn(byref(data), byref(ecc)) + _nvmlCheckReturn(ret) + return ecc.value + +# Added in 3.295 +def nvmlDeviceOnSameBoard(handle1, handle2): + fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard") + onSameBoard = c_int() + ret = fn(handle1, handle2, byref(onSameBoard)) + _nvmlCheckReturn(ret) + return (onSameBoard.value != 0) + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + + + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py b/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py new file mode 100644 index 00000000..f1a42707 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/nvidia_smi.py @@ -0,0 +1,455 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +# +# nvidia_smi +# nvml_bindings nvidia com +# +# Sample code that attempts to reproduce the output of nvidia-smi -q- x +# For many cases the output should match +# +# To Run: +# $ python +# Python 2.7 (r27:82500, Sep 16 2010, 18:02:00) +# [GCC 4.5.1 20100907 (Red Hat 4.5.1-3)] on linux2 +# Type "help", "copyright", "credits" or "license" for more information. +# >>> import nvidia_smi +# >>> print(nvidia_smi.XmlDeviceQuery()) +# ... +# + +from pynvml import * +import datetime + +# +# Helper functions +# +def GetEccByType(handle, counterType, bitType): + try: + count = str(nvmlDeviceGetTotalEccErrors(handle, bitType, counterType)) + except NVMLError as err: + count = handleError(err) + + try: + detail = nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType) + deviceMemory = str(detail.deviceMemory) + registerFile = str(detail.registerFile) + l1Cache = str(detail.l1Cache) + l2Cache = str(detail.l2Cache) + except NVMLError as err: + msg = handleError(err) + deviceMemory = msg + registerFile = msg + l1Cache = msg + l2Cache = msg + strResult = '' + strResult += ' ' + deviceMemory + '\n' + strResult += ' ' + registerFile + '\n' + strResult += ' ' + l1Cache + '\n' + strResult += ' ' + l2Cache + '\n' + strResult += ' ' + count + '\n' + return strResult + +def GetEccByCounter(handle, counterType): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_SINGLE_BIT_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByType(handle, counterType, NVML_DOUBLE_BIT_ECC)) + strResult += ' \n' + return strResult + +def GetEccStr(handle): + strResult = '' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_VOLATILE_ECC)) + strResult += ' \n' + strResult += ' \n' + strResult += str(GetEccByCounter(handle, NVML_AGGREGATE_ECC)) + strResult += ' \n' + return strResult + +# +# Converts errors into string messages +# +def handleError(err): + if (err.value == NVML_ERROR_NOT_SUPPORTED): + return "N/A" + else: + return err.__str__() + +####### +def XmlDeviceQuery(): + + try: + # + # Initialize NVML + # + nvmlInit() + strResult = '' + + strResult += '\n' + strResult += '\n' + strResult += '\n' + + strResult += ' ' + str(datetime.date.today()) + '\n' + strResult += ' ' + str(nvmlSystemGetDriverVersion()) + '\n' + + deviceCount = nvmlDeviceGetCount() + strResult += ' ' + str(deviceCount) + '\n' + + for i in range(0, deviceCount): + handle = nvmlDeviceGetHandleByIndex(i) + + pciInfo = nvmlDeviceGetPciInfo(handle) + + strResult += ' \n' % pciInfo.busId + + strResult += ' ' + nvmlDeviceGetName(handle) + '\n' + + try: + state = ('Enabled' if (nvmlDeviceGetDisplayMode(handle) != 0) else 'Disabled') + except NVMLError as err: + state = handleError(err) + + strResult += ' ' + state + '\n' + + try: + mode = 'Enabled' if (nvmlDeviceGetPersistenceMode(handle) != 0) else 'Disabled' + except NVMLError as err: + mode = handleError(err) + + strResult += ' ' + mode + '\n' + + strResult += ' \n' + + try: + current = str(nvmlDeviceGetCurrentDriverModel(handle)) + except NVMLError as err: + current = handleError(err) + strResult += ' ' + current + '\n' + + try: + pending = str(nvmlDeviceGetPendingDriverModel(handle)) + except NVMLError as err: + pending = handleError(err) + + strResult += ' ' + pending + '\n' + + strResult += ' \n' + + try: + serial = nvmlDeviceGetSerial(handle) + except NVMLError as err: + serial = handleError(err) + + strResult += ' ' + serial + '\n' + + try: + uuid = nvmlDeviceGetUUID(handle) + except NVMLError as err: + uuid = handleError(err) + + strResult += ' ' + uuid + '\n' + + try: + vbios = nvmlDeviceGetVbiosVersion(handle) + except NVMLError as err: + vbios = handleError(err) + + strResult += ' ' + vbios + '\n' + + strResult += ' \n' + + try: + oem = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_OEM) + if oem == '': + oem = 'N/A' + except NVMLError as err: + oem = handleError(err) + + strResult += ' ' + oem + '\n' + + try: + ecc = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_ECC) + if ecc == '': + ecc = 'N/A' + except NVMLError as err: + ecc = handleError(err) + + strResult += ' ' + ecc + '\n' + try: + pwr = nvmlDeviceGetInforomVersion(handle, NVML_INFOROM_POWER) + if pwr == '': + pwr = 'N/A' + except NVMLError as err: + pwr = handleError(err) + + strResult += ' ' + pwr + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += ' %02X\n' % pciInfo.bus + strResult += ' %02X\n' % pciInfo.device + strResult += ' %04X\n' % pciInfo.domain + strResult += ' %08X\n' % (pciInfo.pciDeviceId) + strResult += ' %08X\n' % (pciInfo.pciSubSystemId) + strResult += ' ' + str(pciInfo.busId) + '\n' + strResult += ' \n' + + + strResult += ' \n' + + try: + gen = str(nvmlDeviceGetMaxPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + + try: + gen = str(nvmlDeviceGetCurrPcieLinkGeneration(handle)) + except NVMLError as err: + gen = handleError(err) + + strResult += ' ' + gen + '\n' + strResult += ' \n' + strResult += ' \n' + + try: + width = str(nvmlDeviceGetMaxPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + try: + width = str(nvmlDeviceGetCurrPcieLinkWidth(handle)) + 'x' + except NVMLError as err: + width = handleError(err) + + strResult += ' ' + width + '\n' + + strResult += ' \n' + strResult += ' \n' + strResult += ' \n' + + try: + fan = str(nvmlDeviceGetFanSpeed(handle)) + ' %' + except NVMLError as err: + fan = handleError(err) + strResult += ' ' + fan + '\n' + + try: + memInfo = nvmlDeviceGetMemoryInfo(handle) + mem_total = str(memInfo.total / 1024 / 1024) + ' MB' + mem_used = str(memInfo.used / 1024 / 1024) + ' MB' + mem_free = str(memInfo.free / 1024 / 1024) + ' MB' + except NVMLError as err: + error = handleError(err) + mem_total = error + mem_used = error + mem_free = error + + strResult += ' \n' + strResult += ' ' + mem_total + '\n' + strResult += ' ' + mem_used + '\n' + strResult += ' ' + mem_free + '\n' + strResult += ' \n' + + + try: + mode = nvmlDeviceGetComputeMode(handle) + if mode == NVML_COMPUTEMODE_DEFAULT: + modeStr = 'Default' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_THREAD: + modeStr = 'Exclusive Thread' + elif mode == NVML_COMPUTEMODE_PROHIBITED: + modeStr = 'Prohibited' + elif mode == NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: + modeStr = 'Exclusive Process' + else: + modeStr = 'Unknown' + except NVMLError as err: + modeStr = handleError(err) + + strResult += ' ' + modeStr + '\n' + + try: + util = nvmlDeviceGetUtilizationRates(handle) + gpu_util = str(util.gpu) + mem_util = str(util.memory) + except NVMLError as err: + error = handleError(err) + gpu_util = error + mem_util = error + + strResult += ' \n' + strResult += ' ' + gpu_util + ' %\n' + strResult += ' ' + mem_util + ' %\n' + strResult += ' \n' + + try: + (current, pending) = nvmlDeviceGetEccMode(handle) + curr_str = 'Enabled' if (current != 0) else 'Disabled' + pend_str = 'Enabled' if (pending != 0) else 'Disabled' + except NVMLError as err: + error = handleError(err) + curr_str = error + pend_str = error + + strResult += ' \n' + strResult += ' ' + curr_str + '\n' + strResult += ' ' + pend_str + '\n' + strResult += ' \n' + + strResult += ' \n' + strResult += GetEccStr(handle) + strResult += ' \n' + + try: + temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) + ' C' + except NVMLError as err: + temp = handleError(err) + + strResult += ' \n' + strResult += ' ' + temp + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + perfState = nvmlDeviceGetPowerState(handle) + except NVMLError as err: + perfState = handleError(err) + strResult += ' P%s\n' % perfState + try: + powMan = nvmlDeviceGetPowerManagementMode(handle) + powManStr = 'Supported' if powMan != 0 else 'N/A' + except NVMLError as err: + powManStr = handleError(err) + strResult += ' ' + powManStr + '\n' + try: + powDraw = (nvmlDeviceGetPowerUsage(handle) / 1000.0) + powDrawStr = '%.2f W' % powDraw + except NVMLError as err: + powDrawStr = handleError(err) + strResult += ' ' + powDrawStr + '\n' + try: + powLimit = (nvmlDeviceGetPowerManagementLimit(handle) / 1000.0) + powLimitStr = '%d W' % powLimit + except NVMLError as err: + powLimitStr = handleError(err) + strResult += ' ' + powLimitStr + '\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' +graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + strResult += ' \n' + try: + graphics = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_GRAPHICS)) + except NVMLError as err: + graphics = handleError(err) + strResult += ' ' + graphics + ' MHz\n' + try: + sm = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_SM)) + except NVMLError as err: + sm = handleError(err) + strResult += ' ' + sm + ' MHz\n' + try: + mem = str(nvmlDeviceGetMaxClockInfo(handle, NVML_CLOCK_MEM)) + except NVMLError as err: + mem = handleError(err) + strResult += ' ' + mem + ' MHz\n' + strResult += ' \n' + + try: + perfState = nvmlDeviceGetPowerState(handle) + perfStateStr = 'P%s' % perfState + except NVMLError as err: + perfStateStr = handleError(err) + strResult += ' ' + perfStateStr + '\n' + + strResult += ' \n' + + procstr = "" + try: + procs = nvmlDeviceGetComputeRunningProcesses(handle) + except NVMLError as err: + procs = [] + procstr = handleError(err) + + for p in procs: + procstr += ' \n' + procstr += ' %d\n' % p.pid + try: + name = str(nvmlSystemGetProcessName(p.pid)) + except NVMLError as err: + if (err.value == NVML_ERROR_NOT_FOUND): + # probably went away + continue + else: + name = handleError(err) + procstr += ' ' + name + '\n' + procstr += ' \n' + if (p.usedGpuMemory == None): + procstr += 'N\A' + else: + procstr += '%d MB\n' % (p.usedGpuMemory / 1024 / 1024) + procstr += '\n' + procstr += ' \n' + + strResult += procstr + strResult += ' \n' + strResult += ' \n' + + strResult += '\n' + + except NVMLError as err: + strResult += 'nvidia_smi.py: ' + err.__str__() + '\n' + + nvmlShutdown() + + return strResult + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py b/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py new file mode 100644 index 00000000..90f8bdd8 --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/pynvml.py @@ -0,0 +1,903 @@ +##### +# Copyright (c) 2011-2012, NVIDIA Corporation. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of the NVIDIA Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGE. +##### + +## +# Python bindings for the NVML library +## +from ctypes import * +from ctypes.util import find_library +import sys +import threading + +## C Type mappings ## +## Enums +_nvmlEnableState_t = c_uint +NVML_FEATURE_DISABLED = 0 +NVML_FEATURE_ENABLED = 1 + +_nvmlTemperatureSensors_t = c_uint +NVML_TEMPERATURE_GPU = 0 + +_nvmlComputeMode_t = c_uint +NVML_COMPUTEMODE_DEFAULT = 0 +NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1 +NVML_COMPUTEMODE_PROHIBITED = 2 +NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 + +_nvmlEccBitType_t = c_uint +NVML_SINGLE_BIT_ECC = 0 +NVML_DOUBLE_BIT_ECC = 1 + +_nvmlEccCounterType_t = c_uint +NVML_VOLATILE_ECC = 0 +NVML_AGGREGATE_ECC = 1 + +_nvmlClockType_t = c_uint +NVML_CLOCK_GRAPHICS = 0 +NVML_CLOCK_SM = 1 +NVML_CLOCK_MEM = 2 + +_nvmlDriverModel_t = c_uint +NVML_DRIVER_WDDM = 0 +NVML_DRIVER_WDM = 1 + +_nvmlPstates_t = c_uint +NVML_PSTATE_0 = 0 +NVML_PSTATE_1 = 1 +NVML_PSTATE_2 = 2 +NVML_PSTATE_3 = 3 +NVML_PSTATE_4 = 4 +NVML_PSTATE_5 = 5 +NVML_PSTATE_6 = 6 +NVML_PSTATE_7 = 7 +NVML_PSTATE_8 = 8 +NVML_PSTATE_9 = 9 +NVML_PSTATE_10 = 10 +NVML_PSTATE_11 = 11 +NVML_PSTATE_12 = 12 +NVML_PSTATE_13 = 13 +NVML_PSTATE_14 = 14 +NVML_PSTATE_15 = 15 +NVML_PSTATE_UNKNOWN = 32 + +_nvmlInforomObject_t = c_uint +NVML_INFOROM_OEM = 0 +NVML_INFOROM_ECC = 1 +NVML_INFOROM_POWER = 2 + +_nvmlReturn_t = c_uint +NVML_SUCCESS = 0 +NVML_ERROR_UNINITIALIZED = 1 +NVML_ERROR_INVALID_ARGUMENT = 2 +NVML_ERROR_NOT_SUPPORTED = 3 +NVML_ERROR_NO_PERMISSION = 4 +NVML_ERROR_ALREADY_INITIALIZED = 5 +NVML_ERROR_NOT_FOUND = 6 +NVML_ERROR_INSUFFICIENT_SIZE = 7 +NVML_ERROR_INSUFFICIENT_POWER = 8 +NVML_ERROR_DRIVER_NOT_LOADED = 9 +NVML_ERROR_TIMEOUT = 10, +NVML_ERROR_UNKNOWN = 999 + +_nvmlFanState_t = c_uint +NVML_FAN_NORMAL = 0 +NVML_FAN_FAILED = 1 + +_nvmlLedColor_t = c_uint +NVML_LED_COLOR_GREEN = 0 +NVML_LED_COLOR_AMBER = 1 + +# C preprocessor defined values +nvmlFlagDefault = 0 +nvmlFlagForce = 1 + +# buffer size +NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE = 16 +NVML_DEVICE_UUID_BUFFER_SIZE = 80 +NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE = 81 +NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE = 80 +NVML_DEVICE_NAME_BUFFER_SIZE = 64 +NVML_DEVICE_SERIAL_BUFFER_SIZE = 30 +NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE = 32 + +NVML_VALUE_NOT_AVAILABLE_ulonglong = c_ulonglong(-1) + +## Lib loading ## +nvmlLib = None +libLoadLock = threading.Lock() + +## Error Checking ## +class NVMLError(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return str(nvmlErrorString(self.value)) + +def _nvmlCheckReturn(ret): + if (ret != NVML_SUCCESS): + raise NVMLError(ret) + return ret + +## Function access ## +def _nvmlGetFunctionPointer(name): + global nvmlLib + global libLoadLock + + libLoadLock.acquire() + try: + # ensure library was loaded + if (nvmlLib == None): + raise NVMLError(NVML_ERROR_UNINITIALIZED) + try: + return getattr(nvmlLib, name) + except AttributeError as attrError: + raise NVMLError(NVML_ERROR_NOT_SUPPORTED) + finally: + # lock is always freed + libLoadLock.release() + +## Alternative object +# Allows the object to be printed +# Allows mismatched types to be assigned +# - like None when the Structure variant requires c_uint +class nvmlFriendlyObject(object): + def __init__(self, dictionary): + for x in dictionary: + setattr(self, x, dictionary[x]) + def __str__(self): + return self.__dict__.__str__() + +def nvmlStructToFriendlyObject(struct): + d = {} + for x in struct._fields_: + key = x[0] + value = getattr(struct, key) + d[key] = value + obj = nvmlFriendlyObject(d) + return obj + +# pack the object so it can be passed to the NVML library +def nvmlFriendlyObjectToStruct(obj, model): + for x in model._fields_: + key = x[0] + value = obj.__dict__[key] + setattr(model, key, value) + return model + +## Unit structures +class struct_c_nvmlUnit_t(Structure): + pass # opaque handle +c_nvmlUnit_t = POINTER(struct_c_nvmlUnit_t) + +class c_nvmlUnitInfo_t(Structure): + _fields_ = [ + ('name', c_char * 96), + ('id', c_char * 96), + ('serial', c_char * 96), + ('firmwareVersion', c_char * 96), + ] + +class c_nvmlLedState_t(Structure): + _fields_ = [ + ('cause', c_char * 256), + ('color', _nvmlLedColor_t), + ] + +class c_nvmlPSUInfo_t(Structure): + _fields_ = [ + ('state', c_char * 256), + ('current', c_uint), + ('voltage', c_uint), + ('power', c_uint), + ] + +class c_nvmlUnitFanInfo_t(Structure): + _fields_ = [ + ('speed', c_uint), + ('state', _nvmlFanState_t), + ] + +class c_nvmlUnitFanSpeeds_t(Structure): + _fields_ = [ + ('fans', c_nvmlUnitFanInfo_t * 24), + ('count', c_uint) + ] + +## Device structures +class struct_c_nvmlDevice_t(Structure): + pass # opaque handle +c_nvmlDevice_t = POINTER(struct_c_nvmlDevice_t) + +class nvmlPciInfo_t(Structure): + _fields_ = [ + ('busId', c_char * 16), + ('domain', c_uint), + ('bus', c_uint), + ('device', c_uint), + ('pciDeviceId', c_uint), + + # Added in 2.285 + ('pciSubSystemId', c_uint), + ('reserved0', c_uint), + ('reserved1', c_uint), + ('reserved2', c_uint), + ('reserved3', c_uint), + ] + +class c_nvmlMemory_t(Structure): + _fields_ = [ + ('total', c_ulonglong), + ('free', c_ulonglong), + ('used', c_ulonglong), + ] + +# On Windows with the WDDM driver, usedGpuMemory is reported as None +# Code that processes this structure should check for None, I.E. +# +# if (info.usedGpuMemory == None): +# # TODO handle the error +# pass +# else: +# print("Using %d MB of memory" % (info.usedGpuMemory / 1024 / 1024)) +# +# See NVML documentation for more information +class c_nvmlProcessInfo_t(Structure): + _fields_ = [ + ('pid', c_uint), + ('usedGpuMemory', c_ulonglong), + ] + +class c_nvmlEccErrorCounts_t(Structure): + _fields_ = [ + ('l1Cache', c_ulonglong), + ('l2Cache', c_ulonglong), + ('deviceMemory', c_ulonglong), + ('registerFile', c_ulonglong), + ] + +class c_nvmlUtilization_t(Structure): + _fields_ = [ + ('gpu', c_uint), + ('memory', c_uint), + ] + +# Added in 2.285 +class c_nvmlHwbcEntry_t(Structure): + _fields_ = [ + ('hwbcId', c_uint), + ('firmwareVersion', c_char * 32), + ] + +## Event structures +class struct_c_nvmlEventSet_t(Structure): + pass # opaque handle +c_nvmlEventSet_t = POINTER(struct_c_nvmlEventSet_t) + +nvmlEventTypeSingleBitEccError = 0x0000000000000001 +nvmlEventTypeDoubleBitEccError = 0x0000000000000002 +nvmlEventTypePState = 0x0000000000000004 +nvmlEventTypeXidCriticalError = 0x0000000000000008 +nvmlEventTypeNone = 0x0000000000000000 +nvmlEventTypeAll = ( + nvmlEventTypeNone | + nvmlEventTypeSingleBitEccError | + nvmlEventTypeDoubleBitEccError | + nvmlEventTypePState | + nvmlEventTypeXidCriticalError + ) + +class c_nvmlEventData_t(Structure): + _fields_ = [ + ('device', c_nvmlDevice_t), + ('eventType', c_ulonglong), + ('reserved', c_ulonglong) + ] + +## C function wrappers ## +def nvmlInit(): + global nvmlLib + global libLoadLock + + # + # Load the library if it isn't loaded already + # + if (nvmlLib == None): + # lock to ensure only one caller loads the library + libLoadLock.acquire() + + try: + # ensure the library still isn't loaded + if (nvmlLib == None): + try: + if (sys.platform[:3] == "win"): + # cdecl calling convention + nvmlLib = cdll.nvml + else: + # assume linux + nvmlLib = CDLL("libnvidia-ml.so") + except OSError as ose: + print(ose) + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + if (nvmlLib == None): + print("Failed to load NVML") + _nvmlCheckReturn(NVML_ERROR_DRIVER_NOT_LOADED) + finally: + # lock is always freed + libLoadLock.release() + + # + # Initialize the library + # + fn = _nvmlGetFunctionPointer("nvmlInit") + ret = fn() + _nvmlCheckReturn(ret) + return None + +def nvmlShutdown(): + # + # Leave the library loaded, but shutdown the interface + # + fn = _nvmlGetFunctionPointer("nvmlShutdown") + ret = fn() + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlErrorString(result): + fn = _nvmlGetFunctionPointer("nvmlErrorString") + fn.restype = c_char_p # otherwise return is an int + ret = fn(result) + return ret + +# Added in 2.285 +def nvmlSystemGetNVMLVersion(): + c_version = create_string_buffer(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetNVMLVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_NVML_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetProcessName(pid): + c_name = create_string_buffer(1024) + fn = _nvmlGetFunctionPointer("nvmlSystemGetProcessName") + ret = fn(c_uint(pid), c_name, c_uint(1024)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlSystemGetDriverVersion(): + c_version = create_string_buffer(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlSystemGetDriverVersion") + ret = fn(c_version, c_uint(NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlSystemGetHicVersion(): + c_count = c_uint(0) + hics = None + fn = _nvmlGetFunctionPointer("nvmlSystemGetHicVersion") + + # get the count + ret = fn(byref(c_count), None) + + # this should only fail with insufficient size + if ((ret != NVML_SUCCESS) and + (ret != NVML_ERROR_INSUFFICIENT_SIZE)): + raise NVMLError(ret) + + # if there are no hics + if (c_count.value == 0): + return [] + + hic_array = c_nvmlHwbcEntry_t * c_count.value + hics = hic_array() + ret = fn(byref(c_count), hics) + _nvmlCheckReturn(ret) + return hics + +## Unit get functions +def nvmlUnitGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetHandleByIndex(index): + c_index = c_uint(index) + unit = c_nvmlUnit_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetHandleByIndex") + ret = fn(c_index, byref(unit)) + _nvmlCheckReturn(ret) + return unit + +def nvmlUnitGetUnitInfo(unit): + c_info = c_nvmlUnitInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetUnitInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetLedState(unit): + c_state = c_nvmlLedState_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetLedState") + ret = fn(unit, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state + +def nvmlUnitGetPsuInfo(unit): + c_info = c_nvmlPSUInfo_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetPsuInfo") + ret = fn(unit, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlUnitGetTemperature(unit, type): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlUnitGetTemperature") + ret = fn(unit, c_uint(type), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +def nvmlUnitGetFanSpeedInfo(unit): + c_speeds = c_nvmlUnitFanSpeeds_t() + fn = _nvmlGetFunctionPointer("nvmlUnitGetFanSpeedInfo") + ret = fn(unit, byref(c_speeds)) + _nvmlCheckReturn(ret) + return c_speeds + +# added to API +def nvmlUnitGetDeviceCount(unit): + c_count = c_uint(0) + # query the unit to determine device count + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), None) + if (ret == NVML_ERROR_INSUFFICIENT_SIZE): + ret = NVML_ERROR_SUCCESS + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlUnitGetDevices(unit): + c_count = c_uint(nvmlUnitGetDeviceCount(unit)) + device_array = c_nvmlDevice_t * c_count.value + c_devices = device_array() + fn = _nvmlGetFunctionPointer("nvmlUnitGetDevices") + ret = fn(unit, byref(c_count), c_devices) + _nvmlCheckReturn(ret) + return c_devices + +## Device get functions +def nvmlDeviceGetCount(): + c_count = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCount") + ret = fn(byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetHandleByIndex(index): + c_index = c_uint(index) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByIndex") + ret = fn(c_index, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleBySerial(serial): + c_serial = c_char_p(serial) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleBySerial") + ret = fn(c_serial, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByUUID(uuid): + c_uuid = c_char_p(uuid) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByUUID") + ret = fn(c_uuid, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetHandleByPciBusId(pciBusId): + c_busId = c_char_p(pciBusId) + device = c_nvmlDevice_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetHandleByPciBusId") + ret = fn(c_busId, byref(device)) + _nvmlCheckReturn(ret) + return device + +def nvmlDeviceGetName(handle): + c_name = create_string_buffer(NVML_DEVICE_NAME_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetName") + ret = fn(handle, c_name, c_uint(NVML_DEVICE_NAME_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_name.value + +def nvmlDeviceGetSerial(handle): + c_serial = create_string_buffer(NVML_DEVICE_SERIAL_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSerial") + ret = fn(handle, c_serial, c_uint(NVML_DEVICE_SERIAL_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_serial.value + +def nvmlDeviceGetUUID(handle): + c_uuid = create_string_buffer(NVML_DEVICE_UUID_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUUID") + ret = fn(handle, c_uuid, c_uint(NVML_DEVICE_UUID_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_uuid.value + +def nvmlDeviceGetInforomVersion(handle, infoRomObject): + c_version = create_string_buffer(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetInforomVersion") + ret = fn(handle, _nvmlInforomObject_t(infoRomObject), + c_version, c_uint(NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +def nvmlDeviceGetDisplayMode(handle): + c_mode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDisplayMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetPersistenceMode(handle): + c_state = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPersistenceMode") + ret = fn(handle, byref(c_state)) + _nvmlCheckReturn(ret) + return c_state.value + +def nvmlDeviceGetPciInfo(handle): + c_info = nvmlPciInfo_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPciInfo_v2") + ret = fn(handle, byref(c_info)) + _nvmlCheckReturn(ret) + return c_info + +def nvmlDeviceGetClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +# Added in 2.285 +def nvmlDeviceGetMaxClockInfo(handle, type): + c_clock = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxClockInfo") + ret = fn(handle, _nvmlClockType_t(type), byref(c_clock)) + _nvmlCheckReturn(ret) + return c_clock.value + +def nvmlDeviceGetFanSpeed(handle): + c_speed = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetFanSpeed") + ret = fn(handle, byref(c_speed)) + _nvmlCheckReturn(ret) + return c_speed.value + +def nvmlDeviceGetTemperature(handle, sensor): + c_temp = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTemperature") + ret = fn(handle, _nvmlTemperatureSensors_t(sensor), byref(c_temp)) + _nvmlCheckReturn(ret) + return c_temp.value + +# DEPRECATED use nvmlDeviceGetPerformanceState +def nvmlDeviceGetPowerState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPerformanceState(handle): + c_pstate = _nvmlPstates_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPerformanceState") + ret = fn(handle, byref(c_pstate)) + _nvmlCheckReturn(ret) + return c_pstate.value + +def nvmlDeviceGetPowerManagementMode(handle): + c_pcapMode = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementMode") + ret = fn(handle, byref(c_pcapMode)) + _nvmlCheckReturn(ret) + return c_pcapMode.value + +def nvmlDeviceGetPowerManagementLimit(handle): + c_limit = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerManagementLimit") + ret = fn(handle, byref(c_limit)) + _nvmlCheckReturn(ret) + return c_limit.value + +def nvmlDeviceGetPowerUsage(handle): + c_watts = c_uint() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetPowerUsage") + ret = fn(handle, byref(c_watts)) + _nvmlCheckReturn(ret) + return c_watts.value + +def nvmlDeviceGetMemoryInfo(handle): + c_memory = c_nvmlMemory_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMemoryInfo") + ret = fn(handle, byref(c_memory)) + _nvmlCheckReturn(ret) + return c_memory + +def nvmlDeviceGetComputeMode(handle): + c_mode = _nvmlComputeMode_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeMode") + ret = fn(handle, byref(c_mode)) + _nvmlCheckReturn(ret) + return c_mode.value + +def nvmlDeviceGetEccMode(handle): + c_currState = _nvmlEnableState_t() + c_pendingState = _nvmlEnableState_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetEccMode") + ret = fn(handle, byref(c_currState), byref(c_pendingState)) + _nvmlCheckReturn(ret) + return [c_currState.value, c_pendingState.value] + +# added to API +def nvmlDeviceGetCurrentEccMode(handle): + return nvmlDeviceGetEccMode(handle)[0] + +# added to API +def nvmlDeviceGetPendingEccMode(handle): + return nvmlDeviceGetEccMode(handle)[1] + +def nvmlDeviceGetTotalEccErrors(handle, bitType, counterType): + c_count = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetTotalEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count.value + +def nvmlDeviceGetDetailedEccErrors(handle, bitType, counterType): + c_count = c_nvmlEccErrorCounts_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDetailedEccErrors") + ret = fn(handle, _nvmlEccBitType_t(bitType), + _nvmlEccCounterType_t(counterType), byref(c_count)) + _nvmlCheckReturn(ret) + return c_count + +def nvmlDeviceGetUtilizationRates(handle): + c_util = c_nvmlUtilization_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetUtilizationRates") + ret = fn(handle, byref(c_util)) + _nvmlCheckReturn(ret) + return c_util + +def nvmlDeviceGetDriverModel(handle): + c_currModel = _nvmlDriverModel_t() + c_pendingModel = _nvmlDriverModel_t() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetDriverModel") + ret = fn(handle, byref(c_currModel), byref(c_pendingModel)) + _nvmlCheckReturn(ret) + return [c_currModel.value, c_pendingModel.value] + +# added to API +def nvmlDeviceGetCurrentDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[0] + +# added to API +def nvmlDeviceGetPendingDriverModel(handle): + return nvmlDeviceGetDriverModel(handle)[1] + +# Added in 2.285 +def nvmlDeviceGetVbiosVersion(handle): + c_version = create_string_buffer(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetVbiosVersion") + ret = fn(handle, c_version, c_uint(NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE)) + _nvmlCheckReturn(ret) + return c_version.value + +# Added in 2.285 +def nvmlDeviceGetComputeRunningProcesses(handle): + # first call to get the size + c_count = c_uint(0) + fn = _nvmlGetFunctionPointer("nvmlDeviceGetComputeRunningProcesses") + ret = fn(handle, byref(c_count), None) + + if (ret == NVML_SUCCESS): + # special case, no running processes + return [] + elif (ret == NVML_ERROR_INSUFFICIENT_SIZE): + # typical case + # oversize the array incase more processes are created + c_count.value = c_count.value * 2 + 5 + proc_array = c_nvmlProcessInfo_t * c_count.value + c_procs = proc_array() + + # make the call again + ret = fn(handle, byref(c_count), c_procs) + _nvmlCheckReturn(ret) + + procs = [] + for i in range(c_count.value): + # use an alternative struct for this object + obj = nvmlStructToFriendlyObject(c_procs[i]) + if (obj.usedGpuMemory == NVML_VALUE_NOT_AVAILABLE_ulonglong.value): + # special case for WDDM on Windows, see comment above + obj.usedGpuMemory = None + procs.append(obj) + + return procs + else: + # error case + raise NVMLError(ret) + +## Set functions +def nvmlUnitSetLedState(unit, color): + fn = _nvmlGetFunctionPointer("nvmlUnitSetLedState") + ret = fn(unit, _nvmlLedColor_t(color)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetPersistenceMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetPersistenceMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetComputeMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetComputeMode") + ret = fn(handle, _nvmlComputeMode_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetEccMode(handle, mode): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetEccMode") + ret = fn(handle, _nvmlEnableState_t(mode)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceClearEccErrorCounts(handle, counterType): + fn = _nvmlGetFunctionPointer("nvmlDeviceClearEccErrorCounts") + ret = fn(handle, _nvmlEccCounterType_t(counterType)) + _nvmlCheckReturn(ret) + return None + +def nvmlDeviceSetDriverModel(handle, model): + fn = _nvmlGetFunctionPointer("nvmlDeviceSetDriverModel") + ret = fn(handle, _nvmlDriverModel_t(model)) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventSetCreate(): + fn = _nvmlGetFunctionPointer("nvmlEventSetCreate") + eventSet = c_nvmlEventSet_t() + ret = fn(byref(eventSet)) + _nvmlCheckReturn(ret) + return eventSet + +# Added in 2.285 +def nvmlDeviceRegisterEvents(handle, eventTypes, eventSet): + fn = _nvmlGetFunctionPointer("nvmlDeviceRegisterEvents") + ret = fn(handle, c_ulonglong(eventTypes), eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlDeviceGetSupportedEventTypes(handle): + c_eventTypes = c_ulonglong() + fn = _nvmlGetFunctionPointer("nvmlDeviceGetSupportedEventTypes") + ret = fn(handle, byref(c_eventTypes)) + _nvmlCheckReturn(ret) + return c_eventTypes.value + +# Added in 2.285 +# raises NVML_ERROR_TIMEOUT exception on timeout +def nvmlEventSetWait(eventSet, timeoutms): + fn = _nvmlGetFunctionPointer("nvmlEventSetWait") + data = c_nvmlEventData_t() + ret = fn(eventSet, byref(data), c_uint(timeoutms)) + _nvmlCheckReturn(ret) + return data + +# Added in 2.285 +def nvmlEventSetFree(eventSet): + fn = _nvmlGetFunctionPointer("nvmlEventSetFree") + ret = fn(eventSet) + _nvmlCheckReturn(ret) + return None + +# Added in 2.285 +def nvmlEventDataGetPerformanceState(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetPerformanceState") + pstate = _nvmlPstates_t() + ret = fn(byref(data), byref(pstate)) + _nvmlCheckReturn(ret) + return pstate.value + +# Added in 2.285 +def nvmlEventDataGetXidCriticalError(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetXidCriticalError") + xid = c_uint() + ret = fn(byref(data), byref(xid)) + _nvmlCheckReturn(ret) + return xid.value + +# Added in 2.285 +def nvmlEventDataGetEccErrorCount(data): + fn = _nvmlGetFunctionPointer("nvmlEventDataGetEccErrorCount") + ecc = c_ulonglong() + ret = fn(byref(data), byref(ecc)) + _nvmlCheckReturn(ret) + return ecc.value + +# Added in 3.295 +def nvmlDeviceOnSameBoard(handle1, handle2): + fn = _nvmlGetFunctionPointer("nvmlDeviceOnSameBoard") + onSameBoard = c_int() + ret = fn(handle1, handle2, byref(onSameBoard)) + _nvmlCheckReturn(ret) + return (onSameBoard.value != 0) + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkGeneration(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkGeneration") + gen = c_uint() + ret = fn(handle, byref(gen)) + _nvmlCheckReturn(ret) + return gen.value + +# Added in 3.295 +def nvmlDeviceGetCurrPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetCurrPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + +# Added in 3.295 +def nvmlDeviceGetMaxPcieLinkWidth(handle): + fn = _nvmlGetFunctionPointer("nvmlDeviceGetMaxPcieLinkWidth") + width = c_uint() + ret = fn(handle, byref(width)) + _nvmlCheckReturn(ret) + return width.value + + + diff --git a/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py b/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py new file mode 100644 index 00000000..ab1eddee --- /dev/null +++ b/gpu/nvidia/nvidia-ml-py-3.295.00/setup.py @@ -0,0 +1,32 @@ +from distutils.core import setup +from sys import version + +# earlier versions don't support all classifiers +if version < '2.2.3': + from distutils.dist import DistributionMetadata + DistributionMetadata.classifiers = None + DistributionMetadata.download_url = None + +setup(name='nvidia-ml-py', + version='3.295.00', + description='Python Bindings for the NVIDIA Management Library', + py_modules=['pynvml', 'nvidia_smi'], + package_data=['Example.txt'], + license="BSD", + url="http://www.nvidia.com/", + author="NVIDIA Corporation", + author_email="nvml-bindings@nvidia.com", + classifiers=[ + 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: BSD License', + 'Operating System :: Microsoft :: Windows', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Hardware', + 'Topic :: System :: Systems Administration', + ], + ) + diff --git a/gpu/nvidia/python_modules/nvidia.py b/gpu/nvidia/python_modules/nvidia.py index a023c3a0..84957bb4 100644 --- a/gpu/nvidia/python_modules/nvidia.py +++ b/gpu/nvidia/python_modules/nvidia.py @@ -90,16 +90,16 @@ def gpu_device_handler(name): elif (metric == 'ecc_mode'): try: ecc_mode = nvmlDeviceGetPendingEccMode(gpu_device) - if (ecc_mode == 0): + if (NVML_FEATURE_DISABLED == ecc_mode): return "OFF" - elif (ecc_mode == 1): + elif (NVML_FEATURE_ENABLED == ecc_mode): return "ON" else: return "UNKNOWN" except NVMLError, nvmlError: if NVML_ERROR_NOT_SUPPORTED == nvmlError.value: return 'N/A' - elif (metric == 'perf_state'): + elif (metric == 'perf_state' or metric == 'performance_state'): state = nvmlDeviceGetPerformanceState(gpu_device) try: int(state) @@ -112,14 +112,26 @@ def gpu_device_handler(name): return nvmlDeviceGetClockInfo(gpu_device, NVML_CLOCK_SM) elif (metric == 'mem_speed'): return nvmlDeviceGetClockInfo(gpu_device, NVML_CLOCK_MEM) - elif (metric == 'power_usage'): - return nvmlDeviceGetPowerUsage(gpu_device) elif (metric == 'max_graphics_speed'): return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_GRAPHICS) elif (metric == 'max_sm_speed'): return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_SM) elif (metric == 'max_mem_speed'): return nvmlDeviceGetMaxClockInfo(gpu_device, NVML_CLOCK_MEM) + elif (metric == 'power_usage'): + return nvmlDeviceGetPowerUsage(gpu_device) + elif (metric == 'serial'): + return nvmlDeviceGetSerial(gpu_device) + elif (metric == 'power_man_mode'): + pow_man_mode = nvmlDeviceGetPowerManagementMode(gpu_device) + if (NVML_FEATURE_DISABLED == pow_man_mode): + return "OFF" + elif (NVML_FEATURE_ENABLED == pow_man_mode): + return "ON" + else: + return "UNKNOWN" + elif (metric == 'power_man_limit'): + return nvmlDeviceGetPowerManagementLimit(gpu_device) else: print "Handler for %s not implemented, please fix in gpu_device_handler()" % metric os._exit(1) @@ -159,6 +171,14 @@ def metric_init(params): build_descriptor('gpu%s_fan' % i, gpu_device_handler, default_time_max, 'uint', '%', 'both', '%u', 'GPU%s Fan Speed' %i, 'gpu') build_descriptor('gpu%s_power_usage' % i, gpu_device_handler, default_time_max, 'uint', 'watts', 'both', '%u', 'GPU%s Power Usage' % i, 'gpu') + # Added for version 2.285 + build_descriptor('gpu%s_max_graphics_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max Graphics Speed' % i, 'gpu') + build_descriptor('gpu%s_max_sm_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max SM Speed' % i, 'gpu') + build_descriptor('gpu%s_max_mem_speed' % i, gpu_device_handler, default_time_max, 'uint', 'MHz', 'both', '%u', 'GPU%s Max Memory Speed' % i, 'gpu') + build_descriptor('gpu%s_serial' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Serial' % i, 'gpu') + build_descriptor('gpu%s_power_man_mode' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power Management' % i, 'gpu') + build_descriptor('gpu%s_power_man_limit' % i, gpu_device_handler, default_time_max, 'string', '', 'zero', '%s', 'GPU%s Power Management Limit' % i, 'gpu') + return descriptors def metric_cleanup(): diff --git a/jenkins/README.mkdn b/jenkins/README.mkdn new file mode 100644 index 00000000..33309f55 --- /dev/null +++ b/jenkins/README.mkdn @@ -0,0 +1,29 @@ +jenkins +======= + +python module for ganglia 3.1. + +## Metrics +* Number of total executors +* Number of busy executors +* Length of the queue +* Total number of jobs +* Number of jobs with blue status +* Number of jobs with red status +* Number of jobs with yellow status +* Number of jobs with grey status +* Number of jobs with aborted status +* Number of jobs with not-built status +* Number of jobs with disabled status + +## Parameters + * base_url (The URL to query for Jenkins statistics. Default: 'http://127.0.0.1:8080' + +## Notes + * This has been tested with: + - python 2.7.1 on Mac OS X + - python 2.7.3 on Ubuntu 12.04 + +## AUTHORS + +Andreas Lappe diff --git a/jenkins/conf.d/jenkins.pyconf b/jenkins/conf.d/jenkins.pyconf new file mode 100644 index 00000000..8f64efbb --- /dev/null +++ b/jenkins/conf.d/jenkins.pyconf @@ -0,0 +1,83 @@ +# + +modules { + module { + name = 'jenkins' + language = 'python' + + param base_url { + value = 'http://127.0.0.1:8080' + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 20 + + metric { + name = 'jenkins_overallload_busy_executors' + title = 'Number of busy executors on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_overallload_queue_length' + title = 'Length of the queue on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_overallload_total_executors' + title = 'Number of executors on master and slaves' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_total' + title = 'Total number of jobs' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_blue' + title = 'Number of jobs with status blue' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_red' + title = 'Number of jobs with status red' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_yellow' + title = 'Number of jobs with status yellow' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_grey' + title = 'Number of jobs with status grey' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_aborted' + title = 'Number of jobs with status aborted' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_notbuilt' + title = 'Number of jobs with status notbuilt' + value_threshold = 1.0 + } + + metric { + name = 'jenkins_jobs_disabled' + title = 'Number of jobs with status disabled' + value_threshold = 1.0 + } +} diff --git a/jenkins/python_modules/jenkins.py b/jenkins/python_modules/jenkins.py new file mode 100644 index 00000000..e7cef064 --- /dev/null +++ b/jenkins/python_modules/jenkins.py @@ -0,0 +1,254 @@ +### This script reports jenkins metrics to ganglia. + +### License to use, modify, and distribute under the GPL +### http://www.gnu.org/licenses/gpl.txt +import logging +import os +import subprocess +import sys +import threading +import time +import traceback +import urllib2 +import json + +logging.basicConfig(level=logging.ERROR) + +_Worker_Thread = None + +class UpdateJenkinsThread(threading.Thread): + + def __init__(self, params): + threading.Thread.__init__(self) + self.running = False + self.shuttingdown = False + self.metrics = {} + self.settings = {} + self.refresh_rate = 60 + self.base_url = params['base_url'] + self._metrics_lock = threading.Lock() + self._settings_lock = threading.Lock() + + def shutdown(self): + self.shuttingdown = True + if not self.running: + return + self.join() + + def run(self): + global _Lock + + self.running = True + + while not self.shuttingdown: + time.sleep(self.refresh_rate) + self.refresh_metrics() + + self.running = False + + @staticmethod + def _get_jenkins_statistics(url): + + url += '/api/json' + url += '?tree=jobs[color],overallLoad[busyExecutors[min[latest]],queueLength[min[latest]],totalExecutors[min[latest]]]' + + c = urllib2.urlopen(url) + json_data = c.read() + c.close() + + data = json.loads(json_data) + + result = {} + result['jenkins_overallload_busy_executors'] = data['overallLoad']['busyExecutors']['min']['latest'] + result['jenkins_overallload_queue_length'] = data['overallLoad']['queueLength']['min']['latest'] + result['jenkins_overallload_total_executors'] = data['overallLoad']['totalExecutors']['min']['latest'] + result['jenkins_jobs_total'] = len(data['jobs']) + result['jenkins_jobs_red'] = result['jenkins_jobs_yellow'] = result['jenkins_jobs_grey'] = result['jenkins_jobs_disabled'] = result['jenkins_jobs_aborted'] = result['jenkins_jobs_notbuilt'] = result['jenkins_jobs_blue'] = 0 + + # Possible values: http://javadoc.jenkins-ci.org/hudson/model/BallColor.html + colors = ['red', 'yellow', 'grey', 'disabled', 'aborted', 'notbuilt', 'blue'] + for color in colors: + result['jenkins_jobs_' + color] = 0 + for job in data['jobs']: + color = job['color'] + for c in colors: + if color == c or color == c + '_anime': + result['jenkins_jobs_' + c] += 1 + return result + + def refresh_metrics(self): + logging.debug('refresh metrics') + + try: + logging.debug(' opening URL: ' + str(self.base_url)) + data = UpdateJenkinsThread._get_jenkins_statistics(self.base_url) + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + + try: + self._metrics_lock.acquire() + self.metrics = {} + for k, v in data.items(): + self.metrics[k] = v + except: + logging.warning('error refreshing metrics') + logging.warning(traceback.print_exc(file=sys.stdout)) + return False + + finally: + self._metrics_lock.release() + + if not self.metrics: + logging.warning('error refreshing metrics') + return False + + logging.debug('success refreshing metrics') + logging.debug('metrics: ' + str(self.metrics)) + + return True + + def metric_of(self, name): + logging.debug('getting metric: ' + name) + + try: + if name in self.metrics: + try: + self._metrics_lock.acquire() + logging.debug('metric: %s = %s' % (name, self.metrics[name])) + return self.metrics[name] + finally: + self._metrics_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + + def setting_of(self, name): + logging.debug('getting setting: ' + name) + + try: + if name in self.settings: + try: + self._settings_lock.acquire() + logging.debug('setting: %s = %s' % (name, self.settings[name])) + return self.settings[name] + finally: + self._settings_lock.release() + except: + logging.warning('failed to fetch ' + name) + return 0 + +def metric_init(params): + logging.debug('init: ' + str(params)) + global _Worker_Thread + + METRIC_DEFAULTS = { + 'units': 'jobs', + 'groups': 'jenkins', + 'slope': 'both', + 'value_type': 'uint', + 'format': '%d', + 'description': '', + 'call_back': metric_of + } + + descriptions = dict( + jenkins_overallload_busy_executors = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'executors', + 'description': 'Number of busy executors (master and slaves)'}, + jenkins_overallload_queue_length = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'queued items', + 'description': 'Length of the queue (master and slaves)'}, + jenkins_overallload_total_executors = { + 'value_type': 'float', + 'format': '%.3f', + 'units': 'executors', + 'description': 'Number of executors (master and slaves)'}, + jenkins_jobs_total = { + 'description': 'Total number of jobs'}, + jenkins_jobs_blue = { + 'description': 'Blue jobs'}, + jenkins_jobs_red = { + 'description': 'Red jobs'}, + jenkins_jobs_yellow = { + 'description': 'Yellow jobs'}, + jenkins_jobs_grey = { + 'description': 'Grey jobs'}, + jenkins_jobs_disabled = { + 'description': 'Disabled jobs'}, + jenkins_jobs_aborted = { + 'description': 'Aborted jobs'}, + jenkins_jobs_notbuilt = { + 'description': 'Not-built jobs'}) + + if _Worker_Thread is not None: + raise Exception('Worker thread already exists') + + _Worker_Thread = UpdateJenkinsThread(params) + _Worker_Thread.refresh_metrics() + _Worker_Thread.start() + + descriptors = [] + + for name, desc in descriptions.iteritems(): + d = desc.copy() + d['name'] = str(name) + [ d.setdefault(key, METRIC_DEFAULTS[key]) for key in METRIC_DEFAULTS.iterkeys() ] + descriptors.append(d) + return descriptors + +def metric_of(name): + global _Worker_Thread + return _Worker_Thread.metric_of(name) + +def setting_of(name): + global _Worker_Thread + return _Worker_Thread.setting_of(name) + +def metric_cleanup(): + global _Worker_Thread + if _Worker_Thread is not None: + _Worker_Thread.shutdown() + logging.shutdown() + pass + +if __name__ == '__main__': + from optparse import OptionParser + + try: + logging.debug('running from the cmd line') + parser = OptionParser() + parser.add_option('-u', '--URL', dest='base_url', default='http://127.0.0.1:8080', help='Base-URL for jenkins api (default: http://127.0.0.1:8080)') + parser.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False) + parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False) + + (options, args) = parser.parse_args() + + descriptors = metric_init({ + 'base_url': options.base_url, + }) + + if options.debug: + from pprint import pprint + pprint(descriptors) + + for d in descriptors: + v = d['call_back'](d['name']) + + if not options.quiet: + print ' {0}: {1} {2} [{3}]' . format(d['name'], v, d['units'], d['description']) + + os._exit(1) + + except KeyboardInterrupt: + time.sleep(0.2) + os._exit(1) + except StandardError: + traceback.print_exc() + os._exit(1) + finally: + metric_cleanup() diff --git a/memcached_maxage/README.md b/memcached_maxage/README.md new file mode 100644 index 00000000..7ba8fcef --- /dev/null +++ b/memcached_maxage/README.md @@ -0,0 +1,21 @@ +python-memcached-gmond +====================== + + +This is a Python Gmond module for Memcached, compatible with both Python 2 and +3. In addition to the usual datapoints provided by "stats", this module +aggregates max age metrics from "stats items". All metrics are available in a +"memcached" collection group. + +If you've installed ganglia at the standard locations, you should be able to +install this module by copying `memcached.pyconf` to `/etc/ganglia/conf.d` and +`memcached.py`, `memcached_metrics.py`, and 'every.py' to +`/usr/lib/ganglia/python_modules`. The memcached server's host and port can be +specified in the configuration in memcached.pyconf. + +For more information, see the section [Gmond Python metric modules][1] in the +Ganglia documentation. + +Author: Ori Livneh + + [1]: http://sourceforge.net/apps/trac/ganglia/wiki/ganglia_gmond_python_modules diff --git a/memcached_maxage/conf.d/memcached.pyconf b/memcached_maxage/conf.d/memcached.pyconf new file mode 100644 index 00000000..1552487a --- /dev/null +++ b/memcached_maxage/conf.d/memcached.pyconf @@ -0,0 +1,133 @@ +# Gmond configuration for memcached metric module +# Install to /etc/ganglia/conf.d + +modules { + module { + name = "memcached" + language = "python" + param host { + value = "127.0.0.1" + } + param port { + value = "11211" + } + } +} + +collection_group { + collect_every = 10 + time_threshold = 60 + + metric { + name = "curr_items" + title = "curr_items" + } + metric { + name = "total_items" + title = "total_items" + } + metric { + name = "bytes" + title = "bytes" + } + metric { + name = "curr_connections" + title = "curr_connections" + } + metric { + name = "total_connections" + title = "total_connections" + } + metric { + name = "connection_structures" + title = "connection_structures" + } + metric { + name = "cmd_get" + title = "cmd_get" + } + metric { + name = "cmd_set" + title = "cmd_set" + } + metric { + name = "get_hits" + title = "get_hits" + } + metric { + name = "get_misses" + title = "get_misses" + } + metric { + name = "delete_hits" + title = "delete_hits" + } + metric { + name = "delete_misses" + title = "delete_misses" + } + metric { + name = "incr_hits" + title = "incr_hits" + } + metric { + name = "incr_misses" + title = "incr_misses" + } + metric { + name = "decr_hits" + title = "decr_hits" + } + metric { + name = "decr_misses" + title = "decr_misses" + } + metric { + name = "cas_hits" + title = "cas_hits" + } + metric { + name = "cas_misses" + title = "cas_misses" + } + metric { + name = "evictions" + title = "evictions" + } + metric { + name = "bytes_read" + title = "bytes_read" + } + metric { + name = "bytes_written" + title = "bytes_written" + } + metric { + name = "limit_maxbytes" + title = "limit_maxbytes" + } + metric { + name = "threads" + title = "threads" + } + metric { + name = "conn_yields" + title = "conn_yields" + } + metric { + name = "age_mean" + title = "age_mean" + } + metric { + name = "age_median" + title = "age_median" + } + metric { + name = "age_min" + title = "age_min" + } + metric { + name = "age_max" + title = "age_max" + } +} diff --git a/memcached_maxage/python_modules/every.py b/memcached_maxage/python_modules/every.py new file mode 100644 index 00000000..117bf6ba --- /dev/null +++ b/memcached_maxage/python_modules/every.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Every + + Python decorator; decorated function is called on a set interval. + + :author: Ori Livneh + :copyright: (c) 2012 Wikimedia Foundation + :license: GPL, version 2 or later +""" +from __future__ import division +from datetime import timedelta +import signal +import sys +import threading + + +# pylint: disable=C0111, W0212, W0613, W0621 + + +__all__ = ('every', ) + + +def total_seconds(delta): + """ + Get total seconds of timedelta object. Equivalent to + timedelta.total_seconds(), which was introduced in Python 2.7. + """ + us = (delta.microseconds + (delta.seconds + delta.days * 24 * 3600) * 10**6) + return us / 1000000.0 + + +def handle_sigint(signal, frame): + """ + Attempt to kill all child threads and exit. Installing this as a sigint + handler allows the program to run indefinitely if unmolested, but still + terminate gracefully on Ctrl-C. + """ + for thread in threading.enumerate(): + if thread.isAlive(): + thread._Thread__stop() + sys.exit(0) + + +def every(*args, **kwargs): + """ + Decorator; calls decorated function on a set interval. Arguments to every() + are passed on to the constructor of datetime.timedelta(), which accepts the + following arguments: days, seconds, microseconds, milliseconds, minutes, + hours, weeks. This decorator is intended for functions with side effects; + the return value is discarded. + """ + interval = total_seconds(timedelta(*args, **kwargs)) + def decorator(func): + def poll(): + func() + threading.Timer(interval, poll).start() + poll() + return func + return decorator + + +def join(): + """Pause until sigint""" + signal.signal(signal.SIGINT, handle_sigint) + signal.pause() + + +every.join = join diff --git a/memcached_maxage/python_modules/memcached.py b/memcached_maxage/python_modules/memcached.py new file mode 100644 index 00000000..a4db94de --- /dev/null +++ b/memcached_maxage/python_modules/memcached.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" + Python Gmond Module for Memcached + + This module declares a "memcached" collection group. For more information, + including installation instructions, see: + + http://sourceforge.net/apps/trac/ganglia/wiki/ganglia_gmond_python_modules + + When invoked as a standalone script, this module will attempt to use the + default configuration to query memcached every 10 seconds and print out the + results. + + Based on a suggestion from Domas Mitzuas, this module also reports the min, + max, median and mean of the 'age' metric across slabs, as reported by the + "stats items" memcached command. + + :copyright: (c) 2012 Wikimedia Foundation + :author: Ori Livneh + :license: GPL, v2 or later +""" +from __future__ import division, print_function + +from threading import Timer + +import logging +import os +import pprint +import sys +import telnetlib + +logging.basicConfig(level=logging.DEBUG) + +# Hack: load a file from the current module's directory, because gmond doesn't +# know how to work with Python packages. (To be fair, neither does Python.) +sys.path.insert(0, os.path.dirname(__file__)) +from memcached_metrics import descriptors +from every import every +sys.path.pop(0) + + +# Default configuration +config = { + 'host' : '127.0.0.1', + 'port' : 11211, +} + +stats = {} +client = telnetlib.Telnet() + + +def median(values): + """Calculate median of series""" + values = sorted(values) + length = len(values) + mid = length // 2 + if (length % 2): + return values[mid] + else: + return (values[mid - 1] + values[mid]) / 2 + + +def mean(values): + """Calculate mean (average) of series""" + return sum(values) / len(values) + + +def cast(value): + """Cast value to float or int, if possible""" + try: + return float(value) if '.' in value else int(value) + except ValueError: + return value + + +def query(command): + """Send `command` to memcached and stream response""" + client.write(command.encode('ascii') + b'\n') + while True: + line = client.read_until(b'\r\n').decode('ascii').strip() + if not line or line == 'END': + break + (_, metric, value) = line.split(None, 2) + yield metric, cast(value) + + +@every(seconds=10) +def update_stats(): + """Refresh stats by polling memcached server""" + try: + client.open(**config) + stats.update(query('stats')) + ages = [v for k, v in query('stats items') if k.endswith('age')] + if not ages: + return {'age_min': 0, 'age_max': 0, 'age_mean': 0, 'age_median': 0} + stats.update({ + 'age_min' : min(ages), + 'age_max' : max(ages), + 'age_mean' : mean(ages), + 'age_median' : median(ages) + }) + finally: + client.close() + logging.info("Updated stats: %s", pprint.pformat(stats, indent=4)) + + +# +# Gmond Interface +# + +def metric_handler(name): + """Get the value for a particular metric; part of Gmond interface""" + return stats[name] + + +def metric_init(params): + """Initialize; part of Gmond interface""" + print('[memcached] memcached stats') + config.update(params) + for metric in descriptors: + metric['call_back'] = metric_handler + return descriptors + + +def metric_cleanup(): + """Teardown; part of Gmond interface""" + client.close() + + +if __name__ == '__main__': + # When invoked as standalone script, run a self-test by querying each + # metric descriptor and printing it out. + for metric in metric_init({}): + value = metric['call_back'](metric['name']) + print(( "%s => " + metric['format'] ) % ( metric['name'], value )) + every.join() diff --git a/memcached_maxage/python_modules/memcached_metrics.py b/memcached_maxage/python_modules/memcached_metrics.py new file mode 100644 index 00000000..9677a94d --- /dev/null +++ b/memcached_maxage/python_modules/memcached_metrics.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +descriptors = [ { + "slope": "both", + "time_max": 60, + "description": "Current number of items stored by this instance", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "curr_items" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of items stored during the life of this instance", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "total_items" + }, + { + "slope": "both", + "time_max": 60, + "description": "Current number of bytes used by this server to store items", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes" + }, + { + "slope": "both", + "time_max": 60, + "description": "Current number of open connections", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "curr_connections" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of connections opened since the server started running", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "total_connections" + }, + { + "slope": "both", + "time_max": 60, + "description": "Number of connection structures allocated by the server", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "connection_structures" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of retrieval requests (get operations)", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cmd_get" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of storage requests (set operations)", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cmd_set" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been requested and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "get_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been requested and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "get_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been deleted and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "delete_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been delete and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "delete_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been incremented and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "incr_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been incremented and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "incr_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been decremented and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "decr_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been decremented and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "decr_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of keys that have been compared and swapped and found present", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cas_hits" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of items that have been compared and swapped and not found", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "cas_misses" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of valid items removed from cache to free memory for new items", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "evictions" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of bytes read by this server from network", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes_read" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Total number of bytes sent by this server to network", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "bytes_written" + }, + { + "slope": "zero", + "time_max": 60, + "description": "Number of bytes this server is permitted to use for storage", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "limit_maxbytes" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of worker threads requested", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "threads" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Number of yields for connections", + "format": "%d", + "value_type": "uint", + "groups": "memcached", + "units": "items", + "name": "conn_yields" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (mean)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_mean" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (median)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_median" + }, + { + "slope": "positive", + "time_max": 60, + "description": "Age of the oldest item within slabs (min)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_min" + }, + { + "slope": "positive", + "time_max": 60, + "description": "The age of the oldest item within slabs (max)", + "format": "%.2f", + "value_type": "float", + "groups": "memcached", + "units": "items", + "name": "age_max" + } +] diff --git a/netapp_api/README.mkdn b/netapp_api/README.mkdn new file mode 100644 index 00000000..d062a179 --- /dev/null +++ b/netapp_api/README.mkdn @@ -0,0 +1,19 @@ +NetApp Filer API metrics +======================== + +This is a GMOND Python Module that gathers metrics from NetApp appliances via the Netapp Data ONTAP APIs. +The API allows counter access to many more metrics than available through SNMP. + +This module currently gathers per volume Read/Write/Average IOPs and Latency and handles multiple filers. + +## DEPENDS + * Netapp Managemability SDK 5.0 (download from now.netapp.com to /opt/netapp) + +## USAGE + * Save the netapp_api.pyconf into /etc/ganglia/conf.d + * Save the netapp_api.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules. + * Update the Username, password, IP and filer name. + * Restart gmond and the volume latency & iop metrics should appear in ganglia. + +## AUTHOR + * Author: Evan Fraser <evan.fraser@trademe.co.nz> diff --git a/netapp_api/conf.d/netapp_api.pyconf b/netapp_api/conf.d/netapp_api.pyconf new file mode 100644 index 00000000..906f3c8f --- /dev/null +++ b/netapp_api/conf.d/netapp_api.pyconf @@ -0,0 +1,18 @@ +modules { + module { + name = "netapp_api" + language = "python" + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 15 + time_threshold = 70 + metric { + name_match = "(.+)latency" + } + metric { + name_match = "(.+)ops" + } +} diff --git a/netapp_api/python_modules/netapp_api.py b/netapp_api/python_modules/netapp_api.py new file mode 100755 index 00000000..50089c16 --- /dev/null +++ b/netapp_api/python_modules/netapp_api.py @@ -0,0 +1,351 @@ +#!/usr/bin/python +#Name: netapp_api.py +#Desc: Uses Netapp Data Ontap API to get per volume latency & iops metrics. Download the managemability SDK from now.netapp.com +#Author: Evan Fraser +#Date: 13/08/2012 + +import sys +import time +import pprint +import unicodedata +import os + +sys.path.append("/opt/netapp/lib/python/NetApp") +from NaServer import * + +descriptors = list() +params = {} +filerdict = {} +FASMETRICS = { + 'time' : 0, + 'data' : {} +} +LAST_FASMETRICS = dict(FASMETRICS) +#This is the minimum interval between querying the RPA for metrics +FASMETRICS_CACHE_MAX = 10 + +def get_metrics(name): + global FASMETRICS, LAST_FASMETRICS, FASMETRICS_CACHE_MAX, params + max_records = 10 + metrics = {} + if (time.time() - FASMETRICS['time']) > FASMETRICS_CACHE_MAX: + + for filer in filerdict.keys(): + s = NaServer(filerdict[filer]['ipaddr'], 1, 3) + out = s.set_transport_type('HTTPS') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + + out = s.set_style('LOGIN') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + out = s.set_admin_user(filerdict[filer]['user'], filerdict[filer]['password']) + perf_in = NaElement("perf-object-get-instances-iter-start") + #Hard coding volume object for testing + obj_name = "volume" + perf_in.child_add_string("objectname", obj_name) + #Create object of type counters + counters = NaElement("counters") + #Add counter names to the object + counters.child_add_string("counter", "total_ops") + counters.child_add_string("counter", "avg_latency") + counters.child_add_string("counter", "read_ops") + counters.child_add_string("counter", "read_latency") + counters.child_add_string("counter", "write_ops") + counters.child_add_string("counter", "write_latency") + + perf_in.child_add(counters) + + #Invoke API + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + iter_tag = out.child_get_string("tag") + num_records = 1 + + filername = filerdict[filer]['name'] + + while(int(num_records) != 0): + perf_in = NaElement("perf-object-get-instances-iter-next") + perf_in.child_add_string("tag", iter_tag) + perf_in.child_add_string("maximum", max_records) + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + num_records = out.child_get_int("records") + + if(num_records > 0) : + instances_list = out.child_get("instances") + instances = instances_list.children_get() + + for inst in instances: + inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore') + counters_list = inst.child_get("counters") + counters = counters_list.children_get() + + for counter in counters: + counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore') + counter_value = counter.child_get_string("value") + counter_unit = counter.child_get_string("unit") + metrics[filername + '_vol_' + inst_name + '_' + counter_name] = float(counter_value) + # update cache + LAST_FASMETRICS = dict(FASMETRICS) + FASMETRICS = { + 'time': time.time(), + 'data': metrics + } + + + else: + metrics = FASMETRICS['data'] + #print name + #calculate change in values and return + if 'total_ops' in name: + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + #This is the Operations per second + return delta + + elif 'avg_latency' in name: + try: + #T1 and T2 + #(T2_lat - T1_lat) / (T2_ops - T1_ops) + #Find the metric name of the base counter + total_ops_name = name.replace('avg_latency', 'total_ops') + #Calculate latency in time (div 100 to change to ms) + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][total_ops_name] -LAST_FASMETRICS['data'][total_ops_name])) / 100 + except StandardError: + return 0 + elif 'read_ops' in name: + + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + return delta + + elif 'read_latency' in name: + try: + read_ops_name = name.replace('read_latency', 'read_ops') + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][read_ops_name] -LAST_FASMETRICS['data'][read_ops_name])) / 100 + except StandardError: + return 0 + elif 'write_ops' in name: + try: + delta = float(FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name])/(FASMETRICS['time'] - LAST_FASMETRICS['time']) + if delta < 0: + print "Less than 0" + delta = 0 + except StandardError: + delta = 0 + return delta + + elif 'write_latency' in name: + try: + write_ops_name = name.replace('write_latency', 'write_ops') + return float((FASMETRICS['data'][name] - LAST_FASMETRICS['data'][name]) / (FASMETRICS['data'][write_ops_name] -LAST_FASMETRICS['data'][write_ops_name])) / 100 + except StandardError: + return 0 + + + return 0 + + + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + +def define_metrics(Desc_Skel,params): + max_records = 10 + for filer in params.keys(): + s = NaServer(params[filer]['ipaddr'], 1, 3) + out = s.set_transport_type('HTTPS') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + + out = s.set_style('LOGIN') + if (out and out.results_errno() != 0) : + r = out.results_reason() + print ("Connection to filer failed: " + r + "\n") + sys.exit(2) + out = s.set_admin_user(params[filer]['user'], params[filer]['password']) + perf_in = NaElement("perf-object-get-instances-iter-start") + #Hard coded volume, only volume stats gathered at present + obj_name = "volume" + perf_in.child_add_string("objectname", obj_name) + #Create object of type counters + counters = NaElement("counters") + #Add counter names to the object + counters.child_add_string("counter", "total_ops") + counters.child_add_string("counter", "avg_latency") + counters.child_add_string("counter", "read_ops") + counters.child_add_string("counter", "read_latency") + counters.child_add_string("counter", "write_ops") + counters.child_add_string("counter", "write_latency") + + perf_in.child_add(counters) + + #Invoke API + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + iter_tag = out.child_get_string("tag") + num_records = 1 + filername = params[filer]['name'] + + while(int(num_records) != 0): + perf_in = NaElement("perf-object-get-instances-iter-next") + perf_in.child_add_string("tag", iter_tag) + perf_in.child_add_string("maximum", max_records) + out = s.invoke_elem(perf_in) + + if(out.results_status() == "failed"): + print(out.results_reason() + "\n") + sys.exit(2) + + num_records = out.child_get_int("records") + + if(num_records > 0) : + instances_list = out.child_get("instances") + instances = instances_list.children_get() + + for inst in instances: + inst_name = unicodedata.normalize('NFKD',inst.child_get_string("name")).encode('ascii','ignore') + #print ("Instance = " + inst_name + "\n") + counters_list = inst.child_get("counters") + counters = counters_list.children_get() + + for counter in counters: + counter_name = unicodedata.normalize('NFKD',counter.child_get_string("name")).encode('ascii','ignore') + counter_value = counter.child_get_string("value") + counter_unit = counter.child_get_string("unit") + if 'total_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'avg_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume avg latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + elif 'read_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume read iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'read_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume read latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + elif 'write_ops' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'iops', + "description" : "volume write iops", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "iops" + })) + elif 'write_latency' in counter_name: + descriptors.append(create_desc(Desc_Skel, { + "name" : filername + '_vol_' + inst_name + '_' + counter_name, + "units" : 'ms', + "description" : "volume write latency", + "spoof_host" : params[filer]['ipaddr'] + ':' + params[filer]['name'], + "groups" : "latency" + })) + + return descriptors + +def metric_init(params): + global descriptors,filerdict + print 'netapp_stats] Received the following parameters' + pprint.pprint(params) + params = { + 'filer1' : { + 'name' : 'filer1.localdomain', + 'ipaddr' : '192.168.1.100', + 'user' : 'root', + 'password' : 'password', + }, + } + + filerdict = dict(params) + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_metrics, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'netiron', + 'spoof_host' : 'XXX', + } + + # Run define_metrics + descriptors = define_metrics(Desc_Skel,params) + + return descriptors + +# For CLI Debugging: +if __name__ == '__main__': + #global params + params = { + 'filer1' : { + 'name' : 'filer1.localdomain', + 'ipaddr' : '192.168.1.100', + 'user' : 'root', + 'password' : 'password', + }, + } + descriptors = metric_init(params) + pprint.pprint(descriptors) + #print len(descriptors) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + #print v + print 'value for %s is %.2f' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) diff --git a/network/netstats/python_modules/netstats.py b/network/netstats/python_modules/netstats.py index d22360d8..feb8cdbb 100644 --- a/network/netstats/python_modules/netstats.py +++ b/network/netstats/python_modules/netstats.py @@ -1,6 +1,7 @@ import sys import re import time +import copy PARAMS = {} @@ -13,7 +14,7 @@ tcpext_file = "/proc/net/netstat" snmp_file = "/proc/net/snmp" -LAST_METRICS = dict(METRICS) +LAST_METRICS = copy.deepcopy(METRICS) METRICS_CACHE_MAX = 5 stats_pos = {} @@ -188,10 +189,10 @@ def get_metrics(): if re.match("TcpExt: [0-9]", line): metrics = re.split("\s+", line) - file.close + file.close() # update cache - LAST_METRICS = dict(METRICS) + LAST_METRICS = copy.deepcopy(METRICS) METRICS = { 'time': time.time(), 'tcpext': metrics @@ -216,7 +217,7 @@ def get_metrics(): METRICS['tcp'] = re.split("\s+", line) - file.close + file.close() return [METRICS, LAST_METRICS] diff --git a/passenger/python_modules/passenger.py b/passenger/python_modules/passenger.py index 5d4f4e8f..7f082244 100644 --- a/passenger/python_modules/passenger.py +++ b/passenger/python_modules/passenger.py @@ -190,9 +190,9 @@ def timeout_command(command, timeout): time.sleep(0.2) now = datetime.datetime.now() if (now - start).seconds> timeout: - os.kill(process.pid, signal.SIGKILL) + os.system("sudo kill %s" % process.pid) os.waitpid(-1, os.WNOHANG) - return None + return [] return process.stdout.readlines() if __name__ == '__main__': diff --git a/php_fpm/conf.d/php_fpm.pyconf b/php_fpm/conf.d/php_fpm.pyconf index 59c9e2f8..a95ead0e 100644 --- a/php_fpm/conf.d/php_fpm.pyconf +++ b/php_fpm/conf.d/php_fpm.pyconf @@ -9,7 +9,7 @@ modules { value = 'localhost' } - param port { + param ports { value = '9000' } diff --git a/rabbit/README.mkdn b/rabbit/README.mkdn index 50594517..a785546e 100644 --- a/rabbit/README.mkdn +++ b/rabbit/README.mkdn @@ -5,12 +5,18 @@ python module for ganglia 3.1. "rabbit" sends metrics on RabbitMQ nodes using the stats api. It is based off the very similar ElasticSearch module. -http://(node-ip):55672/api/queues (or nodes) +http://(node-ip):55672/api/queues (or nodes, or exchanges) -This module requires simplejson, or if using a 2.6 interpreter with mod_python, json. Modify accordingly. +Please see http://hg.rabbitmq.com/rabbitmq-management/raw-file/rabbitmq_v2_7_1/priv/www/api/index.html for more info on the management API. That's a good place to start if you want to extend this module and include new metrics. + +This module requires simplejson, or if using a 2.6 interpreter with mod_python, the json module. Modify accordingly. The digItUp function, and the keyToPath syntax, were borrowed from the ElasticSearch module. +To use multiple vhosts, separate them by comma in the vhosts file. + +To get metrics besides nodes or queues, either check out how the buildQueueDescriptors and buildNodeDescriptors were set up and make a new descriptor builder/modify stats at the top of the python file and contribute the changes, or ask for my assistance and I'll see what I can do. + ## AUTHORS Gregory Rice diff --git a/rabbit/conf.d/rabbitmq.pyconf b/rabbit/conf.d/rabbitmq.pyconf index 76d44048..f0cee349 100644 --- a/rabbit/conf.d/rabbitmq.pyconf +++ b/rabbit/conf.d/rabbitmq.pyconf @@ -13,7 +13,7 @@ modules { } param vhost { - value = "/" + value = "/,vhost1,vhost2" } param username { value = "guest" diff --git a/rabbit/python_modules/rabbitmq.py b/rabbit/python_modules/rabbitmq.py index 52d1b9ef..2bdccbb5 100644 --- a/rabbit/python_modules/rabbitmq.py +++ b/rabbit/python_modules/rabbitmq.py @@ -5,20 +5,23 @@ import urllib import time from string import Template +import itertools global url, descriptors, last_update, vhost, username, password, url_template, result, result_dict, keyToPath INTERVAL = 20 descriptors = list() username, password = "guest", "guest" stats = {} -last_update = {} +keyToPath = {} +last_update = None +#last_update = {} compiled_results = {"nodes" : None, "queues" : None, "connections" : None} #Make initial stat test time dict -for stat_type in ('queues', 'connections','exchanges', 'nodes'): - last_update[stat_type] = None - -keyToPath = {} +#for stat_type in ('queues', 'connections','exchanges', 'nodes'): +# last_update[stat_type] = None +### CONFIGURATION SECTION ### +STATS = ['nodes', 'queues'] # QUEUE METRICS # keyToPath['rmq_messages_ready'] = "%s.messages_ready" @@ -71,41 +74,37 @@ def dig_it_up(obj,path): print "Exception" return False -def refreshGroup(group): - +def refreshStats(stats = ('nodes', 'queues'), vhosts = ['/']): global url_template - urlstring = url_template.safe_substitute(stats = group) - global last_update, url, compiled_results now = time.time() - if not last_update[group]: + + if not last_update: diff = INTERVAL else: - diff = now - last_update[group] - - if diff >= INTERVAL or not last_update[group]: - result_dict = {} - print "Fetching stats after %d seconds" % INTERVAL - result = json.load(urllib.urlopen(urlstring)) - compiled_results[group] = result - last_update[group] = now - #Refresh dict by names. We'll probably move this elsewhere. - if group in ('queues', 'nodes'): - for entry in result: - name_attribute = entry['name'] - result_dict[name_attribute] = entry - compiled_results[group] = result_dict - - return compiled_results[group] - -def getConnectionTotal(name): - result = refreshGroup('connections') - return result.length() - -def getConnectionStats(name): - pass + diff = now - last_update + + if diff >= INTERVAL or not last_update: + print "Fetching Results after %d seconds" % INTERVAL + last_update = now + for stat in stats: + for vhost in vhosts: + if stat in ('nodes'): + vhost = '/' + result_dict = {} + urlstring = url_template.safe_substitute(stats = stat, vhost = vhost) + print urlstring + result = json.load(urllib.urlopen(urlstring)) + # Rearrange results so entry is held in a dict keyed by name - queue name, host name, etc. + if stat in ("queues", "nodes", "exchanges"): + for entry in result: + name = entry['name'] + result_dict[name] = entry + compiled_results[(stat, vhost)] = result_dict + + return compiled_results def validatedResult(value): if not isInstance(value, bool): @@ -113,24 +112,28 @@ def validatedResult(value): else: return None -def list_queues(): - # Make a list of queues - results = refreshGroup('queues') - return results.keys() +def list_queues(vhost): + global compiled_results + queues = compiled_results[('queues', vhost)].keys() + return queues def list_nodes(): - results = refreshGroup('nodes') - return results.keys() + global compiled_results + nodes = compiled_results[('nodes', '/')].keys() + return nodes def getQueueStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" #handle queue names with . in them - split_name = name.split(".") + print name + split_name, vhost = name.split("#") + split_name = split_name.split(".") stat_name = split_name[0] queue_name = ".".join(split_name[1:]) - result = refreshGroup('queues') + # Run refreshStats to get the result object + result = compiled_results[('queues', vhost)] value = dig_it_up(result, keyToPath[stat_name] % queue_name) print name, value @@ -145,9 +148,11 @@ def getQueueStat(name): def getNodeStat(name): #Split a name like "rmq_backing_queue_ack_egress_rate.access" - stat_name, node_name = name.split(".") - result = refreshGroup('nodes') + stat_name = name.split(".")[0] + node_name, vhost = name.split(".")[1].split("#") + result = compiled_results[('nodes', '/')] value = dig_it_up(result, keyToPath[stat_name] % node_name) + print name,value #Convert Booleans if value is True: @@ -156,24 +161,35 @@ def getNodeStat(name): value = 0 return float(value) + +def product(*args, **kwds): + # replacement for itertools.product + # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy + pools = map(tuple, args) * kwds.get('repeat', 1) + result = [[]] + for pool in pools: + result = [x+[y] for x in result for y in pool] + for prod in result: + yield tuple(prod) def metric_init(params): ''' Create the metric definition object ''' - global descriptors, stats, vhost, username, password, urlstring, url_template, compiled_results + global descriptors, stats, vhost, username, password, urlstring, url_template, compiled_results, STATS print 'received the following params:' #Set this globally so we can refresh stats if 'host' not in params: params['host'], params['vhost'],params['username'],params['password'] = "localhost", "/", "guest", "guest" - vhost = params['vhost'] + + # Set the vhosts as a list split from params + vhosts = params['vhost'].split(',') username, password = params['username'], params['password'] host = params['host'] - url = 'http://%s:%s@%s:55672/api/$stats' % (username, password, host) + url = 'http://%s:%s@%s:55672/api/$stats/$vhost' % (username, password, host) url_template = Template(url) print params - refreshGroup("nodes") - refreshGroup("queues") + refreshStats(stats = STATS, vhosts = vhosts) def create_desc(prop): d = { @@ -194,9 +210,10 @@ def create_desc(prop): def buildQueueDescriptors(): - for queue in list_queues(): - for metric in QUEUE_METRICS: - name = "%s.%s" % (metric, queue) + for vhost, metric in product(vhosts, QUEUE_METRICS): + queues = list_queues(vhost) + for queue in queues: + name = "%s.%s#%s" % (metric, queue, vhost) print name d1 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getQueueStat, @@ -210,10 +227,9 @@ def buildQueueDescriptors(): descriptors.append(d1) def buildNodeDescriptors(): - for node in list_nodes(): - #node = node.split('@')[0] - for stat in NODE_METRICS: - name = '%s.%s' % (stat, node) + for metric in NODE_METRICS: + for node in list_nodes(): + name = '%s.%s#%s' % (metric, node, '/') print name d2 = create_desc({'name': name.encode('ascii','ignore'), 'call_back': getNodeStat, @@ -241,9 +257,8 @@ def metric_cleanup(): url_template = Template(url) parameters = {"vhost":"/", "username":"guest","password":"guest", "metric_group":"rabbitmq"} metric_init(parameters) - result = refreshGroup('queues') - node_result = refreshGroup('nodes') + result = refreshStats(stats = ('queues', 'nodes'), vhosts = ('/')) print '***'*10 - getQueueStat('rmq_backing_queue_ack_egress_rate.gelf_client_three') - getNodeStat('rmq_disk_free.rmqtwo@inrmq02d1') - getNodeStat('rmq_mem_used.rmqtwo@inrmq02d1') + getQueueStat('rmq_backing_queue_ack_egress_rate.gelf_client_three#/') + getNodeStat('rmq_disk_free.rmqtwo@inrmq02d1#/') + getNodeStat('rmq_mem_used.rmqtwo@inrmq02d1#/') diff --git a/recoverpoint/README.mkdn b/recoverpoint/README.mkdn new file mode 100644 index 00000000..5202cd0f --- /dev/null +++ b/recoverpoint/README.mkdn @@ -0,0 +1,23 @@ +EMC RecoverPoint +=============== + +This is a GMOND Python Module that gets metrics from EMC RecoverPoint replication appliances. + +Currently gathers: + * Per RPA WAN/SAN traffic and Latency + * Per Consistency Group Write, Data, Time and Journal Lags, as well as WAN and SAN traffic. + * Per Consistency Group Protection Window metrics. + +## DEPENDS + * python YAML + * paramiko modules + * ssh access to the recoverpoint appliance (paramiko can use ssh keys if required) + +## USAGE + * Save the recoverpoint.pyconf into /etc/ganglia/conf.d directory and update the management IP and sitenames (the sitenames have been lowercase'd) + * Save the recoverpoint.py into your ganglia python module dir eg: /usr/lib/ganglia/python_modules. Update the username/passwords if necessary. + * Restart gmond and a "recoverpoint" host should appear in ganglia. + +## AUTHOR + +Author: Evan Fraser <evan.fraser@trademe.co.nz> diff --git a/recoverpoint/recoverpoint.py b/recoverpoint/recoverpoint.py new file mode 100755 index 00000000..ab93f140 --- /dev/null +++ b/recoverpoint/recoverpoint.py @@ -0,0 +1,259 @@ +#!/usr/bin/python +# Name: recoverpoint.py +# Desc: Ganglia Python module for gathering EMC recoverpoint statistics via SSH +# Author: Evan Fraser (evan.fraser@trademe.co.nz) +# Date: 01/08/2012 + + +import yaml +import warnings +import pprint +import time +import re + +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + import paramiko + +descriptors = list() +NIMETRICS = { + 'time' : 0, + 'data' : {} +} +#This is the minimum interval between querying the RPA for metrics. +#Each ssh query takes 1.6s so we limit the interval between getting metrics to this interval. +NIMETRICS_CACHE_MAX = 10 + +ipaddr = '' + +#Example of data structure: +#{'RPA statistics': {'Site 1 RPA 1': {'Compression CPU usage': '0.00%', +# 'Latency (ms)': 12, +# 'Packet loss': '0.00%', +# 'Traffic': {'Application': {'SAN': '0 bps', +# 'WAN': '432 bps'}, +# 'Application (writes)': 0, +# 'Compression': 0}}, + +def define_metrics(Desc_Skel, statsDict): + for rpa in statsDict['RPA statistics']: + #pprint.pprint(statsDict['RPA statistics'][rpa]) + for metric in statsDict['RPA statistics'][rpa].keys(): + if "Latency (ms)" in metric: + descriptors.append(create_desc(Desc_Skel, { + "name" : (rpa.lower()).replace(' ','_') + '_latency', + "units" : "ms", + "description" : "latency in ms", + "groups" : "Latency" + })) + if "Traffic" in metric: + #define the Appliance/[SAN|WAN] metrics + for net in statsDict['RPA statistics'][rpa]['Traffic']['Application'].keys(): + #print net + descriptors.append(create_desc(Desc_Skel, { + "name" : (rpa.lower()).replace(' ','_') + '_' + net.lower(), + "units" : "bits/sec", + "description" : net + ' traffic', + "groups" : net + " Traffic", + })) + + #Define Consistency Group metrics this is paintfully nested in the dict. + for group in statsDict['Group']: + #CG SAN and Journal lag are under the policies + for policyname in statsDict['Group'][group]['Copy stats']: + if 'SAN traffic' in statsDict['Group'][group]['Copy stats'][policyname]: + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_SAN_Traffic', + "units" : 'Bits/s', + "description" : group + ' SAN Traffic', + "groups" : 'SAN Traffic', + })) + elif 'Journal' in statsDict['Group'][group]['Copy stats'][policyname]: + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Journal_Lag', + "units" : 'Bytes', + "description" : group + ' Journal Lag', + "groups" : 'Lag', + })) + #Protection window + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Protection_Window', + "units" : 'mins', + "description" : group + ' Protection Window', + "groups" : 'Protection', + })) + + #CG Lag and WAN stats are in the Link stats section + for repname in statsDict['Group'][group]['Link stats']: + #Define CG WAN traffic metrics + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_WAN_Traffic', + "units" : 'Bits/s', + "description" : group + ' WAN Traffic', + "groups" : 'WAN Traffic', + })) + + #Define CG Lag metrics + for lagfields in statsDict['Group'][group]['Link stats'][repname]['Replication']['Lag']: + lagunit = '' + if 'Writes' in lagfields: + lagunit = 'Writes' + elif 'Data' in lagfields: + lagunit = 'Bytes' + elif 'Time' in lagfields: + lagunit = 'Seconds' + descriptors.append(create_desc(Desc_Skel, { + "name" : group + '_Lag_' + lagfields, + "units" : lagunit, + "description" : group + ' Lag ' + lagunit, + "groups" : 'Lag', + })) + + return descriptors + +def create_desc(skel, prop): + d = skel.copy() + for k,v in prop.iteritems(): + d[k] = v + return d + +def get_metrics(name): + global NIMETRICS,ipaddr + # if interval since last check > NIMETRICS_CACHE_MAX get metrics again + metrics = {} + if (time.time() - NIMETRICS['time']) > NIMETRICS_CACHE_MAX: + + sshcon = paramiko.SSHClient() + sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics;get_group_statistics") + rawdata = stdout.read() + #Group stats don't leave a space after the colon in some places + rawmetrics = yaml.safe_load(rawdata.replace(':N',': N')) + #Get RPA metrics + for rpa in rawmetrics['RPA statistics']: + for metric in rawmetrics['RPA statistics'][rpa]: + if "Latency (ms)" in metric: + metrics[(rpa.lower()).replace(' ','_') + '_latency'] = rawmetrics['RPA statistics'][rpa]['Latency (ms)'] + if "Traffic" in metric: + #store the Application/[SAN|WAN] metrics + for net in rawmetrics['RPA statistics'][rpa]['Traffic']['Application'].keys(): + traffic,junk = rawmetrics['RPA statistics'][rpa]['Traffic']['Application'][net].split() + metrics[(rpa.lower()).replace(' ','_') + '_' + net.lower()] = float(traffic) + + for group in rawmetrics['Group']: + #CG SAN and Journal lag are under the policies + for policyname in rawmetrics['Group'][group]['Copy stats']: + #Get CG SAN metrics (remove 'Mbps' from end + convert to float and then bits) + if 'SAN traffic' in rawmetrics['Group'][group]['Copy stats'][policyname]: + metrics[group + '_SAN_Traffic'] = float(rawmetrics['Group'][group]['Copy stats'][policyname]['SAN traffic']['Current throughput'][:-4]) * 1024 * 1024 + elif 'Journal' in rawmetrics['Group'][group]['Copy stats'][policyname]: + datastr = rawmetrics['Group'][group]['Copy stats'][policyname]['Journal']['Journal lag'] + amount = float(datastr[:-2]) + unitstr = datastr[-2:] + if 'MB' in unitstr: + amount = amount * 1024 * 1024 + elif 'KB' in unitstr: + amount = amount * 1024 + elif 'GB' in unitstr: + amount = amount * 1024 * 1024 * 1024 + metrics[group + '_Journal_Lag'] = amount + #Protection Window is in Journal section + prowindowstr = rawmetrics['Group'][group]['Copy stats'][policyname]['Journal']['Protection window']['Current']['Value'] + protectmins = 0 + protimelist = prowindowstr.split(' ') + if 'hr' in protimelist: + hrindex = protimelist.index('hr') + protectmins = protectmins + (int(protimelist[int(hrindex) - 1]) * 60) + if 'min' in protimelist: + minindex = protimelist.index('min') + protectmins = protectmins + int(protimelist[int(minindex) -1]) + metrics[group + '_Protection_Window'] = float(protectmins) + + #CG Lag and WAN stats are in the Link stats section + for repname in rawmetrics['Group'][group]['Link stats']: + #Get CG WAN metrics (remove 'Mbps' from end + convert to float and then bits) + metrics[group + '_WAN_Traffic'] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['WAN traffic'][:-4]) * 1024 * 1024 + + #Get CG Lag metrics + for lagfields in rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag']: + if 'Data' in lagfields: + #Convert 12.34(GB|MB|KB) to bytes + datastr = rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields] + #print datastr + amount = float(datastr[:-2]) + unitstr = datastr[-2:] + if 'MB' in unitstr: + amount = amount * 1024 * 1024 + elif 'KB' in unitstr: + amount = amount * 1024 + elif 'GB' in unitstr: + amount = amount * 1024 * 1024 * 1024 + metrics[group + '_Lag_' + lagfields] = amount + + elif 'Time' in lagfields: + #Strip 'sec' from value, convert to float. + lagtime = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields][:-3]) + metrics[group + '_Lag_' + lagfields] = lagtime + else: + #Writes Lag + metrics[group + '_Lag_' + lagfields] = float(rawmetrics['Group'][group]['Link stats'][repname]['Replication']['Lag'][lagfields]) + + NIMETRICS = { + 'time': time.time(), + 'data': metrics + } + else: + metrics = NIMETRICS['data'] + return metrics[name] + + + +def metric_init(params): + global descriptors, Desc_Skel, ipaddr + print '[recoverpoint] Recieved the following parameters' + print params + ipaddr = params['mgmtip'] + print ipaddr + spoof_string = ipaddr + ':recoverpoint' + Desc_Skel = { + 'name' : 'XXX', + 'call_back' : get_metrics, + 'time_max' : 60, + 'value_type' : 'double', + 'format' : '%0f', + 'units' : 'XXX', + 'slope' : 'both', + 'description' : 'XXX', + 'groups' : 'netiron', + 'spoof_host' : spoof_string + } + + sshcon = paramiko.SSHClient() + sshcon.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + sshcon.connect(ipaddr, username='monitor',password='monitor',look_for_keys='False') + stdin, stdout, sterr = sshcon.exec_command("get_system_statistics;get_group_statistics") + rawdata = stdout.read() + #Group stats don't leave a space after the colon in some places + statsDict = yaml.safe_load(rawdata.replace(':N',': N')) + sshcon.close() + descriptors = define_metrics(Desc_Skel, statsDict) + + return descriptors + +# For CLI Debuging: +if __name__ == '__main__': + params = { + 'mgmtip' : '192.168.1.100', + + } + descriptors = metric_init(params) + pprint.pprint(descriptors) + print len(descriptors) + while True: + for d in descriptors: + v = d['call_back'](d['name']) + print 'value for %s is %u' % (d['name'], v) + print 'Sleeping 5 seconds' + time.sleep(5) +#exit(0) diff --git a/recoverpoint/recoverpoint.pyconf b/recoverpoint/recoverpoint.pyconf new file mode 100644 index 00000000..a63031a0 --- /dev/null +++ b/recoverpoint/recoverpoint.pyconf @@ -0,0 +1,50 @@ +# Name: recoverpoint.pyconf +# Author: Evan Fraser (evan.fraser@trademe.co.nz) +# Desc: Config file for the ganglia gmond recoverpoint module. +# Date: 03/08/2012 +# To use: Save this file in /etc/ganglia/conf.d/, update the mgmtip value to the IP address of one of your RecoverPoint management IP's and change the name_match lines below to match your site names. + +modules { + module { + name = "recoverpoint" + language = "python" + param mgmtip { + value = '192.168.1.100' + } + } +} +#/* Collection groups for the +# example python module */ +collection_group { + collect_every = 20 + time_threshold = 50 + metric { + name_match = "(.+)_wan" + } + metric { + name_match = "(.+)_lan" + } + metric { + name_match = "(.+)_latency" + } + metric { + name_match = "(.+)Time" + } + metric { + name_match = "(.+)Data" + } + metric { + name_match = "(.+)Writes" + } + metric { + name_match = "(.+)Traffic" + } + metric { + name_match = "(.+)Lag" + } + metric { + name_match = "(.+)Window" + } + +} + diff --git a/redis/python_modules/redis.py b/redis/python_modules/redis.py index 92557f73..ed318bd4 100644 --- a/redis/python_modules/redis.py +++ b/redis/python_modules/redis.py @@ -28,6 +28,8 @@ def metric_handler(name): for line in info.splitlines()[1:]: if "" == line: continue + if "#" == line[0]: + continue n, v = line.split(":") if n in metric_handler.descriptors: metric_handler.info[n] = int(v) # TODO Use value_type. @@ -64,7 +66,6 @@ def metric_init(params={}): "expired_keys": {"units": "keys"}, "pubsub_channels": {"units": "channels"}, "pubsub_patterns": {"units": "patterns"}, - "vm_enabled": {"units": "yes/no"}, "master_last_io_seconds_ago": {"units": "seconds ago"}, } metric_handler.descriptors = {}