diff --git a/src/ipahealthcheck/core/core.py b/src/ipahealthcheck/core/core.py index 1b71fd0a..8b4d3ba6 100644 --- a/src/ipahealthcheck/core/core.py +++ b/src/ipahealthcheck/core/core.py @@ -45,29 +45,81 @@ def find_plugins(name, registry): def run_plugin(plugin, available=(), timeout=constants.DEFAULT_TIMEOUT): + plugin_name = F"{plugin.__class__.__module__}:{plugin.__class__.__name__}" + timed_out = {} + def signal_handler(signum, frame): - raise TimeoutError('Request timed out') + # Our exception will likely be caught by code called by the plugin, + # which may not log the traceback, or may even catch the exception + # itself! So while we throw the exception in order to interrupt the + # plugin code, we also stash a copy of it so that we can log it after + # the plugin returns. + timed_out["exception"] = TimeoutError( + f"Check {plugin_name} cancelled after {timeout} sec" + ) + logger.error("--- %s ---", timed_out["exception"]) + traceback.print_stack() + timed_out["stack"] = traceback.format_stack() + logger.error( + "--- The following messages were logged by the check after it" + " was cancelled. They may not indicate the reason why the plugin" + " took too long to run. ---" + ) + raise timed_out["exception"] # manually calculate duration when we create results of our own start = datetime.now(tz=timezone.utc) signal.signal(signal.SIGALRM, signal_handler) signal.alarm(timeout) + try: for result in plugin.check(): if result is None: # Treat no result as success, fudge start time result = Result(plugin, constants.SUCCESS, start=start) yield result - except TimeoutError as e: - yield Result(plugin, constants.ERROR, exception=str(e), - start=start) + except TimeoutError: + # The plugin code _may_ have raised our original TimeoutError however + # we also have to deal with cases where it did not raise anything! In + # the finally block below we handle such cases, and since we already + # stashed the TimeoutError that we raised in timed_out[0], we don't + # need to handle it here. + pass except Exception as e: - logger.debug('Exception raised: %s', e) - logger.debug(traceback.format_exc()) + # We've got no way to know whether the plugin's own exception was + # related to our TimeoutError; if it was then we will yield a + # result here based on the plugin's own exception, and _also_ + # later on in the finally block. + logger.exception( + "Exception raised in check %r", + plugin_name + ) yield Result(plugin, constants.CRITICAL, exception=str(e), traceback=traceback.format_exc(), start=start) finally: + if timed_out: + logger.error( + "--- Increasing the timeout in" + " /etc/ipahealthcheck/ipahealthcheck.conf may allow this" + " healthcheck to complete before the deadline. ---" + ) + # If the plugin already handled our TimeoutError properly (i.e, + # logged it and raised its own exception with a sensible error + # message) then there is no need to yield our own error result + # here; however we can't rely on the plugin doing so. It may have + # caught our TimeoutError and logged something misleading; in that + # case let's err on the side of caution and return an additional + # result. + yield Result(plugin, constants.CRITICAL, + exception=str(timed_out["exception"]), + key="healthcheck_timeout", start=start, + traceback=timed_out["stack"]) + else: + logging.debug( + "--- Check %s complete after %s sec ---", + plugin_name, datetime.now(tz=timezone.utc) - start + ) signal.alarm(0) @@ -141,7 +193,7 @@ def run_service_plugins(plugins, source, check): ) continue - logger.debug('Calling check %s', plugin) + logger.debug('--- Calling check %s ---', plugin) for result in plugin.check(): # always run the service checks so dependencies work if result is not None and result.result == constants.SUCCESS: @@ -176,7 +228,7 @@ def run_plugins(plugins, available, source, check, if not source_or_check_matches(plugin, source, check): continue - logger.debug("Calling check %s", plugin) + logger.debug("--- Calling check %s ---", plugin) if not set(plugin.requires).issubset(available): logger.debug('Skipping %s:%s because %s service(s) not running', plugin.__class__.__module__, diff --git a/tests/test_plugins.py b/tests/test_plugins.py index d6ec9528..337b3961 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -13,11 +13,61 @@ def test_timeout(): """ Test that timeouts are detected. """ - class plugin1(Plugin): + class plugin_slow_passing_exception_up(Plugin): + ''' + Some plugins will not catch unexpected exceptions, and they will be + passed up to the caller. + ''' def check(self): time.sleep(5) - class plugin2(Plugin): + class plugin_slow_catching_exception_and_ignoring(Plugin): + ''' + Some plugins will catch unexpected exceptions and handle them in + unpredictable ways, such as ignoring them. + ''' + def check(self): + try: + time.sleep(5) + except Exception: + pass + return # ugly syntax to let us yield no results + yield + + class plugin_slow_catching_exception_and_handling(Plugin): + ''' + Some plugins will catch unexpected exceptions but handle them + sanely, such as by yielding an error result. + + In this case, the user will get two failure results; one from + the plugin itself, and one from the exception handler. + ''' + def check(self): + try: + time.sleep(5) + except Exception as e: + yield Result( + self, constants.ERROR, + key='test', msg='fail', + exception=str(e) + ) + + class plugin_slow_raising_exception(Plugin): + ''' + Some plugins will catch unexpected exceptions and handle them in + unpredictable ways, such as raising their own exception. + + The user will get one failure result, from the exception handler. + ''' + def check(self): + try: + time.sleep(5) + except Exception as e: + raise Exception( + "I didn't expect an exception to be thrown" + ) from e + + class plugin_fast(Plugin): def check(self): yield Result(self, constants.SUCCESS, key='test', msg='pass') @@ -25,16 +75,53 @@ def check(self): r = Registry() # Register the plugins - r(plugin1) - r(plugin2) + r(plugin_slow_passing_exception_up) + r(plugin_slow_catching_exception_and_ignoring) + r(plugin_slow_catching_exception_and_handling) + r(plugin_slow_raising_exception) + r(plugin_fast) # Collect the results results = run_plugins(r.get_plugins(), (), None, None, {}, timeout=1) - assert len(results.results) == 2 + assert 7 == len(results.results) + + assert results.results[0].kw.get('exception') == 'Check' \ + ' test_plugins:plugin_slow_passing_exception_up' \ + ' cancelled after 1 sec' + assert results.results[0].result == constants.CRITICAL + assert results.results[0].kw.get('key') == 'healthcheck_timeout' + + assert results.results[1].kw.get('exception') == 'Check' \ + ' test_plugins:plugin_slow_catching_exception_and_ignoring' \ + ' cancelled after 1 sec' + assert results.results[1].result == constants.CRITICAL + assert results.results[1].kw.get('key') == 'healthcheck_timeout' + + assert results.results[2].kw.get('exception') == 'Check' \ + ' test_plugins:plugin_slow_catching_exception_and_handling' \ + ' cancelled after 1 sec' + assert results.results[2].result == constants.ERROR + assert results.results[2].kw.get('msg') == 'fail' + assert results.results[2].kw.get('key') == 'test' + + assert results.results[3].kw.get('exception') == 'Check' \ + ' test_plugins:plugin_slow_catching_exception_and_handling' \ + ' cancelled after 1 sec' + assert results.results[3].result == constants.CRITICAL + assert results.results[3].kw.get('key') == 'healthcheck_timeout' + + assert results.results[4].kw.get('exception') == "I didn't expect an" \ + " exception to be thrown" + assert results.results[4].result == constants.CRITICAL + assert not results.results[4].kw.get('key') - assert results.results[0].result == constants.ERROR - assert results.results[0].kw.get('exception') == 'Request timed out' + assert results.results[5].kw.get('exception') == 'Check' \ + ' test_plugins:plugin_slow_raising_exception cancelled after 1 sec' + assert results.results[5].result == constants.CRITICAL + assert results.results[5].kw.get('key') == 'healthcheck_timeout' - assert results.results[1].result == constants.SUCCESS - assert results.results[1].kw.get('msg') == 'pass' + assert results.results[6].kw.get('msg') == 'pass' + assert not results.results[6].kw.get('exception') + assert results.results[6].result == constants.SUCCESS + assert results.results[6].kw.get('key') == 'test'