From e1ecf58cff57270d5befa9bd4ea89b011582ab3e Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Sat, 26 Oct 2024 13:54:25 -0400 Subject: [PATCH] track failed probes --- lib/datadog/di/error.rb | 6 ++++++ lib/datadog/di/probe_manager.rb | 25 ++++++++++++++++++++++--- lib/datadog/di/remote.rb | 6 ++++++ spec/datadog/di/probe_manager_spec.rb | 11 +++++++++-- 4 files changed, 43 insertions(+), 5 deletions(-) diff --git a/lib/datadog/di/error.rb b/lib/datadog/di/error.rb index 5a90d259959..89989660bc3 100644 --- a/lib/datadog/di/error.rb +++ b/lib/datadog/di/error.rb @@ -26,6 +26,12 @@ class AgentCommunicationError < Error # that does not in fact exist anywhere (e.g. due to a misspelling). class DITargetNotDefined < Error end + + # Raised when trying to install a probe whose installation failed + # earlier in the same process. This exception should contain the + # original exception report from initial installation attempt. + class ProbePreviouslyFailed < Error + end end end end diff --git a/lib/datadog/di/probe_manager.rb b/lib/datadog/di/probe_manager.rb index ac6f41caf80..fa854133366 100644 --- a/lib/datadog/di/probe_manager.rb +++ b/lib/datadog/di/probe_manager.rb @@ -20,6 +20,7 @@ def initialize(settings, instrumenter, probe_notification_builder, @logger = logger @installed_probes = {} @pending_probes = {} + @failed_probes = {} @definition_trace_point = TracePoint.trace(:end) do |tp| begin @@ -56,9 +57,21 @@ def clear_hooks attr_reader :installed_probes attr_reader :pending_probes + # Probes that failed to instrument for reasons other than the target is + # not yet loaded are added to this collection, so that we do not try + # to instrument them every time remote configuration is processed. + attr_reader :failed_probes + # config is one probe info def add_probe(probe) # TODO lock here? + + # Probe failed to install previously, do not try to install it again. + if msg = failed_probes[probe.id] + # TODO test this path + raise Error::ProbePreviouslyFailed, msg + end + begin instrumenter.hook(probe, &method(:probe_executed_callback)) @@ -76,11 +89,17 @@ def add_probe(probe) false end rescue => exc + # In "propagate all exceptions" mode we will try to instrument again. raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions - # Silence all exceptions? - # TODO should we propagate here and rescue upstream? + logger.warn("Error processing probe configuration: #{exc.class}: #{exc}") - nil + # TODO report probe as failed to agent since we won't attempt to + # install it again. + + # TODO add top stack frame to message + failed_probes[probe.id] = "#{exc.class}: #{exc}" + + raise end def remove_other_probes(probe_ids) diff --git a/lib/datadog/di/remote.rb b/lib/datadog/di/remote.rb index 9d956b8d547..cefac27edc9 100644 --- a/lib/datadog/di/remote.rb +++ b/lib/datadog/di/remote.rb @@ -59,6 +59,12 @@ def receivers(telemetry) component.logger.warn("Unhandled exception adding probe in DI remote receiver: #{e.class}: #{e}") + # If a probe fails to install, we will mark the content + # as errored. On subsequent remote configuration application + # attemps, probe manager will raise the "previously errored" + # exception and we'll rescue it here, again marking the + # content as errored but with a somewhat different exception + # message. content.errored("Error applying dynamic instrumentation configuration: #{e.class.name} #{e.message}: #{Array(e.backtrace).join("\n")}") end diff --git a/spec/datadog/di/probe_manager_spec.rb b/spec/datadog/di/probe_manager_spec.rb index b0422f97f94..4e720a11b67 100644 --- a/spec/datadog/di/probe_manager_spec.rb +++ b/spec/datadog/di/probe_manager_spec.rb @@ -66,6 +66,7 @@ class ProbeManagerSpecTestClass; end expect(manager.add_probe(probe)).to be true expect(manager.pending_probes.length).to eq 0 + expect(manager.failed_probes.length).to eq 0 expect(manager.installed_probes.length).to eq 1 expect(manager.installed_probes["3ecfd456-2d7c-4359-a51f-d4cc44141ffe"]).to be(probe) @@ -88,11 +89,12 @@ class ProbeManagerSpecTestClass; end expect(manager.pending_probes["3ecfd456-2d7c-4359-a51f-d4cc44141ffe"]).to be(probe) expect(manager.installed_probes.length).to eq 0 + expect(manager.failed_probes.length).to eq 0 end end context 'when there is an exception during instrumentation' do - it 'returns nil, logs warning and drops probe' do + it 'logs warning, drops probe and reraises the exception' do expect(logger).to receive(:warn) do |msg| expect(msg).to match(/Error processing probe configuration.*Instrumentation error/) end @@ -105,11 +107,16 @@ class ProbeManagerSpecTestClass; end expect(probe_notification_builder).not_to receive(:build_installed) expect(probe_notifier_worker).not_to receive(:add_status) - expect(manager.add_probe(probe)).to be nil + expect do + manager.add_probe(probe) + end.to raise_error(RuntimeError, 'Instrumentation error') expect(manager.pending_probes.length).to eq 0 expect(manager.installed_probes.length).to eq 0 + + expect(manager.failed_probes.length).to eq 1 + expect(manager.failed_probes[probe.id]).to match(/Instrumentation error/) end end end