Skip to content

Commit

Permalink
track failed probes
Browse files Browse the repository at this point in the history
  • Loading branch information
p committed Oct 26, 2024
1 parent 91425a6 commit e1ecf58
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 5 deletions.
6 changes: 6 additions & 0 deletions lib/datadog/di/error.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ class AgentCommunicationError < Error
# that does not in fact exist anywhere (e.g. due to a misspelling).
class DITargetNotDefined < Error
end

# Raised when trying to install a probe whose installation failed
# earlier in the same process. This exception should contain the
# original exception report from initial installation attempt.
class ProbePreviouslyFailed < Error
end
end
end
end
25 changes: 22 additions & 3 deletions lib/datadog/di/probe_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def initialize(settings, instrumenter, probe_notification_builder,
@logger = logger
@installed_probes = {}
@pending_probes = {}
@failed_probes = {}

@definition_trace_point = TracePoint.trace(:end) do |tp|
begin
Expand Down Expand Up @@ -56,9 +57,21 @@ def clear_hooks
attr_reader :installed_probes
attr_reader :pending_probes

# Probes that failed to instrument for reasons other than the target is
# not yet loaded are added to this collection, so that we do not try
# to instrument them every time remote configuration is processed.
attr_reader :failed_probes

# config is one probe info
def add_probe(probe)
# TODO lock here?

# Probe failed to install previously, do not try to install it again.
if msg = failed_probes[probe.id]
# TODO test this path
raise Error::ProbePreviouslyFailed, msg
end

begin
instrumenter.hook(probe, &method(:probe_executed_callback))

Expand All @@ -76,11 +89,17 @@ def add_probe(probe)
false
end
rescue => exc
# In "propagate all exceptions" mode we will try to instrument again.
raise if settings.dynamic_instrumentation.internal.propagate_all_exceptions
# Silence all exceptions?
# TODO should we propagate here and rescue upstream?

logger.warn("Error processing probe configuration: #{exc.class}: #{exc}")
nil
# TODO report probe as failed to agent since we won't attempt to
# install it again.

# TODO add top stack frame to message
failed_probes[probe.id] = "#{exc.class}: #{exc}"

raise
end

def remove_other_probes(probe_ids)
Expand Down
6 changes: 6 additions & 0 deletions lib/datadog/di/remote.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def receivers(telemetry)

component.logger.warn("Unhandled exception adding probe in DI remote receiver: #{e.class}: #{e}")

# If a probe fails to install, we will mark the content
# as errored. On subsequent remote configuration application
# attemps, probe manager will raise the "previously errored"
# exception and we'll rescue it here, again marking the
# content as errored but with a somewhat different exception
# message.
content.errored("Error applying dynamic instrumentation configuration: #{e.class.name} #{e.message}: #{Array(e.backtrace).join("\n")}")
end

Expand Down
11 changes: 9 additions & 2 deletions spec/datadog/di/probe_manager_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class ProbeManagerSpecTestClass; end
expect(manager.add_probe(probe)).to be true

expect(manager.pending_probes.length).to eq 0
expect(manager.failed_probes.length).to eq 0

expect(manager.installed_probes.length).to eq 1
expect(manager.installed_probes["3ecfd456-2d7c-4359-a51f-d4cc44141ffe"]).to be(probe)
Expand All @@ -88,11 +89,12 @@ class ProbeManagerSpecTestClass; end
expect(manager.pending_probes["3ecfd456-2d7c-4359-a51f-d4cc44141ffe"]).to be(probe)

expect(manager.installed_probes.length).to eq 0
expect(manager.failed_probes.length).to eq 0
end
end

context 'when there is an exception during instrumentation' do
it 'returns nil, logs warning and drops probe' do
it 'logs warning, drops probe and reraises the exception' do
expect(logger).to receive(:warn) do |msg|
expect(msg).to match(/Error processing probe configuration.*Instrumentation error/)
end
Expand All @@ -105,11 +107,16 @@ class ProbeManagerSpecTestClass; end
expect(probe_notification_builder).not_to receive(:build_installed)
expect(probe_notifier_worker).not_to receive(:add_status)

expect(manager.add_probe(probe)).to be nil
expect do
manager.add_probe(probe)
end.to raise_error(RuntimeError, 'Instrumentation error')

expect(manager.pending_probes.length).to eq 0

expect(manager.installed_probes.length).to eq 0

expect(manager.failed_probes.length).to eq 1
expect(manager.failed_probes[probe.id]).to match(/Instrumentation error/)
end
end
end
Expand Down

0 comments on commit e1ecf58

Please sign in to comment.