Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report datadog.tracer.abandoned_spans health metric #3032

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions ddtrace/tracer/abandonedspans.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"sync/atomic"
"time"

"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/ext"
"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/internal"
"gopkg.in/DataDog/dd-trace-go.v1/internal/log"
)

Expand Down Expand Up @@ -77,27 +79,36 @@ type abandonedSpanCandidate struct {
TraceID, SpanID uint64
Start int64
Finished bool
Integration string
}

func newAbandonedSpanCandidate(s *span, finished bool) *abandonedSpanCandidate {
var component string
if v, ok := s.Meta[ext.Component]; ok {
component = v
} else {
component = "manual"
}
// finished is explicit instead of implicit as s.finished may be not set
// at the moment of calling this method.
// Also, locking is not required as it's called while the span is already locked or it's
// being initialized.
return &abandonedSpanCandidate{
Name: s.Name,
TraceID: s.TraceID,
SpanID: s.SpanID,
Start: s.Start,
Finished: finished,
c := &abandonedSpanCandidate{
Name: s.Name,
TraceID: s.TraceID,
SpanID: s.SpanID,
Start: s.Start,
Finished: finished,
Integration: component,
}
return c
}

// String takes a span and returns a human-readable string representing that span.
func (s *abandonedSpanCandidate) String() string {
age := now() - s.Start
a := fmt.Sprintf("%d sec", age/1e9)
return fmt.Sprintf("[name: %s, span_id: %d, trace_id: %d, age: %s],", s.Name, s.SpanID, s.TraceID, a)
return fmt.Sprintf("[name: %s, integration: %s, span_id: %d, trace_id: %d, age: %s],", s.Name, s.Integration, s.SpanID, s.TraceID, a)
}

type abandonedSpansDebugger struct {
Expand Down Expand Up @@ -292,6 +303,9 @@ func formatAbandonedSpans(b *bucket[uint64, *abandonedSpanCandidate], interval *
if interval != nil && curTime-s.Start < interval.Nanoseconds() {
continue
}
if t, ok := internal.GetGlobalTracer().(*tracer); ok {
t.statsd.Incr("datadog.tracer.abandoned_spans", []string{"name:" + s.Name, "integration:" + s.Integration}, 1)
}
spanCount++
msg := s.String()
sb.WriteString(msg)
Expand Down
55 changes: 54 additions & 1 deletion ddtrace/tracer/abandonedspans_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ import (
"testing"
"time"

"gopkg.in/DataDog/dd-trace-go.v1/ddtrace/ext"
"gopkg.in/DataDog/dd-trace-go.v1/internal/log"
"gopkg.in/DataDog/dd-trace-go.v1/internal/statsdtest"
"gopkg.in/DataDog/dd-trace-go.v1/internal/version"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -58,11 +60,62 @@ func assertProcessedSpans(assert *assert.Assertions, t *tracer, startedSpans, fi

func formatSpanString(s *span) string {
s.Lock()
msg := fmt.Sprintf("[name: %s, span_id: %d, trace_id: %d, age: %s],", s.Name, s.SpanID, s.TraceID, spanAge(s))
var integration string
if v, ok := s.Meta[ext.Component]; ok {
integration = v
} else {
integration = "manual"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mtoffl01 @hannahkm QQ: wouldn't a blank integration already indicate it's a span created manually? Is this required somewhere else to have manual as value?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the main concern is that leaving it blank might give the impression of being misconfigured/missing information. If we manually set it to manual here, it'll give a little more clarity.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly what Hannah said.
From being on the troubleshooting side, an explicit value (e.g. "manual") is a good way to differentiate from an unset/missing value.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM then

}
msg := fmt.Sprintf("[name: %s, integration: %s, span_id: %d, trace_id: %d, age: %s],", s.Name, integration, s.SpanID, s.TraceID, spanAge(s))
s.Unlock()
return msg
}

func TestAbandonedSpansMetric(t *testing.T) {
assert := assert.New(t)
var tg statsdtest.TestStatsdClient
tp := new(log.RecordLogger)
tickerInterval = 100 * time.Millisecond
t.Run("finished", func(t *testing.T) {
tp.Reset()
tg.Reset()
defer setTestTime()()
tracer, _, _, stop := startTestTracer(t, WithLogger(tp), WithDebugSpansMode(500*time.Millisecond), withStatsdClient(&tg))
defer stop()
s := tracer.StartSpan("operation", StartTime(spanStart)).(*span)
s.Finish()
assertProcessedSpans(assert, tracer, 1, 1)
assert.Empty(tg.GetCallsByName("datadog.tracer.abandoned_spans"))
})
t.Run("open", func(t *testing.T) {
tp.Reset()
tg.Reset()
defer setTestTime()()
tracer, _, _, stop := startTestTracer(t, WithLogger(tp), WithDebugSpansMode(500*time.Millisecond), withStatsdClient(&tg))
defer stop()
tracer.StartSpan("operation", StartTime(spanStart), Tag(ext.Component, "some_integration_name"))
assertProcessedSpans(assert, tracer, 1, 0)
calls := tg.GetCallsByName("datadog.tracer.abandoned_spans")
assert.Len(calls, 1)
call := calls[0]
assert.Equal([]string{"name:operation", "integration:some_integration_name"}, call.Tags())
})
t.Run("both", func(t *testing.T) {
tp.Reset()
tg.Reset()
defer setTestTime()()
tracer, _, _, stop := startTestTracer(t, WithLogger(tp), WithDebugSpansMode(500*time.Millisecond), withStatsdClient(&tg))
defer stop()
sf := tracer.StartSpan("op", StartTime(spanStart)).(*span)
sf.Finish()
s := tracer.StartSpan("op2", StartTime(spanStart)).(*span)
assertProcessedSpans(assert, tracer, 2, 1)
calls := tg.GetCallsByName("datadog.tracer.abandoned_spans")
assert.Len(calls, 1)
s.Finish()
})
}

func TestReportAbandonedSpans(t *testing.T) {
assert := assert.New(t)
tp := new(log.RecordLogger)
Expand Down
41 changes: 39 additions & 2 deletions internal/statsdtest/statsdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,16 @@ type TestStatsdCall struct {
rate float64
}

func (c *TestStatsdCall) Tags() []string {
return c.tags
func (t TestStatsdCall) Name() string {
return t.name
}

func (t TestStatsdCall) Tags() []string {
return t.tags
}

func (t TestStatsdCall) IntVal() int64 {
return t.intVal
}

func (tg *TestStatsdClient) addCount(name string, value int64) {
Expand Down Expand Up @@ -225,6 +233,35 @@ func (tg *TestStatsdClient) CallsByName() map[string]int {
return counts
}

// GetCallsByName returns a slice of TestStatsdCalls with the provided name on the TestStatsdClient
// It's useful if you want to use any TestStatsdCall method calls on the result(s)
func (tg *TestStatsdClient) GetCallsByName(name string) (calls []TestStatsdCall) {
tg.mu.RLock()
defer tg.mu.RUnlock()
for _, c := range tg.gaugeCalls {
if c.Name() == name {
calls = append(calls, c)
}
}
for _, c := range tg.incrCalls {
if c.Name() == name {
calls = append(calls, c)
}
}
for _, c := range tg.countCalls {
if c.Name() == name {
calls = append(calls, c)
}
}
for _, c := range tg.timingCalls {
if c.Name() == name {
calls = append(calls, c)
}
}
return calls
}

// FilterCallsByName returns a slice of TestStatsdCalls with the provided name, from the list of provided TestStatsdCalls
func FilterCallsByName(calls []TestStatsdCall, name string) []TestStatsdCall {
var matches []TestStatsdCall
for _, c := range calls {
Expand Down
Loading