diff --git a/pkg/cli/tsdump_test.go b/pkg/cli/tsdump_test.go index 837c1810b8c1..8e4969850c34 100644 --- a/pkg/cli/tsdump_test.go +++ b/pkg/cli/tsdump_test.go @@ -164,7 +164,7 @@ func parseDDInput(t *testing.T, input string, w *datadogWriter) { (data != nil && data.Metric != metricName || (data != nil && source != nameValueTimestamp[1])) { if data != nil { - err := w.emitDataDogMetrics([]DatadogSeries{*data}) + _, err := w.emitDataDogMetrics([]DatadogSeries{*data}) require.NoError(t, err) } data = &DatadogSeries{ @@ -182,7 +182,7 @@ func parseDDInput(t *testing.T, input string, w *datadogWriter) { Timestamp: ts, }) } - err := w.emitDataDogMetrics([]DatadogSeries{*data}) + _, err := w.emitDataDogMetrics([]DatadogSeries{*data}) require.NoError(t, err) } diff --git a/pkg/cli/tsdump_upload.go b/pkg/cli/tsdump_upload.go index 936bfbe60c1b..806e69e58aca 100644 --- a/pkg/cli/tsdump_upload.go +++ b/pkg/cli/tsdump_upload.go @@ -37,6 +37,10 @@ const ( DatadogSeriesTypeRate DatadogSeriesTypeGauge ) +const ( + UploadStatusSuccess = "Success" + UploadStatusFailure = "Failed" +) var ( // each site in datadog has a different host name. ddSiteToHostMap @@ -207,12 +211,14 @@ func dump(kv *roachpb.KeyValue) (*DatadogSeries, error) { var printLock syncutil.Mutex -func (d *datadogWriter) emitDataDogMetrics(data []DatadogSeries) error { +func (d *datadogWriter) emitDataDogMetrics(data []DatadogSeries) ([]string, error) { tags := getUploadTags(d) + emittedMetrics := make([]string, len(data)) for i := 0; i < len(data); i++ { data[i].Tags = append(data[i].Tags, tags...) data[i].Metric = d.namePrefix + data[i].Metric + emittedMetrics[i] = data[i].Metric } // When running in init mode, we insert zeros with the current @@ -242,7 +248,7 @@ func (d *datadogWriter) emitDataDogMetrics(data []DatadogSeries) error { ) }() - return d.flush(data) + return emittedMetrics, d.flush(data) } func getUploadTags(d *datadogWriter) []string { @@ -294,7 +300,8 @@ func (d *datadogWriter) flush(data []DatadogSeries) error { } retryOpts := base.DefaultRetryOptions() - retryOpts.MaxRetries = 5 + retryOpts.MaxBackoff = 2 * time.Second + retryOpts.MaxRetries = 100 var req *http.Request for retry := retry.Start(retryOpts); retry.Next(); { req, err = http.NewRequest("POST", d.targetURL, &zipBuf) @@ -344,12 +351,20 @@ func (d *datadogWriter) upload(fileName string) error { var wg sync.WaitGroup ch := make(chan []DatadogSeries, 4000) - // errorsInDDUpload wraps the error collection in a mutex since + // metricsUploadState wraps the failed metrics collection in a mutex since // they're collected concurrently. - var errorsInDDUpload struct { + var metricsUploadState struct { syncutil.Mutex - errors []string + // we are taking map here as we want unique metric names which were failed across all tsdump upload requests. + uploadFailedMetrics map[string]struct{} + isSingleUploadSucceeded bool } + metricsUploadState.uploadFailedMetrics = make(map[string]struct{}) + metricsUploadState.isSingleUploadSucceeded = false + + markSuccessOnce := sync.OnceFunc(func() { + metricsUploadState.isSingleUploadSucceeded = true + }) // Note(davidh): This was previously set at 1000 and we'd get regular // 400s from Datadog with the cryptic `Unable to decompress payload` @@ -358,16 +373,19 @@ func (d *datadogWriter) upload(fileName string) error { for i := 0; i < 10; i++ { go func() { for data := range ch { - err := d.emitDataDogMetrics(data) + emittedMetrics, err := d.emitDataDogMetrics(data) if err != nil { func() { - errorsInDDUpload.Lock() - defer errorsInDDUpload.Unlock() - errorsInDDUpload.errors = append(errorsInDDUpload.errors, - fmt.Sprintf("retries exhausted for datadog upload for series %s with error %v\n", data[0].Metric, err)) + metricsUploadState.Lock() + defer metricsUploadState.Unlock() + for _, metric := range emittedMetrics { + metricsUploadState.uploadFailedMetrics[metric] = struct{}{} + } + }() + } else { + func() { + markSuccessOnce() }() - wg.Done() - return } wg.Done() } @@ -394,11 +412,35 @@ func (d *datadogWriter) upload(fileName string) error { year, month, day := d.uploadTime.Date() dashboardLink := fmt.Sprintf(datadogDashboardURLFormat, debugTimeSeriesDumpOpts.clusterLabel, d.uploadID, day, int(month), year, fromUnixTimestamp, toUnixTimestamp) - if len(errorsInDDUpload.errors) != 0 { - fmt.Printf("\n%d upload errors occurred:\n%s\n", len(errorsInDDUpload.errors), strings.Join(errorsInDDUpload.errors, "\n")) + var uploadStatus string + if metricsUploadState.isSingleUploadSucceeded { + uploadStatus = UploadStatusSuccess + } else { + uploadStatus = UploadStatusFailure + } + fmt.Printf("\nUpload status: %s!\n", uploadStatus) + + if metricsUploadState.isSingleUploadSucceeded { + if len(metricsUploadState.uploadFailedMetrics) != 0 { + fmt.Printf("The Tsdump upload to Datadog succeeded but %d metrics partially failed to upload."+ + " These failures can be due to transietnt network errors. If any of these metrics are critical for your investigation,"+ + " please re-upload the Tsdump:\n%s\n", len(metricsUploadState.uploadFailedMetrics), strings.Join(func() []string { + var failedMetricsList []string + index := 1 + for metric := range metricsUploadState.uploadFailedMetrics { + metric = fmt.Sprintf("%d) %s", index, metric) + failedMetricsList = append(failedMetricsList, metric) + index++ + } + return failedMetricsList + }(), "\n")) + } + + fmt.Println("\nupload id: ", d.uploadID) + fmt.Printf("datadog dashboard link: %s\n", dashboardLink) + } else { + fmt.Println("All metric upload is failed. Please re-upload the Tsdump.") } - fmt.Println("\nupload id:", d.uploadID) - fmt.Printf("datadog dashboard link: %s\n", dashboardLink) close(ch) return nil