From 6c736b6cfae8123487b4dc787973f1ab778d9b36 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Wed, 2 Aug 2023 09:07:57 -0400 Subject: [PATCH] [aws-cpp-sdk-core]: increase STS reliability and retries This fixes issues we have repeatedly experienced when using STS for authentication in a large Kubernetes cluster, with heavy load on STS: 1. The default connect timeout of 1s is too low. It happens that connections slow down. One case is very high load on kube DNS. A value of 30 seconds has proven to be robust. 2. The retry parameters are too short, authentication would frequently fail whenever STS was under higher load. The retry settings have worked in production for about 2 years. --- .../source/auth/STSCredentialsProvider.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp b/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp index 7747d86951c..f30eb561d5d 100644 --- a/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp +++ b/src/aws-cpp-sdk-core/source/auth/STSCredentialsProvider.cpp @@ -100,12 +100,19 @@ STSAssumeRoleWebIdentityCredentialsProvider::STSAssumeRoleWebIdentityCredentials Aws::Client::ClientConfiguration config; config.scheme = Aws::Http::Scheme::HTTPS; config.region = tmpRegion; + // Set the Connect Timeout to 30s. Default of 1s causes a timeout when STS is under load. + config.connectTimeoutMs = 30000; Aws::Vector retryableErrors; retryableErrors.push_back("IDPCommunicationError"); retryableErrors.push_back("InvalidIdentityToken"); - config.retryStrategy = Aws::MakeShared(STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG, retryableErrors, 3/*maxRetries*/); + // The retry parameters are optimized for STS to still respond when under heavy load in production. + config.retryStrategy = Aws::MakeShared( + STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG, + retryableErrors, + 9, /*maxRetries*/ + 588 /*scaleFactor*/); m_client = Aws::MakeUnique(STS_ASSUME_ROLE_WEB_IDENTITY_LOG_TAG, config); m_initialized = true;