From 48fc82a95f100c0c47a42bb13a02cb2654554905 Mon Sep 17 00:00:00 2001 From: BinBin He Date: Fri, 27 Dec 2024 17:52:17 +0000 Subject: [PATCH] Terminally exit on unrecoverable exceptions for RCI. --- agent/app/agent.go | 17 +++++- agent/app/agent_test.go | 129 +++++++++++++++++++++++++++++++++++++++- agent/app/errors.go | 12 ++++ 3 files changed, 153 insertions(+), 5 deletions(-) diff --git a/agent/app/agent.go b/agent/app/agent.go index 6a4d0bc7ab3..fa51b0f93ba 100644 --- a/agent/app/agent.go +++ b/agent/app/agent.go @@ -447,10 +447,15 @@ func (agent *ecsAgent) doStart(containerChangeEventStream *eventstream.EventStre // Register the container instance err = agent.registerContainerInstance(client, vpcSubnetAttributes) if err != nil { - if isTransient(err) { + if isTerminal(err) { + // On unrecoverable error codes, agent should terminally exit. + seelog.Warnf("Agent will terminally exit, unable to register container instance: %v", err) + return exitcodes.ExitTerminal + } else if isTransient(err) { return exitcodes.ExitError } - return exitcodes.ExitTerminal + // Other errors are considered recoverable and will be retried. + return exitcodes.ExitError } // Load Managed Daemon images asynchronously @@ -840,7 +845,13 @@ func (agent *ecsAgent) registerContainerInstance( logger.Critical("Instance registration attempt with an invalid parameter", logger.Fields{ field.Error: err, }) - return err + return terminalError{err} + } + if utils.IsAWSErrorCodeEqual(err, ecsmodel.ErrCodeClientException) { + logger.Critical("Instance registration attempt with client performing invalid action", logger.Fields{ + field.Error: err, + }) + return terminalError{err} } if _, ok := err.(apierrors.AttributeError); ok { attributeErrorMsg := "" diff --git a/agent/app/agent_test.go b/agent/app/agent_test.go index 9dc8bbafa36..b2290a78ed3 100644 --- a/agent/app/agent_test.go +++ b/agent/app/agent_test.go @@ -222,7 +222,7 @@ func TestDoStartNewTaskEngineError(t *testing.T) { assert.Equal(t, exitcodes.ExitTerminal, exitCode) } -func TestDoStartRegisterContainerInstanceErrorTerminal(t *testing.T) { +func TestDoStartRegisterContainerInstanceErrorTransient(t *testing.T) { ctrl, credentialsManager, state, imageManager, client, dockerClient, _, _, execCmdMgr, _ := setup(t) defer ctrl.Finish() @@ -282,7 +282,7 @@ func TestDoStartRegisterContainerInstanceErrorTerminal(t *testing.T) { exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), credentialsManager, state, imageManager, client, execCmdMgr) - assert.Equal(t, exitcodes.ExitTerminal, exitCode) + assert.Equal(t, exitcodes.ExitCode, exitCode) } func TestDoStartRegisterContainerInstanceErrorNonTerminal(t *testing.T) { @@ -1499,6 +1499,131 @@ func TestRegisterContainerInstanceInvalidParameterTerminalError(t *testing.T) { credentialsManager, state, imageManager, client, execCmdMgr) assert.Equal(t, exitcodes.ExitTerminal, exitCode) } + +func TestRegisterContainerInstanceInvalidClientError(t *testing.T) { + ctrl, credentialsManager, state, imageManager, client, + dockerClient, _, _, execCmdMgr, _ := setup(t) + defer ctrl.Finish() + + mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl) + mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl) + mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) + mockPauseLoader := mock_loader.NewMockLoader(ctrl) + + mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes() + mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + mockEC2Metadata.EXPECT().PrimaryENIMAC().Return("mac", nil) + mockEC2Metadata.EXPECT().VPCID(gomock.Eq("mac")).Return("vpc-id", nil) + mockEC2Metadata.EXPECT().SubnetID(gomock.Eq("mac")).Return("subnet-id", nil) + mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl) + mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes() + mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes() + mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes() + + mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl) + mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager} + mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + + dockerClient.EXPECT().SupportedVersions().Return(apiVersions).AnyTimes() + + gomock.InOrder( + client.EXPECT().GetHostResources().Return(testHostResource, nil), + mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil), + mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil), + dockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).AnyTimes().Return([]string{}, nil), + client.EXPECT().RegisterContainerInstance(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).Return("", "", awserr.New("ClientException", "", nil)), + ) + mockEC2Metadata.EXPECT().OutpostARN().Return("", nil) + + cfg := getTestConfig() + ctx, cancel := context.WithCancel(context.TODO()) + // Cancel the context to cancel async routines + defer cancel() + agent := &ecsAgent{ + ctx: ctx, + ec2MetadataClient: mockEC2Metadata, + cfg: &cfg, + pauseLoader: mockPauseLoader, + credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider), + dockerClient: dockerClient, + mobyPlugins: mockMobyPlugins, + terminationHandler: func(taskEngineState dockerstate.TaskEngineState, dataClient data.Client, taskEngine engine.TaskEngine, cancel context.CancelFunc) { + }, + serviceconnectManager: mockServiceConnectManager, + daemonManagers: mockDaemonManagers, + } + + exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), + credentialsManager, state, imageManager, client, execCmdMgr) + assert.Equal(t, exitcodes.ExitTerminal, exitCode) +} + +func TestRegisterContainerInstanceThrottlingExceptionError(t *testing.T) { + ctrl, credentialsManager, state, imageManager, client, + dockerClient, _, _, execCmdMgr, _ := setup(t) + defer ctrl.Finish() + + mockCredentialsProvider := app_mocks.NewMockCredentialsProvider(ctrl) + mockMobyPlugins := mock_mobypkgwrapper.NewMockPlugins(ctrl) + mockEC2Metadata := mock_ec2.NewMockEC2MetadataClient(ctrl) + mockPauseLoader := mock_loader.NewMockLoader(ctrl) + + mockPauseLoader.EXPECT().IsLoaded(gomock.Any()).Return(false, nil).AnyTimes() + mockPauseLoader.EXPECT().LoadImage(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + mockEC2Metadata.EXPECT().PrimaryENIMAC().Return("mac", nil) + mockEC2Metadata.EXPECT().VPCID(gomock.Eq("mac")).Return("vpc-id", nil) + mockEC2Metadata.EXPECT().SubnetID(gomock.Eq("mac")).Return("subnet-id", nil) + mockServiceConnectManager := mock_serviceconnect.NewMockManager(ctrl) + mockServiceConnectManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockServiceConnectManager.EXPECT().GetLoadedAppnetVersion().AnyTimes() + mockServiceConnectManager.EXPECT().GetCapabilitiesForAppnetInterfaceVersion("").AnyTimes() + mockServiceConnectManager.EXPECT().SetECSClient(gomock.Any(), gomock.Any()).AnyTimes() + + mockDaemonManager := mock_daemonmanager.NewMockDaemonManager(ctrl) + mockDaemonManagers := map[string]dm.DaemonManager{md.EbsCsiDriver: mockDaemonManager} + mockDaemonManager.EXPECT().IsLoaded(gomock.Any()).Return(true, nil).AnyTimes() + mockDaemonManager.EXPECT().LoadImage(gomock.Any(), gomock.Any()).Return(nil, nil).AnyTimes() + + dockerClient.EXPECT().SupportedVersions().Return(apiVersions).AnyTimes() + + gomock.InOrder( + client.EXPECT().GetHostResources().Return(testHostResource, nil), + mockCredentialsProvider.EXPECT().Retrieve(gomock.Any()).Return(awsv2.Credentials{}, nil), + mockMobyPlugins.EXPECT().Scan().AnyTimes().Return([]string{}, nil), + dockerClient.EXPECT().ListPluginsWithFilters(gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).AnyTimes().Return([]string{}, nil), + client.EXPECT().RegisterContainerInstance(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), + gomock.Any()).Return("", "", awserr.New("ThrottlingException", "", nil)), + ) + mockEC2Metadata.EXPECT().OutpostARN().Return("", nil) + + cfg := getTestConfig() + ctx, cancel := context.WithCancel(context.TODO()) + // Cancel the context to cancel async routines + defer cancel() + agent := &ecsAgent{ + ctx: ctx, + ec2MetadataClient: mockEC2Metadata, + cfg: &cfg, + pauseLoader: mockPauseLoader, + credentialsCache: awsv2.NewCredentialsCache(mockCredentialsProvider), + dockerClient: dockerClient, + mobyPlugins: mockMobyPlugins, + terminationHandler: func(taskEngineState dockerstate.TaskEngineState, dataClient data.Client, taskEngine engine.TaskEngine, cancel context.CancelFunc) { + }, + serviceconnectManager: mockServiceConnectManager, + daemonManagers: mockDaemonManagers, + } + + exitCode := agent.doStart(eventstream.NewEventStream("events", ctx), + credentialsManager, state, imageManager, client, execCmdMgr) + assert.Equal(t, exitcodes.ExitError, exitCode) +} + func TestMergeTags(t *testing.T) { ec2Key := "ec2Key" ec2Value := "ec2Value" diff --git a/agent/app/errors.go b/agent/app/errors.go index 06b02349b84..76eacb50b92 100644 --- a/agent/app/errors.go +++ b/agent/app/errors.go @@ -24,6 +24,18 @@ func isTransient(err error) bool { return ok } +type terminalError struct { + error +} + +// isTerminal returns true if the error is already wrapped as an unrecoverable condition +// which will allow agent to exit terminally. +func isTerminal(err error) bool { + // Check if the error is already wrapped as a terminalError + _, terminal := err.(terminalError) + return terminal +} + // clusterMismatchError represents a mismatch in cluster name between the // state file and the config object type clusterMismatchError struct {