Skip to content

Commit

Permalink
Merge pull request #473 from kerthcet/fix/server-restart
Browse files Browse the repository at this point in the history
Fix: counting server restart times error
  • Loading branch information
elezar authored Feb 2, 2024
2 parents 9d2f04b + 564f250 commit 2fe4ca6
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions internal/plugin/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,15 @@ func (plugin *NvidiaDevicePlugin) Serve() error {
go func() {
lastCrashTime := time.Now()
restartCount := 0

for {
// quite if it has been restarted too often
// i.e. if server has crashed more than 5 times and it didn't last more than one hour each time
if restartCount > 5 {
// quit
klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource())
}

klog.Infof("Starting GRPC server for '%s'", plugin.rm.Resource())
err := plugin.server.Serve(sock)
if err == nil {
Expand All @@ -172,25 +180,19 @@ func (plugin *NvidiaDevicePlugin) Serve() error {

klog.Infof("GRPC server for '%s' crashed with error: %v", plugin.rm.Resource(), err)

// restart if it has not been too often
// i.e. if server has crashed more than 5 times and it didn't last more than one hour each time
if restartCount > 5 {
// quit
klog.Fatalf("GRPC server for '%s' has repeatedly crashed recently. Quitting", plugin.rm.Resource())
}
timeSinceLastCrash := time.Since(lastCrashTime).Seconds()
lastCrashTime = time.Now()
if timeSinceLastCrash > 3600 {
// it has been one hour since the last crash.. reset the count
// to reflect on the frequency
restartCount = 1
restartCount = 0
} else {
restartCount++
}
}
}()

// Wait for server to start by launching a blocking connexion
// Wait for server to start by launching a blocking connection
conn, err := plugin.dial(plugin.socket, 5*time.Second)
if err != nil {
return err
Expand Down

0 comments on commit 2fe4ca6

Please sign in to comment.