Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Network Monitor causes race conditions when multiple interfaces are connected #2130

Closed
hurricanehrndz opened this issue Jun 13, 2024 · 2 comments
Labels
bug Something isn't working client

Comments

@hurricanehrndz
Copy link
Contributor

hurricanehrndz commented Jun 13, 2024

Describe the problem

When more than one network interface is connected to the same network, the network monitor will try and restart the engine multiple times in succession

This error will occur

2024-05-29T17:14:13-04:00 INFO client/internal/connect.go:339: using 43196 as wireguard port: 43195 is in use
2024-05-29T17:14:13-04:00 INFO client/internal/routemanager/manager.go:93: Routing setup complete
2024-05-29T17:14:13-04:00 ERRO client/internal/engine.go:313: failed creating tunnel interface utun4319: [resource busy]
2024-05-29T17:14:13-04:00 INFO client/internal/routemanager/manager.go:117: Routing cleanup complete
2024-05-29T17:14:13-04:00 ERRO client/internal/connect.go:261: error while starting Netbird Connection Engine: create wg interface: resource busy

To Reproduce

Steps to reproduce the behavior:
Using macOS device, connected via ethernet and wifi, wait, disconnect, watch netbird fail

Expected behavior

For a resource conflict and race condition not to occur

Are you using NetBird Cloud?

No

NetBird version

0.28/master

** Possible Solution **

Add a de-bounce mechanism to avoid multiple engine restarts within 3s of a restart

From a1f8b0ed4c5d49bb0c00243f5b5a11becae3cf1e Mon Sep 17 00:00:00 2001
From: Carlos Hernandez <[email protected]>
Date: Fri, 31 May 2024 08:16:30 -0600
Subject: [PATCH] Ignore route changes if last was less than 2s ago

---
 client/internal/engine.go | 71 ++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/client/internal/engine.go b/client/internal/engine.go
index 2cbde13a..915e3ea0 100644
--- a/client/internal/engine.go
+++ b/client/internal/engine.go
@@ -152,6 +152,8 @@ type Engine struct {
 	wgProbe     *Probe
 
 	wgConnWorker sync.WaitGroup
+
+	lastNetworkChangeTimestamp *NetworkChangeTimestamp
 }
 
 // Peer is an instance of the Connection Peer
@@ -160,6 +162,17 @@ type Peer struct {
 	WgAllowedIps string
 }
 
+// NetworkChangeTimestamp is a thread-safe structure that holds the timestamp of the last network change
+type NetworkChangeTimestamp struct {
+	mu    sync.Mutex
+	value int64
+}
+
+// NewNetworkChangeTimestamp creates a new NetworkChangeTimestamp instance with the current time as the initial value
+func NewNetworkChangeTimestamp() *NetworkChangeTimestamp {
+	return &NetworkChangeTimestamp{value: time.Now().Unix()}
+}
+
 // NewEngine creates a new Connection Engine
 func NewEngine(
 	clientCtx context.Context,
@@ -199,25 +212,25 @@ func NewEngineWithProbes(
 	relayProbe *Probe,
 	wgProbe *Probe,
 ) *Engine {
-
 	return &Engine{
-		clientCtx:      clientCtx,
-		clientCancel:   clientCancel,
-		signal:         signalClient,
-		mgmClient:      mgmClient,
-		peerConns:      make(map[string]*peer.Conn),
-		syncMsgMux:     &sync.Mutex{},
-		config:         config,
-		mobileDep:      mobileDep,
-		STUNs:          []*stun.URI{},
-		TURNs:          []*stun.URI{},
-		networkSerial:  0,
-		sshServerFunc:  nbssh.DefaultSSHServer,
-		statusRecorder: statusRecorder,
-		mgmProbe:       mgmProbe,
-		signalProbe:    signalProbe,
-		relayProbe:     relayProbe,
-		wgProbe:        wgProbe,
+		clientCtx:                  clientCtx,
+		clientCancel:               clientCancel,
+		signal:                     signalClient,
+		mgmClient:                  mgmClient,
+		peerConns:                  make(map[string]*peer.Conn),
+		syncMsgMux:                 &sync.Mutex{},
+		config:                     config,
+		mobileDep:                  mobileDep,
+		STUNs:                      []*stun.URI{},
+		TURNs:                      []*stun.URI{},
+		networkSerial:              0,
+		sshServerFunc:              nbssh.DefaultSSHServer,
+		statusRecorder:             statusRecorder,
+		mgmProbe:                   mgmProbe,
+		signalProbe:                signalProbe,
+		relayProbe:                 relayProbe,
+		wgProbe:                    wgProbe,
+		lastNetworkChangeTimestamp: &NetworkChangeTimestamp{},
 	}
 }
 
@@ -228,6 +241,7 @@ func (e *Engine) Stop() error {
 	if e.cancel != nil {
 		e.cancel()
 	}
+	e.cancel = nil
 
 	// stopping network monitor first to avoid starting the engine again
 	if e.networkMonitor != nil {
@@ -358,7 +372,6 @@ func (e *Engine) Start() error {
 // modifyPeers updates peers that have been modified (e.g. IP address has been changed).
 // It closes the existing connection, removes it from the peerConns map, and creates a new one.
 func (e *Engine) modifyPeers(peersUpdate []*mgmProto.RemotePeerConfig) error {
-
 	// first, check if peers have been modified
 	var modified []*mgmProto.RemotePeerConfig
 	for _, p := range peersUpdate {
@@ -481,7 +494,8 @@ func sendSignal(message *sProto.Message, s signal.Client) error {
 
 // SignalOfferAnswer signals either an offer or an answer to remote peer
 func SignalOfferAnswer(offerAnswer peer.OfferAnswer, myKey wgtypes.Key, remoteKey wgtypes.Key, s signal.Client,
-	isAnswer bool) error {
+	isAnswer bool,
+) error {
 	var t sProto.Body_Type
 	if isAnswer {
 		t = sProto.Body_ANSWER
@@ -539,7 +553,6 @@ func isNil(server nbssh.Server) bool {
 }
 
 func (e *Engine) updateSSH(sshConf *mgmProto.SSHConfig) error {
-
 	if !e.config.ServerSSHAllowed {
 		log.Warnf("running SSH server is not permitted")
 		return nil
@@ -672,7 +685,6 @@ func (e *Engine) updateTURNs(turns []*mgmProto.ProtectedHostConfig) error {
 }
 
 func (e *Engine) updateNetworkMap(networkMap *mgmProto.NetworkMap) error {
-
 	// intentionally leave it before checking serial because for now it can happen that peer IP changed but serial didn't
 	if networkMap.GetPeerConfig() != nil {
 		err := e.updateConfig(networkMap.GetPeerConfig())
@@ -1110,7 +1122,7 @@ func (e *Engine) receiveSignalEvents() {
 
 func (e *Engine) parseNATExternalIPMappings() []string {
 	var mappedIPs []string
-	var ignoredIFaces = make(map[string]interface{})
+	ignoredIFaces := make(map[string]interface{})
 	for _, iFace := range e.config.IFaceBlackList {
 		ignoredIFaces[iFace] = nil
 	}
@@ -1411,7 +1423,20 @@ func (e *Engine) startNetworkMonitor() {
 	e.networkMonitor = networkmonitor.New()
 	go func() {
 		err := e.networkMonitor.Start(e.ctx, func() {
+			log.Infof("Network monitor detected network change")
+			e.lastNetworkChangeTimestamp.mu.Lock()
+			defer e.lastNetworkChangeTimestamp.mu.Unlock()
+			now := time.Now().Unix()
+			if e.lastNetworkChangeTimestamp.value + 3 > now  || e.cancel == nil {
+				log.Infof("Network monitor, skipping engine restart")
+				return
+			}
 			log.Infof("Network monitor detected network change, restarting engine")
+			e.lastNetworkChangeTimestamp.value = now
+			if e.cancel == nil {
+				log.Info("restart already in progress, skipping")
+				return
+			}
 			if err := e.Stop(); err != nil {
 				log.Errorf("Failed to stop engine: %v", err)
 			}
@hurricanehrndz
Copy link
Contributor Author

@bcmmbaga you may want to look at this because of #2126

@hurricanehrndz
Copy link
Contributor Author

hurricanehrndz commented Jul 3, 2024

closed by #2225

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working client
Projects
None yet
Development

No branches or pull requests

2 participants