Voice client optimizations

- disco.voice.udp: follow nonce format used by Discord for `xsalsa20_poly1305_suffix` - disco.voice.udp: account for 12 Byte nonce of AES256 on decrypt - disco.voice.udp: account for different header size of `_rtpsize` modes on decrypt
elderlabs · Apr 18, 2024 · c18845d · c18845d
1 parent d988d6a
commit c18845d
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 21 deletions.
diff --git a/disco/voice/client.py b/disco/voice/client.py
@@ -56,6 +56,7 @@ class VoiceClient(LoggingClass):
 
     SUPPORTED_MODES = {
         'aead_aes256_gcm_rtpsize',
+        'aead_aes256_gcm',
         'aead_xchacha20_poly1305_rtpsize',
         'xsalsa20_poly1305_lite_rtpsize',
         'xsalsa20_poly1305_lite',
@@ -70,7 +71,7 @@ def __init__(self, client, server_id, is_dm=False, max_reconnects=5, encoder='js
         self.server_id = server_id
         self.channel_id = None
         self.is_dm = is_dm
-        self.encoder = encoder or JSONEncoder
+        self.encoder = JSONEncoder
         self.max_reconnects = max_reconnects
         self.video_enabled = False
         self.media = None
@@ -164,7 +165,7 @@ def ssrc_rtcp(self):
         return self.ssrc + 3
 
     def set_state(self, state):
-        self.log.debug('[{}] state {} -> {}'.format(self.channel_id, self.state, state))
+        self.log.debug('[{}] state {} -> {}'.format(self.channel_id or '-', self.state, state))
         prev_state = self.state
         self.state = state
         self.state_emitter.emit(state, prev_state)
@@ -391,7 +392,7 @@ def on_voice_speaking(self, data):
             priority=bool(data['speaking'] & SpeakingFlags.PRIORITY),
         )
 
-        self.client.gw.events.emit('VoiceSpeaking', payload)
+        self.client.events.emit('VoiceSpeaking', payload)
 
     def on_message(self, msg):
         try:
@@ -469,7 +470,7 @@ def on_close(self, code=None, reason=None):
                 self.log.warning(f'[{self.channel_id}] Session invalidated. Spawning fresh connection to channel.')
                 return self.connect(self.channel_id, mute=self.mute, deaf=self.deaf, video=self.video_enabled)
 
-        wait_time = 0
+        wait_time = (self._reconnects * 5) - 5
 
         self.log.info('[{}] {} in {} second{}'.format(self.channel_id, 'Resuming' if self._identified else 'Reconnecting', wait_time, 's' if wait_time != 1 else ''))
         gevent_sleep(wait_time)
@@ -490,7 +491,7 @@ def connect(self, channel_id, timeout=10, **kwargs):
             if self.state == VoiceState.CONNECTED:
                 self.log.debug('[{}] Moving to channel {}'.format(self.channel_id, channel_id))
             else:
-                self.log.debug('[{}] Attempting connection to channel id {}'.format(self.channel_id, channel_id))
+                self.log.debug('[{}] Attempting connection to channel id {}'.format(self.channel_id or '-', channel_id))
                 self.set_state(VoiceState.AWAITING_ENDPOINT)
 
         self.set_voice_state(channel_id, **kwargs)
@@ -532,7 +533,7 @@ def disconnect(self):
         if self.client.state.voice_states.get(self._session_id):
             del self.client.state.voice_states[self._session_id]
 
-        return self.client.gw.events.emit('VoiceDisconnect', self)
+        return self.client.events.emit('VoiceDisconnect', self)
 
     def send_frame(self, *args, **kwargs):
         self.udp.send_frame(*args, **kwargs)

diff --git a/disco/voice/udp.py b/disco/voice/udp.py
@@ -6,7 +6,6 @@
 
 try:
     from nacl.secret import SecretBox
-    from nacl.utils import random as nacl_random
 except ImportError:
     warnings_warn('nacl is not installed, voice support is disabled')
 
@@ -114,7 +113,7 @@ def increment_timestamp(self, by):
     def setup_encryption(self, encryption_key):
         if 'xsalsa20' in self.vc.mode:
             self._secret_box = SecretBox(encryption_key)
-        elif self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm_rtpsize'):
+        elif self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
             self._secret_box = AEScrypt(encryption_key, self.vc.mode)
 
     def send_frame(self, frame, sequence=None, timestamp=None, incr_timestamp=None):
@@ -123,21 +122,20 @@ def send_frame(self, frame, sequence=None, timestamp=None, incr_timestamp=None):
         struct_pack_into('>I', self._rtp_audio_header, 4, timestamp or self.timestamp)  # BE, unsigned int
         struct_pack_into('>i', self._rtp_audio_header, 8, self.vc.ssrc_audio)  # BE, int
 
-        if self.vc.mode == 'aead_aes256_gcm_rtpsize':
+        if self.vc.mode in ('aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
             nonce = bytearray(12)  # 96-bits
         else:
             nonce = bytearray(24)  # 192-bits is 24 bytes
 
-        if self.vc.mode in ('xsalsa20_poly1305_lite', 'xsalsa20_poly1305_lite_rtpsize', 'aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm_rtpsize'):
+        if self.vc.mode != 'xsalsa20_poly1305':
             # Use an incrementing number as a nonce, only first 4 bytes of the nonce is padded on
             self._nonce += 1
             if self._nonce > MAX_UINT32:
                 self._nonce = 0
             struct_pack_into('>I', nonce, 0, self._nonce)  # BE, unsigned int
+        if self.vc.mode in ('xsalsa20_poly1305_lite', 'xsalsa20_poly1305_lite_rtpsize', 'aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
             nonce_padding = nonce[:4]
         elif self.vc.mode == 'xsalsa20_poly1305_suffix':
-            # Generate a nonce
-            nonce = nacl_random(SecretBox.NONCE_SIZE)
             nonce_padding = nonce
         elif self.vc.mode == 'xsalsa20_poly1305':
             # Nonce is the header
@@ -147,7 +145,7 @@ def send_frame(self, frame, sequence=None, timestamp=None, incr_timestamp=None):
             raise Exception('Voice mode `{}` is not supported.'.format(self.vc.mode))
 
         # Encrypt the payload with the nonce
-        if self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm_rtpsize'):
+        if self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
             payload = self._secret_box.encrypt(plaintext=frame, nonce=bytes(nonce), aad=bytes(self._rtp_audio_header))
         else:
             payload = self._secret_box.encrypt(plaintext=frame, nonce=bytes(nonce))
@@ -210,7 +208,7 @@ def run(self):
                     data=data[8:],
                 )
 
-                self.vc.client.gw.events.emit('RTCPData', payload)
+                self.vc.client.events.emit('RTCPData', payload)
             else:
                 sequence, timestamp, ssrc = struct_unpack_from('>HII', data, 2)  # BE, unsigned short, 2x unsigned int
 
@@ -238,7 +236,11 @@ def run(self):
                     self.log.debug('[{}] [VoiceData] Received unsupported payload type, {}'.format(self.vc.channel_id, rtp.payload_type))
                     continue
 
-                nonce = bytearray(24)
+                if self.vc.mode in ('aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
+                    nonce = bytearray(12)  # 96-bits
+                else:
+                    nonce = bytearray(24)  # 192-bits is 24 bytes
+
                 if self.vc.mode in ('xsalsa20_poly1305_lite', 'xsalsa20_poly1305_lite_rtpsize', 'aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
                     nonce[:4] = data[-4:]
                     data = data[:-4]
@@ -251,13 +253,22 @@ def run(self):
                     self.log.debug('[{}] [VoiceData] Unsupported Encryption Mode, {}'.format(self.vc.channel_id, self.vc.mode))
                     continue
 
+                header_size = 12
+                if '_rtpsize' in self.vc.mode:
+                    header_size += (rtp.csrc_count * 4)
+                    if rtp.extension:
+                        header_size += 4
+                    ctxt = data[header_size:]  # plus strip whatever additional bs is before the payload
+                else:
+                    ctxt = data[12:]
+
                 try:
-                    if self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm_rtpsize'):
-                        data = self._secret_box.decrypt(ciphertext=bytes(data[12:]), nonce=bytes(nonce), aad=bytes(rtp))
+                    if self.vc.mode in ('aead_xchacha20_poly1305_rtpsize', 'aead_aes256_gcm', 'aead_aes256_gcm_rtpsize'):
+                        data = self._secret_box.decrypt(ciphertext=bytes(ctxt), nonce=bytes(nonce), aad=bytes(data[:header_size]))
                     else:
-                        data = self._secret_box.decrypt(ciphertext=bytes(data[12:]), nonce=bytes(nonce))
-                except Exception:
-                    self.log.debug('[{}] [VoiceData] Failed to decode data from ssrc {}'.format(self.vc.channel_id, rtp.ssrc))
+                        data = self._secret_box.decrypt(ciphertext=bytes(ctxt), nonce=bytes(nonce))
+                except Exception as e:
+                    self.log.debug('[{}] [VoiceData] Failed to decode data from ssrc {}: {} - {}'.format(self.vc.channel_id, rtp.ssrc, e.__class__.__name__, e))
                     continue
 
                 # RFC3550 Section 5.1 (Padding)
@@ -313,7 +324,7 @@ def run(self):
                     data=data,
                 )
 
-                self.vc.client.gw.events.emit('VoiceData', payload)
+                self.vc.client.events.emit('VoiceData', payload)
 
     def send(self, data):
         self.conn.sendto(data, (self.ip, self.port))