我正在尝试解密从使用
xsalsa20_poly1305
加密模式的 Discord 收到的语音数据。我的目标是录制并使用音频与 AI 聊天。
我能做错什么吗?感谢您的帮助!
我的代码:
async def record_audio(udp_socket, ssrc, secret_key):
box = nacl.secret.SecretBox(bytes(secret_key)) # TODO: Fix decryption xsalsa20_poly1305
print("Listening for audio data...")
try:
response, _ = udp_socket.recvfrom(74)
print(f"Received response: {response}")
# Process the response...
except socket.timeout:
print("IP discovery timeout")
except Exception as e:
print(f"Unexpected error during IP discovery: {e}")
return None, None
while True:
print("Waiting for audio data...")
try:
ready, _, _ = select.select([udp_socket], [], [], 5.0)
if udp_socket in ready:
data, addr = udp_socket.recvfrom(65536) # Adjust buffer size as necessary
print(f"Received {len(data)} bytes from {addr}: {data.hex()}")
if len(data) > 12:
# Extract the RTP header
header = data[:12]
# Construct the nonce
nonce = header + b'\x00' * 12
print(f"Nonce: {len(nonce)} bytes")
# Get the encrypted audio data
encrypted = data[12:]
print(f"Encrypted audio data: {len(encrypted)} bytes")
#The rest of the data is the encrypted audio data (Should be 48 - 24 = 24 bytes)
#nonce = data[:12]
#print(f"Nonce: {nonce}")
#if len(nonce) < 12:
# nonce.ljust(24, b'\x00')
#remaining 12 bytes can be zeros or another fixed pattern
#nonce = nonce_part + bytes(12)
#copy the RTP header to get the nonce
#nonce = bytearray(24)
#nonce[:12] = data[:12]#data[:12]
#get the encrypted audio data
#encrypted = data[12:]
print(f"Encrypted audio data: {bytes(encrypted)}")
try:
audio_data = box.decrypt(bytes(data), bytes(nonce))
print("Received audio data")
except Exception as e:
print(f"Decryption error: {e}")
except Exception as e:
print(f"Error receiving audio data: {e}")
break
编辑:密钥直接从从 Discord 接收的操作码 4 对象传递。
输出:
Waiting for audio data...
Received 48 bytes from ('66.22.243.22', 50023): 81c9000700013adfaf6439133ca81bfcd2b35eb743f2a4af0165e3cf0517d8efee5dae36ec6653c88a2d625064af33d6
Nonce: 24 bytes
Nonce: b'\x81\xc9\x00\x07\x00\x01:\xdf\xafd9\x13\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'
Encrypted audio data: 36 bytes
Encrypted audio data: b'<\xa8\x1b\xfc\xd2\xb3^\xb7C\xf2\xa4\xaf\x01e\xe3\xcf\x05\x17\xd8\xef\xee]\xae6\xecfS\xc8\x8a-bPd\xaf3\xd6'
Decryption error: Decryption failed. Ciphertext failed verification
出于安全原因,我无法包含我的 Discord 令牌,但有一些测试数据:
{'op': 4, 'd': {'video_codec': 'H264', 'secure_frames_version': 0, 'secret_key': [20, 115, 239, 10, 206, 186, 11, 248, 52, 47, 193, 69, 170, 89, 146, 187, 215, 181, 4, 177, 173, 132, 50, 212, 141, 194, 52, 217, 219, 17, 111, 5], 'mode': 'xsalsa20_poly1305', 'media_session_id': '77d90ef5c4aa124c0dcd6d39bbe88f9f', 'audio_codec': 'opus'}}
Udp socket: <socket.socket fd=604, family=2, type=2, proto=0, laddr=('0.0.0.0', 56866)>
SSRC: 112825
Secret key: [20, 115, 239, 10, 206, 186, 11, 248, 52, 47, 193, 69, 170, 89, 146, 187, 215, 181, 4, 177, 173, 132, 50, 212, 141, 194, 52, 217, 219, 17, 111, 5]
Received data: b'\x81\xc9\x00\x07\x00\x00gIZ.Y\xaf\xf8\x94\xb4a}?gm"\xc6R\x02\\\x13\xaf>@\xf0\xe8\xca\xd0\x90\xf3\x16\x89h\x14\x81s\xa0\x00\xf3$v\x99|'
字节数组中的所有内容:
Data: [129, 201, 0, 7, 0, 1, 87, 149, 132, 179, 156, 19, 161, 42, 79, 112, 160, 142, 72, 43, 68, 43, 225, 201, 66, 97, 38, 88, 120, 123, 192, 102, 18, 163, 126, 210, 96, 21, 113, 212, 66, 63, 102, 7, 123, 24, 141, 1]
RTP header: [129, 201, 0, 7, 0, 1, 87, 149, 132, 179, 156, 19]
Nonce: [129, 201, 0, 7, 0, 1, 87, 149, 132, 179, 156, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Voice data: [161, 42, 79, 112, 160, 142, 72, 43, 68, 43, 225, 201, 66, 97, 38, 88, 120, 123, 192, 102, 18, 163, 126, 210, 96, 21, 113, 212, 66, 63, 102, 7, 123, 24, 141, 1]
所以我找到了解决方案。当语音通道中无人时,Discord 似乎会发送大量垃圾数据,然后是五个静默包(0xF8、0xFF、0xFE),然后再次发送垃圾、无法解密的数据。为了接收一些 XSalsa20_Poly1305 兼容字节,必须有人位于语音通道中并且必须说话或发出任何声音。我最初没有在语音中进行测试,因为我认为静音与语音的格式相同,如果我解密它,我就可以开始了...
顺便说一句,感谢 Topaco 也指出了我的实现中的一些关键问题!