请注意,我知道 TCP 连接的流式传输性质,我的问题与此类事情无关。而是关于 Linux 套接字实现的 bug 怀疑。
更新:考虑到评论,我更新了我的代码,以检查recv()的返回值不仅是-1,而且是任何负值。那是为了以防万一。结果是一样的。
我有一个用 C 编写的非常简单的 TCP 客户端/服务器应用程序。 该项目的完整代码可在 github 上找到。
客户端运行多个并行线程,每个线程执行以下操作:
static size_t send_ex(int fd, const uint8_t *buff, size_t len, bool by_frags)
{
if ( by_frags )
{
size_t chunk_len, pos;
size_t res;
for ( pos = 0; pos < len; )
{
chunk_len = (size_t) random();
chunk_len %= (len - pos);
chunk_len++;
res = send(fd, (const char *) &buff[pos], chunk_len, 0);
if ( res != chunk_len) {
return (size_t) -1;
}
pos += chunk_len;
}
return len;
}
return send(fd, buff, len, 0);
}
static void *connection_task(void *arg)
{
connection_ctx_t *ctx = (connection_ctx_t *) arg;
uint32_t buff[4] = {0xAA55AA55, 0x12345678, 0x12345678, 0x12345678};
int res, fd, i;
for ( i = 0; i < count; i++ )
{
fd = socket(AF_INET, SOCK_STREAM, 0);
if ( fd < 0 ) {
fprintf(stderr, "Can't create socket!\n");
break;
}
res = connect(fd, (struct sockaddr *) ctx->serveraddr, sizeof(struct sockaddr_in));
if ( res < 0 ) {
fprintf(stderr, "Connect failed!\n");
close(fd);
break;
}
res = send_ex(fd, (const char *) buff, sizeof(buff), frags);
if ( res != sizeof(buff) ) {
fprintf(stderr, "Send failed!\n");
close(fd);
break;
}
ctx->sent_packs++;
res = close(fd);
if ( res < 0 ) {
fprintf(stderr, "CLI: Close Failed!!\n");
}
msleep(delay);
}
return NULL;
}
服务器端在每个传入连接上运行线程,执行以下操作:
typedef struct client_ctx_s {
struct sockaddr_in addr;
int fd;
} client_ctx_t;
void *client_task(void *arg)
{
client_ctx_t *client = (client_ctx_t *) arg;
size_t free_space, pos;
ssize_t chunk_len;
uint32_t buff[4] = {0};
int res;
pos = 0;
while ( pos != sizeof(buff) )
{
free_space = sizeof(buff) - pos;
assert(pos < sizeof(buff));
chunk_len = recv(client->fd, &((uint8_t *) buff)[pos], free_space, 0);
if ( chunk_len <= 0 ) {
if ( chunk_len < 0 ) {
fprintf(stderr, "%s:%u: ERROR: recv failed (errno = %d; pos = %zu)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
errno, pos);
}
else if ( pos && pos < sizeof(buff) ) {
fprintf(stderr, "%s:%u: ERROR: incomplete data block (pos = %zu)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
pos);
}
goto out;
}
assert(chunk_len <= free_space);
pos += chunk_len;
if ( pos >= 4 && buff[0] != 0xAA55AA55) {
fprintf(stderr, "%s:%u: ERROR: data corrupted (%08x)!\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
buff[0]);
}
}
fprintf(stdout, "%s:%u: %08x %08x %08x %08x\n",
inet_ntoa(client->addr.sin_addr),
ntohs(client->addr.sin_port),
buff[0], buff[1], buff[2], buff[3]);
out:
debug("Connection closed\n");
res = close(client->fd);
assert(res == 0);
free(client);
return NULL;
}
当客户端运行一千个发送线程并且每个线程重复连接-发送-断开一百次时出现的问题(
./client -t 1000 -c 100 -d 0 -f
):
此行为在本地主机和真实网络连接上都是可重复的。
使用wireshark检查损坏数据的TCP流表明:
我真的不敢相信这个问题出在 Linux TCP/IP 实现上。 谁能解释一下我的代码有什么问题吗?
我与
python3
服务器实现具有相同的行为(C
中的客户端使用 -f
(碎片)键)。并且只有序列开始(第一个块?)总是丢失。
#!/usr/bin/env python3
import threading
import socketserver
assert "__main__" == __name__
_mutex = threading.Lock()
_expected = "55aa55aa785634127856341278563412"
class _Handler(socketserver.BaseRequestHandler):
def handle(self):
_data = list()
while True:
chunk_ = self.request.recv(1024)
if not chunk_: break
_data.append(chunk_)
_data = bytes().join(_data).hex()
if _expected == _data: return
if _expected.endswith(_data): _case = "head case"
else: _case = "other case"
with _mutex: print(f"{_case}: {_data}", flush = True)
class _Server(socketserver.ThreadingMixIn, socketserver.TCPServer): pass
with _Server(("localhost", 5050), _Handler) as _server:
_server.allow_reuse_address = True
_server.serve_forever()