CUDA 流是一个任务队列:内存复制、事件触发、事件等待、内核启动、回调...
但是 - 这些队列的容量不是无限的。事实上,根据经验,我发现这个限制并不是很高,例如数千,而不是数百万。
我的问题:
CUDA 流的大小/容量是否根据任何类型的排队项目固定,或者容量是否根据您排队的操作/任务类型而表现不同?
“容量”根据您排队的操作/任务而表现不同。
这是一个演示:
如果我们在多个内核调用中对单个主机函数/回调进行排队,在 CUDA 11.4 上的 Tesla V100 上,我观察到约 1000 个排队项目的“容量”。 但是,如果我交替使用内核调用和主机函数,我会观察到大约 100 个排队项目的容量。
// test case with alternating kernels and callbacks
$ cat t2042a.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}
// for blocking stream to wait for host signal
class Event {
private:
std::mutex mtx_condition_;
std::condition_variable condition_;
bool signalled = false;
public:
void Signal() {
{
std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
signalled = true;
}
condition_.notify_all();
}
void Wait() {
std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
while (!signalled) {
condition_.wait(lock);
}
}
};
void CUDART_CB block_op_host_fn(void* arg) {
Event* evt = (Event*)arg;
evt->Wait();
}
int main() {
cudaStream_t stream;
CUDACHECK(cudaStreamCreate(&stream));
int num_events = 60; // 50 is okay, 60 will hang
std::vector<std::shared_ptr<Event>> event_vec;
for (int i = 0; i < num_events; i++) {
std::cout << "Queuing NoOp " << i << std::endl;
NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
std::cout << "Queued NoOp " << i << std::endl;
event_vec.push_back(std::make_shared<Event>());
cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
std::cout << "Queued block_op " << i << std::endl;
}
for (int i = 0; i < num_events; i++) {
event_vec[i]->Signal();
}
// clean up
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaStreamDestroy(stream));
return 0;
}
$ nvcc -o t2042a t2042a.cu
$ ./t2042a
Queuing NoOp 0
Queued NoOp 0
Queued block_op 0
Queuing NoOp 1
Queued NoOp 1
Queued block_op 1
Queuing NoOp 2
Queued NoOp 2
Queued block_op 2
Queuing NoOp 3
Queued NoOp 3
Queued block_op 3
Queuing NoOp 4
Queued NoOp 4
Queued block_op 4
Queuing NoOp 5
Queued NoOp 5
Queued block_op 5
Queuing NoOp 6
Queued NoOp 6
Queued block_op 6
Queuing NoOp 7
Queued NoOp 7
Queued block_op 7
Queuing NoOp 8
Queued NoOp 8
Queued block_op 8
Queuing NoOp 9
Queued NoOp 9
Queued block_op 9
Queuing NoOp 10
Queued NoOp 10
Queued block_op 10
Queuing NoOp 11
Queued NoOp 11
Queued block_op 11
Queuing NoOp 12
Queued NoOp 12
Queued block_op 12
Queuing NoOp 13
Queued NoOp 13
Queued block_op 13
Queuing NoOp 14
Queued NoOp 14
Queued block_op 14
Queuing NoOp 15
Queued NoOp 15
Queued block_op 15
Queuing NoOp 16
Queued NoOp 16
Queued block_op 16
Queuing NoOp 17
Queued NoOp 17
Queued block_op 17
Queuing NoOp 18
Queued NoOp 18
Queued block_op 18
Queuing NoOp 19
Queued NoOp 19
Queued block_op 19
Queuing NoOp 20
Queued NoOp 20
Queued block_op 20
Queuing NoOp 21
Queued NoOp 21
Queued block_op 21
Queuing NoOp 22
Queued NoOp 22
Queued block_op 22
Queuing NoOp 23
Queued NoOp 23
Queued block_op 23
Queuing NoOp 24
Queued NoOp 24
Queued block_op 24
Queuing NoOp 25
Queued NoOp 25
Queued block_op 25
Queuing NoOp 26
Queued NoOp 26
Queued block_op 26
Queuing NoOp 27
Queued NoOp 27
Queued block_op 27
Queuing NoOp 28
Queued NoOp 28
Queued block_op 28
Queuing NoOp 29
Queued NoOp 29
Queued block_op 29
Queuing NoOp 30
Queued NoOp 30
Queued block_op 30
Queuing NoOp 31
Queued NoOp 31
Queued block_op 31
Queuing NoOp 32
Queued NoOp 32
Queued block_op 32
Queuing NoOp 33
Queued NoOp 33
Queued block_op 33
Queuing NoOp 34
Queued NoOp 34
Queued block_op 34
Queuing NoOp 35
Queued NoOp 35
Queued block_op 35
Queuing NoOp 36
Queued NoOp 36
Queued block_op 36
Queuing NoOp 37
Queued NoOp 37
Queued block_op 37
Queuing NoOp 38
Queued NoOp 38
Queued block_op 38
Queuing NoOp 39
Queued NoOp 39
Queued block_op 39
Queuing NoOp 40
Queued NoOp 40
Queued block_op 40
Queuing NoOp 41
Queued NoOp 41
Queued block_op 41
Queuing NoOp 42
Queued NoOp 42
Queued block_op 42
Queuing NoOp 43
Queued NoOp 43
Queued block_op 43
Queuing NoOp 44
Queued NoOp 44
Queued block_op 44
Queuing NoOp 45
Queued NoOp 45
Queued block_op 45
Queuing NoOp 46
Queued NoOp 46
Queued block_op 46
Queuing NoOp 47
Queued NoOp 47
Queued block_op 47
Queuing NoOp 48
Queued NoOp 48
Queued block_op 48
Queuing NoOp 49
Queued NoOp 49
Queued block_op 49
Queuing NoOp 50
Queued NoOp 50
Queued block_op 50
Queuing NoOp 51
Queued NoOp 51
Queued block_op 51
Queuing NoOp 52
Queued NoOp 52
Queued block_op 52
Queuing NoOp 53
Queued NoOp 53
Queued block_op 53
Queuing NoOp 54
Queued NoOp 54
Queued block_op 54
Queuing NoOp 55
Queued NoOp 55
Queued block_op 55
Queuing NoOp 56
Queued NoOp 56
Queued block_op 56
Queuing NoOp 57
^C
$
// test case with a single callback and many kernels
$ cat t2042.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#include <cstdlib>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}
// for blocking stream to wait for host signal
class Event {
private:
std::mutex mtx_condition_;
std::condition_variable condition_;
bool signalled = false;
public:
void Signal() {
{
std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
signalled = true;
}
condition_.notify_all();
}
void Wait() {
std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
while (!signalled) {
condition_.wait(lock);
}
}
};
void CUDART_CB block_op_host_fn(void* arg) {
Event* evt = (Event*)arg;
evt->Wait();
}
int main(int argc, char *argv[]) {
cudaStream_t stream;
CUDACHECK(cudaStreamCreate(&stream));
int num_loops = 2000; // 50 is okay, 60 will hang
int num_events = 0;
std::vector<std::shared_ptr<Event>> event_vec;
if (argc > 1) num_loops = atoi(argv[1]);
for (int i = 0; i < num_loops; i++) {
std::cout << "Queuing NoOp " << i << std::endl;
NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
std::cout << "Queued NoOp " << i << std::endl;
if (i == 0){
num_events++;
event_vec.push_back(std::make_shared<Event>());
cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());
std::cout << "Queued block_op " << i << std::endl;}
}
for (int i = 0; i < num_events; i++) {
event_vec[i]->Signal();
}
// clean up
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaStreamDestroy(stream));
return 0;
}
$ nvcc -o t2042 t2042.cu
$ nvcc -o t2042 t2042.cu
$ ./t2042
... <snip>
Queuing NoOp 1019
Queued NoOp 1019
Queuing NoOp 1020
Queued NoOp 1020
Queuing NoOp 1021
Queued NoOp 1021
Queuing NoOp 1022
^C
$
(当队列变“满”时,代码会挂起,我在此时使用 ctrl-C 终止)
除了将越来越多的东西排队直到我不再容纳任何东西之外,我如何确定这个容量?
目前,CUDA 中没有对此进行规范,也没有任何显式方法在运行时查询此内容。
CUDA 应该通过其 API 告诉我们流/队列有多满。我已向 NVIDIA 提交了有关此问题的错误,要求公开流的完整程度: