有没有办法在 Thrust 的一个内核调用中执行一个
reduce_by_key
操作和一个 reduce
(或者理想情况下另一个 reduce_by_key
)操作?除了获得计算速度外,假设我想这样做是因为第一个 reduce_by_key
操作的输出值数量太大,无法存储在内存中。
我一直在想
transform_output_iterator
是否可以在这里提供帮助,但还没有找到解决方案。
一个简单的演示,但不是我的实际用例,可能是找到矩阵中每一行的最大值中的最小值,其中该矩阵被展平并存储在
device_vector
.
以下代码使用固定数量的临时存储来计算所有行最大值中的最小值,以存储有限数量的最小值。之后,执行 min reduce 以找到全局最小值
思路是通过transform_output_iterator直接更新最小值。这可以通过原子(如果是临时最小值的原始指针)或通过锁(如果是临时最小值的迭代器。未在此答案中显示)来完成。
为避免原子竞争,临时最小值的数量不能太少。
对于大小为 1 的 1G 段,即每个输入元素都会有一个原子操作,我在 A100 GPU 上观察到以下时序。
time approach 1 (standard): 13.2674 ms.
time approach 2 (fused): 38.0479 ms. (minimaSlots = 1)
time approach 2 (fused): 23.9251 ms. (minimaSlots = 1024)
time approach 2 (fused): 10.1109 ms. (minimaSlots = 1024 * 1024)
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/discard_iterator.h>
#include <iostream>
#include <vector>
#include <limits>
template<size_t size>
struct UpdateMinimumOp{
int* minPtr;
UpdateMinimumOp(int* ptr):minPtr(ptr){}
__device__
int operator()(int value){
// select output slot for minimum based on thread id
const size_t pos = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
const size_t minPos = pos % size;
atomicMin(minPtr + minPos, value);
return value;
}
};
int main(){
cudaEvent_t a; cudaEventCreate(&a);
cudaEvent_t b; cudaEventCreate(&b);
float t;
size_t N = 1ull << 30;
thrust::device_vector<int> keys(N);
thrust::device_vector<int> values(N);
thrust::sequence(keys.begin(), keys.end(), 0);
thrust::sequence(values.begin(), values.end(), 1);
//Approach 1 (for timing comparison). max Reduce_by_key. then min reduce
thrust::device_vector<int> maxima(N);
cudaEventRecord(a);
thrust::reduce_by_key(
keys.begin(),
keys.end(),
values.begin(),
thrust::make_discard_iterator(),
maxima.begin(),
thrust::equal_to<int>{},
thrust::maximum<int>{}
);
int minimumApproach1 = thrust::reduce(maxima.begin(), maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});
cudaEventRecord(b);
cudaEventSynchronize(b);
cudaEventElapsedTime(&t, a,b);
std::cout << "time approach 1 (standard): " << t << " ms. minimum: " <<minimumApproach1 << "\n";
//Approach 2. Fuse max Reduce_by_key with the computation of the minimaSlots smallest maxima. then min reduce the stored smallest maxima
//constexpr size_t minimaSlots = 1;
//constexpr size_t minimaSlots = 1024;
constexpr size_t minimaSlots = 1024*1024;
thrust::device_vector<int> minima_of_maxima(minimaSlots);
thrust::fill(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max());
auto minimaOfMaximaIterator = thrust::make_transform_output_iterator(
thrust::make_discard_iterator(),
UpdateMinimumOp<minimaSlots>{minima_of_maxima.data().get()}
);
cudaEventRecord(a);
thrust::reduce_by_key(
keys.begin(),
keys.end(),
values.begin(),
thrust::make_discard_iterator(),
minimaOfMaximaIterator,
thrust::equal_to<int>{},
thrust::maximum<int>{}
);
int minimumApproach2 = thrust::reduce(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});
cudaEventRecord(b);
cudaEventSynchronize(b);
cudaEventElapsedTime(&t, a,b);
std::cout << "time approach 2 (fused): " << t << " ms. minimum: " << minimumApproach2 << "\n";
cudaEventDestroy(a);
cudaEventDestroy(b);
}