在 cuda Thrust 中融合两个归约操作

Question

有没有办法在 Thrust 的一个内核调用中执行一个

reduce_by_key

操作和一个

reduce

（或者理想情况下另一个

reduce_by_key

）操作？除了获得计算速度外，假设我想这样做是因为第一个

reduce_by_key

操作的输出值数量太大，无法存储在内存中。

我一直在想

transform_output_iterator

是否可以在这里提供帮助，但还没有找到解决方案。

一个简单的演示，但不是我的实际用例，可能是找到矩阵中每一行的最大值中的最小值，其中该矩阵被展平并存储在

device_vector

.

Answer 1

以下代码使用固定数量的临时存储来计算所有行最大值中的最小值，以存储有限数量的最小值。之后，执行 min reduce 以找到全局最小值

思路是通过transform_output_iterator直接更新最小值。这可以通过原子（如果是临时最小值的原始指针）或通过锁（如果是临时最小值的迭代器。未在此答案中显示）来完成。

为避免原子竞争，临时最小值的数量不能太少。

对于大小为 1 的 1G 段，即每个输入元素都会有一个原子操作，我在 A100 GPU 上观察到以下时序。

time approach 1 (standard): 13.2674 ms.
time approach 2 (fused): 38.0479 ms. (minimaSlots = 1)
time approach 2 (fused): 23.9251 ms. (minimaSlots = 1024)
time approach 2 (fused): 10.1109 ms. (minimaSlots = 1024 * 1024)

#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <iostream>
#include <vector>
#include <limits>

template<size_t size>
struct UpdateMinimumOp{
    int* minPtr;

    UpdateMinimumOp(int* ptr):minPtr(ptr){}
    __device__
    int operator()(int value){
     // select output slot for minimum based on thread id
        const size_t pos = size_t(threadIdx.x) + size_t(blockIdx.x) * size_t(blockDim.x);
        const size_t minPos = pos % size;

        atomicMin(minPtr + minPos, value);
        return value;
    }
};

int main(){
    cudaEvent_t a; cudaEventCreate(&a);
    cudaEvent_t b; cudaEventCreate(&b);
    float t; 

    size_t N =  1ull << 30;
    thrust::device_vector<int> keys(N);
    thrust::device_vector<int> values(N);
    thrust::sequence(keys.begin(), keys.end(), 0);
    thrust::sequence(values.begin(), values.end(), 1);

    //Approach 1 (for timing comparison). max Reduce_by_key. then min reduce
    thrust::device_vector<int> maxima(N);

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        maxima.begin(),
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach1 = thrust::reduce(maxima.begin(), maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 1 (standard): " << t << " ms. minimum: " <<minimumApproach1 << "\n";


    //Approach 2. Fuse max Reduce_by_key with the computation of the minimaSlots smallest maxima. then min reduce the stored smallest maxima
    //constexpr size_t minimaSlots = 1; 
    //constexpr size_t minimaSlots = 1024; 
    constexpr size_t minimaSlots = 1024*1024;
   
    thrust::device_vector<int> minima_of_maxima(minimaSlots);
    thrust::fill(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max());

    auto minimaOfMaximaIterator = thrust::make_transform_output_iterator(
        thrust::make_discard_iterator(),
        UpdateMinimumOp<minimaSlots>{minima_of_maxima.data().get()}
    );

    cudaEventRecord(a);

    thrust::reduce_by_key(
        keys.begin(),
        keys.end(),
        values.begin(),
        thrust::make_discard_iterator(),
        minimaOfMaximaIterator,
        thrust::equal_to<int>{},
        thrust::maximum<int>{}
    );

    int minimumApproach2 = thrust::reduce(minima_of_maxima.begin(), minima_of_maxima.end(), std::numeric_limits<int>::max(), thrust::minimum<int>{});

    cudaEventRecord(b);
    cudaEventSynchronize(b);
    cudaEventElapsedTime(&t, a,b);
    std::cout << "time approach 2 (fused): " << t << " ms. minimum: " << minimumApproach2 << "\n";

    cudaEventDestroy(a);
    cudaEventDestroy(b);
}

在 cuda Thrust 中融合两个归约操作

问题描述投票：0回答：1

1个回答

最新问题

在 cuda Thrust 中融合两个归约操作

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1