如何在 CUDA 中有效地设置位向量的位?

问题描述 投票:0回答:1

任务就像如何并行有效地设置位向量的位?,但是对于 CUDA。

考虑其中包含

N
位的位向量(
N
很大,例如 4G)和
M
数字数组(
M
也很大,例如 1G),每个数字都在
0..N-1
范围内,指示哪个向量的位必须设置为 1。位向量只是一个整数数组,具体为
uint32_t

我在全局内存上尝试了使用

atomicOr()
进行简单的实现:

__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices,
      const uint32_t n_bits, uint32_t *bitset)
{
  const uint32_t n_threads = blockDim.x * gridDim.x;
  const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
  for(uint32_t i=i_thread; i<n_indices; i +=n_threads) {
    const uint32_t index = indices[i];
    assert(index < n_bits);
    const uint32_t i_word = index >> 5;
    const uint32_t i_bit = index & 31;
    atomicOr(bitset+i_word, 1u<<(i_bit));
  }
}

它会为

4G
位和
1G
索引产生有趣的结果:

  • RTX3090:
    0.0383266
    秒。对于排序索引 vs.
    0.332674
    秒。对于未排序(
    8.68x
    改进)
  • RTX2080:
    0.0564464
    秒。对于排序索引 vs.
    1.23666
    秒。对于未排序(
    21.91x
    改进)

因此,设备似乎在一个 warp 内合并/联合多个

atomicOr()
操作,并且 L1 缓存可能会启动,因此当索引冲突时(排序索引就是这种情况),32 位分配实际上比非冲突索引(未排序的情况)。

我们可以进一步改进已排序或未排序的情况吗?

更新:回答评论,任何解决方案都是有意义的,无论是排序还是未排序的情况,有或没有重复。未排序且有重复是更通用的情况,因此它是最令人感兴趣的。

更新2:按照向量化内存访问的建议,我实现了这个:

__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices, const uint32_t n_bits, uint32_t *bitset) {
  const uint32_t n_threads = blockDim.x * gridDim.x;
  const uint32_t i_thread = threadIdx.x + blockDim.x * blockIdx.x;
  const uint32_t n_vectors = n_indices / 4;
  for(uint32_t i=i_thread; i<n_vectors; i +=n_threads) {
    const uint4 v_index = reinterpret_cast<const uint4*>(indices)[i];
    assert(v_index.x < n_bits);
    assert(v_index.y < n_bits);
    assert(v_index.z < n_bits);
    assert(v_index.w < n_bits);
    uint4 vi_word, vi_bit;
    vi_word.x = v_index.x >> 5;
    vi_word.y = v_index.y >> 5;
    vi_word.z = v_index.z >> 5;
    vi_word.w = v_index.w >> 5;
    vi_bit.x = v_index.x & 31;
    vi_bit.y = v_index.y & 31;
    vi_bit.z = v_index.z & 31;
    vi_bit.w = v_index.w & 31;
    atomicOr(bitset+vi_word.x, 1u<<vi_bit.x);
    atomicOr(bitset+vi_word.y, 1u<<vi_bit.y);
    atomicOr(bitset+vi_word.z, 1u<<vi_bit.z);
    atomicOr(bitset+vi_word.w, 1u<<vi_bit.w);
  }
  if(i_thread < 4) {
    const uint32_t tail_start = n_vectors*4;
    const uint32_t tail_len = n_indices - tail_start;
    if(i_thread < tail_len) {
      const uint32_t index = indices[tail_start+i_thread];
      assert(index < n_bits);
      const uint32_t i_word = index >> 5;
      const uint32_t i_bit = index & 31;
      atomicOr(bitset+i_word, 1u<<i_bit);
    }
  }
}

但至少在 RTX2080 上速度较慢(我现在没有带 RTX3090 的 eGPU 来测试):

  • RTX2080:
    0.0815998
    秒。对于排序 vs.
    1.39829
    秒。对于未排序(
    17.14x
    比率)
c++ algorithm parallel-processing cuda bit-manipulation
1个回答
0
投票

这不是完整的答案,但我有太多代码无法评论。

您的代码主要受到分散原子写入的限制。

所以你很难指望最大化内存总线。只有在每次写入时都充分利用缓存行(即仅合并写入)时,您才能这样做。

但是,您可以通过使用

memcpy_async
预取数据来加快速度(可能高达 30%)。

您需要预取足够的数据来克服延迟。

//prefetch count cannot be greater than 8!
template <int my_blockdim, int prefetchcount>
__global__ void BitwiseSet(const uint32_t n_indices, const uint32_t *indices,
      const uint32_t n_bits, uint32_t *bitset)
{
  constexpr auto buffersize = myblockdim * prefetchcount;
  __shared__ s_indices[buffersize];
  auto pipeline = cuda::make_pipeline(); //pipeline with thread_scope_thread
  //every block handles its own section of the data.
  const auto start = blockDim.x * blockIdx.x;
  const auto end = std::min(n_indices, start + ((n_indices + gridDim.x - 1) / gridDim.x); 

  const auto prefetch = [&](uint32_t i){
    //pipeline.producer_acquire(); //no-op for thread_scope_thread
    const auto source = &indices[start + i];
    const auto dest = &s_indices[i % buffersize];
    constexpr auto size = sizeof(int);
    memcpy_async(dest, source, size, pipeline);
    pipeline.producer_commit();
  };
  
  //prime the pump
  for (auto i = 0; i < prefetchcount; i ++) {
    const auto a = start + threadIdx.x + (blockDim.x * i);
    prefetch(a);
  }

  const auto dowork = [&]<bool in_tail>(uint32_t start, uint32_t end) {
  //skip prefetch items, we'll process those in the tail.
  for (uint32_t i = start + threadIdx.x; i < end; i += blockDim.x) {
    pipeline.consumer_wait(); //wait for one batch
    //__syncwarp(); no need for sync here
    const auto index = s_indices[i % buffersize]; //fast because mod by constant
    //prefetch the next batch
    if constexpr (in_tail) {
      prefetch(i);
    }
    //const uint32_t index = indices[i];
    assert(index < n_bits);
    const uint32_t i_word = index >> 5;
    const uint32_t i_bit = index & 31;
    atomicOr(bitset+i_word, (1u << i_bit));
  };
  

  const auto start2 = start + buffersize;
  dowork.template operator()<false>(start2, end);
  dowork.template operator()<true>(0, buffersize);
  
}

您可以根据需要展开它,方法是每个

memcpy_async()
执行多个
pipeline.producer_commit()
并根据需要调整其余部分。

© www.soinside.com 2019 - 2024. All rights reserved.