如何避免在CUDA中隐式使用本地内存？

Question

我正在开发 CUDA 软件路径跟踪渲染器，现在我陷入了 L1TEX 本地加载/存储访问模式次优的问题。 NCU 告诉我，瓶颈出现在以下代码中（指标为

L2 Theoretical Sectors Local

）。下面的代码基本上是无堆栈的线性BVH遍历，我用

\**\

注释标记了本地内存访问。

__device__ float ray_intersect_bvh(
    const Ray& ray,
    const cudaTextureObject_t bvh_leaves,
    const cudaTextureObject_t node_fronts,
    const cudaTextureObject_t node_backs,
    ConstF4Ptr cached_nodes,
    const PrecomputedArray& verts,
    int& min_index,
    int& min_obj_idx,
    float& prim_u,
    float& prim_v,
    const int node_num,
    const int cache_num,
    float min_dist
) {
    bool valid_cache = false;         // whether we find valid node-intersection in cached nodes
    int node_idx     = 0;             // BVH tree node index
    float aabb_tmin  = 0;             // minimum intersection time of AABB (positive)
    // The following while lobe checks cached (in shared memory) node intersection
    // near root layers of the BVH tree are cached for faster access
    while (node_idx < cache_num && !valid_cache) {
        const LinearNode node(
            cached_nodes[node_idx],
            cached_nodes[node_idx + cache_num]
        );
        bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
        int all_offset = node.aabb.base(), gmem_index = node.aabb.prim_cnt();
        int increment = (!intersect_node) * all_offset + (intersect_node && all_offset != 1) * 1;

        node_idx += increment;

        if (intersect_node && all_offset == 1) {
            valid_cache = true;
            node_idx = gmem_index;
        }
    }
    // if we find a valid intersection in cached nodes, we continue traversal in texture memory
    if (valid_cache) {
        while (node_idx < node_num) {
            const LinearNode node(tex1Dfetch<float4>(node_fronts, node_idx),
                            tex1Dfetch<float4>(node_backs, node_idx));
            bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
            int beg_idx = 0, end_idx = 0;
            node.get_range(beg_idx, end_idx);
            
            int increment = (!intersect_node) * (end_idx < 0 ? -end_idx : 1) + int(intersect_node);
            if (intersect_node && end_idx > 0) {
                end_idx += beg_idx;
                // For BVH leaf node: traverse all the triangles within the range
                for (int idx = beg_idx; idx < end_idx; idx ++) {   /* Up to 44%.12 L2 theoretical sectors are accessed for this line */
                    /* SASS for the above line: LDL.LU, load 32 bit. */
                    bool valid  = false;
                    int2 obj_prim_idx = tex1Dfetch<int2>(bvh_leaves, idx);
                    float it_u = 0, it_v = 0, dist = Primitive::intersect(ray, verts, obj_prim_idx.y, it_u, it_v, obj_prim_idx.x >= 0);
                    valid = dist > EPSILON && dist < min_dist;
                    min_dist = valid ? dist : min_dist;
                    prim_u   = valid ? it_u : prim_u;
                    prim_v   = valid ? it_v : prim_v;
                    min_index = valid ? obj_prim_idx.y : min_index;
                    min_obj_idx = valid ? obj_prim_idx.x : min_obj_idx;
                }
            }
            node_idx += increment;
        }
    }
    return min_dist;
}

此外，AABB 相交测试也存在本地内存问题。这更加严重，因为 NCU 报告了本地内存存储模式中存在更大的问题：

__device__ bool intersect(const Ray& ray, float& t_near) const {
    // Vec3 is a simple encapsulation of float3
    Vec3 invDir = ray.d.rcp();
    Vec3 t1s = (mini - ray.o) * invDir;             // long scoreboard
    Vec3 t2s = (maxi - ray.o) * invDir;

    float tmin = t1s.minimize(t2s).max_elem();
    float tmax = t1s.maximize(t2s).min_elem();
    t_near = tmin;
    return (tmax > tmin) && (tmax > 0);             /* SASS: STL, store 32 bit. local memory access problem */
}

其他一些背景：

```
-maxrregcount=56
```
。我的设备是 RTX3060 笔记本电脑，应该不会出现意外的寄存器溢出，因为 NCU 告诉我上述两个函数中的最大活动寄存器是 52，所以应该足够了。
也没有动态本地数组索引（仅共享、全局、纹理访问）。
NCU告诉我：平均只有1.1（商店） |每个线程使用每个扇区传输的 32 个字节中的 15.8 个（负载）。我不知道为什么这么低。

我想知道为什么编译器会进行这种隐式本地内存访问，我认为应该有足够的寄存器可供使用。我也想知道如何避免这种情况。我知道CPP中有一个关键字

register

，虽然它基本上没什么用，但它有点暗示有办法提示编译器。

Answer 1

我经常遇到这个问题。当发生这种情况时，我在“共享”内存中创建一个结构并将其作为引用传递。如果您需要每个线程的参数都不同，请确保使用大小为

blockDim

的数组。

struct SharedData_t {
    Ray& ray;    //const params as normal members
    cudaTextureObject_t bvh_leaves;
    cudaTextureObject_t node_fronts;
    cudaTextureObject_t node_backs;
    mutable ConstF4Ptr cached_nodes; //non-const as mutable
    PrecomputedArray& verts;
    mutable int& min_index;
    mutable int& min_obj_idx;
    mutable float& prim_u;
    mutable float& prim_v;
    int node_num;
    int cache_num;
    mutable float min_dist;

    void init(/*lots of params*/); //you cannot construct a shared struct
};

...
__device__ __noinline__ SharedData_t& CallMeAtInitTime() {
  __shared__ SharedData_t a;
  a.init(/*lots of params*/)
  return a;
}

//pass the struct as const ref to protect the non-mutable members
__device__ float ray_intersect_bvh(const SharedData_t& a) {
... do stuff
 
}

顺便说一句，您确实知道 3090 具有光线追踪硬件来完成您想要做的事情吗？

https://raytracing-docs.nvidia.com/

如何避免在CUDA中隐式使用本地内存？

问题描述投票：0回答：1

1个回答

最新问题

如何避免在CUDA中隐式使用本地内存？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1