我正在开发 CUDA 软件路径跟踪渲染器,现在我陷入了 L1TEX 本地加载/存储访问模式次优的问题。 NCU 告诉我,瓶颈出现在以下代码中(指标为
L2 Theoretical Sectors Local
)。下面的代码基本上是无堆栈的线性BVH遍历,我用\**\
注释标记了本地内存访问。
__device__ float ray_intersect_bvh(
const Ray& ray,
const cudaTextureObject_t bvh_leaves,
const cudaTextureObject_t node_fronts,
const cudaTextureObject_t node_backs,
ConstF4Ptr cached_nodes,
const PrecomputedArray& verts,
int& min_index,
int& min_obj_idx,
float& prim_u,
float& prim_v,
const int node_num,
const int cache_num,
float min_dist
) {
bool valid_cache = false; // whether we find valid node-intersection in cached nodes
int node_idx = 0; // BVH tree node index
float aabb_tmin = 0; // minimum intersection time of AABB (positive)
// The following while lobe checks cached (in shared memory) node intersection
// near root layers of the BVH tree are cached for faster access
while (node_idx < cache_num && !valid_cache) {
const LinearNode node(
cached_nodes[node_idx],
cached_nodes[node_idx + cache_num]
);
bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
int all_offset = node.aabb.base(), gmem_index = node.aabb.prim_cnt();
int increment = (!intersect_node) * all_offset + (intersect_node && all_offset != 1) * 1;
node_idx += increment;
if (intersect_node && all_offset == 1) {
valid_cache = true;
node_idx = gmem_index;
}
}
// if we find a valid intersection in cached nodes, we continue traversal in texture memory
if (valid_cache) {
while (node_idx < node_num) {
const LinearNode node(tex1Dfetch<float4>(node_fronts, node_idx),
tex1Dfetch<float4>(node_backs, node_idx));
bool intersect_node = node.aabb.intersect(ray, aabb_tmin) && aabb_tmin < min_dist;
int beg_idx = 0, end_idx = 0;
node.get_range(beg_idx, end_idx);
int increment = (!intersect_node) * (end_idx < 0 ? -end_idx : 1) + int(intersect_node);
if (intersect_node && end_idx > 0) {
end_idx += beg_idx;
// For BVH leaf node: traverse all the triangles within the range
for (int idx = beg_idx; idx < end_idx; idx ++) { /* Up to 44%.12 L2 theoretical sectors are accessed for this line */
/* SASS for the above line: LDL.LU, load 32 bit. */
bool valid = false;
int2 obj_prim_idx = tex1Dfetch<int2>(bvh_leaves, idx);
float it_u = 0, it_v = 0, dist = Primitive::intersect(ray, verts, obj_prim_idx.y, it_u, it_v, obj_prim_idx.x >= 0);
valid = dist > EPSILON && dist < min_dist;
min_dist = valid ? dist : min_dist;
prim_u = valid ? it_u : prim_u;
prim_v = valid ? it_v : prim_v;
min_index = valid ? obj_prim_idx.y : min_index;
min_obj_idx = valid ? obj_prim_idx.x : min_obj_idx;
}
}
node_idx += increment;
}
}
return min_dist;
}
此外,AABB 相交测试也存在本地内存问题。这更加严重,因为 NCU 报告了本地内存存储模式中存在更大的问题:
__device__ bool intersect(const Ray& ray, float& t_near) const {
// Vec3 is a simple encapsulation of float3
Vec3 invDir = ray.d.rcp();
Vec3 t1s = (mini - ray.o) * invDir; // long scoreboard
Vec3 t2s = (maxi - ray.o) * invDir;
float tmin = t1s.minimize(t2s).max_elem();
float tmax = t1s.maximize(t2s).min_elem();
t_near = tmin;
return (tmax > tmin) && (tmax > 0); /* SASS: STL, store 32 bit. local memory access problem */
}
其他一些背景:
-maxrregcount=56
。我的设备是 RTX3060 笔记本电脑,应该不会出现意外的寄存器溢出,因为 NCU 告诉我上述两个函数中的最大活动寄存器是 52,所以应该足够了。我想知道为什么编译器会进行这种隐式本地内存访问,我认为应该有足够的寄存器可供使用。我也想知道如何避免这种情况。我知道CPP中有一个关键字
register
,虽然它基本上没什么用,但它有点暗示有办法提示编译器。
我经常遇到这个问题。当发生这种情况时,我在“共享”内存中创建一个结构并将其作为引用传递。 如果您需要每个线程的参数都不同,请确保使用大小为
blockDim
的数组。
struct SharedData_t {
Ray& ray; //const params as normal members
cudaTextureObject_t bvh_leaves;
cudaTextureObject_t node_fronts;
cudaTextureObject_t node_backs;
mutable ConstF4Ptr cached_nodes; //non-const as mutable
PrecomputedArray& verts;
mutable int& min_index;
mutable int& min_obj_idx;
mutable float& prim_u;
mutable float& prim_v;
int node_num;
int cache_num;
mutable float min_dist;
void init(/*lots of params*/); //you cannot construct a shared struct
};
...
__device__ __noinline__ SharedData_t& CallMeAtInitTime() {
__shared__ SharedData_t a;
a.init(/*lots of params*/)
return a;
}
//pass the struct as const ref to protect the non-mutable members
__device__ float ray_intersect_bvh(const SharedData_t& a) {
... do stuff
}
顺便说一句,您确实知道 3090 具有光线追踪硬件来完成您想要做的事情吗?https://raytracing-docs.nvidia.com/