在以下构建树的最小可重现示例中,当使用多个块时,根据其位置插入实体(因此是四叉树/八叉树的一维版本),某些块会覆盖其他块的插入,因此树中的主体数量不等于给予内核的主体数量。尽管使用了 threadfences(可能是不必要的量),将树数组标记为易失性,并使用“-Xptxas -dlcm=cg”禁用 L1 缓存,但仍然如此。这是在 Quadro P600 (
nvcc -o example -arch=sm_61 -G -g -Xptxas -dlcm=cg example.cu
) 和 A30 (nvcc -o example -arch=sm_80 -G -g -Xptxas -dlcm=cg example.cu
) 上测试的。
#include <vector>
#include <cstdio>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cstdint>
#include <iostream>
#define CUDA_MIN_WIDTH 1e-10
typedef unsigned long long int cu_size_t;
__device__ cu_size_t first_free_index = 1;
__device__ extern bool return_val = true;
struct OctreeNodeCUDA
{
double bounds;
double width;
size_t children[2];
size_t entity_index;
bool is_leaf;
int locked;
};
struct Body
{
double pos;
};
void hce(cudaError_t error)
{
if (error != cudaSuccess)
{
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
}
__device__ void kill_kernel()
{
return_val = false;
__threadfence(); // ensure store issued before trap
asm("trap;"); // kill kernel with error
}
__device__ u_char get_index(const double pos, const double bounds, double width)
{
u_char index = 0;
if (pos >= bounds + width / 2)
{
index |= 1;
}
return index;
}
__device__ bool subdivide(volatile OctreeNodeCUDA *nodes, size_t const num_nodes, size_t const node_index)
{
if (nodes[node_index].width <= CUDA_MIN_WIDTH)
{
printf("Cannot subdivide further, reached minimum width: %e\n.", nodes[node_index].width);
printf("Exiting...");
return false;
}
double new_width = nodes[node_index].width / 2;
cu_size_t local_first_free_index = atomicAdd(&first_free_index, static_cast<cu_size_t>(2));
for (u_char i = 0; i < 2; i++)
{
double nx = nodes[node_index].bounds + (i & 1) * new_width;
if (local_first_free_index >= num_nodes)
{
printf("GPU array capacity exceeded. Exiting...\n");
return false;
}
else
{
nodes[node_index].children[i] = local_first_free_index;
nodes[local_first_free_index].bounds = nx;
nodes[local_first_free_index].width = new_width;
}
local_first_free_index++;
}
return true;
}
__global__ void insert_entities_kernel(volatile OctreeNodeCUDA *nodes, size_t const num_nodes, Body const *entities, size_t const num_entities, size_t const num_threads)
{
// Distribute entities among threads
size_t chunk_size = static_cast<size_t>(ceilf(num_entities / num_threads)) + 1;
size_t thread_id = threadIdx.x + blockIdx.x * blockDim.x;
size_t start = chunk_size * thread_id;
size_t end = min(start + chunk_size, num_entities);
size_t current_node_index = 0;
size_t locked_index;
if (start < num_entities)
{
while (start < end)
{
current_node_index = 0;
Body const &e = entities[start];
while (nodes[current_node_index].is_leaf == false && nodes[current_node_index].locked == 0)
{
u_char index = get_index(e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
current_node_index = nodes[current_node_index].children[index];
}
__threadfence();
__syncthreads();
if (atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)
{
locked_index = current_node_index;
if (nodes[current_node_index].entity_index == SIZE_MAX)
{
nodes[current_node_index].entity_index = start;
nodes[current_node_index].is_leaf = true;
}
else
{
size_t const other_e_index = nodes[current_node_index].entity_index;
Body const &other_e = entities[other_e_index];
nodes[current_node_index].entity_index = SIZE_MAX;
while (true)
{
bool local_return_val = subdivide(nodes, num_nodes, current_node_index);
if (local_return_val == false)
{
kill_kernel();
}
nodes[current_node_index].is_leaf = false;
u_char index = get_index(e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
u_char other_index = get_index(other_e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
if (index == other_index)
{
current_node_index = nodes[current_node_index].children[index];
}
else
{
nodes[nodes[current_node_index].children[index]].entity_index = start;
nodes[nodes[current_node_index].children[index]].is_leaf = true;
nodes[nodes[current_node_index].children[other_index]].entity_index = other_e_index;
nodes[nodes[current_node_index].children[other_index]].is_leaf = true;
break;
}
}
}
start++;
__threadfence();
atomicExch((int *)&nodes[locked_index].locked, 0);
}
__syncthreads();
__threadfence();
}
}
}
void traverse_tree(std::vector<OctreeNodeCUDA> const &nodes, size_t const node_index, size_t &num_bodies)
{
if (nodes[node_index].is_leaf == true && nodes[node_index].entity_index < SIZE_MAX)
{
num_bodies++;
}
for (size_t i = 0; i < 2; i++)
{
if (nodes[node_index].children[i] != 0)
traverse_tree(nodes, nodes[node_index].children[i], num_bodies);
}
if (node_index == 0)
{
std::cout << "Number of bodies: " << num_bodies << std::endl;
}
}
int main()
{
Body *d_entities;
OctreeNodeCUDA *d_nodes;
size_t num_bodies = 10;
std::vector<OctreeNodeCUDA> nodes(num_bodies * 20);
for (int i = 0; i < 10; i++)
{
for (auto &node : nodes)
{
node.bounds = 0;
node.width = 0;
node.entity_index = SIZE_MAX;
node.is_leaf = true;
node.locked = 0;
for (int j = 0; j < 2; j++)
{
node.children[j] = 0;
}
}
nodes[0].bounds = -0.1;
nodes[0].width = 1.5;
std::vector<Body> entities(num_bodies);
// Initialize entities with random positions between 0 and 1
for (auto &entity : entities)
{
entity.pos = static_cast<double>(rand()) / RAND_MAX;
}
hce(cudaMalloc(&d_nodes, nodes.size() * sizeof(OctreeNodeCUDA)));
hce(cudaMemcpy(d_nodes, nodes.data(), nodes.size() * sizeof(OctreeNodeCUDA), cudaMemcpyHostToDevice));
hce(cudaMalloc(&d_entities, entities.size() * sizeof(Body)));
hce(cudaMemcpy(d_entities, entities.data(), entities.size() * sizeof(Body), cudaMemcpyHostToDevice));
cu_size_t new_index = 1;
hce(cudaMemcpyToSymbol(first_free_index, &new_index, sizeof(cu_size_t), 0, cudaMemcpyHostToDevice));
int num_blocks = 10; //! If 10 blocks are used, the bodies in the tree != num_bodies
int num_threads = 1;
insert_entities_kernel<<<num_blocks, num_threads>>>(d_nodes, nodes.size(), d_entities, entities.size(), num_blocks * num_threads);
hce(cudaGetLastError());
hce(cudaMemcpy(nodes.data(), d_nodes, nodes.size() * sizeof(OctreeNodeCUDA), cudaMemcpyDeviceToHost));
bool device_return_val;
hce(cudaMemcpyFromSymbol(&device_return_val, return_val, sizeof(bool), 0, cudaMemcpyDeviceToHost));
cudaFree(d_nodes);
cudaFree(d_entities);
if (!device_return_val)
{
std::cout << "Error: GPU tree insertion failed" << std::endl;
exit(EXIT_FAILURE);
}
size_t bodies_in_tree = 0;
traverse_tree(nodes, 0, bodies_in_tree);
}
}
这是一个示例结果(应该是 10 个物体):
Number of bodies: 3
Number of bodies: 8
Number of bodies: 8
Number of bodies: 2
Number of bodies: 5
Number of bodies: 4
Number of bodies: 6
Number of bodies: 4
Number of bodies: 4
Number of bodies: 7
正如 Robert Crovella 正确指出的那样,线程 X 的树遍历可能会因为节点被锁定而停止,但在线程 X 尝试锁定自身之前,该节点可能会被线程 Y 解锁,从而导致它在节点中插入主体那不是一片叶子。更改线路:
if (atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)
到 if (nodes[current_node_index].is_leaf == true && atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)
解决了问题。
谢谢罗伯特!