尽管使用了 volatile、threadfence 并禁用 L1 缓存，但块之间的全局内存访问不一致

Question

在以下构建树的最小可重现示例中，当使用多个块时，根据其位置插入实体（因此是四叉树/八叉树的一维版本），某些块会覆盖其他块的插入，因此树中的主体数量不等于给予内核的主体数量。尽管使用了 threadfences（可能是不必要的量），将树数组标记为易失性，并使用“-Xptxas -dlcm=cg”禁用 L1 缓存，但仍然如此。这是在 Quadro P600 (

nvcc -o example -arch=sm_61 -G -g -Xptxas -dlcm=cg example.cu

) 和 A30 (

nvcc -o example -arch=sm_80 -G -g -Xptxas -dlcm=cg example.cu

) 上测试的。

#include <vector>
#include <cstdio>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cstdint>
#include <iostream>

#define CUDA_MIN_WIDTH 1e-10

typedef unsigned long long int cu_size_t;

__device__ cu_size_t first_free_index = 1;
__device__ extern bool return_val = true;

struct OctreeNodeCUDA
{
    double bounds;
    double width;
    size_t children[2];
    size_t entity_index;
    bool is_leaf;
    int locked;
};

struct Body
{
    double pos;
};

void hce(cudaError_t error)
{
    if (error != cudaSuccess)
    {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        exit(EXIT_FAILURE);
    }
}

__device__ void kill_kernel()
{
    return_val = false;
    __threadfence(); // ensure store issued before trap
    asm("trap;");    // kill kernel with error
}

__device__ u_char get_index(const double pos, const double bounds, double width)
{
    u_char index = 0;
    if (pos >= bounds + width / 2)
    {
        index |= 1;
    }
    return index;
}

__device__ bool subdivide(volatile OctreeNodeCUDA *nodes, size_t const num_nodes, size_t const node_index)
{
    if (nodes[node_index].width <= CUDA_MIN_WIDTH)
    {
        printf("Cannot subdivide further, reached minimum width: %e\n.", nodes[node_index].width);
        printf("Exiting...");
        return false;
    }

    double new_width = nodes[node_index].width / 2;
    cu_size_t local_first_free_index = atomicAdd(&first_free_index, static_cast<cu_size_t>(2));
    for (u_char i = 0; i < 2; i++)
    {
        double nx = nodes[node_index].bounds + (i & 1) * new_width;
        if (local_first_free_index >= num_nodes)
        {
            printf("GPU array capacity exceeded. Exiting...\n");
            return false;
        }
        else
        {
            nodes[node_index].children[i] = local_first_free_index;
            nodes[local_first_free_index].bounds = nx;
            nodes[local_first_free_index].width = new_width;
        }
        local_first_free_index++;
    }
    return true;
}

__global__ void insert_entities_kernel(volatile OctreeNodeCUDA *nodes, size_t const num_nodes, Body const *entities, size_t const num_entities, size_t const num_threads)
{
    // Distribute entities among threads
    size_t chunk_size = static_cast<size_t>(ceilf(num_entities / num_threads)) + 1;
    size_t thread_id = threadIdx.x + blockIdx.x * blockDim.x;
    size_t start = chunk_size * thread_id;
    size_t end = min(start + chunk_size, num_entities);

    size_t current_node_index = 0;
    size_t locked_index;
    if (start < num_entities)
    {
        while (start < end)
        {
            current_node_index = 0;
            Body const &e = entities[start];
            while (nodes[current_node_index].is_leaf == false && nodes[current_node_index].locked == 0)
            {
                u_char index = get_index(e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
                current_node_index = nodes[current_node_index].children[index];
            }
            __threadfence();
            __syncthreads();
            if (atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)
            {
                locked_index = current_node_index;
                if (nodes[current_node_index].entity_index == SIZE_MAX)
                {
                    nodes[current_node_index].entity_index = start;
                    nodes[current_node_index].is_leaf = true;
                }
                else
                {
                    size_t const other_e_index = nodes[current_node_index].entity_index;
                    Body const &other_e = entities[other_e_index];
                    nodes[current_node_index].entity_index = SIZE_MAX;
                    while (true)
                    {
                        bool local_return_val = subdivide(nodes, num_nodes, current_node_index);
                        if (local_return_val == false)
                        {
                            kill_kernel();
                        }
                        nodes[current_node_index].is_leaf = false;

                        u_char index = get_index(e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
                        u_char other_index = get_index(other_e.pos, nodes[current_node_index].bounds, nodes[current_node_index].width);
                        if (index == other_index)
                        {
                            current_node_index = nodes[current_node_index].children[index];
                        }
                        else
                        {
                            nodes[nodes[current_node_index].children[index]].entity_index = start;
                            nodes[nodes[current_node_index].children[index]].is_leaf = true;
                            nodes[nodes[current_node_index].children[other_index]].entity_index = other_e_index;
                            nodes[nodes[current_node_index].children[other_index]].is_leaf = true;
                            break;
                        }
                    }
                }
                start++;
                __threadfence();
                atomicExch((int *)&nodes[locked_index].locked, 0);
            }
            __syncthreads();
            __threadfence();
        }
    }
}

void traverse_tree(std::vector<OctreeNodeCUDA> const &nodes, size_t const node_index, size_t &num_bodies)
{
    if (nodes[node_index].is_leaf == true && nodes[node_index].entity_index < SIZE_MAX)
    {
        num_bodies++;
    }
    for (size_t i = 0; i < 2; i++)
    {
        if (nodes[node_index].children[i] != 0)
            traverse_tree(nodes, nodes[node_index].children[i], num_bodies);
    }
    if (node_index == 0)
    {
        std::cout << "Number of bodies: " << num_bodies << std::endl;
    }
}

int main()
{
    Body *d_entities;
    OctreeNodeCUDA *d_nodes;

    size_t num_bodies = 10;
    std::vector<OctreeNodeCUDA> nodes(num_bodies * 20);
    for (int i = 0; i < 10; i++)
    {
        for (auto &node : nodes)
        {
            node.bounds = 0;
            node.width = 0;
            node.entity_index = SIZE_MAX;
            node.is_leaf = true;
            node.locked = 0;
            for (int j = 0; j < 2; j++)
            {
                node.children[j] = 0;
            }
        }
        nodes[0].bounds = -0.1;
        nodes[0].width = 1.5;

        std::vector<Body> entities(num_bodies);
        // Initialize entities with random positions between 0 and 1
        for (auto &entity : entities)
        {
            entity.pos = static_cast<double>(rand()) / RAND_MAX;
        }

        hce(cudaMalloc(&d_nodes, nodes.size() * sizeof(OctreeNodeCUDA)));
        hce(cudaMemcpy(d_nodes, nodes.data(), nodes.size() * sizeof(OctreeNodeCUDA), cudaMemcpyHostToDevice));

        hce(cudaMalloc(&d_entities, entities.size() * sizeof(Body)));
        hce(cudaMemcpy(d_entities, entities.data(), entities.size() * sizeof(Body), cudaMemcpyHostToDevice));

        cu_size_t new_index = 1;
        hce(cudaMemcpyToSymbol(first_free_index, &new_index, sizeof(cu_size_t), 0, cudaMemcpyHostToDevice));

        int num_blocks = 10; //! If 10 blocks are used, the bodies in the tree != num_bodies
        int num_threads = 1;

        insert_entities_kernel<<<num_blocks, num_threads>>>(d_nodes, nodes.size(), d_entities, entities.size(), num_blocks * num_threads);
        hce(cudaGetLastError());

        hce(cudaMemcpy(nodes.data(), d_nodes, nodes.size() * sizeof(OctreeNodeCUDA), cudaMemcpyDeviceToHost));

        bool device_return_val;

        hce(cudaMemcpyFromSymbol(&device_return_val, return_val, sizeof(bool), 0, cudaMemcpyDeviceToHost));

        cudaFree(d_nodes);
        cudaFree(d_entities);

        if (!device_return_val)
        {
            std::cout << "Error: GPU tree insertion failed" << std::endl;
            exit(EXIT_FAILURE);
        }

        size_t bodies_in_tree = 0;

        traverse_tree(nodes, 0, bodies_in_tree);
    }
}

这是一个示例结果（应该是 10 个物体）：

Number of bodies: 3
Number of bodies: 8
Number of bodies: 8
Number of bodies: 2
Number of bodies: 5
Number of bodies: 4
Number of bodies: 6
Number of bodies: 4
Number of bodies: 4
Number of bodies: 7

Answer 1

正如 Robert Crovella 正确指出的那样，线程 X 的树遍历可能会因为节点被锁定而停止，但在线程 X 尝试锁定自身之前，该节点可能会被线程 Y 解锁，从而导致它在节点中插入主体那不是一片叶子。更改线路：

if (atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)

到

if (nodes[current_node_index].is_leaf == true && atomicCAS((int *)&nodes[current_node_index].locked, 0, 1) == 0)

解决了问题。

谢谢罗伯特！

尽管使用了 volatile、threadfence 并禁用 L1 缓存，但块之间的全局内存访问不一致

问题描述投票：0回答：1

1个回答

最新问题

尽管使用了 volatile、threadfence 并禁用 L1 缓存，但块之间的全局内存访问不一致

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1