CUBLAS 与行主数据的矩阵乘法

问题描述 投票:0回答:1

我在这里阅读了一些相关的文章,并成功使用 cuBLAS 进行行主矩阵乘法:

A*B(列专业)= B*A(行专业)

我编写了一个包装器来执行此操作,以便我可以传递行主矩阵 A 和 B 并相应地返回结果。

cublasStatus_t gemm_float(cuDataF &out, cublasHandle_t &handle, const cuDataF &in1, int row1, int col1, cublasOperation_t trans1, const cuDataF &in2, int row2, int col2, cublasOperation_t trans2, float alpha, float beta) {
        /*if (trans1 == CUBLAS_OP_N && trans2 == CUBLAS_OP_T) {
            
        }*/

        return cublasSgemm(handle, trans1, trans2,
                            col2, row1, col1,
                            &alpha,
                            in2.data(), col2,
                            in1.data(), col1,
                            &beta,
                            out.data(), col2);
  }

但现在我的问题是,如果我想

A*transpose(B)
主修行,该怎么做?

我尝试用手推导数学,但答案是错误的。

喜欢:

A(2x3) = {1,2,3,4,5,6} (row majored) 
B(2x3) = {7,8,9,10,11,12} (row majored)
A*transpose(B)?
c++ cuda cublas
1个回答
0
投票

所以你的

A
矩阵和
B
矩阵,解释为 2x3,是:

  A             B
1 2 3         7  8  9
4 5 6        10  11 12

因此

transpose(B)=B'
是:

  B'
7  10
8  11
9  12

产品

C=AxB'
是:

   C
50  68
122 167

同样对于类似的 3x2 测试用例,我们得到:

  C          =     A   x    B'
23  53  83        7  8     1  3  5
29  67  105       9  10    2  4  6
35  81  127       11 12

这似乎有效:

# cat t232.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <vector>


extern void gpuAssert(cudaError_t code, const char *file, int line);
void cublasAssert(cublasStatus_t code, const char *file, int line);

// See below
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }


std::vector<float> CUDA_mult_MAT_T(const std::vector<float> &data_1 , const uint64_t data_1_rows, const uint64_t data_1_columns,
                                 const std::vector<float> &data_2 , const uint64_t data_2_rows, const uint64_t data_2_columns)
{
    cublasHandle_t handle;

    cublasErrchk(cublasCreate(&handle));

    std::vector<float> result(data_1_rows * data_2_columns); //Vector holding the result of the multiplication

    /*----------------------------------------------------------------------------------------------*/

    float* GPU_data_1 = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_data_1 , data_1.size()*sizeof(float))); //Allocate memory on the GPU
    gpuErrchk(cudaMemcpy(GPU_data_1, data_1.data(), data_1.size()*sizeof(float), cudaMemcpyHostToDevice)); //Copy data from data_1 to GPU_data_1

    float* GPU_data_2 = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_data_2 ,data_2.size()*sizeof(float))); //Allocate memory on the GPU
    gpuErrchk(cudaMemcpy(GPU_data_2, data_2.data(), data_2.size()*sizeof(float), cudaMemcpyHostToDevice));//Copy data from data_2 to GPU_data_2

    float* GPU_result = nullptr;
    gpuErrchk(cudaMalloc((void**)&GPU_result , result.size()*sizeof(float))); //Allocate memory on the GPU

    /*----------------------------------------------------------------------------------------------*/


    const float alpha = 1.f;
    const float beta  = 0.f;

    cublasErrchk(
               cublasSgemm(handle , CUBLAS_OP_T, CUBLAS_OP_N,
                       data_2_columns , data_1_rows ,data_1_columns,
                       &alpha , GPU_data_2 , data_2_rows,
                       GPU_data_1 , data_1_columns,
                       &beta , GPU_result , data_2_columns)

               ); //Perform multiplication



    gpuErrchk(cudaMemcpy(result.data() , GPU_result , result.size() * sizeof(float) , cudaMemcpyDeviceToHost)); //Copy back to the vector 'result'

    gpuErrchk(cudaFree(GPU_data_1)); //Free GPU memory
    gpuErrchk(cudaFree(GPU_data_2)); //Free GPU memory
    gpuErrchk(cudaFree(GPU_result)); //Free GPU memory

    cublasErrchk(cublasDestroy_v2(handle));

    return result;
}
int main()
{
    const auto r1 = CUDA_mult_MAT_T({1 , 2 , 3 , 4 , 5 , 6} , 2 , 3 ,
                                  {7 , 8 , 9 , 10 , 11 , 12} , 3 , 2);
    /*
    Product:
              7  8
    1 2 3  x  9  10
    4 5 6     11 12

    */

    for(const auto& value: r1){std::cout << value << " " ;}
    std::cout << std::endl;

    const auto r2 = CUDA_mult_MAT_T({7 , 8 , 9 , 10 , 11 , 12} , 3 , 2 ,
                                  {1 , 2 , 3 , 4 , 5 , 6} , 2 , 3);
    /*
    Product:
    7  8
    9  10  x  1  2  3
    11 12     4  5  6
    */


    for(const auto& value: r2){std::cout << value << " " ;}
    std::cout << std::endl;

    return 0;
}

// Shamelessly stolen from https://stackoverflow.com/a/14038590
void gpuAssert(cudaError_t code, const char *file, int line)
{

    if (code != cudaSuccess)
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        exit(code);
    }
}

void cublasAssert(cublasStatus_t code, const char *file, int line)
{
    if(code != CUBLAS_STATUS_SUCCESS)
    {
        std::cerr <<  "CUBLAS error.\nError code: ";

        switch(code)
        {
            case CUBLAS_STATUS_SUCCESS:{std::cerr << "CUBLAS_STATUS_SUCCESS."; break;}

            case CUBLAS_STATUS_NOT_INITIALIZED:{std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED."; break;}

            case CUBLAS_STATUS_ALLOC_FAILED:{std::cerr << "CUBLAS_STATUS_ALLOC_FAILED."; break;}

            case CUBLAS_STATUS_INVALID_VALUE:{std::cerr << "CUBLAS_STATUS_INVALID_VALUE."; break;}

            case CUBLAS_STATUS_ARCH_MISMATCH:{std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH."; break;}

            case CUBLAS_STATUS_MAPPING_ERROR:{std::cerr << "CUBLAS_STATUS_MAPPING_ERROR."; break;}

            case CUBLAS_STATUS_EXECUTION_FAILED:{std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED."; break;}

            case CUBLAS_STATUS_INTERNAL_ERROR:{std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR."; break;}

            case CUBLAS_STATUS_NOT_SUPPORTED:{std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED."; break;}

            case CUBLAS_STATUS_LICENSE_ERROR:{std::cerr << "CUBLAS_STATUS_LICENSE_ERROR."; break;}

            default:{std::cerr << "<unknown>."; break;}
        }

        std::cerr << "\nFile: "<< file << "\n";
        std::cerr << "Line: "<< line <<std::endl;

        exit(EXIT_FAILURE);
    }
}
# nvcc -o t232 t232.cu -lcublas
# compute-sanitizer ./t232
========= COMPUTE-SANITIZER
50 68 122 167
23 53 83 29 67 105 35 81 127
========= ERROR SUMMARY: 0 errors
#

请注意,我保留了两个矩阵的“输入”描述,以便与它们的实际计算方式保持一致。 在计算例程中,我所做的唯一更改是将第一个转置说明符从“OP_N”更改为“OP_T”(因为第二个输入矩阵显示为该行主标题/公式中的第一个计算矩阵)并且我更改了第一个计算矩阵(第二个输入矩阵 - 进行转置的矩阵)的主维从

data_2_columns
data_2_rows

© www.soinside.com 2019 - 2024. All rights reserved.