我在这里阅读了一些相关的文章,并成功使用 cuBLAS 进行行主矩阵乘法:
A*B(列专业)= B*A(行专业)
我编写了一个包装器来执行此操作,以便我可以传递行主矩阵 A 和 B 并相应地返回结果。
cublasStatus_t gemm_float(cuDataF &out, cublasHandle_t &handle, const cuDataF &in1, int row1, int col1, cublasOperation_t trans1, const cuDataF &in2, int row2, int col2, cublasOperation_t trans2, float alpha, float beta) {
/*if (trans1 == CUBLAS_OP_N && trans2 == CUBLAS_OP_T) {
}*/
return cublasSgemm(handle, trans1, trans2,
col2, row1, col1,
&alpha,
in2.data(), col2,
in1.data(), col1,
&beta,
out.data(), col2);
}
但现在我的问题是,如果我想
A*transpose(B)
主修行,该怎么做?
我尝试用手推导数学,但答案是错误的。
喜欢:
A(2x3) = {1,2,3,4,5,6} (row majored)
B(2x3) = {7,8,9,10,11,12} (row majored)
A*transpose(B)?
所以你的
A
矩阵和 B
矩阵,解释为 2x3,是:
A B
1 2 3 7 8 9
4 5 6 10 11 12
因此
transpose(B)=B'
是:
B'
7 10
8 11
9 12
产品
C=AxB'
是:
C
50 68
122 167
同样对于类似的 3x2 测试用例,我们得到:
C = A x B'
23 53 83 7 8 1 3 5
29 67 105 9 10 2 4 6
35 81 127 11 12
这似乎有效:
# cat t232.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <iostream>
#include <vector>
extern void gpuAssert(cudaError_t code, const char *file, int line);
void cublasAssert(cublasStatus_t code, const char *file, int line);
// See below
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define cublasErrchk(ans) { cublasAssert((ans), __FILE__, __LINE__); }
std::vector<float> CUDA_mult_MAT_T(const std::vector<float> &data_1 , const uint64_t data_1_rows, const uint64_t data_1_columns,
const std::vector<float> &data_2 , const uint64_t data_2_rows, const uint64_t data_2_columns)
{
cublasHandle_t handle;
cublasErrchk(cublasCreate(&handle));
std::vector<float> result(data_1_rows * data_2_columns); //Vector holding the result of the multiplication
/*----------------------------------------------------------------------------------------------*/
float* GPU_data_1 = nullptr;
gpuErrchk(cudaMalloc((void**)&GPU_data_1 , data_1.size()*sizeof(float))); //Allocate memory on the GPU
gpuErrchk(cudaMemcpy(GPU_data_1, data_1.data(), data_1.size()*sizeof(float), cudaMemcpyHostToDevice)); //Copy data from data_1 to GPU_data_1
float* GPU_data_2 = nullptr;
gpuErrchk(cudaMalloc((void**)&GPU_data_2 ,data_2.size()*sizeof(float))); //Allocate memory on the GPU
gpuErrchk(cudaMemcpy(GPU_data_2, data_2.data(), data_2.size()*sizeof(float), cudaMemcpyHostToDevice));//Copy data from data_2 to GPU_data_2
float* GPU_result = nullptr;
gpuErrchk(cudaMalloc((void**)&GPU_result , result.size()*sizeof(float))); //Allocate memory on the GPU
/*----------------------------------------------------------------------------------------------*/
const float alpha = 1.f;
const float beta = 0.f;
cublasErrchk(
cublasSgemm(handle , CUBLAS_OP_T, CUBLAS_OP_N,
data_2_columns , data_1_rows ,data_1_columns,
&alpha , GPU_data_2 , data_2_rows,
GPU_data_1 , data_1_columns,
&beta , GPU_result , data_2_columns)
); //Perform multiplication
gpuErrchk(cudaMemcpy(result.data() , GPU_result , result.size() * sizeof(float) , cudaMemcpyDeviceToHost)); //Copy back to the vector 'result'
gpuErrchk(cudaFree(GPU_data_1)); //Free GPU memory
gpuErrchk(cudaFree(GPU_data_2)); //Free GPU memory
gpuErrchk(cudaFree(GPU_result)); //Free GPU memory
cublasErrchk(cublasDestroy_v2(handle));
return result;
}
int main()
{
const auto r1 = CUDA_mult_MAT_T({1 , 2 , 3 , 4 , 5 , 6} , 2 , 3 ,
{7 , 8 , 9 , 10 , 11 , 12} , 3 , 2);
/*
Product:
7 8
1 2 3 x 9 10
4 5 6 11 12
*/
for(const auto& value: r1){std::cout << value << " " ;}
std::cout << std::endl;
const auto r2 = CUDA_mult_MAT_T({7 , 8 , 9 , 10 , 11 , 12} , 3 , 2 ,
{1 , 2 , 3 , 4 , 5 , 6} , 2 , 3);
/*
Product:
7 8
9 10 x 1 2 3
11 12 4 5 6
*/
for(const auto& value: r2){std::cout << value << " " ;}
std::cout << std::endl;
return 0;
}
// Shamelessly stolen from https://stackoverflow.com/a/14038590
void gpuAssert(cudaError_t code, const char *file, int line)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
exit(code);
}
}
void cublasAssert(cublasStatus_t code, const char *file, int line)
{
if(code != CUBLAS_STATUS_SUCCESS)
{
std::cerr << "CUBLAS error.\nError code: ";
switch(code)
{
case CUBLAS_STATUS_SUCCESS:{std::cerr << "CUBLAS_STATUS_SUCCESS."; break;}
case CUBLAS_STATUS_NOT_INITIALIZED:{std::cerr << "CUBLAS_STATUS_NOT_INITIALIZED."; break;}
case CUBLAS_STATUS_ALLOC_FAILED:{std::cerr << "CUBLAS_STATUS_ALLOC_FAILED."; break;}
case CUBLAS_STATUS_INVALID_VALUE:{std::cerr << "CUBLAS_STATUS_INVALID_VALUE."; break;}
case CUBLAS_STATUS_ARCH_MISMATCH:{std::cerr << "CUBLAS_STATUS_ARCH_MISMATCH."; break;}
case CUBLAS_STATUS_MAPPING_ERROR:{std::cerr << "CUBLAS_STATUS_MAPPING_ERROR."; break;}
case CUBLAS_STATUS_EXECUTION_FAILED:{std::cerr << "CUBLAS_STATUS_EXECUTION_FAILED."; break;}
case CUBLAS_STATUS_INTERNAL_ERROR:{std::cerr << "CUBLAS_STATUS_INTERNAL_ERROR."; break;}
case CUBLAS_STATUS_NOT_SUPPORTED:{std::cerr << "CUBLAS_STATUS_NOT_SUPPORTED."; break;}
case CUBLAS_STATUS_LICENSE_ERROR:{std::cerr << "CUBLAS_STATUS_LICENSE_ERROR."; break;}
default:{std::cerr << "<unknown>."; break;}
}
std::cerr << "\nFile: "<< file << "\n";
std::cerr << "Line: "<< line <<std::endl;
exit(EXIT_FAILURE);
}
}
# nvcc -o t232 t232.cu -lcublas
# compute-sanitizer ./t232
========= COMPUTE-SANITIZER
50 68 122 167
23 53 83 29 67 105 35 81 127
========= ERROR SUMMARY: 0 errors
#
请注意,我保留了两个矩阵的“输入”描述,以便与它们的实际计算方式保持一致。 在计算例程中,我所做的唯一更改是将第一个转置说明符从“OP_N”更改为“OP_T”(因为第二个输入矩阵显示为该行主标题/公式中的第一个计算矩阵)并且我更改了第一个计算矩阵(第二个输入矩阵 - 进行转置的矩阵)的主维从
data_2_columns
到 data_2_rows
。