在编译matmul.f90文件中,我在cmd中使用了此命令:
Gfortran-o3 -march =本机-funroll -all -loopsmatmul.f90
在编译matmul.c文件的顺序中,我在cmd中使用了此命令:
GCC-O3 -March =本机-funroll -all -loopsmatmul.c
the是Fortran代码:
program matmul_fortran
implicit none
integer, parameter :: N = 1024 ! matrix size
integer, parameter :: BLOCK_SIZE = 32 ! block size
real(8), dimension(N, N) :: A, B, C
integer :: i, j, k, i_block, j_block, k_block
real(8) :: start, finish, temp
! initialize matrices A and B with random values
call random_seed()
call random_number(A)
call random_number(B)
C = 0.0 ! set matrix C to zero values
call cpu_time(start)
! multiplication
do i_block = 1, N, BLOCK_SIZE
do j_block = 1, N, BLOCK_SIZE
do k_block = 1, N, BLOCK_SIZE
do i = i_block, min(i_block + BLOCK_SIZE - 1, N)
do j = j_block, min(j_block + BLOCK_SIZE - 1, N)
do k = k_block, min(k_block + BLOCK_SIZE - 1, N)
C(k, i) = C(k, i) + A(k, j)*B(j, i)
end do
end do
end do
end do
end do
end do
call cpu_time(finish)
print *, "Fortran Matrix Multiplication Time: ", finish - start, " seconds"
end program matmul_fortran
code是C代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 1024 // matrix size
#define BLOCK_SIZE 32 // block size
// function to initliaze matrices with random values
void initialize_matrix(double *matrix) {
for (int i = 0; i < N * N; i++) {
matrix[i] = (double)rand() / RAND_MAX; // Random values between 0 and 1
}
}
int main() {
double *A, *B, *C;
clock_t start, end;
A = (double *)malloc(N * N * sizeof(double));
B = (double *)malloc(N * N * sizeof(double));
C = (double *)malloc(N * N * sizeof(double));
// set matrix C to zero values
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
C[i * N + j] = 0.0;
// Initialize matrices
srand(time(NULL));
initialize_matrix(A);
initialize_matrix(B);
start = clock();
// multiplication
for (int i_block = 0; i_block < N; i_block += BLOCK_SIZE) {
for (int j_block = 0; j_block < N; j_block += BLOCK_SIZE) {
for (int k_block = 0; k_block < N; k_block += BLOCK_SIZE) {
for (int i = i_block; i < i_block + BLOCK_SIZE && i < N; i++) {
for (int j = j_block; j < j_block + BLOCK_SIZE && j < N; j++) {
for (int k = k_block; k < k_block + BLOCK_SIZE && k < N; k++) {
C[i*N + k] += A[j*N + k]*B[i*N + j];
}
}
}
}
}
}
end = clock();
printf("C Matrix Multiplication Time: %.6f seconds\n", ((double)(end - start)) / CLOCKS_PER_SEC);
free(A);
free(B);
free(C);
return 0;
}
提前感谢!
我将要回答我的问题 - 部分。 我尝试在我的笔记本电脑上编译和运行矩阵乘法的代码,该矩阵乘法在具有AMD Ryzen 8845HS处理器上,并且正在运行Ubuntu 22.04.5.5。 compilation命令是相同的,即 fortran:
gfortran -O3 -march=native -funroll-all-loops matmul.f90
gcc -O3 -march=native -funroll-all-loops matmul.c
无论如何,当我使用Intel Fortran和Intel C编译器(安装在Windows 10 PC上的带有i7 6700处理器的PC)时,情况完全不同。对于尺寸1024 x 1024的矩阵,Fortran编译的代码需要0.079秒,而C代码为0.077秒。 C代码实际上更快。
我使用的Intel Fortran编译器的编译选项是:ifx -fast matmul.f90 /heap-arrays
我使用的Intel C编译器的编译选项是:
ifc -fast matmul.c