MPI_Scatterv 要慢得多(23 倍)。为什么会这样?

问题描述 投票:0回答:1

ifort版本:(IFORT)2021.8.0 20221119 操作系统:WSL Ubuntu 20.04LTS

  1. 我有一个 (1000x1000x1000) 3D 数组可以在进程之间分配。在第一种方法中,我将数组展平,然后将数组分布在进程之间,大约需要 7.86 秒

  2. 在第二种方法中,我使用 MPI 派生的数据类型来分散 3D 数组,我注意到它需要大约 165.34 秒。但收集相同数据大约需要 14.24 秒。

  3. 造成这种不一致的原因是什么?我预计 Scatterv 会花费与 Gatherv 类似的时间

这是代码

program ex_scatterv
  use mpi
  use iso_fortran_env, only : real64
  implicit none

!allocate arrays
  real(real64), allocatable,dimension(:,:,:) :: array, array_local
  real(real64), allocatable,dimension(:) :: array_flat, array_local_flat
  integer :: rank, num_procs, i, j, k
  integer :: nx, ny, nz, str_idx, end_idx, local_size, local_size_flat
  integer, dimension(:), allocatable :: sendcounts, displacements
  integer :: sizes(3), sub_sizes(3), starts(3), recv_starts(3), recv_sizes(3), &
    send_type, resize_send_type, recv_type, resize_recv_type
  integer(kind=8) :: lb, extent, lb_resize
  real(real64) :: start_time
  integer :: mpierr
  call mpi_init(mpierr)
  call mpi_comm_size(mpi_comm_world, num_procs, mpierr)
  call mpi_comm_rank(mpi_comm_world, rank, mpierr)

!size of array
  nx=1000
  ny=1000
  nz=1000

  if(rank==0) then
    if(num_procs>nx) then
      print*, "Number of procs should be less than or equal to first dimension of the array"
      call MPI_Abort(mpi_comm_world, 1, mpierr)
    endif
  endif

  start_time=MPI_Wtime()
!allocate in the root rank
  if(rank==0) then
    allocate(array(nx,ny,nz))
    allocate(array_flat(nx*ny*nz))
  else !for other procs allocate with zero size
    allocate(array(0,0,0))
  endif

!assign values to the array
  if(rank==0) then
    do k=1,nz
      do j=1,ny
        do i=1,nx
          array(i,j,k) = (i-1)+(j-1)*nx+(k-1)*nx*ny
        end do
      end do
    end do
!print*, "Before scattering..."
!print*, array
!flatten the 3D array
    forall(k=1:nz, j=1:ny, i=1:nx) array_flat(k+(j-1)*nz+(i-1)*ny*nz)=array(i,j,k)
  endif

!distribute the 3d array among different procs
  call distribute_points(nx, rank, num_procs, str_idx, end_idx)
  local_size = end_idx - str_idx + 1
  local_size_flat = local_size*ny*nz

!allocate local(for each rank) arrays
  allocate(array_local_flat(local_size_flat))
  allocate(array_local(local_size, ny, nz))

!allocate sendcoutns and displacements arrays for braodcasting
  allocate(sendcounts(num_procs), displacements(num_procs))

!gather displacements and sendcounts for all ranks
  call MPI_Allgather(str_idx, 1, MPI_INTEGER, displacements, 1, MPI_INTEGER, &
    MPI_COMM_WORLD, mpierr)
  call MPI_Allgather(local_size, 1, MPI_INTEGER, sendcounts, 1, &
    MPI_INTEGER, MPI_COMM_WORLD, mpierr)

!total sendcounts and displacements
  sendcounts = sendcounts*ny*nz
  displacements = displacements - 1 !Array index starts with 0 in MPI (C)
  displacements = displacements*ny*nz

!scatter the flattened array among procs
  call MPI_Scatterv(array_flat, sendcounts, displacements, MPI_DOUBLE_PRECISION, &
    array_local_flat, local_size*ny*nz, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, &
    mpierr)

!form 3D array from flattened local array
  forall(k=1:nz, j=1:ny, i=1:local_size) array_local(i,j,k) = &
    array_local_flat(k+(j-1)*nz+(i-1)*ny*nz)

!print*, "Scattered array: ", rank
!print*, array_local
  if(rank==0) then
    print*, "Time taken by flatten and scatter: ", MPI_Wtime()-start_time
  endif

  call MPI_Barrier(mpi_comm_world, mpierr)
!deallocate(array_flat, array_local_flat)

  start_time=MPI_Wtime()
!Scatterning using subarray type
  sizes = [nx, ny, nz]
  recv_sizes=[local_size, ny, nz]
  sub_sizes = [1, ny, nz]
  starts = [0, 0, 0]
  recv_starts = [0, 0, 0]

!to get extent of MPI_DOUBLE_PRECISION
  call MPI_Type_get_extent(MPI_DOUBLE_PRECISION, lb, extent, mpierr)

!create a mpi subarray data type for sending data
  call MPI_Type_create_subarray(3, sizes, sub_sizes, starts, &
    MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, send_type, mpierr)
  lb_resize=0
!resize the send subarray for starting at correct location for next send
  call MPI_Type_create_resized(send_type, lb_resize, extent, &
    resize_send_type, mpierr)
  call MPI_Type_commit(resize_send_type, mpierr)

!create a mpi subarray data type for receiving data
  call MPI_Type_create_subarray(3, recv_sizes, sub_sizes, recv_starts, &
    MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, recv_type, mpierr)

!resize the receive subarray for starting at correct location for next receive
  call MPI_Type_create_resized(recv_type, lb_resize, extent, &
    resize_recv_type, mpierr)
  call MPI_Type_commit(resize_recv_type, mpierr)

!sendcounts and displacement for sending and receiving subarrays
  sendcounts=sendcounts/(ny*nz)
  displacements = displacements/(ny*nz)

  if(rank==0) then
    print*, "Time taken for creating MPI type subarrays: ", MPI_Wtime()-start_time
  endif

  call MPI_Barrier(mpi_comm_world, mpierr)
  start_time=MPI_Wtime()
!scatter the subarrays
  call MPI_Scatterv(array, sendcounts, displacements, resize_send_type, &
    array_local, sendcounts, resize_recv_type, 0, MPI_COMM_WORLD, mpierr)

  if(rank==0) then
    print*, "Time taken for scattering using MPI type subarrays: ", MPI_Wtime()-start_time
  endif
  call MPI_Barrier(mpi_comm_world, mpierr)
!print the scattered array
!print*, "Scattered array with subarray: ", rank
!print*, array_local

!do some computations on the scattered local arrays
  array_local = array_local+1

  call MPI_Barrier(mpi_comm_world, mpierr)
  start_time=MPI_Wtime()
!Gather the local arrays to global (array) using the same subarrays
  call MPI_Gatherv(array_local, local_size, resize_recv_type, array, &
    sendcounts, displacements, resize_send_type, 0, MPI_COMM_WORLD, mpierr)

  if(rank==0) then
    print*, "Time taken by MPI_Type_create_subarray Gathering: ", MPI_Wtime()-start_time
  endif

!if(rank==0) then
! print*, "Gathered array: ------------------"
! print*, array
!endif
  call MPI_Finalize(mpierr)



  contains

  subroutine distribute_points(npts, rank, size, start_idx, end_idx)
    implicit none

    integer, intent(in) :: npts, size, rank
    integer, intent(out) :: start_idx, end_idx
    integer :: pts_per_proc

    pts_per_proc = npts/size

    if(rank < mod(npts, size)) then
      pts_per_proc=pts_per_proc + 1
    end if

    if(rank < mod(npts, size)) then
      start_idx = rank * pts_per_proc + 1
      end_idx = (rank + 1) * pts_per_proc
    else
      start_idx = mod(npts, size) + rank*pts_per_proc + 1
      end_idx = mod(npts, size) + (rank + 1) * pts_per_proc
    end if

  end subroutine distribute_points
end program ex_scatterv
fortran mpi distributed-computing intel-oneapi intel-mpi
1个回答
1
投票

MPI 数据类型比用户级打包和发送操作慢的原因有很多。

我已经在https://arxiv.org/abs/1809.10778

中对此进行了探索
  • 与普通缓冲区不同,数据类型不是直接从内存中流式传输:它们被读取、写入紧凑缓冲区,然后再次读取以进行发送。 (如果您有昂贵的网卡,它们实际上可能直接从内存中流式传输,但不要指望这一点。)因此,使用派生类型可能会产生带宽损失。
  • MPI 可能不会立即发送整个消息,例如因为它不愿意创建真正大的内部缓冲区。
  • 如果您在定时循环中进行发送,您会遇到 MPI 是无状态的事实,因此它将重复分配和释放其内部缓冲区。

在您的具体情况下,正如一些评论者指出的那样,您的数据不规则也可能使您的

Scatterv
效率低下。

© www.soinside.com 2019 - 2024. All rights reserved.