使用@turbo优化 Julia 函数

问题描述 投票:0回答:0

我正在 Julia 中编写流体动力学 3D 代码,并且我正在努力使其速度相当快(与我的研究生导师用于他的研究的 F90 代码相当)。现在,我设法使我用 Julia 编写的 1D 比我导师的等效 C++ 更快(但比 F90 慢),但是当我将维度提升到 2D 时,特别是在计算网格上的变量通量。

一维中的原始函数是:

function p2f(F,UPr)
    F[1] = UPr[1]*UPr[2]
    F[2] = UPr[1]*UPr[2]*UPr[2] + UPr[3]
    F[3] = UPr[2]*(0.5*UPr[1]*(UPr[2]^2) + 
        gamma/(gamma-1)*UPr[3])
end
function HLLFluxes(U,F,UPrim)
    FL = zeros(neq)
    FR = zeros(neq)
    wavespeed(UPrim)
    for i in 1:nx+1
        if sl[i] >= 0
            @views p2f(F[:,i],UPrim[:,i])
        elseif sr[i] <= 0
            @views p2f(F[:,i],UPrim[:,i+1])
        else
            @views p2f(FL,UPrim[:,i])
            @views p2f(FR,UPrim[:,i+1])
            @views F[:,i] = (sr[i]*FL[:] .- sl[i]*FR[:] .+ sl[i]*sr[i]*(U[:,i+1] .- U[:,i]))./(
            sr[i]-sl[i])
        end
    end
end

p2f
函数只是为了使函数更具可读性,但是当我尝试将宏
@turbo
放在外部循环上时,它不起作用,因为它既不理解if也不理解函数
p2f
,这样就能够使用我想出的宏:

function HLLFluxes(U,F,UPrim)
    wavespeed(UPrim)
    A = sl .>= 0
    B = .!A .* (sr .<= 0)
    C = .!A .* .!B
    @turbo for i in 1:nx+1
        flr = UPrim[1,i]*UPrim[2,i]
        flv = UPrim[1,i]*UPrim[2,i]*UPrim[2,i] + UPrim[3,i]
        flp = UPrim[2,i]*(0.5*UPrim[1,i]*UPrim[2,i]*UPrim[2,i] + gamma/(gamma-1)*UPrim[3,i])
        
        frr = UPrim[1,i+1]*UPrim[2,i+1]
        frv = UPrim[1,i+1]*UPrim[2,i+1]*UPrim[2,i+1] + UPrim[3,i+1]
        frp = UPrim[2,i+1]*(0.5*UPrim[1,i+1]*UPrim[2,i+1]*UPrim[2,i+1] + gamma/(gamma-1)*UPrim[3,i+1])
        
        F[1,i] = flr*A[i] + frr*B[i] + C[i]*(sr[i]*flr - sl[i]*frr + sl[i]*sr[i]*(U[1,i+1] - 
                U[1,i]))/(sr[i]-sl[i])
        F[2,i] = flv*A[i] + frv*B[i] + C[i]*(sr[i]*flv - sl[i]*frv + sl[i]*sr[i]*(U[2,i+1] - 
                U[2,i]))/(sr[i]-sl[i])
        F[3,i] = flp*A[i] + frp*B[i] + C[i]*(sr[i]*flp - sl[i]*frp + sl[i]*sr[i]*(U[3,i+1] - 
                U[3,i]))/(sr[i]-sl[i])
    end
end

这有点难看,但从

2.026 ms
28586
分配到
26.840 μs
101
分配(用 @benchmark 测量),所以当我尝试将函数缩放到 2D 时,执行时间只减少了一半并且分配的数量实际上从 4M 增加到 10M,而且由于某种原因,它在模拟中引入了随时间缩放的错误,并在一段时间后开始输出 NaN。这是 2D 函数:

function HLLFluxes(U,F,G,UPrim)
    wavespeed(UPrim)
    @turbo begin
    AA .= sl .>= 0
    BB .= .!AA .* sr .<= 0
    CC .= .!AA .* .!BB
    AAA .= sd .>= 0
    BBB .= .!AAA .* su .<= 0
    CCC .= .!AAA .* .!BBB
    end
    @turbo for i in 1:nx+1
        for j in 1:ny+1
            flr = UPrim[1,i,j]*UPrim[2,i,j]
            flvx = UPrim[1,i,j]*UPrim[2,i,j]*UPrim[2,i,j] + UPrim[4,i,j]
            flvy = UPrim[1,i,j]*UPrim[2,i,j]*UPrim[3,i,j]
            flp = UPrim[2,i,j]*(0.5*UPrim[1,i,j]*(UPrim[2,i,j]^2 + UPrim[3,i,j]^2) + 
                gamma/(gamma-1)*UPrim[4,i,j])
            frr = UPrim[1,i+1,j]*UPrim[2,i+1,j]
            frvx = UPrim[1,i+1,j]*UPrim[2,i+1,j]*UPrim[2,i+1,j] + UPrim[4,i+1,j]
            frvy = UPrim[1,i+1,j]*UPrim[2,i+1,j]*UPrim[3,i+1,j]
            frp = UPrim[2,i+1,j]*(0.5*UPrim[1,i+1,j]*(UPrim[2,i+1,j]^2 + UPrim[3,i+1,j]^2) + 
                gamma/(gamma-1)*UPrim[4,i+1,j])
            F[1,i,j] = AA[i,j]*flr + BB[i,j]*frr + CC[i,j]*(sr[i,j]*flr - sl[i,j]*frr + sl[i,j]*sr[i,j]*(
                    U[1,i+1,j] - U[1,i,j]))/(sr[i,j]-sl[i,j])
            F[2,i,j] = AA[i,j]*flvx + BB[i,j]*frvx + CC[i,j]*(sr[i,j]*flvx - sl[i,j]*frvx + sl[i,j]*sr[i,j]*(
                    U[2,i+1,j] - U[2,i,j]))/(sr[i,j]-sl[i,j])
            F[3,i,j] = AA[i,j]*flvy + BB[i,j]*frvy + CC[i,j]*(sr[i,j]*flvy - sl[i,j]*frvy + sl[i,j]*sr[i,j]*(
                    U[3,i+1,j] - U[3,i,j]))/(sr[i,j]-sl[i,j])
            F[4,i,j] = AA[i,j]*flp + BB[i,j]*frp + CC[i,j]*(sr[i,j]*flp - sl[i,j]*frp + sl[i,j]*sr[i,j]*(
                    U[4,i+1,j] - U[4,i,j]))/(sr[i,j]-sl[i,j])

            glr = UPrim[1,i,j]*UPrim[3,i,j]
            glvx = UPrim[1,i,j]*UPrim[2,i,j]*UPrim[3,i,j]
            glvy = UPrim[1,i,j]*UPrim[3,i,j]*UPrim[3,i,j] + UPrim[4,i,j]
            glp = UPrim[3,i,j]*(0.5*UPrim[1,i,j]*(UPrim[2,i,j]^2 + UPrim[3,i,j]^2) + 
                gamma/(gamma-1)*UPrim[4,i,j])
            grr = UPrim[1,i,j+1]*UPrim[3,i,j+1]
            grvx = UPrim[1,i,j+1]*UPrim[2,i,j+1]*UPrim[3,i,j+1]
            grvy = UPrim[1,i,j+1]*UPrim[3,i,j+1]*UPrim[3,i,j+1] + UPrim[4,i,j+1]
            grp = UPrim[3,i,j+1]*(0.5*UPrim[1,i,j+1]*(UPrim[2,i,j+1]^2 + UPrim[3,i,j+1]^2) + 
                gamma/(gamma-1)*UPrim[4,i,j+1])
            G[1,i,j] = AAA[i,j]*glr + BBB[i,j]*grr + CCC[i,j]*(su[i,j]*glr - sd[i,j]*grr + sd[i,j]*su[i,j]*(
                    U[1,i,j+1] - U[1,i,j]))/(su[i,j]-sd[i,j])
            G[2,i,j] = AAA[i,j]*glvx + BBB[i,j]*grvx + CCC[i,j]*(su[i,j]*glvx - sd[i,j]*grvx + sd[i,j]*su[i,j]*(
                    U[2,i,j+1] - U[2,i,j]))/(su[i,j]-sd[i,j])
            G[3,i,j] = AAA[i,j]*glvy + BBB[i,j]*grvy + CCC[i,j]*(su[i,j]*glvy - sd[i,j]*grvy + sd[i,j]*su[i,j]*(
                    U[3,i,j+1] - U[3,i,j]))/(su[i,j]-sd[i,j])
            G[4,i,j] = AAA[i,j]*glp + BBB[i,j]*grp + CCC[i,j]*(su[i,j]*glp - sd[i,j]*grp + sd[i,j]*su[i,j]*(
                    U[4,i,j+1] - U[4,i,j]))/(su[i,j]-sd[i,j])
        end
    end
end

所以我的问题是,

@turbo
只减少一半时间并进行更多分配是正常的吗?为什么它会在模拟中引入奇怪的错误?那么有没有办法可以进一步提高执行速度呢?如有任何帮助,我们将不胜感激。

optimization julia nested-loops numerical-computing
© www.soinside.com 2019 - 2024. All rights reserved.