运行具有多个函数的代码时,R 中的replicate() 最快替代方法是什么?

问题描述 投票:0回答:1
我有这段代码,其中使用六个函数进行蒙特卡洛模拟,然后计算我使用

replicate()

 指定的数据帧的每一行的结果概率分布的第 95 个百分位数。我的实际数据框有 11,628 行。我使用 
system.time()
 来估计假设我只有 5 行需要多长时间。结果表明,每 5 行的运行时间为 50 秒,因此我将忍受 32 个小时以上的时间来处理 11,628 行。是否有更快、更有效的替代方案来执行我的代码?

这是我的数据框

final.distrib

install.packages("partitions") library(partitions) # Identify building stock distribution in percentage # Get 6 elements that sum up to 1 z <- compositions(n = 20, m = 6, include.zero = FALSE) # Convert partition to matrix z <- as.matrix.partition(z) # Transpose matrix z <- t(z) # Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05 z <- z * 0.05 distrib.percent <- data.frame(z) colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB") # Identify household count without rounding off # Multiply the percentages of distribution with the total number of households (971 households) distrib.count <- data.frame(z*971) colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB") #Round off values, making sure each row sums up to 971. smart.round <- function(x) { y <- floor(x) indices <- tail(order(x-y), round(sum(x)) - sum(y)) y[indices] <- y[indices] + 1 y } final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round))) colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
这是具有六个功能的代码:

install.packages("triangle") library(triangle) function.LWA <- function() { n_sim <- 10000 LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212) LWA.cost*LWA.area*LWA.loss } function.LWB <- function() { n_sim <- 10000 LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038) LWB.cost*LWB.area*LWB.loss } function.SCA <- function() { n_sim <- 10000 SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136) SCA.cost*SCA.area*SCA.loss } function.SCB <- function() { n_sim <- 10000 SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791) SCB.cost*SCB.area*SCB.loss } function.RCA <- function() { n_sim <- 10000 RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537) RCA.cost*RCA.area*RCA.loss } function.RCB <- function() { n_sim <- 10000 RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71) random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519) RCB.cost*RCB.area*RCB.loss } # Function to calculate the direct loss and return the 95th percentile calculate_percentile <- function(row) { set.seed(1) # for comparability across scenarios direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) + (rowSums(replicate(n=row["LWB"], function.LWB()))) + (rowSums(replicate(n=row["SCA"], function.SCA()))) + (rowSums(replicate(n=row["SCB"], function.SCB()))) + (rowSums(replicate(n=row["RCA"], function.RCA()))) + (rowSums(replicate(n=row["RCB"], function.RCB()))) quantile(direct.loss, probs = 0.95) # Specify 95th percentile } # Apply the function to each row of the data frame (containing house count) and store the results loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")
谢谢你。

r simulation montecarlo replicate
1个回答
0
投票
一些改进以及并行处理可以将 16 个核心的时间缩短到大约一个小时。

首先,不要使用

replicate

,而是根据需要获取每行的成本/损失/面积样本总数,并在执行 
rowsums
 之前将其放入矩阵中。

其次,您生成的对数正态样本比截断所需的要多。一些快速计算表明,仅 3% 的过采样就足以几乎保证 5 到 200 之间有足够的样本:

# probability that a sample will fall outside [5, 200] (p <- diff(plnorm(c(5, 200), 3.61, 0.82))) #> [1] 0.9728997 # minimum number of log-normal samples needed min_n <- min(final.distrib)*1e4 # upper bound probability of not generating enough samples within [5, 200] -expm1(pbinom(min_n, 1.03*min_n, p, FALSE, TRUE)*6*nrow(final.distrib)) #> [1] 1.550331e-13
完整代码(超过150行):

n_sim <- 1e4 abc <- array(c(0.9*8407.29, 1.1*8407.29, 8407.29, 0.0890, 0.6655, 0.3212, 0.9*10328.43, 1.1*10328.43, 10328.43, 0.0423, 0.5572, 0.2038, 0.9*12055.57, 1.1*12055.57, 12055.57, 0.0453, 0.5665, 0.2136, 0.9*17935.86, 1.1*17935.86, 17935.86, 0.0645, 0.6361, 0.2791, 0.9*19363.57, 1.1*19363.57, 19363.57, 0.0263, 0.5009, 0.1537, 0.9*25182.71, 1.1*25182.71, 25182.71, 0.0583, 0.6078, 0.2519), c(3, 2, 6)) rtri <- function(n, a, b, c) { fc <- (c - a)/(b - a) U <- runif(n) blna <- U < fc r <- numeric(n) r[blna] <- a + sqrt(U[blna]*(b - a)*(c - a)) r[!blna] <- b - sqrt((1 - U[!blna])*(b - a)*(b - c)) r } f.random.area <- function(n_rep) { n <- n_sim*n_rep x <- rlnorm(n*1.029, 3.61, 0.82) matrix(x[abs(x - 102.5) < 97.5][1:n], n_sim, n_rep) } f <- function(i, n_rep) { n <- n_sim*n_rep cost <- rtri(n, abc[1, 1, i], abc[2, 1, i], abc[3, 1, i]) loss <- rtri(n, abc[1, 2, i], abc[2, 2, i], abc[3, 2, i]) rowsums(cost*loss*f.random.area(n_rep)) } loss.quantile <- function(n_rep, p = 0.95) { quantile(rowsums(mapply(\(i) f(i, n_rep[i]), seq_along(n_rep))), probs = p) } library(parallel) cl <- makeCluster(detectCores() - 1) # 15 cores clusterExport(cl, c("f", "loss.quantile", "f.random.area", "abc", "n_sim", "rtri")) invisible(clusterEvalQ(cl, library(Rfast))) # for `rowsums` # loss.PEISVII.95p <- unlist(parLapply(cl, asplit(final.distrib, 1), loss.quantile)) # full calculation # time 150 rows system.time(loss.PEISVII.95p <- unlist( parLapply(cl, asplit(final.distrib[1:150,], 1), loss.quantile) )) #> user system elapsed #> 0.03 0.00 46.66
    
© www.soinside.com 2019 - 2024. All rights reserved.