运行具有多个函数的代码时，R 中的replicate() 最快替代方法是什么？

Question

我有这段代码，其中使用六个函数进行蒙特卡洛模拟，然后计算我使用

replicate()

 指定的数据帧的每一行的结果概率分布的第 95 个百分位数。我的实际数据框有 11,628 行。我使用

system.time()

 来估计假设我只有 5 行需要多长时间。结果表明，每 5 行的运行时间为 50 秒，因此我将忍受 32 个小时以上的时间来处理 11,628 行。是否有更快、更有效的替代方案来执行我的代码？

这是我的数据框

final.distrib

。

install.packages("partitions")
library(partitions)

# Identify building stock distribution in percentage
# Get 6 elements that sum up to 1
z <- compositions(n = 20, m = 6, include.zero = FALSE)

# Convert partition to matrix
z <- as.matrix.partition(z)

# Transpose matrix
z <- t(z)

# Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05
z <- z * 0.05
distrib.percent <- data.frame(z)
colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

# Identify household count without rounding off
# Multiply the percentages of distribution with the total number of households (971 households)
distrib.count <- data.frame(z*971)
colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

#Round off values, making sure each row sums up to 971.
smart.round <- function(x) {
  y <- floor(x)
  indices <- tail(order(x-y), round(sum(x)) - sum(y))
  y[indices] <- y[indices] + 1
  y
}

final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round)))
colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")

这是具有六个功能的代码：

install.packages("triangle")
library(triangle)

function.LWA <- function() {
  n_sim <- 10000
  LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29) 
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim) 
  LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212) 
  LWA.cost*LWA.area*LWA.loss
}

function.LWB <- function() {
  n_sim <- 10000
  LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038)
  LWB.cost*LWB.area*LWB.loss
}

function.SCA <- function() {
  n_sim <- 10000
  SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136)
  SCA.cost*SCA.area*SCA.loss
}

function.SCB <- function() {
  n_sim <- 10000
  SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791)
  SCB.cost*SCB.area*SCB.loss
}

function.RCA <- function() {
  n_sim <- 10000
  RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537)
  RCA.cost*RCA.area*RCA.loss
}

function.RCB <- function() {
  n_sim <- 10000
  RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71)
  random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82) 
  RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
  RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519)
  RCB.cost*RCB.area*RCB.loss
}

# Function to calculate the direct loss and return the 95th percentile
calculate_percentile <- function(row) {
  set.seed(1) # for comparability across scenarios
  direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) +
                 (rowSums(replicate(n=row["LWB"], function.LWB()))) +
                 (rowSums(replicate(n=row["SCA"], function.SCA()))) +
                 (rowSums(replicate(n=row["SCB"], function.SCB()))) +
                 (rowSums(replicate(n=row["RCA"], function.RCA()))) +
                 (rowSums(replicate(n=row["RCB"], function.RCB())))
  quantile(direct.loss, probs = 0.95) # Specify 95th percentile
}

# Apply the function to each row of the data frame (containing house count) and store the results
loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")

谢谢你。

Answer 1

一些改进以及并行处理可以将 16 个核心的时间缩短到大约一个小时。

首先，不要使用

replicate

，而是根据需要获取每行的成本/损失/面积样本总数，并在执行

rowsums

 之前将其放入矩阵中。

其次，您生成的对数正态样本比截断所需的要多。一些快速计算表明，仅 3% 的过采样就足以几乎保证 5 到 200 之间有足够的样本：

# probability that a sample will fall outside [5, 200]
(p <- diff(plnorm(c(5, 200), 3.61, 0.82)))
#> [1] 0.9728997
# minimum number of log-normal samples needed
min_n <- min(final.distrib)*1e4
# upper bound probability of not generating enough samples within [5, 200]
-expm1(pbinom(min_n, 1.03*min_n, p, FALSE, TRUE)*6*nrow(final.distrib))
#> [1] 1.550331e-13

完整代码（超过150行）：

n_sim <- 1e4
abc <- array(c(0.9*8407.29, 1.1*8407.29, 8407.29,
               0.0890, 0.6655, 0.3212,
               0.9*10328.43, 1.1*10328.43, 10328.43,
               0.0423, 0.5572, 0.2038,
               0.9*12055.57, 1.1*12055.57, 12055.57,
               0.0453, 0.5665, 0.2136,
               0.9*17935.86, 1.1*17935.86, 17935.86,
               0.0645, 0.6361, 0.2791,
               0.9*19363.57, 1.1*19363.57, 19363.57,
               0.0263, 0.5009, 0.1537,
               0.9*25182.71, 1.1*25182.71, 25182.71,
               0.0583, 0.6078, 0.2519), c(3, 2, 6))

rtri <- function(n, a, b, c) {
  fc <- (c - a)/(b - a)
  U <- runif(n)
  blna <- U < fc
  r <- numeric(n)
  r[blna] <- a + sqrt(U[blna]*(b - a)*(c - a))
  r[!blna] <- b - sqrt((1 - U[!blna])*(b - a)*(b - c))
  r
}

f.random.area <- function(n_rep) {
  n <- n_sim*n_rep
  x <- rlnorm(n*1.029, 3.61, 0.82)
  matrix(x[abs(x - 102.5) < 97.5][1:n], n_sim, n_rep)
}

f <- function(i, n_rep) {
  n <- n_sim*n_rep
  cost <- rtri(n, abc[1, 1, i], abc[2, 1, i], abc[3, 1, i])
  loss <- rtri(n, abc[1, 2, i], abc[2, 2, i], abc[3, 2, i])
  rowsums(cost*loss*f.random.area(n_rep))
}

loss.quantile <- function(n_rep, p = 0.95) {
  quantile(rowsums(mapply(\(i) f(i, n_rep[i]), seq_along(n_rep))), probs = p)
}

library(parallel)

cl <- makeCluster(detectCores() - 1) # 15 cores
clusterExport(cl, c("f", "loss.quantile", "f.random.area", "abc", "n_sim", "rtri"))
invisible(clusterEvalQ(cl, library(Rfast))) # for `rowsums`
# loss.PEISVII.95p <- unlist(parLapply(cl, asplit(final.distrib, 1), loss.quantile)) # full calculation
# time 150 rows
system.time(loss.PEISVII.95p <- unlist(
  parLapply(cl, asplit(final.distrib[1:150,], 1), loss.quantile)
))
#>    user  system elapsed 
#>    0.03    0.00   46.66

运行具有多个函数的代码时，R 中的replicate() 最快替代方法是什么？

问题描述投票：0回答：1

1个回答

最新问题

运行具有多个函数的代码时，R 中的replicate() 最快替代方法是什么？

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1