replicate()
指定的数据帧的每一行的结果概率分布的第 95 个百分位数。我的实际数据框有 11,628 行。我使用
system.time()
来估计假设我只有 5 行需要多长时间。结果表明,每 5 行的运行时间为 50 秒,因此我将忍受 32 个小时以上的时间来处理 11,628 行。是否有更快、更有效的替代方案来执行我的代码?这是我的数据框
final.distrib
。
install.packages("partitions")
library(partitions)
# Identify building stock distribution in percentage
# Get 6 elements that sum up to 1
z <- compositions(n = 20, m = 6, include.zero = FALSE)
# Convert partition to matrix
z <- as.matrix.partition(z)
# Transpose matrix
z <- t(z)
# Multiply by 0.05 so that elements take any value from 0 to 1 by incremental step of 0.05
z <- z * 0.05
distrib.percent <- data.frame(z)
colnames(distrib.percent) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
# Identify household count without rounding off
# Multiply the percentages of distribution with the total number of households (971 households)
distrib.count <- data.frame(z*971)
colnames(distrib.count) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
#Round off values, making sure each row sums up to 971.
smart.round <- function(x) {
y <- floor(x)
indices <- tail(order(x-y), round(sum(x)) - sum(y))
y[indices] <- y[indices] + 1
y
}
final.distrib <- data.frame(t(apply(distrib.count, 1, smart.round)))
colnames(final.distrib) <- c("LWA", "LWB", "SCA", "SCB", "RCA", "RCB")
这是具有六个功能的代码:
install.packages("triangle")
library(triangle)
function.LWA <- function() {
n_sim <- 10000
LWA.cost <- rtriangle(n_sim, a = 0.9*8407.29, b = 1.1*8407.29, c = 8407.29)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
LWA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
LWA.loss <- rtriangle (n_sim, a = 0.0890, b = 0.6655, c = 0.3212)
LWA.cost*LWA.area*LWA.loss
}
function.LWB <- function() {
n_sim <- 10000
LWB.cost <- rtriangle(n_sim, a = 0.9*10328.43, b = 1.1*10328.43, c = 10328.43)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
LWB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
LWB.loss <- rtriangle(n_sim, a = 0.0423, b = 0.5572, c = 0.2038)
LWB.cost*LWB.area*LWB.loss
}
function.SCA <- function() {
n_sim <- 10000
SCA.cost <- rtriangle(n_sim, a = 0.9*12055.57, b = 1.1*12055.57, c = 12055.57)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
SCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
SCA.loss <- rtriangle(n_sim, a = 0.0453, b = 0.5665, c = 0.2136)
SCA.cost*SCA.area*SCA.loss
}
function.SCB <- function() {
n_sim <- 10000
SCB.cost <- rtriangle(n_sim, a = 0.9*17935.86, b = 1.1*17935.86, c = 17935.86)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
SCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
SCB.loss <- rtriangle(n_sim, a = 0.0645, b = 0.6361, c = 0.2791)
SCB.cost*SCB.area*SCB.loss
}
function.RCA <- function() {
n_sim <- 10000
RCA.cost <- rtriangle(n_sim, a = 0.9*19363.57, b = 1.1*19363.57, c = 19363.57)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
RCA.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
RCA.loss <- rtriangle(n_sim, a = 0.0263, b = 0.5009, c = 0.1537)
RCA.cost*RCA.area*RCA.loss
}
function.RCB <- function() {
n_sim <- 10000
RCB.cost <- rtriangle(n_sim, a = 0.9*25182.71, b = 1.1*25182.71, c = 25182.71)
random.area <- rlnorm(100000, meanlog = 3.61, sdlog = 0.82)
RCB.area <- head(random.area[random.area >= 5 & random.area <= 200], n_sim)
RCB.loss <- rtriangle(n_sim, a = 0.0583, b = 0.6078, c = 0.2519)
RCB.cost*RCB.area*RCB.loss
}
# Function to calculate the direct loss and return the 95th percentile
calculate_percentile <- function(row) {
set.seed(1) # for comparability across scenarios
direct.loss <- (rowSums(replicate(n=row["LWA"], function.LWA()))) +
(rowSums(replicate(n=row["LWB"], function.LWB()))) +
(rowSums(replicate(n=row["SCA"], function.SCA()))) +
(rowSums(replicate(n=row["SCB"], function.SCB()))) +
(rowSums(replicate(n=row["RCA"], function.RCA()))) +
(rowSums(replicate(n=row["RCB"], function.RCB())))
quantile(direct.loss, probs = 0.95) # Specify 95th percentile
}
# Apply the function to each row of the data frame (containing house count) and store the results
loss.PEISVII.95p <- setNames(data.frame(apply(final.distrib, 1, calculate_percentile)), "PEIS VII loss")
谢谢你。
首先,不要使用
replicate
,而是根据需要获取每行的成本/损失/面积样本总数,并在执行
rowsums
之前将其放入矩阵中。其次,您生成的对数正态样本比截断所需的要多。一些快速计算表明,仅 3% 的过采样就足以几乎保证 5 到 200 之间有足够的样本:
# probability that a sample will fall outside [5, 200]
(p <- diff(plnorm(c(5, 200), 3.61, 0.82)))
#> [1] 0.9728997
# minimum number of log-normal samples needed
min_n <- min(final.distrib)*1e4
# upper bound probability of not generating enough samples within [5, 200]
-expm1(pbinom(min_n, 1.03*min_n, p, FALSE, TRUE)*6*nrow(final.distrib))
#> [1] 1.550331e-13
完整代码(超过150行):
n_sim <- 1e4
abc <- array(c(0.9*8407.29, 1.1*8407.29, 8407.29,
0.0890, 0.6655, 0.3212,
0.9*10328.43, 1.1*10328.43, 10328.43,
0.0423, 0.5572, 0.2038,
0.9*12055.57, 1.1*12055.57, 12055.57,
0.0453, 0.5665, 0.2136,
0.9*17935.86, 1.1*17935.86, 17935.86,
0.0645, 0.6361, 0.2791,
0.9*19363.57, 1.1*19363.57, 19363.57,
0.0263, 0.5009, 0.1537,
0.9*25182.71, 1.1*25182.71, 25182.71,
0.0583, 0.6078, 0.2519), c(3, 2, 6))
rtri <- function(n, a, b, c) {
fc <- (c - a)/(b - a)
U <- runif(n)
blna <- U < fc
r <- numeric(n)
r[blna] <- a + sqrt(U[blna]*(b - a)*(c - a))
r[!blna] <- b - sqrt((1 - U[!blna])*(b - a)*(b - c))
r
}
f.random.area <- function(n_rep) {
n <- n_sim*n_rep
x <- rlnorm(n*1.029, 3.61, 0.82)
matrix(x[abs(x - 102.5) < 97.5][1:n], n_sim, n_rep)
}
f <- function(i, n_rep) {
n <- n_sim*n_rep
cost <- rtri(n, abc[1, 1, i], abc[2, 1, i], abc[3, 1, i])
loss <- rtri(n, abc[1, 2, i], abc[2, 2, i], abc[3, 2, i])
rowsums(cost*loss*f.random.area(n_rep))
}
loss.quantile <- function(n_rep, p = 0.95) {
quantile(rowsums(mapply(\(i) f(i, n_rep[i]), seq_along(n_rep))), probs = p)
}
library(parallel)
cl <- makeCluster(detectCores() - 1) # 15 cores
clusterExport(cl, c("f", "loss.quantile", "f.random.area", "abc", "n_sim", "rtri"))
invisible(clusterEvalQ(cl, library(Rfast))) # for `rowsums`
# loss.PEISVII.95p <- unlist(parLapply(cl, asplit(final.distrib, 1), loss.quantile)) # full calculation
# time 150 rows
system.time(loss.PEISVII.95p <- unlist(
parLapply(cl, asplit(final.distrib[1:150,], 1), loss.quantile)
))
#> user system elapsed
#> 0.03 0.00 46.66