不想删除问题,因为附加了评论 - 这对我来说是一个非常简单的错误
我有一个函数内部的代码块。它返回一个列表的列表(即,对象
dfTemp
将是一个列表的列表)。当我浏览到我的函数时,我可以看到所有内部列表的第一个值是 NA
。
dfTemp <- parLapply(cl, 1:X,
function(j) {
library(matrixStats) #matrixStats needs to be loaded within each worker node
result <- colMeans2(
mapply(
function(i) rowMeans2(
matrix(
sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
na.rm = TRUE),
N1:N2),
na.rm = TRUE)
return(result)
}
)
这是
dfTemp
的结构 - 请注意所有列表中第一行的 NA 值:
Browse[1]> str(dfTemp)
List of 100
$ : num [1:15] NA 0.598 0.948 0.179 0.284 ...
$ : num [1:15] NA 0.355 0.834 0.312 0.243 ...
$ : num [1:15] NA 0.425 0.521 0.361 0.296 ...
$ : num [1:15] NA 0.8166 0.0939 0.155 0.351 ...
$ : num [1:15] NA 0 0.197 0.413 0.219 ...
我遇到的另一个问题(我认为可能相关)是,当我尝试将列表列表转换为数据框时,它通过连接所有值来命名列。
data.frame': 15 obs. of 2 variables:
$ c.NA..0.598119527934818..0.947653884049345..0.178908501367397..: num NA 0.598 0.948 0.179 0.284 ...
$ c.NA..0.354694348429857..0.833605540582925..0.312442033011144..: num NA 0.355 0.834 0.312 0.243 ...
如何解决这两个(假定相关)问题?
#### LIBRARIES ####
library(parallel)
library(tidyverse)
library(matrixStats)
#### BOOTSTRAPPING FUNCTIONS ####
bootstrap_function <- function(column, N1, N2, B, X, param) {
bdf <- as.data.frame(column)
colnames(bdf) <- names(column)
cl <- makeCluster(detectCores() - 1) # Enable parallel processing with n-1 cores
clusterExport(cl, c("bdf", "N1", "N2", "B", "X", "param"), envir = environment()) # Export the objects for parallel processing
dfTemp <- switch(
param,
"mean" = {
parLapply(cl, 1:X,
function(j) {
library(matrixStats) #matrixStats needs to be loaded within each worker node
result <- colMeans2(
mapply(
function(i) rowMeans2(
matrix(
sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
na.rm = TRUE),
N1:N2),
na.rm = TRUE)
return(result)
}
)
},
"sd" = {
parLapply(cl, 1:X,
function(j) {
browser()
library(matrixStats) #matrixStats needs to be loaded within each worker node
result <- colSds(
mapply(
function(i) rowSds(
matrix(
sample(sample(bdf[, 1], i), B * i, replace = TRUE), B, i),
na.rm = TRUE),
N1:N2),
na.rm = TRUE)
return(result)
}
)
}
)
stopCluster(cl)
browser() #use for debugging...
# Convert to dataframe and then name columns
dfTemp <- as.data.frame(dfTemp)
#colnames(dfTemp) <- paste0("est", 1:X)
# Add identifiers
dfTemp <- cbind("Group" = rep(names(column), nrow(dfTemp)), dfTemp) # Add column with group identifier
dfTemp <- cbind("n" = row.names(dfTemp), dfTemp) # Add columnw with sample size identifer
return(dfTemp)
}
#### PREP THE DATA ####
df <- data.frame(
"1.A"= c(10.7,9.7,10.7,11.9,10,10.5,9,10.9,9.6,11.8,8.7,11.9,10.7,10.4,12.7),
"1.B"= c(11.7,10.2,10.9,11.4,10.3,9.8,9.7,10.2,10.6,8.6,9.1,9.8,13.3,9.8,8.3),
"2.A"= c(11.6,10.6,9.9,10,11.3,10.4,11.2,8.3,9.2,11.2,11.3,11.2,11,8,9.2),
"2.B"= c(10.7,11.5,10.1,8.9,11.5,9.5,12.1,10.7,8.2,10.2,9.6,10.4,8.3,11.1,9.4)
)
#### RUN THE ANALYSIS ####
# Replace any zero values with NA
df[df == 0] <- NA
# define lower and upper bounds for sample sizes to estimate
N1 <- 1
N2 <- nrow(df)
# set number of bootstrap replicates
B <- 100
# set number of times to repeat the estimate
X <- 100
# define which parameter to estimate - functions for "mean" and "sd" are supported
param <- "sd"
# Apply the bootstrap function to each column of the data frame
dfBoot <- as.data.frame(
do.call(
rbind, lapply(seq_along(df), function(x) {
browser() # used to enter break mode inside lapply function call for debugging
colname <- names(df)[x]
print(paste("Processing column:", colname))
bootstrap_function(column = df[x], N1 = N1, N2 = N2, B = B, X = X, param = param)
})
)
)
好吧,我解决了我自己的问题。我将代码转换为使用 apply,然后在每个内部函数中添加浏览器语句,以便我可以看到发生了什么。
长话短说,问题在于尝试计算样本量 1 的标准差。样本下限至少需要为 2。我犯了愚蠢的错误。简单修复:
# define lower and upper bounds for sample sizes to estimate
N1 <- 2 # needs to be at least 2
N2 <- nrow(df)