我想为两个因子变量的每个组合生成一些简单的计算,并将结果存储在数据框中。数据如下:
df <- data.frame(SPECIES = as.factor(c(rep("SWAN",10), rep("DUCK",4), rep("GOOS",12),
rep("PASS",9), rep("FALC",10))),
DRAINAGE = as.factor(c(rep(c("Central", "Upper", "West"),15))),
CATCH_QTY = c(1,1,2,5,6,1,2,1,1,1,1,1,3,1,1,2,1,1,2,2,
rep(1,25)),
TAGGED = c(rep("T",6),NA,"T","T",NA,rep("T",10),NA,NA,
rep("T",9),NA,"T","T","T","T",NA,rep("T",7),NA),
RECAP = c(rep(NA,6),"RC",NA,NA,"RC",rep(NA,10),"RC","RC",
rep(NA,9),"RC",NA,NA,NA,NA,"RC",rep(NA,7),"RC"))
这是函数:
myfunction <- function(dat, yr, spp, drain){
dat <- dat %>% filter(SPECIES == spp, DRAINAGE == drain)
estimatea <<-
dat %>%
summarise(NumCaught = sum(CATCH_QTY, na.rm = T),
NewTags = sum(!is.na(TAGGED)),
Recaps = sum(!is.na(RECAP)),
TotTags = sum(NewTags+Recaps))
dataTest1 <- cbind(yr, spp, drain, estimatea$NumCaught, estimatea$NewTags,
estimatea$Recaps, estimatea$TotTags)
}
我主要是在尝试嵌套 for 循环,并且一直在努力将输出存储在数据框中,因为我迭代的变量是因子,而不是数字,因此堆栈交换上的许多现有答案都不是。相关。迭代因子的答案here 未显示如何存储输出。
我尝试的一些例子:
out <- list()
for (i in seq_along(levels(df$SPECIES))) {
for (j in seq_along(levels(df$DRAINAGE))) {
out[i,j] <- myfunction(df, "2023", i, j)
}
}
Error in out[i, j] <- myfunction(df, "2023", i, j) :
incorrect number of subscripts on matrix
for (i in seq_along(levels(df$SPECIES))) {
for (j in seq_along(levels(df$DRAINAGE))) {
out[i+1,j+1] <- myfunction(df, "2023", i, j)
}
}
Error in out[i, j] <- myfunction(df, "2023", i, j) :
incorrect number of subscripts on matrix
我还考虑了一些非 for 循环选项,例如,
combos <- expand.grid(df$SPECIES, df$DRAINAGE) %>% distinct() %>%
drop_na() %>% rename(spp = Var1, drain = Var2)
test <- myfunction(df, "2023", combos$spp, combos$drain) #generates incorrect results
sapply(combos$spp, function(x) mapply(myfunction,x,combos$drain))
apply(combos, 2, FUN = myfunction)
Error in UseMethod("filter") :
no applicable method for 'filter' applied to an object of class "character"
理想情况下,输出数据框看起来像这样:
desired_out <- data.frame(yr = rep("2023",3),
spp = c("DUCK", "DUCK", "GOOS"),
drain = c("West", "Central", "Upper"),
V4 = c(1,3,4),
V5 = c(1,1,3),
v6 = c(0,0,1),
V7 = c(1,1,4))
为了获得所需的输出,
dplyr
函数可以完成您需要的操作,而无需进入循环或*apply
函数:
df %>%
group_by(SPECIES, DRAINAGE) %>%
summarise(
NumCaught = sum(CATCH_QTY, na.rm = T),
NewTags = sum(!is.na(TAGGED)),
Recaps = sum(!is.na(RECAP)),
TotTags = NewTags+Recaps
)
#> SPECIES DRAINAGE NumCaught NewTags Recaps TotTags
#> 1 SWAN Central 9 2 2 4
#> 2 SWAN Upper 8 3 0 3
#> 3 SWAN West 4 3 0 3
#> 4 DUCK Upper 2 2 0 2
#> 5 DUCK West 1 1 0 1
#> 6 DUCK Central 3 1 0 1
#> 7 GOOS West 4 3 1 4
#> 8 GOOS Central 6 3 1 4
#> 9 GOOS Upper 5 4 0 4
#> 10 PASS West 3 3 0 3
#> 11 PASS Central 3 3 0 3
#> 12 PASS Upper 3 2 1 3
#> 13 FALC West 4 3 1 4
#> 14 FALC Central 3 2 1 3
#> 15 FALC Upper 3 3 0 3
如果您的实际需求更复杂并且您确实需要使用某种循环,我会推荐
*apply
系列函数。在您的示例中,嵌套 lapply()
将得到相同的结果:
# the function can just return the summarised 1-row data frame,
# no need to update estimatea
myfunction <- function(dat, yr, spp, drain){
dat %>% filter(SPECIES == spp, DRAINAGE == drain) %>%
summarise(NumCaught = sum(CATCH_QTY, na.rm = T),
NewTags = sum(!is.na(TAGGED)),
Recaps = sum(!is.na(RECAP)),
TotTags = sum(NewTags+Recaps),
.by = c(SPECIES, DRAINAGE))
}
# nested lapply() to create lists of 1-row data frames
# (use levels(df$SPECIES) not seq_along() because we want
# the character strings, not the numeric index)
outputs <- lapply(levels(df$SPECIES),
function(x) {
lapply(levels(df$DRAINAGE),
function(y) myfunction(df, "2023", x, y)
)
})
# bind them together into 1 data frame
do.call(bind_rows, outputs)
#> SPECIES DRAINAGE NumCaught NewTags Recaps TotTags
#> 1 DUCK Central 3 1 0 1
#> 2 DUCK Upper 2 2 0 2
#> 3 DUCK West 1 1 0 1
#> 4 FALC Central 3 2 1 3
#> 5 FALC Upper 3 3 0 3
#> 6 FALC West 4 3 1 4
#> 7 GOOS Central 6 3 1 4
#> 8 GOOS Upper 5 4 0 4
#> 9 GOOS West 4 3 1 4
#> 10 PASS Central 3 3 0 3
#> 11 PASS Upper 3 2 1 3
#> 12 PASS West 3 3 0 3
#> 13 SWAN Central 9 2 2 4
#> 14 SWAN Upper 8 3 0 3
#> 15 SWAN West 4 3 0 3
通常建议谨慎使用
<<-
运算符,因为它会影响函数范围之外的全局变量。 *apply
在 R 中使用函数通常更清晰。