我正在使用下面的代码对人口进行分层随机抽样,目标是根据性别和地区预先确定的。可以看出,相同的代码重复了三次(每次都使用了 rbind)。有没有一种方法可以编写一个循环函数来重复这个过程,这样我就不必写几次了?
library(tidyverse)
population<- data.frame(region=rep(c("North", "South", "North", "South"),50000),
sex=rep(c("Male", "Female", "Female", "Male"),50000))
targets<- data.frame(region=c("North", "South", "North", "South"),
sex=c("Male", "Female", "Female", "Male"),
total=c(20,25,20,30))
n=0
n=n+1
sample<- population %>% filter(region==targets$region[[1]],
sex==targets$sex[[1]]) %>%
slice_sample(n=targets$total[[1]])
n=n+1
sample<- sample %>% rbind(population %>% filter(region==targets$region[[n]],
sex==targets$sex[[n]]) %>%
slice_sample(n=targets$total[[n]]))
n=n+1
sample<- sample %>% rbind(population %>% filter(region==targets$region[[n]],
sex==targets$sex[[n]]) %>%
slice_sample(n=targets$total[[n]]))
n=n+1
sample<- sample %>% rbind(population %>% filter(region==targets$region[[n]],
sex==targets$sex[[n]]) %>%
slice_sample(n=targets$total[[n]]))
感谢Harry Smith提供的帮助,我找到了答案。它包括对他的代码的一个小调整:
n = 4
sample_list <- list()
for(i in 1:n){
sample<- population %>% filter(region==targets$region[[i]],
sex==targets$sex[[i]]) %>%
slice_sample(n=targets$total[[i]])
sample_list[[i]] <- sample
}
a<-bind_rows(sample_list)
像这样。它将 dfs 保存在列表中。
n = 4
sample_list <- list()
for(i in 1:n){
sample<-rbind(population %>% filter(region==targets$region[[i]],
sex==targets$sex[[i]]) %>%
slice_sample(n=targets$total[[i]]))
sample_list[[i]] <- sample
}
假设数据中的变量为h。然后迭代循环,根据指示变量提取相应的观察值。
data <- csv.()
sizes <- c(a, b, c)
indicator <- data$h
total_size <- sum(sizes)
ratio <- sizes / total_size
创建一个空矩阵来存储最终样本。这将启动一个循环遍历每个 (i) 。根据其比率计算当前切片所需的样本大小
sample1 <- NULL
for (i in 1:length(sizes)) {
slice <- data[ind == i,]
slice_sample_size <- round(ratio[i] * total_size)
slice_sample <- slice[sample(nrow(slice), slice_sample_size),]
sample1 <- rbind(sample1, slice_sample)
}
或使用 df
slice_data <- df[df$h == slice[i], ]
for (i in 1:n_slice) {
slice_data <- df[df$g1 == strata[i], ]
slice_sample <- slice_data[sample.int(nrow(slice_data), sample_sizes[i]), ]