我的数据集包含821049个变量和18列。我想抽取9列进行分层抽样。这些是“ BASKETS_NZ”,“ PIS”,“ PIS_AP”,“ PIS_DV”,“ PIS_PL”,“ PIS_SDV”,“ PIS_SHOPS”,“ PIS_SR”,“ QUANTITY”。我的分层变量是ID = 1:821049。如何选择变量的间隔?如何设置采样大小?
dpt(rbind(head(WKA_ohneJB,10),tail(WKA_ohneJB,10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L,
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L,
13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L,
17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L,
1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L),
PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L,
8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L,
2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L,
0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L,
24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L,
6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L,
0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L,
0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L,
1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L,
1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L),
LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L,
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")
要解决此问题,是从每个组中抽取分层样本。将数据分组在一起的潜在方法是将9列粘贴在一起或使用dplyr的groupby函数。
使用解决方案是此问题How to get around error "factor has new levels" in cross-validation glm?,并以dplyr样式进行更新。
此dplyr_stratified函数将采用所需的采样率和任意数量的列,并将返回包含采样行的数据帧。请参阅下面的示例以获取2列。
set.seed(1)
x <- rnorm(n = 100)
y <- rep(x = c("A","B"), times = c(50,50))
z <- rep(x = c("D","E","F"), times = c(33,33,34))
data <- data.frame(x, y=sample(y, replace = TRUE), z=sample(z, replace=TRUE))
#optional tag row for later identification:
data$rowid<-1:nrow(data)
dplyr_stratified <- function(df, percent, ...){
columns<-enquos(...)
#group then sample each group
out<-df %>% group_by(!!!columns) %>% slice( sample(1:n(), percent*n()))
}
testgroup<-dplyr_stratified(data, 0.2, z, y)
testgroup