我有一个数据集。我想把它分成相等的两部分。例如,如果有 10 列,我想获取两个 5 列的数据集。正如您所看到的,这可以通过
choose(10, 5) / 2 = 126
不同的方式来完成。我怎样才能确保每次分割都是不同的?
我已经走到这一步了。但是
sample()
函数不会给我我想要的东西。
# the data
data <- structure(list(V1 = c(0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
0, 0, 1, 0, 1, 1, 1), V2 = c(0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 1, 1), V3 = c(0, 0, 1, 0, 0, 1, 0, 0, 1,
0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0), V4 = c(0, 0, 1, 1, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0)), class = "data.frame", row.names = c(NA,
-20L))
n1 <- ncol(data)
n2 <- ncol(data)/2
x1 <- colnames(data)
# the dataset can have an odd number of columns.
ifelse(n1%%2 == 0, MM <- sample(x1, n2), MM <- sample(x1, n2+1))
s1 <- data[MM]
s1
s2 <- data[!colnames(data) %in% MM]
s2
# s1 and s2 are completely different from each other.
mylist <- list(s1, s2)
mylist
我想将所有可能的一半合并到一个列表中。
您可以使用
gtools::combinations
获取所有组合并使用它来生成分割:
library(gtools)
split_data <- function(dat) {
nc <- ncol(dat)
## get all combinations (TODO: add some logic to deal with nc being odd)
sel <- combinations(nc, nc / 2L)
## however we need only half of them as the selection of the first half
## determines the other half
sel_final <- sel[1:(nrow(sel) / 2L), ]
apply(sel_final, 1L, function(set1) {
set1_idx <- 1:ncol(dat) %in% set1
set2_idx <- !set1_idx
list(grp1 = dat[, set1_idx],
grp2 = dat[, set2_idx])
})
}
str(split_data(data))
# List of 3
# $ :List of 2
# ..$ grp1:'data.frame': 20 obs. of 2 variables:
# .. ..$ V1: num [1:20] 0 1 0 1 1 1 1 1 1 0 ...
# .. ..$ V2: num [1:20] 0 1 0 0 0 0 1 1 0 0 ...
# ..$ grp2:'data.frame': 20 obs. of 2 variables:
# .. ..$ V3: num [1:20] 0 0 1 0 0 1 0 0 1 0 ...
# .. ..$ V4: num [1:20] 0 0 1 1 0 0 0 1 0 0 ...
# $ :List of 2
# ..$ grp1:'data.frame': 20 obs. of 2 variables:
# .. ..$ V1: num [1:20] 0 1 0 1 1 1 1 1 1 0 ...
# .. ..$ V3: num [1:20] 0 0 1 0 0 1 0 0 1 0 ...
# ..$ grp2:'data.frame': 20 obs. of 2 variables:
# .. ..$ V2: num [1:20] 0 1 0 0 0 0 1 1 0 0 ...
# .. ..$ V4: num [1:20] 0 0 1 1 0 0 0 1 0 0 ...
# $ :List of 2
# ..$ grp1:'data.frame': 20 obs. of 2 variables:
# .. ..$ V1: num [1:20] 0 1 0 1 1 1 1 1 1 0 ...
# .. ..$ V4: num [1:20] 0 0 1 1 0 0 0 1 0 0 ...
# ..$ grp2:'data.frame': 20 obs. of 2 variables:
# .. ..$ V2: num [1:20] 0 1 0 0 0 0 1 1 0 0 ...
# .. ..$ V3: num [1:20] 0 0 1 0 0 1 0 0 1 0 ...