我在列表中有数据集,其中分类已指定为单词或短语。 我想在一个序列中找到最常见的分类。
这是我的数据:
## Create some random data
id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
"Frequent", "Frequent", "Very Rare", "Often", "Not Seen", "Occasional", "Rare")
df <- data.frame(id, class)
print(df)
## Another two data sets to form a list:
id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
"Not Seen", "Occasional", "Rare","Occasional", "Often", "Very Rare","Occasional")
df1 <- data.frame(id, class)
id <- seq(0, 8, .5)
class <- c("Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare", "Occasional", "Very Rare", "Absent",
"Frequent", "Often","Occasional", "Often", "Not Seen", "Frequent", "Rare")
df2 <- data.frame(id, class)
## make a list
filez <- list(df, df1, df2)
我得到输出:
> print(df)
id class
1 0.0 Frequent
2 0.5 Often
3 1.0 Occasional
4 1.5 Often
5 2.0 Not Seen
6 2.5 Frequent
7 3.0 Rare
8 3.5 Occasional
9 4.0 Very Rare
10 4.5 Absent
11 5.0 Frequent
12 5.5 Frequent
13 6.0 Very Rare
14 6.5 Often
15 7.0 Not Seen
16 7.5 Occasional
17 8.0 Rare
然后我为类分配一个数值(对我来说,使用数字比使用字符串更有意义)
for (i in 1:length(filez)) {
filez$classnum <- as.numeric(revalue(df$class,
c("Frequent"= 1, "Often"= 2,"Occasional" =3, "Rare"= 4,
"Very Rare"= 5, "Not Seen" = 6, "Absent" = 7))
}
我现在如何着手寻找每个序列组最常见的分类? 我试过使用
cut
和 aggregate
,我觉得我快到了,但输出返回带有“数字”一词的组。我该怎么做才能让它给我 1-7 之间的数字?
new_seq <- seq(0,8,2)
for(i in 1:length(filez)){
aggr_func <- function(filez){
filez$group <- cut(filez[,1], new_seq)
output <- aggregate(filez[,3], by = list(filez$group), FUN =mode)
return(output)
}
final <- lapply(filez, aggr_func)
}
> print(final[[1]])
Group.1 x
1 (0,2] numeric
2 (2,4] numeric
3 (4,6] numeric
4 (6,8] numeric
使用
lapply
代替for-loops
,正如评论中提到的,我们需要自定义“模式”功能。
# rowbind and convert "class" to numeric
out <- do.call(rbind, filez)
out$classnum <- as.integer(
factor(out$class, levels = c("Frequent", "Often","Occasional", "Rare",
"Very Rare", "Not Seen", "Absent")))
现在获取模式,使用这篇文章中的 2 个函数:How to find the statistical mode?
#group by Mode
aggregate(out[ "classnum" ], list(cut(out$id, seq(0, 8, 2))), FUN = Mode)
# Group.1 classnum
# 1 (0,2] 2
# 2 (2,4] 1
# 3 (4,6] 7
# 4 (6,8] 2
#group by Modes
aggregate(out[ "classnum" ], list(cut(out$id, seq(0, 8, 2))), FUN = Modes)
# Group.1 classnum
# 1 (0,2] 2
# 2 (2,4] 1, 4, 3, 5
# 3 (4,6] 7, 1
# 4 (6,8] 2, 3