我有以下数据框:
> test_df
sample pers expl resp
1 1 1 a 1
2 2 2 a 1
3 3 3 a 1
4 4 4 a 1
5 5 5 a 1
6 6 1 b 1
7 7 2 b 1
8 8 3 b 0
9 9 4 b 1
10 10 5 b 1
11 11 1 c 0
12 12 2 c 1
13 13 3 c 0
14 14 4 c 1
15 15 5 c 0
16 16 1 d 1
17 17 2 d 1
18 18 3 d 0
19 19 4 d 1
20 20 5 d 1
21 21 1 e 1
22 22 2 e 1
23 23 3 e 0
24 24 4 e 1
25 25 5 e 1
26 26 1 f 0
27 27 2 f 0
28 28 3 f 0
29 29 4 f 0
30 30 5 f 0
31 31 1 g 1
32 32 2 g 1
33 33 3 g 0
34 34 4 g 0
35 35 5 g 0
36 36 6 a 1
37 37 7 a 1
38 38 8 a 1
39 39 9 a 1
40 40 10 a 1
41 41 6 h 1
42 42 7 h 1
43 43 8 h 1
44 44 9 h 1
45 45 10 h 1
46 46 6 i 1
47 47 7 i 1
48 48 8 i 1
49 49 9 i 1
50 50 10 i 1
51 51 6 d 1
52 52 7 d 1
53 53 8 d 1
54 54 9 d 1
55 55 10 d 1
56 56 6 d2 1
57 57 7 d2 1
58 58 8 d2 1
59 59 9 d2 1
60 60 6 e 1
61 61 7 e 1
62 62 8 e 1
63 63 9 e 1
64 64 10 e 1
65 65 6 e2 1
66 66 7 e2 1
67 67 8 e2 1
68 68 9 e2 1
69 69 10 e2 1
70 70 6 g 1
71 71 7 g 0
72 72 8 g 1
73 73 9 g 1
74 74 10 g 0
我想测试是否有任何统计上的显着差异。响应变量是“resp”,解释变量是“expl”,随机变量是“pers”。 我应用以下命令:
model <- glmer(resp~expl+(1|pers), data=test_df, family=binomial)
但是我收到以下警告消息:
Warning message:
In checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
Hessian is numerically singular: parameters are not uniquely determined
我对建模还比较陌生,我想知道这里的问题是什么。
提前致谢。
Jay 的链接给出了一个很好的答案,但我想我可以添加一些细节。
# Here's your data, except I dropped the `sample` column, as it wasn't used.
tt <- structure(list(pers=structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L,
5L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L,
9L, 10L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L, 9L, 6L, 7L, 8L, 9L, 10L, 6L, 7L, 8L,
9L, 10L, 6L, 7L, 8L, 9L, 10L), levels=c("1", "2", "3", "4", "5", "6", "7",
"8", "9", "10"), class="factor"), expl=structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 6L, 6L, 6L, 6L, 6L, 8L,
8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 1L, 1L, 1L, 1L, 1L, 10L, 10L, 10L, 10L,
10L, 11L, 11L, 11L, 11L, 11L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L,
6L, 6L, 7L, 7L, 7L, 7L, 7L, 9L, 9L, 9L, 9L, 9L), levels=c("a", "b", "c", "d",
"d2", "e", "e2", "f", "g", "h", "i"), class="factor"), resp=c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L)), row.names=c(NA, -74L),
class="data.frame")
如果我们运行
table()
,我们将对您的数据有一个很好的概览。
table(tt)
# , , resp = 0
#
# expl
# pers a b c d d2 e e2 f g h i
# 1 0 0 1 0 0 0 0 1 0 0 0
# 2 0 0 0 0 0 0 0 1 0 0 0
# 3 0 1 1 1 0 1 0 1 1 0 0
# 4 0 0 0 0 0 0 0 1 1 0 0
# 5 0 0 1 0 0 0 0 1 1 0 0
# 6 0 0 0 0 0 0 0 0 0 0 0
# 7 0 0 0 0 0 0 0 0 1 0 0
# 8 0 0 0 0 0 0 0 0 0 0 0
# 9 0 0 0 0 0 0 0 0 0 0 0
# 10 0 0 0 0 0 0 0 0 1 0 0
#
# , , resp = 1
#
# expl
# pers a b c d d2 e e2 f g h i
# 1 1 1 0 1 0 1 0 0 1 0 0
# 2 1 1 1 1 0 1 0 0 1 0 0
# 3 1 0 0 0 0 0 0 0 0 0 0
# 4 1 1 1 1 0 1 0 0 0 0 0
# 5 1 1 0 1 0 1 0 0 0 0 0
# 6 1 0 0 1 1 1 1 0 1 1 1
# 7 1 0 0 1 1 1 1 0 0 1 1
# 8 1 0 0 1 1 1 1 0 1 1 1
# 9 1 0 0 1 1 1 1 0 1 1 1
# 10 1 0 0 1 0 1 1 0 0 1 1
#
正如你所看到的,它非常不平衡(
resp = 1
比resp = 0
的情况更多)并且稀疏(两个表中都有很多零,包括一些全零的行和列)
这还远远不足以让
glmer()
算法达成共识。我们可以尝试并对数据进行重新采样,看看需要什么样的数据大小才能让 glmer()
满意。这当然不是可靠的统计数据,仅用于演示目的。
set.seed(1)
tt2 <- rbind(tt, as.data.frame(lapply(tt, sample, 500, rep=TRUE)))
table(tt2)
# , , resp = 0
#
# expl
# pers a b c d d2 e e2 f g h i
# 1 0 0 1 0 0 1 2 1 2 1 1
# 2 2 1 0 1 2 1 3 1 0 0 0
# 3 2 1 3 6 1 2 0 3 1 1 2
# 4 1 1 0 2 0 2 0 1 3 2 1
# 5 4 1 2 3 1 1 3 4 3 0 1
# 6 1 0 0 2 0 0 0 1 6 0 1
# 7 1 0 2 6 0 2 1 2 1 0 0
# 8 3 0 2 1 1 1 0 1 2 0 2
# 9 3 0 0 2 0 1 0 2 1 1 1
# 10 3 1 0 1 0 2 1 0 3 0 0
#
# , , resp = 1
#
# expl
# pers a b c d d2 e e2 f g h i
# 1 8 4 3 8 3 6 3 5 5 1 1
# 2 6 3 3 2 0 4 4 2 4 1 1
# 3 5 1 1 4 1 4 3 2 7 1 5
# 4 11 3 4 5 4 4 3 3 5 1 3
# 5 10 5 2 7 2 5 0 3 6 6 1
# 6 5 0 0 8 3 8 6 2 6 3 4
# 7 4 2 4 8 5 6 1 3 10 3 2
# 8 8 4 1 7 1 8 2 3 5 3 5
# 9 7 2 4 6 4 6 3 3 9 2 4
# 10 4 3 3 3 1 9 3 3 5 5 2
#
library(lme4)
model <- glmer(resp~expl+(1|pers), data=tt2, family=binomial)
生成了 400 行额外的行,它仍然抱怨 singular fit,但生成了 500 行,它似乎很满意。 虽然您的模型并不是特别复杂,但数据的性质(不平衡、机器人解释变量和随机变量的多个级别)需要比您拥有的样本多得多的样本。