R 中 lag() 的奇怪行为

问题描述 投票:0回答:1

我正在使用代码从较大的数据集中过滤出较小的数据集。我选择 24 个月以下的儿童和另一个变量 (b9),该变量表示孩子是否与母亲住在一起。 然而,在某些情况下,可能有超过一个 24 个月以下的孩子与母亲一起生活,在这种情况下,我只想选择按 caseid 和出生史 (bidx) 排列的最小的孩子。我正在使用以下代码,但它给了我一个空数据框。我没有收到任何错误,代码运行但它返回一个空数据框,但它不应该返回。这是一个逻辑错误。

###SAMPLE Dataframe

structure(list(v001 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), v002 = c(107, 
107, 107, 107, 113, 113, 113, 119, 125, 131, 137, 137, 137, 143, 
143, 143, 149, 149, 15, 15, 15, 15, 15, 15, 156, 156, 21, 21, 
3, 3, 33, 39, 46, 52, 52, 52, 58, 64, 64, 64, 64, 76, 76, 76, 
76, 82, 82, 82, 82, 82), caseid = c("       1 107  2", "       1 107  2", 
"       1 107 10", "       1 107 10", "       1 113  2", "       1 113  2", 
"       1 113  2", "       1 119  3", "       1 125  2", "       1 131  2", 
"       1 137  2", "       1 137  2", "       1 137  2", "       1 143  6", 
"       1 143  6", "       1 143 10", "       1 149  4", "       1 149  4", 
"       1  15  7", "       1  15 18", "       1  15 21", "       1  15 25", 
"       1  15 27", "       1  15 27", "       1 156  2", "       1 156  8", 
"       1  21  2", "       1  21  7", "       1   3  4", "       1   3  4", 
"       1  33  3", "       1  39 11", "       1  46  3", "       1  52  2", 
"       1  52  5", "       1  52  5", "       1  58  4", "       1  64  2", 
"       1  64  3", "       1  64  8", "       1  64  8", "       1  76 22", 
"       1  76 25", "       1  76 29", "       1  76 29", "       1  82  7", 
"       1  82  7", "       1  82 15", "       1  82 21", "       1  82 21"
), v021 = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), v022 = c(2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2), v023 = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), bidx = c(1, 2, 
1, 2, 1, 2, 3, 1, 1, 1, 1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 
2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 
2, 1, 2, 1, 1, 2), b9 = c(0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0), age = c(30, 
40, 2, 41, 3, 14, 41, 21, 19, 5, 1, 27, 57, 10, 59, 16, 7, 53, 
29, 50, 33, 5, 2, 35, 0, 10, 31, 22, 0, 55, 14, 17, 1, 34, 23, 
54, 6, 55, 0, 11, 45, 43, 29, 43, 57, 5, 44, 27, 5, 39)), row.names = c(NA, 
50L), class = "data.frame")


#create subset of KRfile to select for children under age of 24 months living with their mother

KRiycf <- KRdata %>%
  subset(age < 24 & b9==0 & !is.na(b9)) %>% # children under 24 months living at home
  arrange(caseid, bidx) %>% # make sure the data is sorted
  subset(is.na(lag(caseid)) | caseid!=lag(caseid)) # select just the youngest

#Above code gives me an empty dataframe

我尝试了几件事

KRiycf <- KRdata %>%
  subset(age < 24 & b9 == 0) %>%
  arrange(caseid, bidx) %>%
  subset(is.na(lag(caseid, default = NA)) | caseid != lag(caseid, default = NA))
##If lag() not returning NA properly, it could be an issue so I tried this but did not work.

KRiycf <- KRdata %>%
  subset(age < 24 & b9==0 & !is.na(b9)) %>% 
  arrange(caseid, bidx) %>% 
  subset(is.na(lag(caseid)) | caseid!=lag(caseid))
#I thought if there was a NA value in b9 there could be issue with sorting, but that di also not work. 
r sorting dplyr lag large-data
1个回答
0
投票

如果您的目标是在

caseid
之前只得到符合您标准的最小孩子,我认为您根本不需要
lag()
。相反,您可以使用
dplyr::slice_min()
by
caseid
:

library(dplyr) 

KRdata %>%
  subset(age < 24 & b9 %in% 0) %>%
  slice_min(age, by = caseid)

输出:

   v001 v002          caseid v021 v022 v023 bidx b9 age
1     1  107        1 107 10    1    2    2    1  0   2
2     1  113        1 113  2    1    2    2    1  0   3
3     1  119        1 119  3    1    2    2    1  0  21
4     1  125        1 125  2    1    2    2    1  0  19
5     1  131        1 131  2    1    2    2    1  0   5
6     1  137        1 137  2    1    2    2    1  0   1
7     1  143        1 143  6    1    2    2    1  0  10
8     1  143        1 143 10    1    2    2    1  0  16
9     1  149        1 149  4    1    2    2    1  0   7
10    1   15        1  15 25    1    2    2    1  0   5
11    1   15        1  15 27    1    2    2    1  0   2
12    1  156        1 156  2    1    2    2    1  0   0
13    1  156        1 156  8    1    2    2    1  0  10
14    1   21        1  21  7    1    2    2    1  0  22
15    1    3        1   3  4    1    2    2    1  0   0
16    1   33        1  33  3    1    2    2    1  0  14
17    1   39        1  39 11    1    2    2    1  0  17
18    1   46        1  46  3    1    2    2    1  0   1
19    1   52        1  52  5    1    2    2    1  0  23
20    1   58        1  58  4    1    2    2    1  0   6
21    1   64        1  64  3    1    2    2    1  0   0
22    1   64        1  64  8    1    2    2    1  0  11
23    1   82        1  82  7    1    2    2    1  0   5
24    1   82        1  82 21    1    2    2    1  0   5

您没有提供这些数据所需的输出,所以如果这不完全正确,请告诉我,我会更新!

© www.soinside.com 2019 - 2024. All rights reserved.