这是我的玩具数据集
df <- tribble(
~x, ~y, ~z,
7, NA, 4,
8, 2, NA,
NA, NA, NA,
NA, 4, 6)
我想只在每列的第一个和最后一个出现次数之间为每个变量获取一个具有NA
数量的数据帧,并在第一个出现的数量和最后一行之间获得NA
的数量。因此,对于此示例,所需的解决方案是
desired_df <- tribble(~vars, ~na_count_between_1st_last_num, ~na_count_between_1st_num_last_row,
"x", 0, 2,
"y", 1, 1,
"z", 2, 2)
如何获得所需的输出?
[na.trim
如果我们指定sides="left"
或sides="right"
,则从两端或仅从左端或右端修剪NA,因此:
library(dplyr)
library(tibble)
library(tidyr)
library(zoo)
df %>%
pivot_longer(everything()) %>%
group_by(name) %>%
summarize(na1 = sum(is.na(na.trim(value))),
na2 = sum(is.na(na.trim(value, "left")))) %>%
ungroup
给予:
# A tibble: 3 x 3
name na1 na2
<chr> <int> <int>
1 x 0 2
2 y 1 1
3 z 2 2
这里是通过底数R给出的一个想法,
f1 <- function(x) {i1 <- which(!is.na(x)); head(i1, 1):tail(i1, 1) }
f2 <- function(x) {i1 <- which(!is.na(x)); head(i1, 1):length(x) }
merge(stack(sapply(df, function(i) sum(is.na(i[f1(i)])))),
stack(sapply(df, function(i) sum(is.na(i[f2(i)])))), by = 'ind')
# ind values.x values.y
#1 x 0 2
#2 y 1 1
#3 z 2 2
这里是使用两种功能的一种可能性:
fun1 <- function(x) { #count NA between first and last non NA
idx1 <- cumsum(!is.na(x)) > 0 #identify leading NA
idx2 <- rev(cumsum(!is.na(rev(x))) > 0) #identify trailing NA
sum(is.na(x[idx1 & idx2]))
}
fun2 <- function(x) {#count NA between first non-NA and last element
idx1 <- cumsum(!is.na(x)) > 0 #identify leading NA
sum(is.na(x[idx1]))
}
[之后,您只需总结一下data.frame并重塑它:
df %>% summarise_all(list(m1 = ~fun1(.), m2 = ~fun2(.))) %>%
pivot_longer(cols = everything(), names_pattern = "^(.)_(.*)$", names_to = c("vars", "a"),
values_to = "x") %>%
spread(a, x)
# A tibble: 3 x 3
vars m1 m2
<chr> <int> <int>
1 x 0 2
2 y 1 1
3 z 2 2
这里是使用data.table::nafill
的另一个选项:
library(data.table)
natrail <- colSums(is.na(as.data.table(nafill(df, "nocb"))))
nastart <- colSums(is.na(as.data.table(nafill(df, "locf"))))
n1last <- nrow(df) - colSums(!is.na(df)) - nastart
n1num <- n1last - natrail
cbind(na_count_between_1st_last_num=n1num, na_count_between_1st_num_last_row=n1last)
输出:
na_count_between_1st_last_num na_count_between_1st_num_last_row
x 0 2
y 1 1
z 2 2
a 1 2
b 0 0
d 0 1
数据:
df <- data.frame(x=c(7,8,NA,NA), #0 2
y=c(NA, 2, NA, 4), #1 1
z=c(4, NA, NA, 6), #2 2
a=c(1, NA, 1, NA), #1 2
b=c(NA, NA, 1, 1), #0 0
d=c(NA, 1, 1, NA)) #0 1