我有一个包含不完整行的数据框(或数据表,如果这更容易的话):
ID Var1 Var2 Var3
1 2 5 1
2 12 3
3 8
4 4
d <- data.frame(
ID = 1:4,
Var1 = c(2, 12, 8, 4),
Var2 = c(5, 3, NA, NA),
Var3 = c(1, NA, NA, NA)
)
library(data.table)
d <- fread("
ID Var1 Var2 Var3
1 2 5 1
2 12 3 NA
3 8 NA NA
4 4 NA NA
")
空单元格始终位于行的末尾。
我想用该行最后一个非空单元格中的值填充每行中的空单元格,例如:
ID Var1 Var2 Var3
1 2 5 1
2 12 3 -> 3
3 8 -> 8 -> 8
4 4 -> 4 -> 4
我该怎么做?
使用
collapse
包的另一个答案,它的优点是特别快(比 data.table
快):
library(collapse)
dapply(d, na_locf, MARGIN = 1)
# ID Var1 Var2 Var3
# 1 1 2 5 1
# 2 2 12 3 3
# 3 3 8 8 8
# 4 4 4 4 4
微基准:
# Unit: microseconds
# expr min lq mean median uq max neval
# collapse 69.5 112.95 244.847 135.45 161.7 9964.4 100
# dt 592.9 788.70 1237.643 874.70 1186.6 14563.1 100
# tidyr 32283.2 36170.80 41293.420 40501.55 43809.1 75417.8 100
# Reduce_dt 645.0 803.70 1083.373 954.05 1222.6 2367.5 100
# Reduce_TiC 383.9 499.25 661.475 586.40 687.6 5179.1 100
基准代码:
microbenchmark::microbenchmark(
collapse = dapply(d, na_locf, MARGIN = 1),
dt = data.table::transpose(
setnafill(data.table::transpose(d, keep.names = "ID"),
type = "locf", cols = 1:nrow(d) + 1), make.names = "ID"),
tidyr = d %>%
pivot_longer(-ID) %>%
fill(value, .direction = "down") %>%
pivot_wider(),
Reduce_dt = d[, (sprintf("Var%d", 1:3)) := Reduce(\(x, y) ifelse(is.na(y), x, y), .SD, accumulate = TRUE), .SDcols = sprintf("Var%d", 1:3)],
Reduce_TiC = Reduce(\(x, y) ifelse(is.na(y), x, y), d[-1], accumulate = TRUE)
)
转置并填补缺失,然后再次转置。
library(data.table)
transpose(
setnafill(transpose(d, keep.names = "ID"),
type = "locf", cols = 1:nrow(d) + 1),
make.names = "ID")
# ID Var1 Var2 Var3
# <int> <int> <int> <int>
# 1: 1 2 5 1
# 2: 2 12 3 3
# 3: 3 8 8 8
# 4: 4 4 4 4
这是基础方法:
(start <- structure(list(
ID = 1:4,
Var1 = c(2L, 12L, 8L, 4L),
Var2 = c(5L, 3L, NA, NA),
Var3 = c(1L, NA, NA, NA)
),
class = "data.frame",
row.names = c(NA, -4L)))
start$last <- apply(start,MARGIN = 1,FUN = \(row){row |>
as.matrix() |>
na.omit() |>
tail(n=1)})
start$last
(fin <- apply(start,MARGIN = 1,FUN = \(row){
n <- length(row)
p <- ifelse(is.na(row),row[n] ,row)
head(p,-1)
}) |> t() |> data.frame())
A
tidyr::pivot
/fill
采取:
library(tidyverse) # `tidyr`
# toy data
aux <- tibble::tribble(
~ID, ~Var1, ~Var2, ~Var3,
1, 2, 5, 1,
2, 12, 3, NA,
3, 8, NA, NA,
4, 4, NA, NA)
# Pivot down, fill down and and pivot back
new_aux <- aux %>%
pivot_longer(-ID) %>%
fill(value, .direction = "down") %>%
pivot_wider()
输出:
> new_aux
# A tibble: 4 × 4
ID Var1 Var2 Var3
<dbl> <dbl> <dbl> <dbl>
1 1 2 5 1
2 2 12 3 3
3 3 8 8 8
4 4 4 4 4
创建于 2024-05-28,使用 reprex v2.1.0
d[, (cols) := Reduce(\(x, y) ifelse(is.na(y), x, y), .SD, accumulate = TRUE), .SDcols = cols]
或者
d[, (cols) := Reduce(fcoalesce, .SD, right = TRUE, accumulate = TRUE), .SDcols = cols]
输出
# ID Var1 Var2 Var3
# <int> <int> <int> <int>
# 1: 1 2 5 1
# 2: 2 12 3 3
# 3: 3 8 8 8
# 4: 4 4 4 4
哪里
cols <- sprintf("Var%d", 1:3)
d <- data.table(
ID = 1:4,
Var1 = c(2L, 12L, 8L, 4L),
Var2 = c(5L, 3L, 8L, 4L),
Var3 = c(1L, 3L, 8L, 4L)
)
您可以简单地运行
Reduce
,如下所示
> df[-1] <- Reduce(\(x, y) ifelse(is.na(y), x, y), df[-1], accumulate = TRUE)
或
df[-1] <- Reduce(\(x, y) rowSums(cbind(x, y), TRUE) - x * !is.na(y), df[-1], accumulate = TRUE)
这给出了
> df
ID Var1 Var2 Var3
1 1 2 5 1
2 2 12 3 3
3 3 8 8 8
4 4 4 4 4