跨列查找多个时间报告

Question

我正在尝试评估伤害数据集。数据来自 4 个来源（医院、全科医生、自我报告、死亡），每个来源均以年为单位给出受伤时间（连续变量）。一个人可能会在一个或多个来源中报告受伤情况。我想知道其他来源是否报告了医院受伤情况（0.25 以内的任何受伤时间都被视为同一伤害）。

因此，我想创建一个列“Hospital_elsewhere_1”，其中如果“Hospital_1”列中有一个时间，则“Hospital_elsewhere_1”列将显示“医院”，如果任何其他列（不包括“医院”）中的时间在 0.25 以内，它将显示“医院”还包括由 | 分隔的文本来源。

例如，如果 Hospital_1 中受伤年龄为 65.44 岁，GP_1 中受伤年龄为 65.42 岁，自我报告中受伤年龄为 65.43 岁，则会显示“Hospital|GP|self_report”。

我想对每个医院列都这样做，这样就会有一个 Hospital_elsewhere_(i)

下面是一个示例数据集

library(tibble)
set.seed(123)

example_data <- tibble(
  id = 1:30,
  Hospital_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_2 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_3 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  Hospital_4 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_2 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_3 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_4 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_5 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_6 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_7 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_8 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_9 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  GP_10 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE),
  self_report_1 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_2 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_3 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  self_report_4 = sample(c(NA, round(runif(15, 1, 80), 2)), 30, replace = TRUE),
  death_1 = sample(c(NA, round(runif(25, 1, 80), 2)), 30, replace = TRUE)
)

for (i in 1:10) {
  index <- sample(1:30, 1)  
  gp_value <- round(runif(1, 1, 80), 2)  
  example_data[index, paste0("GP_", 1:4)] <- gp_value 
  example_data[index, paste0("Hospital_", 1:4)] <- gp_value + runif(1, -0.25, 0.25)
}

Answer 1

每个位置使用一 (

list

) 列转换数据：

library(dplyr)
library(purrr)
library(tidyr)
(list_df <- example_data %>%
  pivot_longer(names_to = c("Type", "nr"), 
               names_pattern = "(.*)_(\\d*)", 
               cols = -id,
               names_transform = list(nr = as.integer)) %>%
  summarize(vals = list(value), .by = c(id, Type)) %>%
  pivot_wider(names_from = Type, values_from = vals) %>%
  relocate(GP, .after = Hospital))
# # A tibble: 30 × 5
#       id Hospital  GP         self_report death    
#    <int> <list>    <list>     <list>      <list>   
#  1     1 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
#  2     2 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
#  3     3 <dbl [4]> <dbl [10]> <dbl [4]>   <dbl [1]>
# [...]

对于每个

Hospital

检查时间范围内是否有任何位置：

detect_reportings <- function(other, Hospital) {
  imap_chr(set_names(Hospital, paste0("Hospital_", seq_along(Hospital))),
           ~ if_else(any(abs(other - .x) <= .25), cur_column(), NA_character_)) %>%
  list()
}

此函数最终将为每个其他位置返回长度为

（== 医院数量）的字符向量，如果在时间范围内没有事件，则为

NA

，否则返回位置名称：

(dist_check <- list_df %>%
  group_by(id) %>%
  mutate(across(GP:death, ~ detect_reportings(.x[[1]], Hospital[[1]])),
         Hospital = set_names(if_else(is.na(Hospital[[1]]), NA_character_, "Hospital"),
                              paste0("Hospital_", seq_along(Hospital[[1]]))) %>%
                     list()))
# # A tibble: 30 × 5
# # Groups:   id [30]
#       id Hospital  GP        self_report death    
#    <int> <list>    <list>    <list>      <list>   
#  1     1 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
#  2     2 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
#  3     3 <chr [4]> <chr [4]> <chr [4]>   <chr [4]>
# [...]

dist_check$GP[[16]]
## Hospital_3 and Hospital_4 were within the .25 range
# Hospital_1 Hospital_2 Hospital_3 Hospital_4 
#         NA         NA       "GP"       "GP" 
example_data[16, c(4:5, 8, 14)]
# # A tibble: 1 × 4
#   Hospital_3 Hospital_4  GP_3  GP_9
#        <dbl>      <dbl> <dbl> <dbl>
# 1       25.6       19.9  25.4  19.7

最后一步是合并这些检查：

merge_reportings <- function(...) {
  cbind(...) %>%
    as_tibble() %>%
    unite("result", sep = "|", na.rm = TRUE) %>%
    t() %>%
    c() %>%
    set_names(paste0("Hospital_elsewhere_", seq_along(..1))) %>%
    as.list() %>%
    as_tibble()
}
res <- dist_check %>%
  reframe(merge_reportings(Hospital[[1]], GP[[1]], self_report[[1]], death[[1]]))
print(res, n = 10L)

# # A tibble: 30 × 5
#       id Hospital_elsewhere_1 Hospital_elsewhere_2 Hospital_elsewhere_3 Hospital_elsewhere_4
#    <int> <chr>                <chr>                <chr>                <chr>               
#  1     1 Hospital             Hospital             Hospital             "Hospital"          
#  2     2 Hospital             Hospital             Hospital             "Hospital"          
#  3     3 Hospital             Hospital             Hospital             "Hospital"          
#  4     4 Hospital             Hospital             Hospital             "Hospital"          
#  5     5 Hospital             Hospital             Hospital             "Hospital"          
#  6     6 Hospital             Hospital             Hospital|GP          "Hospital|GP"       
#  7     7 Hospital|GP          Hospital|GP          Hospital|GP          "Hospital|GP"       
#  8     8 Hospital|GP          Hospital|GP          Hospital|GP          "Hospital|GP"       
#  9     9 Hospital             Hospital             Hospital             "Hospital"          
# 10    10 Hospital             Hospital             Hospital             "Hospital"          
# # ℹ 20 more rows
# # ℹ Use `print(n = ...)` to see more rows

跨列查找多个时间报告

问题描述投票：0回答：1

1个回答

最新问题

跨列查找多个时间报告

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1