我致力于研究性传播疾病,例如淋病及其可能的后果,例如女性宫外孕。我的数据集 (df_gono) 包含按年份类别、年龄类别、性别和艾滋病毒状况分类的纵向流行病学数据。它还具有一个变量,用于计算每个age_cat、year_cat 和 HIV 状态的淋病诊断数量(从 0 到 n 的整数)。 我想创建一个变量来告知患者的“淋病状态”,患者在给定的age_cat和year_cat中至少被诊断一次后立即服用1。换句话说,对于每个患者,无论何时 变量 n_gono 大于 0,该行和后续行的 status_gono 取 1。结果应为 df_gono2。
df_gono <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7))
df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))
尝试这个解决方案,您需要安装
dplyr
和 tidyr
功能:
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
df_gono2 <- data.frame(patient = c("A", "A", "A", "A", "A","A", "B", "B", "B", "C", "C", "C", "C", "C", "D", "D", "D", "D", "E", "E", "E"),
sex = c("Female", "Female", "Female", "Female", "Female", "Female", "Female","Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female", "Female"),
age_current_cat = c(5, 5, 6, 6, 6, 7, 7, 8, 8, 5, 6, 6, 7, 7, 3, 3, 4, 4, 6, 7, 7),
calyear_current_cat = c(2, 2, 2, 3, 4, 4, 2, 2, 3, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 4, 4),
age_cat = c("30-34","30-34", "35-39", "35-39", "35-39", "40-44", "40-44", "45-49", "45-49", "30-34", "35-39", "35-39", "40-44", "40-44", "20-24", "20-24", "25-29", "25-29", "35-39", "40-44", "40-44"),
year_cat = c("2011-2013", "2011-2013", "2011-2013", "2014-2017", "2018-2020", "2018-2020", "2011-2013", "2011-2013", "2014-2017", "2011-2013", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2011-2013", "2014-2017", "2014-2017", "2018-2020", "2018-2020", "2018-2020", "2018-2020"),
hiv = c(0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1),
pat_year = c(0.05475702, 2.52635181, 0.41957563, 3.00068446, 1.57973990, 1.91649555, 0.08555784, 2.91512663, 2.08076660, 0.08418891, 2.91649555, 2.08350445, 0.91718001, 3.49623546, 3.00068446, 0.83299110, 2.16769336, 0.99657769, 0.25188227, 0.18343600, 2.22861054),
n_gono = c(0, 1, 0, 2, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 7, 1, 1, 0, 0, 2, 7),
status_gono = c(0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1))
df_gono2 <- df_gono2 |>
group_by(patient) |>
mutate(status_gono2 = ifelse(n_gono >= 1, 1, NA)) |>
fill(status_gono2, .direction = "down") |>
replace_na(list (status_gono2 = 0))
select(df_gono2, status_gono, status_gono2) |>
print(n = 30)
#> Adding missing grouping variables: `patient`
#> # A tibble: 21 × 3
#> # Groups: patient [5]
#> patient status_gono status_gono2
#> <chr> <dbl> <dbl>
#> 1 A 0 0
#> 2 A 1 1
#> 3 A 1 1
#> 4 A 1 1
#> 5 A 1 1
#> 6 A 1 1
#> 7 B 0 0
#> 8 B 1 1
#> 9 B 1 1
#> 10 C 0 0
#> 11 C 0 0
#> 12 C 0 0
#> 13 C 0 0
#> 14 C 0 0
#> 15 D 1 1
#> 16 D 1 1
#> 17 D 1 1
#> 18 D 1 1
#> 19 E 0 0
#> 20 E 1 1
#> 21 E 1 1
创建于 2024-08-27,使用 reprex v2.1.0