我的数据在一列中包含最近的基因集,包括它们的基因组区域和链。 我想为从该列中提取的单个基因创建一个新列,并将它们分成多行。基因名称位于 |+| 之后或|-|。
原始数据
original <- structure(list(id = c(1, 2), distance = c(0, 167155),
nearestTargetGeneAnchorB = c("chr1:248838178-248849517|+|ZNF672,chr1:248826377-248826443|+|MIR3124,chr1:248810452-248825955|-|SH3BP5L",
"chr1:248323630-248324568|-|OR2M7,chr1:248294616-248295578|-|OR2T12")),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-2L))
如何将表格转换成这样?
fixed <- structure(list(id = c(1, 1, 1, 2, 2), distance = c(0,
0, 0, 167155, 167155), nearestTargetGeneAnchorB = c("chr1:248838178-248849517|+|ZNF672,chr1:248826377-248826443|+|MIR3124,chr1:248810452-248825955|-|SH3BP5L",
"chr1:248838178-248849517|+|ZNF672,chr1:248826377-248826443|+|MIR3124,chr1:248810452-248825955|-|SH3BP5L",
"chr1:248838178-248849517|+|ZNF672,chr1:248826377-248826443|+|MIR3124,chr1:248810452-248825955|-|SH3BP5L",
"chr1:248323630-248324568|-|OR2M7,chr1:248294616-248295578|-|OR2T12",
"chr1:248323630-248324568|-|OR2M7,chr1:248294616-248295578|-|OR2T13"),
...4 = c("ZNF672", "MIR3124", "SH3BP5L", "OR2M7", "OR2T13")),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L))
library(tidyr)
original %>%
separate_rows(nearestTargetGeneAnchorB, sep = ",") %>%
mutate(variable = sub(".*\\|(\\w+).*", "\\1", nearestTargetGeneAnchorB)) %>%
select(-nearestTargetGeneAnchorB)
# A tibble: 5 × 3
id distance variable
<dbl> <dbl> <chr>
1 1 0 ZNF672
2 1 0 MIR3124
3 1 0 SH3BP5L
4 2 167155 OR2M7
5 2 167155 OR2T12