structure(list(REF = c("T", "C", "T", "REF", "T", "C", "REF",
"G", "T"), ALT = c("G", "G", "C", "ALT", "G", "G", "ALT", "A",
"C"), INFO = c("AC=1;AF=0.500;AN=2;BaseQRankSum=2.56;DB;DP=394;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=14.20;ReadPosRankSum=1.08;SOR=0.674",
"AC=1;AF=0.500;AN=2;BaseQRankSum=3.000e-03;DB;DP=263;ExcessHet=3.0103;FS=0.451;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=12.84;ReadPosRankSum=0.041;SOR=0.661",
"AC=2;AF=1.00;AN=2;DB;DP=405;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=33.41;SOR=0.858",
"INFO", "AC=1;AF=0.500;AN=2;BaseQRankSum=3.85;DB;DP=498;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=13.45;ReadPosRankSum=1.05;SOR=0.736",
"AC=1;AF=0.500;AN=2;BaseQRankSum=1.11;DB;DP=350;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=12.85;ReadPosRankSum=-4.640e-01;SOR=0.687",
"INFO", "AC=1;AF=0.500;AN=2;BaseQRankSum=-1.635e+00;DB;DP=467;ExcessHet=3.0103;FS=0.522;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=10.95;ReadPosRankSum=0.905;SOR=0.750",
"AC=2;AF=1.00;AN=2;BaseQRankSum=-5.710e-01;DB;DP=478;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;MQRankSum=0.00;QD=31.82;ReadPosRankSum=-1.297e+00;SOR=0.369"
), 2257588 = c("0/1:186,205:391:99:0|1:11854457_G_A:5581,0,3667",
"0/1:134,125:259:99:3353,0,3693", "1/1:0,400:403:99:13391,1205,0",
"2257689", "0/1:255,240:495:99:6688,0,6524", "0/1:188,160:348:99:4501,0,5131",
"2258942", "0/1:256,207:463:99:5099,0,7033", "1/1:1,477:478:99:15237,1392,0"
)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"
))
如您所见,列名称在数据框中出现多次。前三列具有一致的名称,但第四列的名称是患者 ID,我想将其转换为新列。
structure(list(REF = c("T", "C", "T", "T", "C", "G", "T"), ALT = c("G",
"G", "C", "G", "G", "A", "C"), INFO = c("AC=1;AF=0.500;AN=2;BaseQRankSum=2.56;DB;DP=394;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=14.20;ReadPosRankSum=1.08;SOR=0.674",
"AC=1;AF=0.500;AN=2;BaseQRankSum=3.000e-03;DB;DP=263;ExcessHet=3.0103;FS=0.451;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=12.84;ReadPosRankSum=0.041;SOR=0.661",
"AC=2;AF=1.00;AN=2;DB;DP=405;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;QD=33.41;SOR=0.858",
"AC=1;AF=0.500;AN=2;BaseQRankSum=3.85;DB;DP=498;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=13.45;ReadPosRankSum=1.05;SOR=0.736",
"AC=1;AF=0.500;AN=2;BaseQRankSum=1.11;DB;DP=350;ExcessHet=3.0103;FS=0.000;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=12.85;ReadPosRankSum=-4.640e-01;SOR=0.687",
"AC=1;AF=0.500;AN=2;BaseQRankSum=-1.635e+00;DB;DP=467;ExcessHet=3.0103;FS=0.522;MLEAC=1;MLEAF=0.500;MQ=60.00;MQRankSum=0.00;QD=10.95;ReadPosRankSum=0.905;SOR=0.750",
"AC=2;AF=1.00;AN=2;BaseQRankSum=-5.710e-01;DB;DP=478;ExcessHet=3.0103;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=60.00;MQRankSum=0.00;QD=31.82;ReadPosRankSum=-1.297e+00;SOR=0.369"
), VAR = c("0/1:186,205:391:99:0|1:11854457_G_A:5581,0,3667",
"0/1:134,125:259:99:3353,0,3693", "1/1:0,400:403:99:13391,1205,0",
"0/1:255,240:495:99:6688,0,6524", "0/1:188,160:348:99:4501,0,5131",
"0/1:256,207:463:99:5099,0,7033", "1/1:1,477:478:99:15237,1392,0"
), ID = c(2257588, 2257588, 2257588, 2257689, 2257689, 2258942,
2258942)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-7L))
如果您能提供一些帮助,我将非常感激。
首先,导入数据,使第一行不是标题:
tmp <- tempfile(fileext = ".csv")
write.csv(df, tmp, row.names = FALSE)
df1 <- read.csv(tmp, header = FALSE)
然后,使用
tidyr::fill
复制 ID:
library(dplyr)
library(tidyr)
df2 <- df1 %>%
mutate(ID = as.numeric(V4)) %>%
fill(ID) %>%
filter(V1 != "REF") %>%
rename(
REF = V1,
ALT = V2,
INFO = V3,
VAR = V4
)