我有以下结构的巨大文件(> 25 MB):
ITEM: TIMESTEP
0
ITEM: NUMBER OF ATOMS
14748
ITEM: BOX BOUNDS ss ss ss
-1.3314357502021994e+02 1.1517122459132779e+02
-1.3499049172495816e+02 1.5089532983410831e+02
-4.1105766276524565e+02 4.1825892946863183e+02
ITEM: ATOMS id x y z c_q[1] c_q[2] c_q[3] c_q[4] c_shape[1] c_shape[2] c_shape[3] mol q type
40 -48.7871 -80.7758 -383.704 0 0 0 0 0 0 0 1 1 2
41 -51.0027 -77.7928 -384.047 0 0 0 0 0 0 0 1 0 39
42 -52.2822 -74.6121 -381.792 0 0 0 0 0 0 0 1 1 22
43 -53.6295 -75.5845 -378.205 0 0 0 0 0 0 0 1 0 35
44 -57.267 -76.4807 -378.221 0 0 0 0 0 0 0 1 0 28
45 -57.4497 -77.4334 -381.768 0 0 0 0 0 0 0 1 0 37
46 -56.4146 -80.8904 -381.67 0 0 0 0 0 0 0 1 0 40
47 -58.4181 -80.8895 -378.383 0 0 0 0 0 0 0 1 0 21
48 -61.5076 -80.3974 -380.283 0 0 0 0 0 0 0 1 0 31
49 -60.9285 -83.196 -382.86 0 0 0 0 0 0 0 1 1 22
50 -59.8865 -85.4222 -379.651 0 0 0 0 0 0 0 1 -1 27
51 -63.3418 -84.4792 -378.177 0 0 0 0 0 0 0 1 0 30
52 -64.9464 -85.538 -381.729 0 0 0 0 0 0 0 1 1 22
53 -62.8596 -88.8378 -381.581 0 0 0 0 0 0 0 1 1 22
54 -63.7873 -90.0464 -377.83 0 0 0 0 0 0 0 1 0 39
55 -67.4433 -88.4767 -378.467 0 0 0 0 0 0 0 1 0 26
56 -67.8255 -91.1638 -381.32 0 0 0 0 0 0 0 1 1 32
57 -66.194 -94.1545 -379.992 0 0 0 0 0 0 0 1 0 36
58 -68.004 -96.1613 -376.969 0 0 0 0 0 0 0 1 0 37
59 -65.5268 -97.9105 -375.094 0 0 0 0 0 0 0 1 -1 27
60 -65.3757 -98.4001 -371.262 0 0 0 0 0 0 0 1 0 31
...
17627 49.6773 50.5787 392.185 0 0 0 0 0 0 0 1992 -1 42
17628 45.488 56.5193 406.41 0 0 0 0 0 0 0 1992 -1 42
有进一步的部分,用于随后的时间步骤,它们都以相同的方式开始,例如这是下一个
ITEM: TIMESTEP
100000
ITEM: NUMBER OF ATOMS
14748
ITEM: BOX BOUNDS ss ss ss
-1.4837772831937818e+02 1.3859288720579670e+02
-1.4339607509329937e+02 1.6977859136366405e+02
-4.8020962419356266e+02 5.1578309694876850e+02
ITEM: ATOMS id x y z c_q[1] c_q[2] c_q[3] c_q[4] c_shape[1] c_shape[2] c_shape[3] mol q type
40 -70.2744 -123.319 -430.593 0 0 0 0 0 0 0 1 1 2
41 -71.6868 -119.878 -432.341 0 0 0 0 0 0 0 1 0 39
42 -70.2556 -116.45 -430.652 0 0 0 0 0 0 0 1 1 22
43 -71.1458 -115.417 -427.107 0 0 0 0 0 0 0 1 0 35
44 -74.6494 -113.816 -426.925 0 0 0 0 0 0 0 1 0 28
45 -76.0197 -115.121 -429.907 0 0 0 0 0 0 0 1 0 37
46 -77.4939 -118.11 -428.852 0 0 0 0 0 0 0 1 0 40
47 -78.461 -116.067 -425.641 0 0 0 0 0 0 0 1 0 21
48 -80.6403 -113.891 -427.878 0 0 0 0 0 0 0 1 0 31
49 -82.5667 -116.737 -429.45 0 0 0 0 0 0 0 1 1 22
50 -82.8208 -118.067 -425.623 0 0 0 0 0 0 0 1 -1 27
51 -84.3263 -114.418 -424.728 0 0 0 0 0 0 0 1 0 30
52 -86.7863 -115.369 -427.978 0 0 0 0 0 0 0 1 1 22
53 -87.636 -118.593 -426.541 0 0 0 0 0 0 0 1 1 22
54 -88.4756 -117.592 -422.645 0 0 0 0 0 0 0 1 0 39
55 -90.0285 -114.297 -424.2 0 0 0 0 0 0 0 1 0 26
56 -92.7283 -116.552 -425.923 0 0 0 0 0 0 0 1 1 32
57 -93.5545 -119.262 -423.443 0 0 0 0 0 0 0 1 0 36
58 -95.5213 -118.131 -420.219 0 0 0 0 0 0 0 1 0 37
59 -94.5936 -120.806 -417.701 0 0 0 0 0 0 0 1 -1 27
60 -94.3178 -120.163 -413.806 0 0 0 0 0 0 0 1 0 31
...
我想将整个文件读为一个大蒂布尔,其中time_step作为附加列,即带有colnames:
time_step, ATOMS_id, x, y, z, c_q[1], c_q[2], c_q[3], c_q[4], c_shape[1], c_shape[2], c_shape[3], mol, q, type
。 (原子和框边界的数量在这里不重要。)
我可以用像
这样的东西“手动”library(dplyr)
library(stringr)
all_dumps <- tibble (
condition = character(),
timestep = numeric(),
atom_id = numeric(),
atom_type = numeric(),
mol_id = numeric(),
x = numeric(),
y = numeric(),
z = numeric()
)
raw_lines <- readLines('dna_temper_A.dump')
for (line in raw_lines) {
if (line != "ITEM: TIMESTEP") {
line_count_in_block = line_count_in_block + 1
if (line_count_in_block == 1) {
time_step = line
}
else if (line_count_in_block > 8) {
line_split <- unlist(str_split(line, ' ')) %>% as.numeric()
all_dumps <- add_row (
all_dumps,
timestep = as.numeric(time_step),
atom_id = line_split[1],
atom_type = line_split[14],
mol_id = line_split[12],
x = line_split[2],
y = line_split[3],
z = line_split[4]
)
}
}
else {
line_count_in_block = 0
}
}
(仅保留我真正想稍后再使用的那些列。)
,这是超级缓慢/效率低下的! 是否有更好的方法来执行此操作(以真正的泰迪尔风格)?
lines <- strsplit(raw_lines, "\n")[[1]]
loc_timestep <- grep("ITEM: TIMESTEP", lines)
loc_atoms <- grep("ITEM: ATOMS id x y z", lines)
timesteps <- lines[loc_timestep + 1]
loc_starts <- loc_atoms + 1
# each table ends before next "TIMESTEP", except last table ends at end of file
loc_ends <- c(loc_timestep[-1] - 1, length(lines))
loc_tbl_lines <- mapply(seq, loc_starts, loc_ends) |>
unlist()
len_tbl <- loc_ends - loc_starts + 1
# add column names
# loc_tbl_lines <- c(loc_atoms[[1]], loc_tbl_lines)
tbl_lines <- lines[loc_tbl_lines] |>
paste(collapse = "\n")
col_names <- gsub("ITEM: ATOMS ", "", lines[[loc_atoms[[1]]]])
col_names <- strsplit(col_names, "\\s+")[[1]]
dat <- read.table(
text = tbl_lines,
col.names = col_names
)
dat <- cbind(timestep = rep(timesteps, len_tbl), dat)
恢复:
#> head(dat)
timestep id x y z c_q.1. c_q.2. c_q.3. c_q.4. c_shape.1.
1 0 40 -48.7871 -80.7758 -383.704 0 0 0 0 0
2 0 41 -51.0027 -77.7928 -384.047 0 0 0 0 0
3 0 42 -52.2822 -74.6121 -381.792 0 0 0 0 0
4 0 43 -53.6295 -75.5845 -378.205 0 0 0 0 0
5 0 44 -57.2670 -76.4807 -378.221 0 0 0 0 0
6 0 45 -57.4497 -77.4334 -381.768 0 0 0 0 0
c_shape.2. c_shape.3. mol q type
1 0 0 1 1 2
2 0 0 1 0 39
3 0 0 1 1 22
4 0 0 1 0 35
5 0 0 1 0 28
6 0 0 1 0 37
#> tail(dat)
timestep id x y z c_q.1. c_q.2. c_q.3. c_q.4.
39 100000 55 -90.0285 -114.297 -424.200 0 0 0 0
40 100000 56 -92.7283 -116.552 -425.923 0 0 0 0
41 100000 57 -93.5545 -119.262 -423.443 0 0 0 0
42 100000 58 -95.5213 -118.131 -420.219 0 0 0 0
43 100000 59 -94.5936 -120.806 -417.701 0 0 0 0
44 100000 60 -94.3178 -120.163 -413.806 0 0 0 0
c_shape.1. c_shape.2. c_shape.3. mol q type
39 0 0 0 1 0 26
40 0 0 0 1 1 32
41 0 0 0 1 0 36
42 0 0 0 1 0 37
43 0 0 0 1 -1 27
44 0 0 0 1 0 31