我想找到一个图案的位置,并对其进行过滤,我想找一个函数来返回图案 "gaaaa "在每行30到34之间的起始位置。
我正在寻找一个函数来返回模式 "gaaaa "在30和34之间的每一行的起始位置。
我解释一下,目前这里是我用函数str_locate_all得到的结果。
library(stringr)
Sequence <- data.frame(All = c("ggcgaagcagugcucccaguguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"aggacaacucgcuccacggccguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"cugaaauggcagcagaaacguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcaacaaa",
"ggucaaagaggaggagcucguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu"))
str_locate_all(pattern = 'gaaa', Sequence$All)
[[1]]
start end
[1,] 33 36
[2,] 73 76
[[2]]
start end
[1,] 34 37
[2,] 74 77
[[3]]
start end
[1,] 3 6
[2,] 15 18
[3,] 32 35
[4,] 72 75
[[4]]
start end
[1,] 32 35
[2,] 72 75
这是我想得到的结果。
start
1 33
2 34
3 32
4 32
谢谢你!
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
# start
# 1 33
# 2 34
# 3 32
# 4 32
一个 dplyr
和 purrr
解决办法可能是。
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
ID start end
1 1 33 36
2 2 34 37
3 3 32 35
4 4 32 35
这里有一个方法。它使用的是 str_locate_all
问题中的指令,并将其过滤在 lapply
循环。
found <- str_locate_all(pattern = 'gaaa', Sequence$All)
found <- lapply(found, function(x){
y <- x[, 'start']
data.frame(start = y[y >= 30 & y <= 34])
})
do.call(rbind, found)
# start
#1 33
#2 34
#3 32
#4 32
这里是另一种方法。它只搜索原始字符串的一个子字符串。
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
以下是目前已有的3个答案的时间。r2evans矿区和 tmfmnk.
我只发输入量较大的结果,因为这才是应该重要的时机。
library(stringr)
library(dplyr)
library(purrr)
r2evans <- function(){
Sequence$start <-
sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
}
rui <- function(){
first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
}
tmfmnk <- function(){
map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
~ as.data.frame(.x) %>%
filter(start %in% c(30:34)),
.id = "ID")
}
library(microbenchmark)
for(i in 1:8) Sequence <- rbind(Sequence, Sequence)
dim(Sequence)
#[1] 1024 1
mb <- microbenchmark(
revans = f1(),
rui = f2()
#tmfmnk = f3()
)
print(mb, unit = 'relative', order = 'median')
#Unit: relative
# expr min lq mean median uq max neval
# rui 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 100
# r2evans 19.66135 17.52724 16.28008 15.47317 16.20747 5.60779 100
# tmfmnk 1529.51644 1235.86285 1079.56958 1073.49131 1072.39265 317.95638 100