查找模式并过滤起始位置

问题描述 投票:0回答:2

我想找到一个图案的位置,并对其进行过滤,我想找一个函数来返回图案 "gaaaa "在每行30到34之间的起始位置。

我正在寻找一个函数来返回模式 "gaaaa "在30和34之间的每一行的起始位置。

我解释一下,目前这里是我用函数str_locate_all得到的结果。

library(stringr)
Sequence <- data.frame(All = c("ggcgaagcagugcucccaguguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"aggacaacucgcuccacggccguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu",
"cugaaauggcagcagaaacguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcaacaaa",
"ggucaaagaggaggagcucguuuuagagcuagaaauagcaaguuaaaauaaggcuaguccguuaucaacuugaaaaaguggcaccgagucggugcuu"))
str_locate_all(pattern = 'gaaa', Sequence$All)

[[1]]
     start end
[1,]    33  36
[2,]    73  76

[[2]]
     start end
[1,]    34  37
[2,]    74  77

[[3]]
     start end
[1,]     3   6
[2,]    15  18
[3,]    32  35
[4,]    72  75

[[4]]
     start end
[1,]    32  35
[2,]    72  75

这是我想得到的结果。

       start
1         33
2         34
3         32
4         32

谢谢你!

r design-patterns filter location
2个回答
1
投票
Sequence$start <- 
  sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
         function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
Sequence[,2,drop=FALSE]
#   start
# 1    33
# 2    34
# 3    32
# 4    32

1
投票

一个 dplyrpurrr 解决办法可能是。

map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
        ~ as.data.frame(.x) %>%
         filter(start %in% c(30:34)),
        .id = "ID")

  ID start end
1  1    33  36
2  2    34  37
3  3    32  35
4  4    32  35

1
投票

这里有一个方法。它使用的是 str_locate_all 问题中的指令,并将其过滤在 lapply 循环。

found <- str_locate_all(pattern = 'gaaa', Sequence$All)
found <- lapply(found, function(x){
  y <- x[, 'start']
  data.frame(start = y[y >= 30 & y <= 34])
})
do.call(rbind, found)
#  start
#1    33
#2    34
#3    32
#4    32

这里是另一种方法。它只搜索原始字符串的一个子字符串。

first <- 30
last <- 34
tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)

时序

以下是目前已有的3个答案的时间。r2evans矿区和 tmfmnk.

我只发输入量较大的结果,因为这才是应该重要的时机。

library(stringr)
library(dplyr)
library(purrr)

r2evans <- function(){
  Sequence$start <-
    sapply(str_locate_all(pattern = 'gaaa', Sequence$All),
           function(z) { ind <- which(30 <= z[,1] & z[,1] <= 34); if (length(ind)) z[ind[1],1] else NA })
  Sequence[,2,drop=FALSE]
}

rui <- function(){
  first <- 30
  last <- 34
  tmp <- substr(Sequence$All, first, last + nchar('gaaa') - 1)
  data.frame(start = str_locate(pattern = 'gaaa', tmp)[, 1] + first - 1)
}

tmfmnk <- function(){
  map_dfr(.x = str_locate_all(pattern = "gaaa", Sequence$All),
          ~ as.data.frame(.x) %>%
            filter(start %in% c(30:34)),
          .id = "ID")
}


library(microbenchmark)

for(i in 1:8) Sequence <- rbind(Sequence, Sequence)
dim(Sequence)
#[1] 1024    1

mb <- microbenchmark(
  revans = f1(),
  rui = f2()
  #tmfmnk = f3()
)
print(mb, unit = 'relative', order = 'median')
#Unit: relative
#    expr        min         lq       mean     median         uq       max neval
#     rui    1.00000    1.00000    1.00000    1.00000    1.00000   1.00000   100
# r2evans   19.66135   17.52724   16.28008   15.47317   16.20747   5.60779   100
#  tmfmnk 1529.51644 1235.86285 1079.56958 1073.49131 1072.39265 317.95638   100
© www.soinside.com 2019 - 2024. All rights reserved.