修改 data.table 中的列:用最后一个有效标记替换字符串

问题描述 投票:0回答:1

尝试通过将字符串替换为最后一个有效标记来修改 data.table 列,当最后一个标记丢失时保留“NA”。

可重现的示例:

input_DT <- structure(list(Team = c("T1", "T1", "T1", "T1", "T1", "T1", "T1", 
"T1", "T1", "T1", "T2", "T2", "T2", "T2", "T2"), Player = c("T1.P1", 
"T1.P2", "T1.P3", "T1.P4", "T1.P5", "T1.P1", "T1.P2", "T1.P3", 
"T1.P4", "T1.P5", "T2.P1", "T2.P2", "T2.P3", "T2.P4", "T2.P5"
), Year = c(2011L, 2011L, 2011L, 2011L, 2011L, 2012L, 2012L, 
2012L, 2012L, 2012L, 2011L, 2011L, 2011L, 2011L, 2011L), Str = c("a bin|a 3 2|a north|an|a c d|a bin", 
"a bin|a 3 2|a north|an|a c d|a 3 2", "a bin|a 3 2|a north|an|a c d|a north", 
"a bin|a 3 2|a north|an|a c d|an", "a bin|a 3 2|a north|an|a c d|a c d", 
"a bin|a north|an|a bin", "a bin|a north|an", "a bin|a north|an|a north", 
"a bin|a north|an|an", "a bin|a north|an", "an|bin|4eight|12|an", 
"an|bin|4eight|12", "an|bin|4eight|12|bin", "an|bin|4eight|12|4eight", 
"an|bin|4eight|12|12")), row.names = c(NA, -15L), class = c("data.table", 
"data.frame"))

> print(input_DT)
      Team Player  Year                                  Str
    <char> <char> <int>                               <char>
 1:     T1  T1.P1  2011   a bin|a 3 2|a north|an|a c d|a bin
 2:     T1  T1.P2  2011   a bin|a 3 2|a north|an|a c d|a 3 2
 3:     T1  T1.P3  2011 a bin|a 3 2|a north|an|a c d|a north
 4:     T1  T1.P4  2011      a bin|a 3 2|a north|an|a c d|an
 5:     T1  T1.P5  2011   a bin|a 3 2|a north|an|a c d|a c d
 6:     T1  T1.P1  2012               a bin|a north|an|a bin
 7:     T1  T1.P2  2012                     a bin|a north|an
 8:     T1  T1.P3  2012             a bin|a north|an|a north
 9:     T1  T1.P4  2012                  a bin|a north|an|an
10:     T1  T1.P5  2012                     a bin|a north|an
11:     T2  T2.P1  2011                  an|bin|4eight|12|an
12:     T2  T2.P2  2011                     an|bin|4eight|12
13:     T2  T2.P3  2011                 an|bin|4eight|12|bin
14:     T2  T2.P4  2011              an|bin|4eight|12|4eight
15:     T2  T2.P5  2011                  an|bin|4eight|12|12

output_DT <- structure(list(Team = c("T1", "T1", "T1", "T1", "T1", "T1", "T1", 
"T1", "T1", "T1", "T2", "T2", "T2", "T2", "T2"), Player = c("T1.P1", 
"T1.P2", "T1.P3", "T1.P4", "T1.P5", "T1.P1", "T1.P2", "T1.P3", 
"T1.P4", "T1.P5", "T2.P1", "T2.P2", "T2.P3", "T2.P4", "T2.P5"
), Year = c(2011L, 2011L, 2011L, 2011L, 2011L, 2012L, 2012L, 
2012L, 2012L, 2012L, 2011L, 2011L, 2011L, 2011L, 2011L), Var = c("a bin", 
"a 3 2", "a north", "an", "a c d", "a bin", "", "a north", "an", 
"", "an", "", "bin", "4eight", "12")), row.names = c(NA, -15L
), class = c("data.table", "data.frame"))

> print(output_DT)
      Team Player  Year     Var
    <char> <char> <int>  <char>
 1:     T1  T1.P1  2011   a bin
 2:     T1  T1.P2  2011   a 3 2
 3:     T1  T1.P3  2011 a north
 4:     T1  T1.P4  2011      an
 5:     T1  T1.P5  2011   a c d
 6:     T1  T1.P1  2012   a bin
 7:     T1  T1.P2  2012      NA
 8:     T1  T1.P3  2012 a north
 9:     T1  T1.P4  2012      an
10:     T1  T1.P5  2012      NA
11:     T2  T2.P1  2011      an
12:     T2  T2.P2  2011      NA
13:     T2  T2.P3  2011     bin
14:     T2  T2.P4  2011  4eight
15:     T2  T2.P5  2011      12

备注:

  • 在 input_DT 中,“Str”列中的字符串包含每个“Team”中每个“Year”的所有标记,通过“|”连接;
  • “Str”列 (input_DT) 中每个字符串中的最后一个标记是要保留在“Var”列 (output_DT) 中的有效标记,除非该字符串没有最大字符串“长度”(在这种情况下,表示按团队和年份划分的最大令牌数(缺少最后一个令牌)。在这种情况下,“Var”为 NA(参见 ouput_DT 中的第 7、10 和 12 行);

我尝试过的:

output_DT$Var <- 
  input_DT[,
           data.table::fifelse(
             stringr::str_count(input_DT$Str,"\\|")==max(stringr::str_count(input_DT$Str,"\\|")),
             stringr::word(input_DT$Str,
                  sep = stringr::fixed("|"),-1
             ),NA_character_
           )
  ]

> print(output_DT)
    Team Player Year     Var
 1:   T1  T1.P1 2011   a bin
 2:   T1  T1.P2 2011   a 3 2
 3:   T1  T1.P3 2011 a north
 4:   T1  T1.P4 2011      an
 5:   T1  T1.P5 2011   a c d
 6:   T1  T1.P1 2012    <NA>
 7:   T1  T1.P2 2012    <NA>
 8:   T1  T1.P3 2012    <NA>
 9:   T1  T1.P4 2012    <NA>
10:   T1  T1.P5 2012    <NA>
11:   T2  T2.P1 2011    <NA>
12:   T2  T2.P2 2011    <NA>
13:   T2  T2.P3 2011    <NA>
14:   T2  T2.P4 2011    <NA>
15:   T2  T2.P5 2011    <NA> 

我尝试的方式,最大字符串“长度”(字符串中的最大标记)基于“Str”列中的所有字符串,而不是每个团队和年份的字符串中的标记数量。

提前致谢。

r string data.table token
1个回答
0
投票

包含多个步骤的解决方案可以接受吗?

library(data.table)
library(tidyverse)
input_DT <- structure(list(Team = c("T1", "T1", "T1", "T1", "T1", "T1", "T1", 
                                    "T1", "T1", "T1", "T2", "T2", "T2", "T2", "T2"), 
                           Player = c("T1.P1", "T1.P2", "T1.P3", "T1.P4", "T1.P5", "T1.P1", "T1.P2", "T1.P3", 
                                      "T1.P4", "T1.P5", "T2.P1", "T2.P2", "T2.P3", "T2.P4", "T2.P5"
                           ), Year = c(2011L, 2011L, 2011L, 2011L, 2011L, 2012L, 2012L, 
                                       2012L, 2012L, 2012L, 2011L, 2011L, 2011L, 2011L, 2011L), 
                           Str = c("a bin|a 3 2|a north|an|a c d|a bin", "a bin|a 3 2|a north|an|a c d|a 3 2", "a bin|a 3 2|a north|an|a c d|a north", 
                                   "a bin|a 3 2|a north|an|a c d|an", "a bin|a 3 2|a north|an|a c d|a c d", 
                                   "a bin|a north|an|a bin", "a bin|a north|an", "a bin|a north|an|a north", 
                                   "a bin|a north|an|an", "a bin|a north|an", "an|bin|4eight|12|an", 
                                   "an|bin|4eight|12", "an|bin|4eight|12|bin", "an|bin|4eight|12|4eight", 
                                   "an|bin|4eight|12|12")), 
                      row.names = c(NA, -15L), class = c("data.table", "data.frame"))

CountPipes <- input_DT[, .(maxPipes = max(str_count(Str, "\\|"))), c("Team", "Year")]
input_DT2 <- merge(input_DT, CountPipes)
input_DT2[, Pipes := str_count(Str, "\\|")]
input_DT2[, Var := fifelse(Pipes == maxPipes, str_extract(Str, "[^|]+$"), NA)]
input_DT2
#> Key: <Team, Year>
#>       Team  Year Player                                  Str maxPipes Pipes
#>     <char> <int> <char>                               <char>    <int> <int>
#>  1:     T1  2011  T1.P1   a bin|a 3 2|a north|an|a c d|a bin        5     5
#>  2:     T1  2011  T1.P2   a bin|a 3 2|a north|an|a c d|a 3 2        5     5
#>  3:     T1  2011  T1.P3 a bin|a 3 2|a north|an|a c d|a north        5     5
#>  4:     T1  2011  T1.P4      a bin|a 3 2|a north|an|a c d|an        5     5
#>  5:     T1  2011  T1.P5   a bin|a 3 2|a north|an|a c d|a c d        5     5
#>  6:     T1  2012  T1.P1               a bin|a north|an|a bin        3     3
#>  7:     T1  2012  T1.P2                     a bin|a north|an        3     2
#>  8:     T1  2012  T1.P3             a bin|a north|an|a north        3     3
#>  9:     T1  2012  T1.P4                  a bin|a north|an|an        3     3
#> 10:     T1  2012  T1.P5                     a bin|a north|an        3     2
#> 11:     T2  2011  T2.P1                  an|bin|4eight|12|an        4     4
#> 12:     T2  2011  T2.P2                     an|bin|4eight|12        4     3
#> 13:     T2  2011  T2.P3                 an|bin|4eight|12|bin        4     4
#> 14:     T2  2011  T2.P4              an|bin|4eight|12|4eight        4     4
#> 15:     T2  2011  T2.P5                  an|bin|4eight|12|12        4     4
#>         Var
#>      <char>
#>  1:   a bin
#>  2:   a 3 2
#>  3: a north
#>  4:      an
#>  5:   a c d
#>  6:   a bin
#>  7:    <NA>
#>  8: a north
#>  9:      an
#> 10:    <NA>
#> 11:      an
#> 12:    <NA>
#> 13:     bin
#> 14:  4eight
#> 15:      12

创建于 2024 年 10 月 17 日,使用 reprex v2.1.1

© www.soinside.com 2019 - 2024. All rights reserved.