我想使用
interval_left_join
对两个时间对齐的数据帧进行精确连接:
df1 <- structure(list(Utterance = c("(5.127)", ">like how old 's your mom¿ ",
"(0.855)", "eh six:ty:::-one= ", "(0.101)", "(0.166)", "=NOW ",
"(0.622)", "!how! this always plays out ", "(0.726)", "[when was] that¿= ",
"[yes] ", "(0.163)", "=!this! was on °Wednesday° ", "(0.273)",
"oka[y] "), Starttime_ms = c(0, 5127, 6830, 7685, 9889, 176800,
176966, 177372, 177994, 179328, 180054, 180135, 180668, 180831,
181720, 181993), Endtime_ms = c(5127, 6830, 7685, 9889, 9990,
176966, 177372, 177994, 179328, 180054, 180668, 180555, 180831,
181720, 181993, 182398)), row.names = c(NA, -16L), class = c("tbl_df",
"tbl", "data.frame"))
df2 <- structure(list(Q_segment = c(">like I don 't understand< ", "sorry like how old 's your mom¿",
"[when was] that¿=", "=[did she let you in?]", "=holy [sh:]i:[:t",
"does it happen] often? ", "[like] ", "[what] type of:: tours is it ",
"is it [(like a long] ti:me¿", "[or]"), Starttime_ms = c(5127,
5963, 180054, 253226, 345450, 347454, 348459, 478500, 480669,
481763), Endtime_ms = c(5963, 6830, 180668, 254156, 347454, 348459,
348724, 480669, 481763, 482000)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
连接这两个ata帧会产生比预期更多的连接,大概是因为当interval_left_join
的其中一个和
Starttime_ms
的
Endtime_ms
与
df2
或
Endtime_ms
的第一毫秒相同时,Starttime_ms
连接df2
的:
# install package "IRanges":
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("IRanges")
library(BiocManager)
library(fuzzyjoin)
library(data.table)
interval_left_join(x = df1,
y = df2,
by = c("Starttime_ms", "Endtime_ms"))
# A tibble: 17 × 6
Utterance Starttime_ms.x Endtime_ms.x Q_segment Starttime_ms.y Endtime_ms.y
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 "(5.127)" 0 5127 >like 5127 5963
2 ">like how old 's your mom¿ " 5127 6830 >like 5127 5963
3 ">like how old 's your mom¿ " 5127 6830 how old 's your mom¿ 5963 6830
4 "(0.855)" 6830 7685 how old 's your mom¿ 5963 6830
5 "eh six:ty:::-one= " 7685 9889 NA NA NA
6 "(0.101)" 9889 9990 NA NA NA
7 "(0.166)" 176800 176966 NA NA NA
8 "=NOW " 176966 177372 NA NA NA
9 "(0.622)" 177372 177994 NA NA NA
10 "!how! this always plays out " 177994 179328 NA NA NA
11 "(0.726)" 179328 180054 [when was] that¿= 180054 180668
12 "[when was] that¿= " 180054 180668 [when was] that¿= 180054 180668
13 "[yes] " 180135 180555 [when was] that¿= 180054 180668
14 "(0.163)" 180668 180831 [when was] that¿= 180054 180668
15 "=!this! was on °Wednesday° " 180831 181720 NA NA NA
16 "(0.273)" 181720 181993 NA NA NA
17 "oka[y] " 181993 182398 NA NA NA
如何更精确地连接两个数据框,使得Starttime_ms
的两者
Endtime_ms
和df2
落入
df1
各自的时间间隔?
期望的结果是这样的:
Utterance Starttime_ms.x Endtime_ms.x Q_segment Starttime_ms.y Endtime_ms.y
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 "(5.127)" 0 5127 NA NA NA
2 ">like how old 's your mom¿ " 5127 6830 >like 5127 5963
3 ">like how old 's your mom¿ " 5127 6830 how old 's your mom¿ 5963 6830
4 "(0.855)" 6830 7685 NA NA NA
5 "eh six:ty:::-one= " 7685 9889 NA NA NA
6 "(0.101)" 9889 9990 NA NA NA
7 "(0.166)" 176800 176966 NA NA NA
8 "=NOW " 176966 177372 NA NA NA
9 "(0.622)" 177372 177994 NA NA NA
10 "!how! this always plays out " 177994 179328 NA NA NA
11 "(0.726)" 179328 180054 NA NA NA
12 "[when was] that¿= " 180054 180668 [when was] that¿= 180054 180668
13 "[yes] " 180135 180555 [when was] that¿= 180054 180668
14 "(0.163)" 180668 180831 NA NA NA
15 "=!this! was on °Wednesday° " 180831 181720 NA NA NA
16 "(0.273)" 181720 181993 NA NA NA
17 "oka[y] " 181993 182398 NA NA NA
dplyr::join_by()
?它使您可以更好地控制您的加入。
df1 |>
dplyr::left_join(
df2,
dplyr::join_by(
y$Starttime_ms >= x$Starttime_ms,
y$Endtime_ms <= x$Endtime_ms
)
) |>
dplyr::select(1:4) # remove duplicate columns
#> # A tibble: 17 x 4
#> Utterance Starttime_ms.x Endtime_ms.x Q_segment
#> <chr> <dbl> <dbl> <chr>
#> 1 "(5.127)" 0 5127 <NA>
#> 2 ">like how old 's your mom¿ " 5127 6830 ">like I don 't ~
#> 3 ">like how old 's your mom¿ " 5127 6830 "sorry like how ~
#> 4 "(0.855)" 6830 7685 <NA>
#> 5 "eh six:ty:::-one= " 7685 9889 <NA>
#> 6 "(0.101)" 9889 9990 <NA>
#> 7 "(0.166)" 176800 176966 <NA>
#> 8 "=NOW " 176966 177372 <NA>
#> 9 "(0.622)" 177372 177994 <NA>
#> 10 "!how! this always plays out " 177994 179328 <NA>
#> 11 "(0.726)" 179328 180054 <NA>
#> 12 "[when was] that¿= " 180054 180668 "[when was] that~
#> 13 "[yes] " 180135 180555 <NA>
#> 14 "(0.163)" 180668 180831 <NA>
#> 15 "=!this! was on °Wednesday° " 180831 181720 <NA>
#> 16 "(0.273)" 181720 181993 <NA>
#> 17 "oka[y] " 181993 182398 <NA>