我有这个数据:
simulated_states = c("A", "E", "B", "B", "A", "C", "D", "A", "B", "D", "A", "D",
"D", "E", "D", "D", "D", "E", "A", "A", "A", "B", "A", "C", "C",
"D", "A", "A", "D", "A", "D", "A", "A", "A", "C", "C", "D", "A",
"C", "C", "D", "E", "C", "C", "C", "E", "B", "A", "E", "E", "C",
"C", "D", "E", "C", "E", "E", "A", "E", "B", "A", "A", "E", "E",
"C", "E", "C", "C", "C", "D", "E", "D", "C", "D", "A", "B", "B",
"E", "B", "A", "E", "C", "C", "D", "B", "B", "A", "C", "B", "A",
"D", "A", "D", "E", "C", "D", "D", "A", "A", "C")
我知道如何计算转移概率:
calculate_transition_probs <- function(states) {
transitions <- data.frame(
from = states,
to = c(states[-1], NA)
)
transition_counts <- table(transitions, useNA = "always")
transition_df <- as.data.frame(transition_counts)
colnames(transition_df) <- c("from", "to", "count")
transition_df <- transition_df[!is.na(transition_df$to), ]
transition_df <- transition_df %>%
group_by(from) %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
transition_df <- transition_df[, c("from", "to", "count", "percent")]
transition_df <- transition_df[order(transition_df$from, transition_df$to), ]
return(transition_df)
}
transition_probs <- calculate_transition_probs(simulated_states)
结果如下所示:
from to count percent
A A 7 26.923077
A B 3 11.538462
A C 6 23.076923
A D 5 19.230769
A E 5 19.230769
B A 7 58.333333
B B 3 25.000000
B C 0 0.000000
B D 1 8.333333
B E 1 8.333333
C A 0 0.000000
C B 1 4.545455
C C 9 40.909091
C D 9 40.909091
C E 3 13.636364
D A 9 42.857143
D B 1 4.761905
D C 1 4.761905
D D 4 19.047619
D E 6 28.571429
E A 2 11.111111
E B 4 22.222222
E C 7 38.888889
E D 2 11.111111
E E 3 16.666667
现在,我想扩展它来计算 n 步概率的转移概率。
例如
如何编写一个函数来执行 n 步?
例如对于 5 个步骤,输出应如下所示:
from1 from2 from3 from4 from5 to count percent
A A A A A A 0 0
A A A A A E 0 0
A A A A A B 0 0
A A A A A C 0 0
A A A A A D 0 0
您的
transition_counts
可以通过将条目除以行总和来转换为转换矩阵(为了简单起见,我将其称为 A
):
A <- transition_counts / rowSums(transition_counts)
那么,两步转移概率就是简单的
A %*% A
to
from A B C D E
A 0.2435780 0.12229330 0.2404796 0.2137937 0.1798554
B 0.3478582 0.15229446 0.1709910 0.1581451 0.1707112
C 0.2169913 0.07974223 0.2398662 0.2642168 0.1991834
D 0.2565411 0.13608217 0.2385630 0.1738936 0.1949202
E 0.2256817 0.12838088 0.2548378 0.2386595 0.1524402
请注意,行总和仍然为 1。然后是 3 步
A %*% A %*% A
或者为了简化,我们可以使用
expm
包,它具有方便的 %^%
功能:
library(expm)
A %^% 3
这个函数可以让你计算第n步。
A %^% 10
to
from A B C D E
A 0.2494011 0.1199961 0.2341925 0.2148659 0.1815444
B 0.2494024 0.1199966 0.2341917 0.2148651 0.1815442
C 0.2494006 0.1199957 0.2341927 0.2148664 0.1815446
D 0.2494013 0.1199962 0.2341924 0.2148656 0.1815445
E 0.2494010 0.1199962 0.2341926 0.2148660 0.1815442
上式接近稳态,由解给出:
qr.solve(rbind(t(A) - diag(5), rep(1, 5)), c(rep(0,5), 1))
# A B C D E
# 0.2494012 0.1199961 0.2341924 0.2148659 0.1815444
假设您想要跟踪中间状态,将滞后列组装在数据框中,将状态粘贴在一起(使用
tidyverse::unite
),然后调用 table
应该可以:
library(dplyr) # for %>% operator
library(tidyverse) # for unite function
n = length(simulated_states)
data.frame(matrix(c(simulated_states[seq(1,n-2)],
simulated_states[seq(2,n-1)],
simulated_states[seq(3,n)]),
ncol=3)) %>%
{unite(data=.,col='d',names(.),sep='')} %>% table
作为函数:
require(tidyverse)
calculate_transition_probs = function(states, m){
n = length(states)
lagged_states = c()
for (i in 1:m){
lagged_states = c(lagged_states,
simulated_states[seq(i,n-m+i)])
}
return(data.frame(matrix(lagged_states,
ncol=m)) %>%
{unite(data=.,col='d',names(.),sep='')} %>%
table)
}
尝试下面的功能。我添加了另一个参数
step
来控制你想要的步数。
如果您设置了
step = 1
,输出与您到目前为止所做的相同。
calculate_transition_probs <- function(states, step = 1) {
nc <- step+1
lagged_mat <- matrix(
states[sequence(rep(length(states), nc), 1:nc)],
ncol = nc
)
trans_prob <- lagged_mat %>%
as.data.frame(stringsAsFactors = TRUE) %>%
head(-step) %>%
group_by(pick(everything()), .drop = FALSE) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(percent = count / sum(count) * 100) %>%
ungroup()
names(trans_prob)[1:nc] <- c(paste0("from", 1:step), "to")
return(trans_prob)
}
calculate_transition_probs(simulated_states, step = 3)
# # A tibble: 625 × 6
# from1 from2 from3 to count percent
# <fct> <fct> <fct> <fct> <int> <dbl>
# 1 A A A A 0 0
# 2 A A A B 1 50
# 3 A A A C 1 50
# 4 A A A D 0 0
# 5 A A A E 0 0
# 6 A A B A 1 100
# 7 A A B B 0 0
# 8 A A B C 0 0
# 9 A A B D 0 0
# 10 A A B E 0 0
# # ℹ 615 more rows