我可以使用logisticPCA从logisticPCA包中获取PC分数和加载(https://cran.r-project.org/web/packages/logisticPCA/logisticPCA.pdf)。但我找不到一种方法来提取每台电脑捕获的特征值或解释的变化。
我遇到了同样的问题。我找到的解决方案是从包中导出计算logisticPCA的函数,提取特征值并计算每个分量与特征值总和的比例。
然后可以像在包中一样调用函数“logisticPCA2”,并且可以提取解释的方差:
logisticPCA2 <- function(x, k = 2, m = 4, quiet = TRUE, partial_decomp = FALSE,
max_iters = 1000, conv_criteria = 1e-5, random_start = FALSE,
start_U, start_mu, main_effects = TRUE, validation, M, use_irlba) {
if (!missing(M)) {
m = M
warning("M is deprecated. Use m instead. ",
"Using m = ", m)
}
if (!missing(use_irlba)) {
partial_decomp = use_irlba
warning("use_irlba is deprecated. Use partial_decomp instead. ",
"Using partial_decomp = ", partial_decomp)
}
if (partial_decomp) {
if (!requireNamespace("RSpectra", quietly = TRUE)) {
message("RSpectra must be installed to use partial_decomp")
partial_decomp = FALSE
}
}
q = as.matrix(2 * x - 1)
missing_mat = is.na(q)
q[is.na(q)] <- 0 # forces Z to be equal to theta when data is missing
n = nrow(q)
d = ncol(q)
if (k >= d & partial_decomp) {
message("k >= dimension. Setting partial_decomp = FALSE")
partial_decomp = FALSE
k = d
}
if (m == 0) {
m = 4
solve_M = TRUE
if (!missing(validation)) {
if (ncol(validation) != ncol(x)) {
stop("validation does not have the same variables as x")
}
validation = as.matrix(validation)
q_val = 2 * validation - 1
q_val[is.na(q_val)] <- 0
}
} else {
solve_M = FALSE
}
if (main_effects) {
if (!missing(start_mu)) {
mu = start_mu
} else {
mu = colMeans(m * q)
}
} else {
mu = rep(0, d)
}
# Initialize #
##################
if (!missing(start_U)) {
U = sweep(start_U, 2, sqrt(colSums(start_U^2)), "/")
} else if (random_start) {
U = matrix(rnorm(d * k), d, k)
U = qr.Q(qr(U))
} else {
if (partial_decomp) {
udv = RSpectra::svds(scale(q, center = main_effects, scale = FALSE), k = k)
} else {
udv = svd(scale(q, center = main_effects, scale = FALSE))
}
U = matrix(udv$v[, 1:k], d, k)
}
# etaTeta = crossprod(eta)
qTq = crossprod(q)
loss_trace = numeric(max_iters + 1)
eta = m * q + missing_mat * outer(rep(1, n), mu)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
loglike <- log_like_Bernoulli(q = q, theta = theta)
loss_trace[1] = (-loglike) / sum(q!=0)
ptm <- proc.time()
if (!quiet) {
cat(0, " ", loss_trace[1], "")
cat("0 hours elapsed\n")
}
for (i in 1:max_iters) {
last_U = U
last_m = m
last_mu = mu
if (solve_M) {
if (missing(validation)) {
Phat = inv.logit.mat(theta)
M_slope = sum(((Phat - x) * (q %*% tcrossprod(U)))[q != 0])
M_curve = sum((Phat * (1 - Phat) * (q %*% tcrossprod(U))^2)[q != 0])
} else {
lpca_obj = structure(list(mu = mu, U = U, m = m),
class = "lpca")
Phat = predict(lpca_obj, newdata = validation, type = "response")
M_slope = sum(((Phat - validation) * (q_val %*% tcrossprod(U)))[q_val != 0])
M_curve = sum((Phat * (1 - Phat) * (q_val %*% tcrossprod(U))^2)[q_val != 0])
}
m = max(m - M_slope / M_curve, 0)
eta = m * q + missing_mat * outer(rep(1, n), mu)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
}
Z = as.matrix(theta + 4 * q * (1 - inv.logit.mat(q * theta)))
if (main_effects) {
mu = as.numeric(colMeans(Z - eta %*% tcrossprod(U)))
}
eta = m * q + missing_mat * outer(rep(1, n), mu)
mat_temp = crossprod(scale(eta, center = mu, scale = FALSE), Z)
mat_temp = mat_temp + t(mat_temp) - crossprod(eta) + n * outer(mu, mu)
# RSpectra could give poor estimates of e-vectors
# so I switch to standard eigen if it does
repeat {
if (partial_decomp) {
eig = RSpectra::eigs_sym(mat_temp, k = min(k + 2, d))
}
if (!partial_decomp || any(eig$values[1:k] < 0)) {
eig = eigen(mat_temp, symmetric = TRUE)
if (!quiet & partial_decomp) {
cat("RSpectra::eigs_sym returned negative values.\n")
}
}
#####################################################
U = matrix(eig$vectors[, 1:k], d, k)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
this_loglike <- log_like_Bernoulli(q = q, theta = theta)
if (!partial_decomp | this_loglike >= loglike) {
loglike = this_loglike
break
} else {
partial_decomp = FALSE
warning("RSpectra::eigs_sym was too inaccurate in iteration ", i , ". Switched to base::eigen")
}
}
loss_trace[i + 1] = (-loglike) / sum(q!=0)
if (!quiet) {
time_elapsed = as.numeric(proc.time() - ptm)[3]
tot_time = max_iters / i * time_elapsed
time_remain = tot_time - time_elapsed
cat(i, " ", loss_trace[i + 1], "")
cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
}
if (i > 4) {
# when solving for m, the monoticity does not apply
if (solve_M) {
if (abs(loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
break
}
} else {
if ((loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
break
}
}
}
}
# test if loss function increases
if ((loss_trace[i + 1] - loss_trace[i]) > (1e-10)) {
U = last_U
mu = last_mu
m = last_m
i = i - 1
if (!solve_M) {
warning("Algorithm stopped because deviance increased.\nThis should not happen!")
}
}
# calculate the null log likelihood for % deviance explained
if (main_effects) {
null_proportions = colMeans(x, na.rm = TRUE)
} else {
null_proportions = rep(0.5, d)
}
null_loglikes <- null_proportions * log(null_proportions) +
(1 - null_proportions) * log(1 - null_proportions)
null_loglike = sum((null_loglikes * colSums(q!=0))[!(null_proportions %in% c(0, 1))])
eta = m * q + missing_mat * outer(rep(1, n), mu)
#calculate explained variance
total_eig = sum(eig$values[1:k])
eig_sorted = sort(eig$values[1:k], decreasing = TRUE)
explainedVariance = c()
for (eig in eig_sorted){
explainedVariance = append(explainedVariance, eig/total_eig)
}
object <- list(mu = mu,
U = U,
PCs = scale(eta, center = mu, scale = FALSE) %*% U,
explainedVariance = explainedVariance,
m = m,
M = m, # need to deprecate after 0.1.1
iters = i,
loss_trace = loss_trace[1:(i + 1)],
prop_deviance_expl = 1 - loglike / null_loglike)
class(object) <- "lpca"
object
}
logpca_model = logisticPCA2(data, k = 9, m = 8)
logpca_model$explainedVariance