1 如何获得R中logistic PCA的每个主成分所解释的方差量?

问题描述 投票:0回答:1

我可以使用logisticPCA从logisticPCA包中获取PC分数和加载(https://cran.r-project.org/web/packages/logisticPCA/logisticPCA.pdf)。但我找不到一种方法来提取每台电脑捕获的特征值或解释的变化。

r pca
1个回答
0
投票

我遇到了同样的问题。我找到的解决方案是从包中导出计算logisticPCA的函数,提取特征值并计算每个分量与特征值总和的比例。

然后可以像在包中一样调用函数“logisticPCA2”,并且可以提取解释的方差:

logisticPCA2 <- function(x, k = 2, m = 4, quiet = TRUE, partial_decomp = FALSE,
                         max_iters = 1000, conv_criteria = 1e-5, random_start = FALSE,
                         start_U, start_mu, main_effects = TRUE, validation, M, use_irlba) {
  if (!missing(M)) {
    m = M
    warning("M is deprecated. Use m instead. ",
            "Using m = ", m)
  }
  if (!missing(use_irlba)) {
    partial_decomp = use_irlba
    warning("use_irlba is deprecated. Use partial_decomp instead. ",
            "Using partial_decomp = ", partial_decomp)
  }
  if (partial_decomp) {
    if (!requireNamespace("RSpectra", quietly = TRUE)) {
      message("RSpectra must be installed to use partial_decomp")
      partial_decomp = FALSE
    }
  }
  
  q = as.matrix(2 * x - 1)
  missing_mat = is.na(q)
  q[is.na(q)] <- 0 # forces Z to be equal to theta when data is missing
  n = nrow(q)
  d = ncol(q)
  
  if (k >= d & partial_decomp) {
    message("k >= dimension. Setting partial_decomp = FALSE")
    partial_decomp = FALSE
    k = d
  }
  
  if (m == 0) {
    m = 4
    solve_M = TRUE
    if (!missing(validation)) {
      if (ncol(validation) != ncol(x)) {
        stop("validation does not have the same variables as x")
      }
      validation = as.matrix(validation)
      q_val = 2 * validation - 1
      q_val[is.na(q_val)] <- 0
    }
  } else {
    solve_M = FALSE
  }
  
  if (main_effects) {
    if (!missing(start_mu)) {
      mu = start_mu
    } else {
      mu = colMeans(m * q)
    }
  } else {
    mu = rep(0, d)
  }
  
  # Initialize #
  ##################
  if (!missing(start_U)) {
    U = sweep(start_U, 2, sqrt(colSums(start_U^2)), "/")
  } else if (random_start) {
    U = matrix(rnorm(d * k), d, k)
    U = qr.Q(qr(U))
  } else {
    if (partial_decomp) {
      udv = RSpectra::svds(scale(q, center = main_effects, scale = FALSE), k = k)
    } else {
      udv = svd(scale(q, center = main_effects, scale = FALSE))
    }
    U = matrix(udv$v[, 1:k], d, k)
  }
  
  # etaTeta = crossprod(eta)
  qTq = crossprod(q)
  
  loss_trace = numeric(max_iters + 1)
  eta = m * q + missing_mat * outer(rep(1, n), mu)
  theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
  loglike <- log_like_Bernoulli(q = q, theta = theta)
  loss_trace[1] = (-loglike) / sum(q!=0)
  ptm <- proc.time()
  
  if (!quiet) {
    cat(0, "  ", loss_trace[1], "")
    cat("0 hours elapsed\n")
  }
  
  for (i in 1:max_iters) {
    last_U = U
    last_m = m
    last_mu = mu
    
    if (solve_M) {
      if (missing(validation)) {
        Phat = inv.logit.mat(theta)
        M_slope = sum(((Phat - x) * (q %*% tcrossprod(U)))[q != 0])
        M_curve = sum((Phat * (1 - Phat) * (q %*% tcrossprod(U))^2)[q != 0])
      } else {
        lpca_obj = structure(list(mu = mu, U = U, m = m),
                             class = "lpca")
        Phat = predict(lpca_obj, newdata = validation, type = "response")
        M_slope = sum(((Phat - validation) * (q_val %*% tcrossprod(U)))[q_val != 0])
        M_curve = sum((Phat * (1 - Phat) * (q_val %*% tcrossprod(U))^2)[q_val != 0])
      }
      m = max(m - M_slope / M_curve, 0)
      
      eta = m * q + missing_mat * outer(rep(1, n), mu)
      theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
    }
    
    Z = as.matrix(theta + 4 * q * (1 - inv.logit.mat(q * theta)))
    if (main_effects) {
      mu = as.numeric(colMeans(Z - eta %*% tcrossprod(U)))
    }
    
    eta = m * q + missing_mat * outer(rep(1, n), mu)
    
    mat_temp = crossprod(scale(eta, center = mu, scale = FALSE), Z)
    mat_temp = mat_temp + t(mat_temp) - crossprod(eta) + n * outer(mu, mu)
    
    # RSpectra could give poor estimates of e-vectors
    # so I switch to standard eigen if it does
    repeat {
      if (partial_decomp) {
        eig = RSpectra::eigs_sym(mat_temp, k = min(k + 2, d))
      }
      if (!partial_decomp || any(eig$values[1:k] < 0)) {
        eig = eigen(mat_temp, symmetric = TRUE)
        if (!quiet & partial_decomp) {
          cat("RSpectra::eigs_sym returned negative values.\n")
        }
      }
      #####################################################
      U = matrix(eig$vectors[, 1:k], d, k)
      
      
      theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
      this_loglike <- log_like_Bernoulli(q = q, theta = theta)
      
      if (!partial_decomp | this_loglike >= loglike) {
        loglike = this_loglike
        break
      } else {
        partial_decomp = FALSE
        warning("RSpectra::eigs_sym was too inaccurate in iteration ", i , ". Switched to base::eigen")
      }
    }
    
    loss_trace[i + 1] = (-loglike) / sum(q!=0)
    
    if (!quiet) {
      time_elapsed = as.numeric(proc.time() - ptm)[3]
      tot_time = max_iters / i * time_elapsed
      time_remain = tot_time - time_elapsed
      cat(i, "  ", loss_trace[i + 1], "")
      cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
    }
    if (i > 4) {
      # when solving for m, the monoticity does not apply
      if (solve_M) {
        if (abs(loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
          break
        }
      } else {
        if ((loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
          break
        }
      }
    }
  }
  
  # test if loss function increases
  if ((loss_trace[i + 1] - loss_trace[i]) > (1e-10)) {
    U = last_U
    mu = last_mu
    m = last_m
    i = i - 1
    
    if (!solve_M) {
      warning("Algorithm stopped because deviance increased.\nThis should not happen!")
    }
  }
  
  # calculate the null log likelihood for % deviance explained
  if (main_effects) {
    null_proportions = colMeans(x, na.rm = TRUE)
  } else {
    null_proportions = rep(0.5, d)
  }
  null_loglikes <- null_proportions * log(null_proportions) +
    (1 - null_proportions) * log(1 - null_proportions)
  null_loglike = sum((null_loglikes * colSums(q!=0))[!(null_proportions %in% c(0, 1))])
  
  eta = m * q + missing_mat * outer(rep(1, n), mu)
  
  #calculate explained variance
  total_eig = sum(eig$values[1:k])
  eig_sorted = sort(eig$values[1:k], decreasing = TRUE)
  
  explainedVariance = c()
  for (eig in eig_sorted){
    explainedVariance = append(explainedVariance, eig/total_eig)
  }
  
  object <- list(mu = mu,
                 U = U,
                 PCs = scale(eta, center = mu, scale = FALSE) %*% U,
                 explainedVariance = explainedVariance,
                 m = m,
                 M = m, # need to deprecate after 0.1.1
                 iters = i,
                 loss_trace = loss_trace[1:(i + 1)],
                 prop_deviance_expl = 1 - loglike / null_loglike)
  class(object) <- "lpca"
  object
}

logpca_model = logisticPCA2(data, k = 9, m = 8)
logpca_model$explainedVariance
© www.soinside.com 2019 - 2024. All rights reserved.