如何使用两个数据框通过for循环运行线性回归

问题描述 投票:0回答:1

我是初次发布者,但是长期用户在这里堆栈溢出。另外,我是R的初学者,所以请裸身。我正在尝试使用具有4列的数据框(所有这些都是因变量)和具有194行和212列的数据框进行简单的线性回归分析。我还有5个其他数据框用作相同分析的因变量

在我当前的代码中,我已经达到了预期的结果,但是我需要进行扩展,我试图添加一个额外的for循环(用于因变量的列),但是我还需要同时创建更多的空白列表。

我想知道我将如何实现?

我当前的for循环是:

y <- data.frame(Green_Class_Commercial[,-1])
x <- data.frame(lagvar[1:175,c(-1,-2)])
out <- data.frame(NULL)              # create object to keep results

for (i in 1:length(x)) {
  m <- summary(lm(y[,1] ~ x[,i]))    # run model
  out[i, 1] <- names(x)[i]           # print variable name
  out[i, 2] <- m$coefficients[1,1]   # intercept
  out[i, 3] <- m$coefficients[2,1]   # coefficient
  out[i, 4] <-m$coefficients[2,4]    # Pvalue
  out[i,5] <-m$r.squared             # R-squared
}
names(out) <- c("Variable", "Intercept", "Coefficient","P-val","R-square")
head(out)

提供输出

> head(out)
               Variable Intercept   Coefficient     P-val     R-square
1                GDP.SC 0.2540527 -4.722220e-07 0.7032087 8.411229e-04
2               GDP.SC1 0.1148311  3.107631e-07 0.7959237 3.899366e-04
3               GDP.SC2 0.1609010  4.998762e-08 0.9673014 9.855831e-06
4               GDP.SC3 0.1353608  1.959274e-07 0.8746321 1.468544e-04
5               GDP.SC4 0.1439931  1.487237e-07 0.9064221 8.200597e-05
6 CivilianLaborForce.SC 0.2595231 -4.078450e-08 0.7716514 4.881398e-04
> 

所以这是我要运行回归的变量

#The x Variable
structure(list(GDP.SC = c(154698, 154698, 154698, 154698, 154698, 
154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4, 
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
), GDP.SC1 = c(NA, 154698, 154698, 154698, 154698, 154698, 154698, 
154698, 154698, 154698, 154698, 160138.4, 160138.4, 160138.4, 
160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4), 
    GDP.SC2 = c(NA, NA, 154698, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 160138.4, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
    ), GDP.SC3 = c(NA, NA, NA, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 154698, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4, 160138.4
    ), GDP.SC4 = c(NA, NA, NA, NA, 154698, 154698, 154698, 154698, 
    154698, 154698, 154698, 154698, 154698, 154698, 160138.4, 
    160138.4, 160138.4, 160138.4, 160138.4, 160138.4)), row.names = c(NA, 
20L), class = "data.frame")

#The Y Variable
structure(list(X = 1:20, ComBus = c(0.83, 0, 0.23, 0.09, 0.1, 
0.11, 0.15, 0.18, 0.37, 0.19, 0, 0.18, 0.09, 0.1, 0.03, 0.5, 
0.14, 0.17, 0.11, 0.06), ComCon = c(NA, 0, 0, 0, 0, 0.5, 0, 0, 
NA, 0.67, 0, 0, 0, 0, 0.5, 0, 0, NA, 1, 0), ComNoo = c(0.25, 
0.14, 0.38, 0.17, 0.14, 0.33, 0.44, 0.05, 0.04, 0.1, 0.18, 0.06, 
0.23, 0.14, 0.5, 0.14, 0.5, 0, 0.14, 0.23), ComOO = c(0, 0, 0, 
0, 0, 0.33, 0, 0, 0, 0.18, 0.22, 0.15, 0, 0, 0.17, 0, 0, 0, 0, 
0)), row.names = c(NA, 20L), class = "data.frame")

r loops dataframe for-loop linear-regression
1个回答
1
投票

好,这对您有好处吗?如果可以的话,我用Apply替换循环?

### Some dummy dataframes
x <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000))
x2 <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y <- data.frame(v1 = rnorm(1:10),
                v2 = rnorm(1:10),
                v3 = runif(10, 1, 1000),
                v4 = runif(10, 1, 1000))
y2 <- data.frame(v1 = rnorm(1:10),
                 v2 = rnorm(1:10),
                 v3 = runif(10, 1, 1000),
                 v4 = runif(10, 1, 1000)) 

###
# I tend to prefer the apply family of functions to replace loops where possible.
# This function takes two inputs, dataframes of dependent and independent variables.
# the apply function here takes the x_df and applies the following anonymous function to each column
# so for each column in x_df it performs a lm against the first column of y_df

lm_func <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  return(out)
}

results_list <- lm_func(y, x)

# the output is one list element per lm. I like to keep the whole lm output just in case you need to go back to it

# we can then turn that list back into a dataframe using rbindlist from data.table
# and get what I think is your desired output using glance from broom

library(data.table)
library(broom)

results_glance <- rbindlist(lapply(results_list, glance), idcol = "var_name")

# or keep it as a list if you wish
results_list_glance <- lapply(results_list, glance)

# to run the function using a single y argument but multiple x arguments you can use mapply

results_list_m <- mapply(lm_func,
                       y_df = list(y, y2),
                       MoreArgs = list(    # other arguments you want to keep fixed
                         x_df = x
                       ),
                       SIMPLIFY = F
)

# the output is a little fiendish because it will be a list of lists
# we can include the rbindlist and glance into the function to make the output a little simpler:


lm_func_bind <- function(y_df, x_df) {
  out <- apply(x_df, MARGIN = 2, function(x) {
    lm(y_df[, 1] ~ x)
  })
  out <- rbindlist(lapply(out, glance), idcol = "var_name")
  return(out)
}
results_glance_df <- lm_func_bind(y, x)

results_list_dfs <- mapply(lm_func_bind,
                           y_df = list(y, y2),
                           MoreArgs = list(    # other arguments you want to keep fixed
                             x_df = x
                           ),
                           SIMPLIFY = F
)

让我知道我是否可以做得更好。如果您不熟悉applyrbindlist之类的某些功能,则值得查阅其文档。干杯!

P.S。由于成功的机会,通常重复线性模型并不理想。不过,这更多是统计问题,而不是编码问题!

© www.soinside.com 2019 - 2024. All rights reserved.