据我了解,标准化是为了保证鲁棒性。但是,我很难理解如何在 glmnet 和 caret 库中的 cv.glmnet 和 knnreg 函数中对 LASSO 和 KNN 回归中的数据应用标准化,并相互比较回归输出。
我首先尝试在 cv.glmnet 函数中使用
standardize = TRUE
进行 LASSO 回归,据我了解,这标准化了回归的数据,然后“去标准化”了回归输出。
当我使用 knnreg 函数时,我首先使用
scale(...)
缩放数值数据,然后创建数值虚拟变量,然后运行 knn 回归。然而,输出仍然以其缩放形式呈现,因此无法比较两个回归 beta 和 MSE。
有没有办法“缩小”knn 回归的结果或“重新缩放”LASSO 的结果,或者我是否以错误的方式看待这个问题?
TIA 寻求帮助。
##Code for CV LASSO regression
set.seed(34064064)
library(glmnet)
Hitters<-na.omit(Hitters)
x <- model.matrix(Salary~.,Hitters)[,-1]
y <- Hitters$Salary
cv<-cv.glmnet(x,y,lambda=exp(seq(-2, 4, length.out = 30)),nfolds=10,alpha=1,standardize = TRUE,type.measure = "mse")
best.lambda <- cv$lambda.min
fit <- glmnet(x, y, lambda=best.lambda, alpha=1, standardize=TRUE)
y.pred <- predict(fit, newx=x)
training.mse <- mean((y - y.pred)^2)
print(training.mse)
##Code for KNN regression
library(caret)
fn.split <- function(d,p=0.2) {
aux <- 1:length(d[,1])
id.test <- sort(sample(aux,size=floor(p*length(aux)),
replace=FALSE))
d.test <- d[id.test,]
d.train <- d[-id.test,]
return(list(train=d.train,test=d.test))
}
set.seed(34064064)
hitters.na<-na.omit(Hitters)
split.data<-fn.split(d=hitters.na,p=0.3)
train<-split.data$train
test<-split.data$test
# Standardize numeric columns in train dataset
for (i in 1:ncol(train)) {
if (is.numeric(train[,i])) {
train[,i] <- (train[,i] - mean(train[,i])) / sd(train[,i])
}
}
# Standardize numeric columns in test dataset
for (i in 1:ncol(test)) {
if (is.numeric(test[,i])) {
test[,i] <- (test[,i] - mean(test[,i])) / sd(test[,i])
}
}
#Create dummy variable
train$League<-ifelse(train$League=="N",1,0)
train$NewLeague<-ifelse(train$NewLeague=="N",1,0)
train$Division<-ifelse(train$Division=="W",1,0)
test$League<-ifelse(test$League=="N",1,0)
test$NewLeague<-ifelse(test$NewLeague=="N",1,0)
test$Division<-ifelse(test$Division=="W",1,0)
knn.reg<-knnreg(Salary~.,data=train,k=20)
pred <- predict(knn.reg, newdata = test)
mse<- mean((test$Salary-pred)^2) #MSE on test data
mse
我相信我已经发现了自己所犯的错误。在我原来的帖子中,我用我所有的数值变量标准化了响应变量“薪水”,这是不正确的。请参阅下文,了解我现在认为正确的内容。
# Standardize numeric columns dataset without the response Salary! (col 19)
train.p<-train[,-19]
test.p<-test[,-19]
for (i in 1:ncol(train.p)) {
if (is.numeric(train.p[,i])) {
train.p[,i] <- (train.p[,i] - mean(train.p[,i])) / sd(train.p[,i])
}
}
for (i in 1:ncol(test.p)) {
if (is.numeric(test.p[,i])) {
test.p[,i] <- (test.p[,i] - mean(test.p[,i])) / sd(test.p[,i])
}
}
#Create dummy variables after standardisation
test.p$League<-ifelse(test.p$League=="N",1,0)
test.p$NewLeague<-ifelse(test.p$NewLeague=="N",1,0)
test.p$Division<-ifelse(test.p$Division=="W",1,0)
train.p$League<-ifelse(train.p$League=="N",1,0)
train.p$NewLeague<-ifelse(train.p$NewLeague=="N",1,0)
train.p$Division<-ifelse(train.p$Division=="W",1,0)
# add back in "un-scaled" response Salary to dataset
test.p$Salary<-test$Salary
train.p$Salary<-train$Salary