我有这段代码可以进行逻辑回归
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import RocCurveDisplay, roc_auc_score, confusion_matrix
from sklearn.model_selection import KFold
from loguru import logger
data_path="../data/creditcard.csv"
df=pd.read_csv(data_path)
df=df.drop("Time",axis=1)
print(df.head())
print(f"Shape: {df.shape}")
# Randomly sampling 50% of all the normal data points
# in the data frame and picking out all of the anomalies from the data
# frame as separate data frames.
normal=df[df.Class==0].sample(frac=0.5,random_state=2020).reset_index(drop=True)
anomaly=df[df.Class==1]
print(f"Normal: {normal.shape}")
print(f"Anomalies: {anomaly.shape}")
# split the normal and anomaly sets into train-test
normal_train,normal_test=train_test_split(normal,test_size=0.2,random_state=2020)
anomaly_train,anomaly_test=train_test_split(anomaly,test_size=0.2,random_state=2020)
# From there split train into train validate
normal_train,normal_validate=train_test_split(normal_train,test_size=0.25,random_state=2020)
anomaly_train,anomaly_validate=train_test_split(anomaly_train,test_size=0.25,random_state=2020)
# Create the whole sets
x_train =pd.concat((normal_train,anomaly_train))
x_test=pd.concat((normal_test,anomaly_test))
x_validate=pd.concat((normal_validate, anomaly_validate))
y_train=np.array(x_train["Class"])
y_test=np.array(x_test["Class"])
y_validate=np.array(x_validate["Class"])
x_train=x_train.drop("Class",axis=1)
x_test=x_test.drop("Class",axis=1)
x_validate=x_validate.drop("Class",axis=1)
print("Training sets:\nx_train: {} \ny_train: {}".format(x_train.shape, y_train.shape))
print("Testing sets:\nx_test: {} \ny_test: {}".format(x_test.shape, y_test.shape))
print("Validation sets:\nx_validate: {} \ny_validate: {}".format(x_validate.shape, y_validate.shape))
# Scale the data
scaler= StandardScaler()
scaler.fit(pd.concat((normal,anomaly)).drop("Class",axis=1))
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)
x_validate=scaler.transform(x_validate)
def train(sk_model,x_train,y_train):
sk_model=sk_model.fit(x_train,y_train)
train_acc=sk_model.score(x_train,y_train)
logger.info(f"Train Accuracy: {train_acc:.3%}")
def evaluate(sk_model,x_test,y_test):
eval_acc=sk_model.score(x_test,y_test)
preds=sk_model.predict(x_test)
auc_score=roc_auc_score(y_test,preds)
print(f"Auc Score: {auc_score:.3%}")
print(f"Eval Accuracy: {eval_acc:.3%}")
roc_plot = RocCurveDisplay.from_estimator(sk_model, x_test, y_test, name='Scikit-learn ROC Curve')
plt.savefig("sklearn_roc_plot.png")
plt.show()
plt.clf()
conf_matrix=confusion_matrix(y_test, preds)
ax=sns.heatmap(conf_matrix,annot=True,fmt='g')
ax.invert_xaxis()
ax.invert_yaxis()
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title("Confusion Matrix")
plt.savefig("sklearn_conf_matrix.png")
sk_model= LogisticRegression(random_state=None, max_iter=400, solver='newton-cg')
# sk_model= LogisticRegression(random_state=None, max_iter=1, solver='newton-cg')
train(sk_model,x_train,y_train)
evaluate(sk_model,x_test,y_test)
使用此处的信用卡欺诈检测作为数据(以防重现我将要讨论的结果)
事情是如你所见,我们有
sk_model= LogisticRegression(random_state=None, max_iter=400, solver='newton-cg')
这样我就得到了
2024-11-13 16:56:34.087 | INFO | __main__:train:84 - Train Accuracy: 99.894%
Auc Score: 85.341%
Eval Accuracy: 99.874%
但是如果我把它改成
sk_model= LogisticRegression(random_state=None, max_iter=10, solver='newton-cg')
我得到了相同的结果!
如果我把它改成极端的话
sk_model= LogisticRegression(random_state=None, max_iter=1, solver='newton-cg')
我收到了预期的警告
optimize.py:318: ConvergenceWarning: newton-cg failed to converge at loss = 0.1314439039348997. Increase the number of iterations.
但是我得到了更好的结果!
2024-11-13 16:58:03.127 | INFO | __main__:train:84 - Train Accuracy: 99.897%
Auc Score: 86.858%
Eval Accuracy: 99.888%
为什么会出现这种情况?在这种情况下,我很难理解
max_iter
的概念,(我尝试过使用梯度下降的纯 python 逻辑回归,在这种情况下我有点理解)。有人可以澄清为什么会发生这种情况吗?
我认为这与准确性有关。
模型的准确性是衡量模型做出的正确预测(真阳性和真阴性)在所有预测中所占比例的指标,方程式如下:
Accuracy= (TP+TN) / (TP+TN+FP+FN)
在credit.csv中,大多数标签(“Class”列)都是0(280k行中只有492行是1,其他都是0),这意味着模型将更容易给出高acc,例如,预测所有输出为 0。
如果我们更关心正数部分,我们可以使用精确度/召回率来代替 acc。