我目前正在创建一个模型来预测生物体污染的毒性测定。
由于没有合适的数据集,尚未尝试。我只是想问我的代码是否正确。欢迎所有批评。另外,如果我遗漏或应该包含任何内容,请告诉我。
此外,我也在考虑更多的模型,例如 RandomForestRegressor、Boosting(AdaBoost、GradientBoost)。我应该考虑这些吗?另外,当我最终获得数据时,是否应该从测试中删除任何模型?
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('') # csv file with data (concentration and mortality)
# basic chart
sns.scatterplot(data = df, x = 'Concentration', y = 'Mortality')
# basic split of training vs testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=101)
# Linear model
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)
lr_preds = lr_model.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error
mean_absolute_error(y_test, lr_preds)
np.sqrt(mean_absolute_error(y_test, lr_preds))
concentration_range = np.arange(0,100) # adjustable based on min/max concentration
concentration_preds = lr_model.predict(concentration_range.reshape(-1,1))
plt.figure(figsize = (12,6),dpi = 200)
sns.scatterplot(data = df, x = 'Concentration', y = 'Signal')
plt.plot(concentration_range,concentration_preds)
# Polynomical model
# Function for testing models
def run_model(model, X_train, y_train, X_test, y_test):
# Fit model
model.fit(X_train,y_train)
# Get metrics
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test,preds))
mae = mean_absolute_error(y_test, preds)
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
# Plot results model signal range
concentration_range = np.arange(0,100) # again this is adjustable
concentration_preds = model.predict(concentration_range.reshape(-1,1))
plt.figure(figsize = (12,8), dpi = 200)
sns.scatterplot(x = 'Concentration', y = 'Mortality', data = df, color = 'black')
plt.plot(concentration_range, concentration_preds)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
pipe = make_pipeline(PolynomialFeatures(degree = 2),LinearRegression()) # degree is adjustable
run_model(pipe, X_train, y_train, X_test, y_test)
# K-Nearest Neighbors model
from sklearn.neighbors import KNeighborsRegressor
k_values = [1,2,3,4,5,6,7,8,9,10]
for k in k_values:
model = KNeighborsRegressor(n_neighbors=k)
run_model(model, X_train,y_train,X_test, y_test)
# Decision Tree model
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
run_model(model, X_train, y_train, X_test, y_test)
# SVR Model
from sklearn.svm import SVR # Support Vector Regression
from sklearn.model_selection import GridSearchCV
svr = SVR()
param_grid = {'C':[0.01,0.1,1,5,10,100,1000],
'gamma':['auto','scale']}
grid = GridSearchCV(svr, param_grid)
run_model(grid, X_train,y_train,X_test, y_test)
您的代码看起来不错,我会考虑添加 R^2 以获得整体适合。