# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
import joblib
# Load the data
data = pd.read_csv("data.csv")
# Data preprocessing
# Convert the "eligible_date" column to datetime
data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')
# Feature engineering
data['year'] = data['eligible_date'].dt.year
data['month'] = data['eligible_date'].dt.month
data['day'] = data['eligible_date'].dt.day
data['weekday'] = data['eligible_date'].dt.weekday
# Label encoding for the "country" and "package" columns
label_encoder = LabelEncoder()
data['country_encoded'] = label_encoder.fit_transform(data['country'])
data['package_encoded'] = label_encoder.fit_transform(data['package'])
data['model_encoded'] = label_encoder.fit_transform(data['model'])
joblib.dump(label_encoder, 'label_encoder.pkl')
# Drop unnecessary columns
data.drop(['eligible_date', 'country', 'package','model'], axis=1, inplace=True)
# Split the data into features and target
X = data.drop('payout', axis=1)
y = data['payout']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define an XGBoost model with hyperparameters
model = XGBRegressor(
n_estimators=1500,
learning_rate=0.01,
max_depth=3,
subsample=0.8,
colsample_bytree=0.8,
objective='reg:squarederror',
random_state=42
)
# Train the model
model.fit(X_train, y_train)
joblib.dump(model, 'xgboost_model.pkl')
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)
# You can also use cross-validation to assess model performance
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')
print("Cross-Validation Mean Squared Error:", np.mean(cv_scores))
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
'n_estimators': [100, 500, 1000, 1500],
'max_depth': [3, 5, 7, 9],
'learning_rate': [0.01, 0.05, 0.1],
'subsample': [0.7, 0.8, 0.9],
'colsample_bytree': [0.7, 0.8, 0.9]
}
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=kf, random_state=42)
random_search.fit(X, y)
print("Best Hyperparameters:", random_search.best_params_)
#### testing phase
loaded_label_encoder = joblib.load('label_encoder.pkl')
loaded_model = joblib.load('xgboost_model.pkl')
# Create a function to predict total payout for a given month
def predict_total_payout_for_month(target_time, model, encoder):
# Create a sample DataFrame with one row for the specified month
sampledata = pd.DataFrame({
'eligible_date': pd.to_datetime([target_time], format='%Y-%m-%d'),
#'eligible_date': [datetime.strptime(target_month, '%Y-%m')],
'country': ['Malaysia'], # Replace with a valid country name
'package': ['Express 100K'],
'model': ['Express'],# Replace with a valid package name
'payout': [0] # This value will be overwritten
})
#data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')
# Data preprocessing
# sampledata['month'] = sampledata['eligible_date'].dt.to_period('M')
sampledata['year'] = sampledata['eligible_date'].dt.year
sampledata['month'] = sampledata['eligible_date'].dt.month
sampledata['day'] = sampledata['eligible_date'].dt.day
sampledata['weekday'] = sampledata['eligible_date'].dt.weekday
# Assuming "country" and "package" columns need encoding
sampledata['country_encoded'] = encoder.fit_transform(sampledata['country'])
sampledata['package_encoded'] = encoder.fit_transform(sampledata['package'])
sampledata['model_encoded'] = encoder.fit_transform(sampledata['model'])
print(sampledata)
# Prepare the input features (X) for prediction
X_new = sampledata[['year', 'month', 'day', 'weekday', 'country_encoded', 'package_encoded', 'model_encoded']] # Make sure the columns match those used during training
print(X_new)
# Use the trained model to make predictions
predicted_payout = model.predict(X_new)
return predicted_payout # Return the predicted payout for the specified month
# Specify the target month for prediction
target = '2023-11-15' # Replace with the month you want to predict
# Use the function to make predictions
predicted_total_payout = predict_total_payout_for_month(target, loaded_model, loaded_label_encoder)
print(f"Predicted Total Payout for {target}: {predicted_total_payout}")
运行测试部分,编码值不匹配。例如,
malaysia
应编码为79
,但无论我在测试中输入哪个国家/地区,它都会编码为0
。我正在加载 labeldata pickel 来转换测试数据。
如何调整才能将训练阶段的标签应用到测试阶段?
问题在于您正在将新的 LabelEncoder 拟合到测试数据。要将与训练阶段相同的编码应用于测试数据,您应该使用训练期间使用的 LabelEncoder 的
transform
方法。
def predict_total_payout_for_month(target_time, model, encoder):
# some code ...
sampledata['country_encoded'] = encoder.transform(sampledata['country'])
sampledata['package_encoded'] = encoder.transform(sampledata['package'])
sampledata['model_encoded'] = encoder.transform(sampledata['model'])
# some code