测试模型给出错误:“y 包含以前未见过的标签”

问题描述 投票:0回答:1
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from datetime import datetime
import joblib

# Load the data
data = pd.read_csv("data.csv")

# Data preprocessing
# Convert the "eligible_date" column to datetime
data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')

# Feature engineering
data['year'] = data['eligible_date'].dt.year
data['month'] = data['eligible_date'].dt.month
data['day'] = data['eligible_date'].dt.day
data['weekday'] = data['eligible_date'].dt.weekday

# Label encoding for the "country" and "package" columns
label_encoder = LabelEncoder()
data['country_encoded'] = label_encoder.fit_transform(data['country'])
data['package_encoded'] = label_encoder.fit_transform(data['package'])
data['model_encoded'] = label_encoder.fit_transform(data['model'])

joblib.dump(label_encoder, 'label_encoder.pkl')

# Drop unnecessary columns
data.drop(['eligible_date', 'country', 'package','model'], axis=1, inplace=True)

# Split the data into features and target
X = data.drop('payout', axis=1)
y = data['payout']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an XGBoost model with hyperparameters
model = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='reg:squarederror',
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

joblib.dump(model, 'xgboost_model.pkl')

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# You can also use cross-validation to assess model performance
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error')

print("Cross-Validation Mean Squared Error:", np.mean(cv_scores))

# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 500, 1000, 1500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=kf, random_state=42)
random_search.fit(X, y)

print("Best Hyperparameters:", random_search.best_params_)

#### testing phase

loaded_label_encoder = joblib.load('label_encoder.pkl')
loaded_model = joblib.load('xgboost_model.pkl')

# Create a function to predict total payout for a given month
def predict_total_payout_for_month(target_time, model, encoder):
    # Create a sample DataFrame with one row for the specified month
    sampledata = pd.DataFrame({
        'eligible_date': pd.to_datetime([target_time], format='%Y-%m-%d'),
        #'eligible_date': [datetime.strptime(target_month, '%Y-%m')],
        'country': ['Malaysia'],  # Replace with a valid country name
        'package': ['Express 100K'],
        'model': ['Express'],# Replace with a valid package name
        'payout': [0]  # This value will be overwritten
    })
#data['eligible_date'] = pd.to_datetime(data['eligible_date'], format='%Y-%m-%d')
    # Data preprocessing
    # sampledata['month'] = sampledata['eligible_date'].dt.to_period('M')
    sampledata['year'] = sampledata['eligible_date'].dt.year
    sampledata['month'] = sampledata['eligible_date'].dt.month
    sampledata['day'] = sampledata['eligible_date'].dt.day
    sampledata['weekday'] = sampledata['eligible_date'].dt.weekday
    # Assuming "country" and "package" columns need encoding
    sampledata['country_encoded'] = encoder.fit_transform(sampledata['country'])
    sampledata['package_encoded'] = encoder.fit_transform(sampledata['package'])
    sampledata['model_encoded'] = encoder.fit_transform(sampledata['model'])
    print(sampledata)

    # Prepare the input features (X) for prediction
    X_new = sampledata[['year', 'month', 'day', 'weekday', 'country_encoded', 'package_encoded', 'model_encoded']]  # Make sure the columns match those used during training
    print(X_new)

    # Use the trained model to make predictions
    predicted_payout = model.predict(X_new)

    return predicted_payout  # Return the predicted payout for the specified month

# Specify the target month for prediction
target = '2023-11-15'  # Replace with the month you want to predict

# Use the function to make predictions
predicted_total_payout = predict_total_payout_for_month(target, loaded_model, loaded_label_encoder)

print(f"Predicted Total Payout for {target}: {predicted_total_payout}")

运行测试部分,编码值不匹配。例如,

malaysia
应编码为
79
,但无论我在测试中输入哪个国家/地区,它都会编码为
0
。我正在加载 labeldata pickel 来转换测试数据。

如何调整才能将训练阶段的标签应用到测试阶段?

python machine-learning scikit-learn xgboost label-encoding
1个回答
0
投票

问题在于您正在将新的 LabelEncoder 拟合到测试数据。要将与训练阶段相同的编码应用于测试数据,您应该使用训练期间使用的 LabelEncoder 的

transform
方法。

def predict_total_payout_for_month(target_time, model, encoder):
    # some code ...
    sampledata['country_encoded'] = encoder.transform(sampledata['country'])
    sampledata['package_encoded'] = encoder.transform(sampledata['package'])
    sampledata['model_encoded'] = encoder.transform(sampledata['model'])
    # some code
© www.soinside.com 2019 - 2024. All rights reserved.