我正在尝试使用我自己的零膨胀数据集来学习本教程:https://aws.amazon.com/getting-started/hands-on/machine-learning-tutorial-train-a-model/
我得到错误:
UnexpectedStatusException:HyperParameterTuning 作业错误 xgbtune-230402-0248:失败。原因:5 后没有训练工作成功 尝试。有关更多详细信息,请查看培训 通过列出用于超参数调整的训练作业来处理作业失败 工作。
如何列出超参数调整作业的训练作业?
我运行的代码是
tuning_job_name_prefix = "xgbtune"
training_job_name_prefix = "xgbtrain"
xgb_model_name = "xgb-stn-130-model"
endpoint_name_prefix = "xgb-stn-130-dev"
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
predictor_instance_count = 1
predictor_instance_type = "ml.m4.xlarge"
clarify_instance_count = 1
clarify_instance_type = "ml.m4.xlarge"
static_hyperparams = {
"eval_metric" : "rmse",
"objective": "reg:tweedie",
"num_round": "50"
}
xgb_estimator = XGBoost(
entry_point="xgboost_train.py",
output_path=estimator_output_uri,
code_location=estimator_output_uri,
hyperparameters=static_hyperparams,
role=sagemaker_role,
instance_count=train_instance_count,
instance_type=train_instance_type,
framework_version="1.3-1",
base_job_name=training_job_name_prefix
)
hyperparameter_ranges = {
"eta": ContinuousParameter(0, 1),
"subsample": ContinuousParameter(0.7, 0.95),
"colsample_bytree": ContinuousParameter(0.7, 0.95),
"max_depth": IntegerParameter(1, 5)
}
objective_metric_name = "validation:rmse"
# Setting up tuner object
tuner_config_dict = {
"estimator" : xgb_estimator,
"max_jobs" : 10,
"max_parallel_jobs" : 2,
"objective_metric_name" : objective_metric_name,
"objective_type" : 'Minimize',
"hyperparameter_ranges" : hyperparameter_ranges,
"base_tuning_job_name" : tuning_job_name_prefix,
"strategy" : "Random" ## can be changed to BAYESIAN optimizaiton for better results. It takes longer time though.
}
tuner = HyperparameterTuner(**tuner_config_dict)
s3_input_train = TrainingInput(s3_data="s3://{}/{}".format(read_bucket, train_data_key), content_type="csv", s3_data_type="S3Prefix")
s3_input_validation = (TrainingInput(s3_data="s3://{}/{}".format(read_bucket, validation_data_key),
content_type="csv", s3_data_type="S3Prefix")
)
tuner.fit(inputs={"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False, wait=False)
tuner.wait()
xgboost_train.py文件如下。
import tempfile
import boto3
import argparse
import os
import joblib
import json
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Hyperparameters and algorithm parameters are described here
parser.add_argument("--num_round", type=int, default=100)
parser.add_argument("--max_depth", type=int, default=3)
parser.add_argument("--eta", type=float, default=0.2)
parser.add_argument("--subsample", type=float, default=1)
parser.add_argument("--colsample_bytree", type=float, default=0.8)
parser.add_argument("--objective", default="reg:tweedie")
parser.add_argument("--eval_metric", default="rmse")
parser.add_argument("--nfold", type=int, default=1)
parser.add_argument("--early_stopping_rounds", type=int, default=3)
bucket_name = "xgboost-stations-data-store"
# SageMaker specific arguments. Defaults are set in the environment variables
# Location of input training data
parser.add_argument("--train_data_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-all-data/xgb-stn-130-train.csv")
# Location of input validation data
parser.add_argument("--validation_data_dir", type=str,default= f"s3://xgboost-stations-data-store/xgb-all-data/xgb-stn-130-val.csv")
# Location where trained model will be stored. Default set by SageMaker, /opt/ml/model
parser.add_argument("--model_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-model-output/")
# Location where model artifacts will be stored. Default set by SageMaker, /opt/ml/output/data
parser.add_argument("--output_data_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-model-output/metrics.json")
args = parser.parse_args()
data_train = pd.read_csv(args.train_data_dir)
train = data_train.drop("Unnamed: 0", axis=1)
train = train.drop("dt_ts", axis=1)
train = train.drop("target_130", axis=1)
#print(train.columns)
label_train = pd.DataFrame(data_train["target_130"])
dtrain = xgb.DMatrix(train, label=label_train)
data_validation = pd.read_csv(args.validation_data_dir)
#print(data_validation.columns)
validation = data_validation.drop("Unnamed: 0", axis=1)
validation = validation.drop("dt_ts", axis=1)
validation = validation.drop("target_130", axis=1)
label_validation = pd.DataFrame(data_validation["target_130"])
dvalidation = xgb.DMatrix(validation, label=label_validation)
params = {"max_depth": args.max_depth,
"eta": args.eta,
"objective": args.objective,
"subsample" : args.subsample,
"colsample_bytree":args.colsample_bytree
}
num_boost_round = args.num_round
nfold = args.nfold
early_stopping_rounds = args.early_stopping_rounds
## need to use walk-forward validation and NOT k-fold cross-validation
cv_results = xgb.cv(
params=params,
dtrain=dtrain,
num_boost_round=num_boost_round,
nfold=nfold,
early_stopping_rounds=early_stopping_rounds,
metrics=["rmse"],
seed=42,
)
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results))
train_pred = model.predict(dtrain)
validation_pred = model.predict(dvalidation)
train_rmse = mean_squared_error(label_train, train_pred, squared=False)
validation_rmse = mean_squared_error(label_validation, validation_pred,squared=False)
print(f"train_rmse:{train_rmse:.2f}")
print(f"validation_rmse:{validation_rmse:.2f}")
metrics_data = {"hyperparameters" : params,
"regression_metrics": {"validation:rmse": {"value": validation_rmse},
"train:rmse": {"value": train_rmse}
}
}
# Save the evaluation metrics to the location specified by output_data_dir
metrics_location = f"s3://xgboost-stations-data-store/xgb-model-output/metrics.json"
# Save the model to the location specified by model_dir
model_location = args.model_dir
s3 = boto3.resource('s3')
s3object = s3.Object(bucket_name, metrics_location)
s3object.put(
Body=(bytes(json.dumps(metrics_data).encode('UTF-8')))
)
s3_client = boto3.client('s3')
key = "xgb-model-output/model.pkl"
# WRITE
with tempfile.TemporaryFile() as fp:
joblib.dump(model, fp)
fp.seek(0)
s3_client.put_object(Body=fp.read(), Bucket=bucket_name, Key=key)