UnexpectedStatusException:HyperParameterTuning 作业错误 xgbtune-230402-0248:失败。原因:5次尝试都没有训练成功

问题描述 投票:0回答:0

我正在尝试使用我自己的零膨胀数据集来学习本教程:https://aws.amazon.com/getting-started/hands-on/machine-learning-tutorial-train-a-model/

我得到错误:

UnexpectedStatusException:HyperParameterTuning 作业错误 xgbtune-230402-0248:失败。原因:5 后没有训练工作成功 尝试。有关更多详细信息,请查看培训 通过列出用于超参数调整的训练作业来处理作业失败 工作。

如何列出超参数调整作业的训练作业?

我运行的代码是

tuning_job_name_prefix = "xgbtune" 
training_job_name_prefix = "xgbtrain"
xgb_model_name = "xgb-stn-130-model"
endpoint_name_prefix = "xgb-stn-130-dev"
train_instance_count = 1
train_instance_type = "ml.m4.xlarge"
predictor_instance_count = 1
predictor_instance_type = "ml.m4.xlarge"
clarify_instance_count = 1
clarify_instance_type = "ml.m4.xlarge"

static_hyperparams = {  
                        "eval_metric" : "rmse",
                        "objective": "reg:tweedie",
                        "num_round": "50"
                      }

xgb_estimator = XGBoost(
                        entry_point="xgboost_train.py",
                        output_path=estimator_output_uri,
                        code_location=estimator_output_uri,
                        hyperparameters=static_hyperparams,
                        role=sagemaker_role,
                        instance_count=train_instance_count,
                        instance_type=train_instance_type,
                        framework_version="1.3-1",
                        base_job_name=training_job_name_prefix
                    )    
hyperparameter_ranges = {
"eta": ContinuousParameter(0, 1),
"subsample": ContinuousParameter(0.7, 0.95),
"colsample_bytree": ContinuousParameter(0.7, 0.95),
"max_depth": IntegerParameter(1, 5)
}

objective_metric_name = "validation:rmse"

# Setting up tuner object
tuner_config_dict = {
                     "estimator" : xgb_estimator,
                     "max_jobs" : 10,
                     "max_parallel_jobs" : 2,
                     "objective_metric_name" : objective_metric_name,
                     "objective_type" : 'Minimize',
                     "hyperparameter_ranges" : hyperparameter_ranges,
                     "base_tuning_job_name" : tuning_job_name_prefix,
                     "strategy" : "Random"   ##  can be changed to BAYESIAN optimizaiton for better results. It takes longer time though.
                    }
tuner = HyperparameterTuner(**tuner_config_dict)

s3_input_train = TrainingInput(s3_data="s3://{}/{}".format(read_bucket, train_data_key), content_type="csv", s3_data_type="S3Prefix")
s3_input_validation = (TrainingInput(s3_data="s3://{}/{}".format(read_bucket, validation_data_key), 
                                    content_type="csv", s3_data_type="S3Prefix")
                      )

tuner.fit(inputs={"train": s3_input_train, "validation": s3_input_validation}, include_cls_metadata=False, wait=False)
tuner.wait()

xgboost_train.py文件如下。

import tempfile
import boto3
import argparse
import os
import joblib
import json
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # Hyperparameters and algorithm parameters are described here
    parser.add_argument("--num_round", type=int, default=100)
    parser.add_argument("--max_depth", type=int, default=3)
    parser.add_argument("--eta", type=float, default=0.2)
    parser.add_argument("--subsample", type=float, default=1)
    parser.add_argument("--colsample_bytree", type=float, default=0.8)
    parser.add_argument("--objective", default="reg:tweedie")
    parser.add_argument("--eval_metric", default="rmse")
    parser.add_argument("--nfold", type=int, default=1)
    parser.add_argument("--early_stopping_rounds", type=int, default=3)
    
    bucket_name = "xgboost-stations-data-store"

    # SageMaker specific arguments. Defaults are set in the environment variables
    # Location of input training data
    parser.add_argument("--train_data_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-all-data/xgb-stn-130-train.csv")
    # Location of input validation data
    parser.add_argument("--validation_data_dir", type=str,default= f"s3://xgboost-stations-data-store/xgb-all-data/xgb-stn-130-val.csv")
    # Location where trained model will be stored. Default set by SageMaker, /opt/ml/model
    parser.add_argument("--model_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-model-output/")
    # Location where model artifacts will be stored. Default set by SageMaker, /opt/ml/output/data
    parser.add_argument("--output_data_dir", type=str, default= f"s3://xgboost-stations-data-store/xgb-model-output/metrics.json")
    
    args = parser.parse_args()

    data_train = pd.read_csv(args.train_data_dir)
    train = data_train.drop("Unnamed: 0", axis=1)
    train = train.drop("dt_ts", axis=1)
    train = train.drop("target_130", axis=1)
    #print(train.columns)
    label_train = pd.DataFrame(data_train["target_130"])
    dtrain = xgb.DMatrix(train, label=label_train)
    
    
    data_validation = pd.read_csv(args.validation_data_dir)
    #print(data_validation.columns)    
    validation = data_validation.drop("Unnamed: 0", axis=1)
    validation = validation.drop("dt_ts", axis=1)
    validation = validation.drop("target_130", axis=1)
    label_validation = pd.DataFrame(data_validation["target_130"])
    dvalidation = xgb.DMatrix(validation, label=label_validation)

    params = {"max_depth": args.max_depth,
              "eta": args.eta,
              "objective": args.objective,
              "subsample" : args.subsample,
              "colsample_bytree":args.colsample_bytree
             }
    
    num_boost_round = args.num_round
    nfold = args.nfold
    early_stopping_rounds = args.early_stopping_rounds
    
    
    ## need to use walk-forward validation and NOT k-fold cross-validation
    cv_results = xgb.cv(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        nfold=nfold,
        early_stopping_rounds=early_stopping_rounds,
        metrics=["rmse"],
        seed=42,
    )
    
    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=len(cv_results))
    
    train_pred = model.predict(dtrain)
    validation_pred = model.predict(dvalidation)
    
    train_rmse = mean_squared_error(label_train, train_pred, squared=False)
    validation_rmse = mean_squared_error(label_validation, validation_pred,squared=False)
    
    print(f"train_rmse:{train_rmse:.2f}")
    print(f"validation_rmse:{validation_rmse:.2f}")

    metrics_data = {"hyperparameters" : params,
                    "regression_metrics": {"validation:rmse": {"value": validation_rmse},
                                                      "train:rmse": {"value": train_rmse}
                                                     }
                   }

    
    
    
    # Save the evaluation metrics to the location specified by output_data_dir

    metrics_location = f"s3://xgboost-stations-data-store/xgb-model-output/metrics.json"
    
    # Save the model to the location specified by model_dir
    model_location = args.model_dir
    
    s3 = boto3.resource('s3')
    s3object = s3.Object(bucket_name, metrics_location)

    s3object.put(
        Body=(bytes(json.dumps(metrics_data).encode('UTF-8')))
    )    

    s3_client = boto3.client('s3')

    key = "xgb-model-output/model.pkl"

# WRITE
    with tempfile.TemporaryFile() as fp:
        joblib.dump(model, fp)
        fp.seek(0)
        s3_client.put_object(Body=fp.read(), Bucket=bucket_name, Key=key)
amazon-web-services xgboost
© www.soinside.com 2019 - 2024. All rights reserved.