我正在调整一些
sklearn
和 tensorflow
模型,我将超参数作为字典获取,例如 {'l1': 0.001, 'lr':0.001}
,并在我的调整功能中将它们保存在更大的字典中:
def optunize_hyperparameters(X_tr: Union[List[np.ndarray], pd.DataFrame, np.ndarray], y_tr: Union[dict, pd.DataFrame, np.ndarray],
Objective: Callable, builder_func: Callable, model_name: str, fit_params: dict, log_path: str, n_trials: int=10, **kwargs) -> dict:
"""
Optimizes a model's hyperparameters with grid search. Loads existing ones if they exist.
Parameters
----------
X:
Training features.
y:
Training targets
Objective:
Optuna objective callable class.
model_name:
In the format of "<model name>_qx" for x in {5,25,50,75,95}.
hp_grid:
Parameter grid for grid search
fit_params:
Parameters to pass to model.fit()
log_path:
Path to hyperparameter log file, e.g., "tuning_log.txt"
Returns
-------
best_hps:
A dictionary that can be passed as **kwargs to builder_func.
"""
# Check if log exists, create log file if not
if not os.path.exists(log_path):
with open(log_path, 'w') as f:
log = {}
json.dump(log, f)
# Load log
try:
with open(log_path, 'r') as f:
log = json.load(f)
print("Successfully loaded existing hyperparameters.")
except OSError as e:
print(e)
# Look for existing hps, optimize otherwise
try:
best_hps = log[model_name]
print("Existing hyperparameters found, loading...")
except:
print("No existing hyperparameters found, optimizing hyperparameters...")
study = optuna.create_study(
sampler=optuna.samplers.RandomSampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner(),
direction='maximize'
)
study.optimize(
Objective(
X_tr, y_tr,
builder_func=builder_func,
fit_params=fit_params,
**kwargs
),
n_trials=n_trials,
n_jobs=-1
)
best_hps = study.best_params
# Add hps to log and save it
log[model_name] = best_hps
with open(log_path, 'w') as f:
json.dump(log, f)
return best_hps
我正在使用 slurm 在 Compute Canada 上并行提交多个作业(大约 32 个)。我似乎随机得到
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
,对于任何特定的工作或任何特定的模型都不一致。我无法在本地复制这个问题,所以我什至不知道如何调试它。将文件从 .json
更改为 .txt
似乎有一点帮助。过去,我也遇到过文件中其他行的解码错误,因此可能不一定是第一个字符的问题,尽管第一个字符的问题似乎是最常见的。我看过tuning_log.txt
,看起来不错:
{"MT1_2012": {"l1": 0.01, "lr": 0.01}, "MT3_1997": {"l1": 1e-06, "lr": 0.01}, ...}
我已经浏览过有关此问题的其他帖子,但它们都与获取网络数据有关,并且问题似乎与我的不同(这似乎是随机且不一致的)。
发生这种情况的原因是因为我正在并行运行多个作业,这些作业试图打开并写入同一个文件。我通过使用文件锁定解决了这个问题。我这样修改了我的函数(我需要更新我的文档字符串)
def optunize_hyperparameters(X_tr: Union[List[np.ndarray], pd.DataFrame, np.ndarray], y_tr: Union[dict, pd.DataFrame, np.ndarray],
Objective: Callable, builder_func: Callable, model_name: str, fit_params: dict, log_path: str, n_trials: int=10, **kwargs) -> dict:
"""
Optimizes a model's hyperparameters with grid search. Loads existing ones if they exist.
Parameters
----------
X:
Training features.
y:
Training targets
Objective:
Optuna objective callable class.
model_name:
In the format of "<model name>_qx" for x in {5,25,50,75,95}.
hp_grid:
Parameter grid for grid search
fit_params:
Parameters to pass to model.fit()
log_path:
Path to hyperparameter log file, e.g., "tuning_log.txt"
Returns
-------
best_hps:
A dictionary that can be passed as **kwargs to builder_func.
"""
# Check if log exists, create log file if not
if not os.path.exists(log_path):
with open(log_path, 'w') as f:
portalocker.lock(f, portalocker.LOCK_EX)
log = {}
json.dump(log, f)
portalocker.unlock(f)
# Load log
try:
with open(log_path, 'r+') as f:
portalocker.lock(f, portalocker.LOCK_EX)
log = json.load(f)
# Perform read/write operations
portalocker.unlock(f)
except json.JSONDecodeError as e:
raise e
# Look for existing hps, optimize otherwise
try:
best_hps = log[model_name]
print("Existing hyperparameters found, loading...")
except:
print("No existing hyperparameters found, optimizing hyperparameters...")
study = optuna.create_study(
sampler=optuna.samplers.RandomSampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner(),
direction='maximize'
)
study.optimize(
Objective(
X_tr, y_tr,
builder_func=builder_func,
fit_params=fit_params,
**kwargs
),
n_trials=n_trials,
n_jobs=-1
)
best_hps = study.best_params
# Add hps to log and save it
log[model_name] = best_hps
with open(log_path, 'r+') as f:
portalocker.lock(f, portalocker.LOCK_EX)
log[model_name] = best_hps
f.seek(0)
json.dump(log, f)
f.truncate()
portalocker.unlock(f)
return best_hps