我是基于模型的强化学习的研究员。我将归一化流模型添加到我的代码中,以便通过流获得更好的模拟样本。我把这两部分集成调试了很长时间。现在代码可以运行了,但是训练速度非常快。慢的。我从昨晚10点到今天早上10点跑了3个epoch,也就是3000步。这是不正常的。 可能是什么原因?
这是我的火车功能:
def train(args, env_sampler, predict_env, agent, env_pool, model_pool, couple_flow, rewarder, model_agent, rollout_schedule):
total_step = 0
rollout_length =1
rollout_depth =2
exploration_before_start(args, env_sampler, env_pool, agent)
for epoch_step in range(args.num_epoch):
start_step = total_step
train_policy_steps = 0
for i in count():
cur_step = total_step - start_step
if cur_step >= args.epoch_length and len(env_pool) > args.min_pool_size: # epoch_length is 1000
break
if args.use_algo == 'discriminator':
if i == 0:
train_predict_model(args, env_pool, predict_env)
print("completed model!!!!!!!!!!!!")
#function2
train_couple_flow(args, env_pool, predict_env, agent, rollout_depth, rewarder, total_step)
if cur_step > 0 and cur_step % args.model_train_freq == 0 and args.real_ratio < 1.0:
print("start train model")
train_predict_model(args, env_pool, predict_env)
print("end train model")
new_rollout_length = set_rollout_length(epoch_step, rollout_schedule)
if rollout_length != new_rollout_length:
rollout_length = new_rollout_length
model_pool = resize_model_pool(args, rollout_length, model_pool)
print("start rollouting")
MPC_rollout_model(args, predict_env, agent, model_pool, env_pool, rollout_length, rewarder, total_step)
# rollout_model(args, predict_env, agent, model_pool, env_pool, rollout_length)
print("end rollouting")
elif args.use_algo == 'flowrl':
if i == 0:
train_predict_model(args, env_pool, predict_env)
#function1
train_predict_model_by_couple_flow(args, env_pool, predict_env, agent, rollout_depth, rewarder, model_agent, cur_step, total_step)
if cur_step > 0 and cur_step % args.model_train_freq == 0 and args.real_ratio < 1.0:
# if cur_step > 0 and cur_step % 500 == 0 and args.real_ratio < 1.0:
new_rollout_length = set_rollout_length(epoch_step, rollout_schedule)
if rollout_length != new_rollout_length:
rollout_length = new_rollout_length
model_pool = resize_model_pool(args, rollout_length, model_pool)
rollout_model(args, predict_env, agent, model_pool, env_pool, rollout_length)
elif args.use_algo == 'mbpo':
if cur_step > 0 and cur_step % args.model_train_freq == 0 and args.real_ratio < 1.0:
train_predict_model(args, env_pool, predict_env)
new_rollout_length = set_rollout_length(epoch_step, rollout_schedule)
if rollout_length != new_rollout_length:
rollout_length = new_rollout_length
model_pool = resize_model_pool(args, rollout_length, model_pool)
rollout_model(args, predict_env, agent, model_pool, env_pool, rollout_length)
cur_state, action, next_state, reward, done, info = env_sampler.sample(agent)
env_pool.push(cur_state, action, reward, next_state, done)
if len(env_pool) > args.min_pool_size:
train_policy_steps += train_policy_repeats(args, total_step, train_policy_steps, cur_step, env_pool, model_pool, agent)
total_step += 1
if total_step % args.epoch_length == 0:
'''
avg_reward_len = min(len(env_sampler.path_rewards), 5)
avg_reward = sum(env_sampler.path_rewards[-avg_reward_len:]) / avg_reward_len
logging.info("Step Reward: " + str(total_step) + " " + str(env_sampler.path_rewards[-1]) + " " + str(avg_reward))
print(total_step, env_sampler.path_rewards[-1], avg_reward)
'''
env_sampler.current_state = None
sum_reward = 0
done = False
test_step = 0
while (not done) and (test_step != args.max_path_length):
cur_state, action, next_state, reward, done, info = env_sampler.sample(agent, eval_t=True)
sum_reward += reward
test_step += 1
# logger.record_tabular("total_step", total_step)
# logger.record_tabular("sum_reward", sum_reward)
# logger.dump_tabular()
folder_path = f"./results/{args.env_name}"
if not os.path.exists(folder_path):
os.makedirs(folder_path)
if args.use_algo == 'discriminator':
file_name = f"{folder_path}/{args.env_name}_discriminator_{now02}.txt"
elif args.use_algo == 'flowrl':
file_name = f"{folder_path}/{args.env_name}_flowRL_{now02}.txt"
elif args.use_algo == 'mbpo':
file_name = f"{folder_path}/{args.env_name}_mbpo_{now02}.txt"
with open(file_name, "a") as file:
file.write(f"{total_step}\t{sum_reward}\n")
logging.info("Step Reward: " + str(total_step) + " " + str(sum_reward))
print(total_step, sum_reward)
torch.cuda.empty_cache()
我尝试将流的batch_size从100增加到256再到500,减少文件IO,并更换更好的显卡(Tesla-v100-SXM2-32GB)。 ,但好像效果不大。
您是否设法提高了性能?我也刚刚开始使用基于模型的强化学习,并且对它的速度如此之慢感到震惊