我正在训练一个 RL 代理来优化作业车间制造系统中的调度。我的方法基于此代码:https://github.com/AndreasKuhnle/SimRLFab。它将环境迁移到体育馆环境,并将Python版本从Python 3.6更新到3.10。我正在测试不同的算法,例如 PPO、TRPO 和 DQN。在训练期间,我注意到每集的平均奖励,即张量板中的 ep_re_mean,随着时间的推移而减少,这与我预期它应该增加的情况相反。奖励函数是机器的利用率,应该最大化。这种行为的原因可能是什么?
我正在使用“自制”的健身房环境和简单的环境。由于我不认为自己是专家,我认为它看起来像是学会了最小化奖励,尽管它不应该。我的想法对吗?据我了解,利用率应该最大化,这就是为什么它是积极的并计算为
r_util = exp(util/1.5) - 1
来自 tensorboard 的 ep_rew_mean 图:
张量板的损失。它似乎至少学到了一些东西。虽然,我不确定它是否学到了错误的东西。
阶梯函数,调用奖励函数的计算是:
def step(self, actions):
reward = None
terminal = False
states = None
truncated = False
info = {}
self.step_counter += 1
# print(self.counter, "Agent-Action: ", int(actions))
if (self.step_counter % self.parameters['EXPORT_FREQUENCY'] == 0 or self.step_counter % self.max_episode_timesteps == 0) \
and not self.parameters['EXPORT_NO_LOGS']:
self.export_statistics(self.step_counter, self.count_episode)
if self.step_counter == self.max_episode_timesteps:
print("Last episode action ", datetime.now())
truncated = True
# If multiple transport agents then for loop required
for agent in Transport.agents_waiting_for_action:
agent = Transport.agents_waiting_for_action.pop(0)
if self.parameters['TRANSP_AGENT_ACTION_MAPPING'] == 'direct':
agent.next_action = [int(actions)]
elif self.parameters['TRANSP_AGENT_ACTION_MAPPING'] == 'resource':
agent.next_action = [int(actions[0]), int(actions[1])]
agent.state_before = None
self.parameters['continue_criteria'].succeed()
self.parameters['continue_criteria'] = self.env.event()
self.env.run(until=self.parameters['step_criteria']) # Waiting until action is processed in simulation environment
# Simulation is now in state after action processing
reward, terminal = agent.calculate_reward(actions)
if terminal:
print("Last episode action ", datetime.now())
self.export_statistics(self.step_counter, self.count_episode)
agent = Transport.agents_waiting_for_action[0]
states = agent.calculate_state() # Calculate state for next action determination
if self.parameters['TRANSP_AGENT_ACTION_MAPPING'] == 'direct':
self.statistics['stat_agent_reward'][-1][3] = [int(actions)]
elif self.parameters['TRANSP_AGENT_ACTION_MAPPING'] == 'resource':
self.statistics['stat_agent_reward'][-1][3] = [int(actions[0]), int(actions[1])]
self.statistics['stat_agent_reward'][-1][4] = round(reward, 5)
self.statistics['stat_agent_reward'][-1][5] = agent.next_action_valid
self.statistics['stat_agent_reward'].append([self.count_episode, self.step_counter, round(self.env.now, 5),
None, None, None, states])
# done = truncated or terminal
#if truncated:
#self.reset()
return states, reward, terminal, truncated, info
奖励函数是这样计算的:
def calculate_reward(self, action):
result_reward = self.parameters['TRANSP_AGENT_REWARD_INVALID_ACTION'] # = 0.0
result_terminal = False
if self.invalid_counter < self.parameters['TRANSP_AGENT_MAX_INVALID_ACTIONS']: # If true, then invalid action selected
if self.parameters['TRANSP_AGENT_REWARD'] == "valid_action":
result_reward = get_reward_valid_action(self, result_reward)
elif self.parameters['TRANSP_AGENT_REWARD'] == "utilization":
result_reward = get_reward_utilization(self, result_reward)
else:
self.invalid_counter = 0
result_reward = 0.0
# result_terminal = True
if self.next_action_valid:
self.invalid_counter = 0
self.counter_action_subsets[0] += 1
if self.next_action_destination != -1 and self.next_action_origin != -1 and self.next_action_destination.type == 'machine':
self.counter_action_subsets[1] += 1
elif self.next_action_destination != -1 and self.next_action_origin != -1 and self.next_action_destination.type == 'sink':
self.counter_action_subsets[2] += 1
# If explicit episode limits are set in configuration
if self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT'] > 0:
result_reward = 0.0
if (self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT_TYPE'] == 'valid' and self.counter_action_subsets[0] == self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT']) or \
(self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT_TYPE'] == 'entry' and self.counter_action_subsets[1] == self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT']) or \
(self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT_TYPE'] == 'exit' and self.counter_action_subsets[2] == self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT']) or \
(self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT_TYPE'] == 'time' and self.env.now - self.last_reward_calc_time > self.parameters['TRANSP_AGENT_REWARD_EPISODE_LIMIT']):
result_terminal = True
self.last_reward_calc_time = self.env.now
self.invalid_counter = 0
self.counter_action_subsets = [0, 0, 0]
if result_terminal:
if self.parameters['TRANSP_AGENT_REWARD_SPARSE'] == "utilization":
result_reward = get_reward_sparse_utilization(self)
elif self.parameters['TRANSP_AGENT_REWARD_SPARSE'] == "waiting_time":
result_reward = get_reward_sparse_waiting_time(self)
elif self.parameters['TRANSP_AGENT_REWARD_SPARSE'] == "valid_action":
result_reward = get_reward_sparse_valid_action(self)
else:
self.last_reward_calc_time = self.env.now
self.latest_reward = result_reward
return result_reward, result_terminal
def get_reward_utilization(transport_resource, invalid_reward):
result_reward = invalid_reward
if transport_resource.next_action_destination == -1 or transport_resource.next_action_origin == -1: # Waiting or empty action selected
result_reward = transport_resource.parameters['TRANSP_AGENT_REWARD_WAITING_ACTION'] # = 0.0
elif transport_resource.next_action_valid:
util = 0.0
for mach in transport_resource.resources['machines']:
util += mach.get_utilization_step() # calculation of utilization of machines
util = util / transport_resource.parameters['NUM_MACHINES']
transport_resource.last_reward_calc = util
result_reward = np.exp(util / 1.5) - 1.0
if transport_resource.next_action_destination.type == 'machine':
result_reward = transport_resource.parameters['TRANSP_AGENT_REWARD_SUBSET_WEIGHTS'][0] * result_reward # here the weight is = 1.0
else:
result_reward = transport_resource.parameters['TRANSP_AGENT_REWARD_SUBSET_WEIGHTS'][1] * result_reward # here the weight is = 1.0
return result_reward
重置功能如下所示:
def reset(self):
print("####### Reset Environment #######")
self.count_episode += 1
self.step_counter = 0
if self.count_episode == self.parameters['CHANGE_SCENARIO_AFTER_EPISODES']:
self.change_production_parameters()
print("Sim start time: ", self.statistics['sim_start_time'])
# Setup and start simulation
if self.env.now == 0.0:
print('Run machine shop simpy environment')
self.env.run(until=self.parameters['step_criteria'])
obs = np.array(self.resources['transps'][0].calculate_state())
info = {}
return obs, info
我已经尝试检查奖励功能,但据我所知,它的工作方式与我预期的一样。此外,我检查了转移到张量板的奖励是否与我的日志文件中的奖励相似。我在这里阅读了这篇文章为什么 ep_re_mean 会随着时间的推移而减少?,但这对我没有帮助.. 有谁知道为什么每集的平均奖励会随着时间的推移而减少? 注意:如果需要,我可以提供更多代码。 提前致谢!
编辑:我的完整代码可以在这里找到:JSP_Environment