每当尝试使用内存缓冲区来训练我的策略时,我总是会遇到向后传递错误或就地自动分级错误。
这是导致问题的函数:
def reinforce(pi, episodes, max_steps = 100, actor_fpath = 'actor.pth', critic_fpath = 'critic.pth', decay = False):
'''
In this function instead of using advantage estimation and training online we implement policy gradients using
gains and implement the REINFORCE algorithm for on-policy optimization
'''
optimizer_actor = optim.AdamW(pi.parameters(), lr=p_lr)
running_avg_reward = 0
alpha = 1e-2
mem = ReplayMemory(maxlen=2*max_steps)
for e in range(episodes):
state = sample_non_colliding(sample_state, is_colliding, sample_bounds)
traj = []
done = False
steps = 0
while not done and steps < max_steps:
# Sample an action and step the environment
action, logprob = pi.sample(state)
next_state, reward, done = step_env(state, action)
# Update running average of reward
running_avg_reward = (1-alpha)*running_avg_reward + alpha * reward
# Apped this transition to our trajectory
traj.append([state, action, next_state, reward, logprob])
steps += 1
# Compute the gain at all steps and add this to our replay buffer
traj = compute_gain(traj)
mem.add_trajectory(traj)
# Train our policy
data = DataLoader(mem, batch_size=policy_bs, shuffle=True)
for i, batch in enumerate(data):
if i > 10: break
gains = batch['gains']
logprobs = batch['logprobs']
weight = (gains - running_avg_reward).unsqueeze(1).detach()
actor_loss = (-logprobs * weight).sum()
optimizer_actor.zero_grad()
actor_loss.backward(retain_graph = True)
optimizer_actor.step()
if e % 100 == 0:
success_rate, avg_reward_per_step, avg_reward_per_episode = evaluate_policy(pi, trials=50, maxsteps=50)
print(f'Episode: {e}, Success rate: {success_rate}, average reward per step: {avg_reward_per_step}, avg reward per episode {avg_reward_per_episode}')
if decay: pi.decay_variance()
如果我没有在 actor_loss.backward() 中设置retain_graph = True,则会出现错误:
Sinas-MacBook-Pro:actor_critic shazeghi$ python vanilla_pg.py
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/graph.py:825: UserWarning: Error detected in torch::autograd::CopySlices. Traceback of forward call that caused the error:
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 237, in <module>
reinforce(pi, 5)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 123, in reinforce
mem.add_trajectory(traj)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/policy.py", line 95, in add_trajectory
self.push(s, a, ns, r, lp, g)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/policy.py", line 85, in push
self.logprobs[self.position] = lp
(Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:115.)
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 237, in <module>
reinforce(pi, 5)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 135, in reinforce
actor_loss.backward()
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/_tensor.py", line 581, in backward
torch.autograd.backward(
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/__init__.py", line 347, in backward
_engine_run_backward(
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
现在,如果我设置retain_graph = True,我会收到此错误:
actor_critic shazeghi$ python vanilla_pg.py
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/graph.py:825: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 237, in <module>
reinforce(pi, 5)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 111, in reinforce
action, logprob = pi.sample(state)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/policy.py", line 38, in sample
mu = self.forward(state)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/policy.py", line 23, in forward
out2 = self.tanh(self.layer2(out1))
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 125, in forward
return F.linear(input, self.weight, self.bias)
(Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:115.)
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 237, in <module>
reinforce(pi, 5)
File "/Users/shazeghi/Documents/Comp_Robotics/advanced_robotics/actor_critic/vanilla_pg.py", line 135, in reinforce
actor_loss.backward(retain_graph = True)
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/_tensor.py", line 581, in backward
torch.autograd.backward(
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/__init__.py", line 347, in backward
_engine_run_backward(
File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [12, 2]], which is output 0 of AsStridedBackward0, is at version 3; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
这很奇怪,因为每当我尝试在线训练我的策略时,相同的损失计算和梯度都可以正常工作,但是当使用内存缓冲区时,就会遇到这个问题。如果需要,这里有一些定义我的策略和 ReplayMemory 类的附加代码:
class Policy(nn.Module):
'''
Learns a parametrized actor network that takes in state (x_t, y_t, xdot_t, ydot_t) and returns mu_x, mu_y
means of 2 gaussian's. We sample these gaussians to obtain action a_t = (u_x, u_y)
'''
def __init__(self, hidden_size, input_size = 4, output_size = 2, var = 0.1, var_decay = 0.99) -> None:
super(Policy, self).__init__()
self.layer1 = nn.Linear(input_size, hidden_size)
self.layer2 = nn.Linear(hidden_size, output_size)
self.var = var
self.gamma = var_decay
self.relu = nn.LeakyReLU()
self.tanh = nn.Tanh()
def forward(self, state: torch.tensor):
out1 = self.relu(self.layer1(state))
out2 = self.tanh(self.layer2(out1))
return out2
def decay_variance(self):
self.var *= self.gamma
def sample(self, state: torch.tensor) -> torch.Tensor:
'''
This method sample's an action given a state.
Does this by computing a predicted mean mu for each action DoF and samples
From a gaussian using the mean and it's own set variance
returns action and logprobs of that action
'''
mu = self.forward(state)
distr = dist.Normal(mu, self.var)
# action = torch.clip(distr.sample(), -1, 1).clone()
action = distr.sample()
# print(f'action: {action}, {action.shape}') DEBUG PRINT
logprobs = distr.log_prob(action)
# print(logprobs) DEBUG PRINT
return action, logprobs
class ReplayMemory(Dataset):
def __init__(self, maxlen=500, s_shape=4, a_shape=2, dtype=torch.float32):
self.maxlen = maxlen
self.current_size = 0
self.position = 0
# Defer tensor creation until we see the first sample
self.states = torch.zeros(size=(maxlen, s_shape), dtype=dtype)
self.actions = torch.zeros(size=(maxlen, a_shape), dtype=dtype)
self.rewards = torch.zeros(size = (maxlen, ), dtype=dtype)
self.next_states = torch.zeros(size=(maxlen, s_shape), dtype=dtype)
self.gains = torch.zeros(size = (maxlen, ), dtype=dtype)
self.logprobs = torch.zeros(size=(maxlen, a_shape), dtype=dtype)
def __len__(self):
return self.current_size
def __getitem__(self, idx):
return {
'state': self.states[idx],
'action': self.actions[idx],
'reward': self.rewards[idx],
'next_state': self.next_states[idx],
'gains': self.gains[idx],
'logprobs': self.logprobs[idx]
}
def push(self, s, a, ns, r, lp, g):
# Store transition in the circular buffer
self.states[self.position] = s
self.actions[self.position] = a
self.rewards[self.position] = r
self.next_states[self.position] = ns
self.logprobs[self.position] = lp
self.gains[self.position] = g
# Update position and size
self.position = (self.position + 1) % self.maxlen
self.current_size = min(self.current_size + 1, self.maxlen)
def add_trajectory(self, trajectory):
for t in trajectory:
s, a,ns, r, lp, g = t
self.push(s, a, ns, r, lp, g)
我尝试过分离各种张量,例如优势和权重。我尝试使用常规列表而不是重播内存来存储增益和记录概率。我尝试过实现类似于上面强化实现的 a2c 离线版本,但使用价值网络,并且总是注意到 actor_loss.backward 有问题
经过一番阅读后,我意识到发生了就地错误,因为 PyTorch 不希望我们使用存储在
logprobs
中的 ReplayMemory
来计算用于更新策略的损失。
这是因为当我们更新策略时,我们正在训练的
logprobs
中的底层图表不再与我们策略网络中的权重相同。我可以通过使用我的策略即时计算 logprobs
,然后计算损失来解决这个问题,而不是使用 ReplayMemory 来存储它。
将
logprobs
存储在 ReplayMemory
中仍然很有用,因为它可以用于实现 PPO 中使用的重要性采样和裁剪之类的功能,并且可以显示我们的策略分布如何随时间变化。以下是有效的新代码:
# Train our policy
data = DataLoader(mem, batch_size=bs, shuffle=True)
for i, batch in enumerate(data):
if i > 10: break
gains = batch['gains']
states = batch['state']
actions = batch['action']
logprobs = pi.get_logprob(states, actions)
weight = (gains - running_avg_reward).unsqueeze(1).detach()
actor_loss = (-logprobs * weight).sum()
optimizer_actor.zero_grad()
actor_loss.backward()
optimizer_actor.step()