我是 DRL 新手,我想使用
DRL
、pytorch
或 tensorflow
解决一个简单的优化问题。但是,我的代码中有错误。任何人都可以帮我调试它。预先感谢!
Error: mean(): argument 'input' (position 1) must be Tensor, not list
该代码试图解决以下优化问题:
Min 2\*X1^2+4\*X2
subject to X1+5\*X2\>=5
我知道有更好的替代方案可以解决优化问题,但这只是开发更复杂模型的说明性概念。互联网上更多可用的代码正在使用预定义的环境。然而,这里我们需要定义我们自己的环境。
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt
# Define a custom Gym environment for the constrained optimization problem
class CustomConstrainedEnv(gym.Env):
def __init__(self):
super(CustomConstrainedEnv, self).__init__()
self.action_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
def reset(self):
self.state = torch.rand(2) * 2 - 1
return self.state
def step(self, action):
self.state = torch.clamp(self.state, -1, 1)
x1, x2 = self.state
objective = 2 * x1**2 + 4 * x2
constraint = x1 + 5 * x2 - 5
reward = -objective
penalty = -1e6 * torch.max(torch.tensor([0.0]), constraint)
done = False
return self.state, reward + penalty, done, {}
# Define a neural network model for the policy
class PolicyModel(nn.Module):
def __init__(self, num_actions):
super(PolicyModel, self).__init__()
self.fc1 = nn.Linear(2, 64)
self.fc2 = nn.Linear(64, 64)
self.mean_head = nn.Linear(64, num_actions)
self.std_head = nn.Linear(64, num_actions)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
mean = torch.tanh(self.mean_head(x))
std = self.softplus(self.std_head(x)) # Use the built-in softplus
return mean, std
def softplus(self, x):
return torch.log(1 + torch.exp(x))
# Hyperparameters
learning_rate = 0.001
gamma = 0.99
num_epochs = 500
num_episodes = 100
# Create the custom environment
env = CustomConstrainedEnv()
# Build the policy model
num_actions = env.action_space.shape[0]
policy_model = PolicyModel(num_actions)
optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
# Training loop
reward_history = []
for epoch in range(num_epochs):
states, actions, rewards, old_means, old_stds, returns, advantages = [], [], [], [], [], [], []
for episode in range(num_episodes):
state = env.reset()
done = False
#while not done:
for _ in range(10):
action_means, action_stds = policy_model(state)
action = torch.normal(action_means, action_stds)
action = torch.clamp(action, -1, 1)
new_state, reward, done, _ = env.step(action)
old_mean, old_std = action_means, action_stds
states.append(state)
actions.append(action)
rewards.append(reward)
old_means.append(old_mean)
old_stds.append(old_std)
state = new_state
discounted_reward = 0
advantage = 0
for t in reversed(range(len(rewards))):
discounted_reward = rewards[t] + gamma * discounted_reward
advantage = discounted_reward - old_means[t]
returns.insert(0, discounted_reward)
advantages.insert(0, advantage)
advantages = (advantages - torch.mean(advantages)) / (torch.std(advantages) + 1e-8)
policy_loss = []
for t in range(len(states)):
action_means, action_stds = policy_model(states[t])
action_dist = torch.distributions.Normal(action_means, action_stds)
new_action_probs = action_dist.log_prob(actions[t])
old_action_probs = action_dist.log_prob(actions[t])
prob_ratio = torch.exp(new_action_probs - old_action_probs)
surrogate_loss = torch.min(
prob_ratio * advantages[t],
torch.clamp(prob_ratio, 1 - 0.2, 1 + 0.2) * advantages[t]
)
policy_loss.append(-surrogate_loss)
policy_loss = torch.stack(policy_loss).mean()
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
# Evaluate the learned policy
total_rewards = []
for _ in range(10):
state = env.reset()
done = False
episode_reward = 0
#while not done:
for _ in range(10):
action_means, _ = policy_model(state)
action = action_means
state, reward, done, _ = env.step(action)
episode_reward += reward
total_rewards.append(episode_reward)
avg_reward = np.mean(total_rewards)
reward_history.append(avg_reward)
print(f"Epoch {epoch + 1}/{num_epochs}, Average Reward: {avg_reward}")
# Plot the learning progress
plt.plot(reward_history)
plt.xlabel('Epoch')
plt.ylabel('Average Reward')
plt.title('PPO Learning Progress')
plt.show()
问题是您正在尝试计算张量值列表的平均值 所以我认为你应该尝试创建一个循环来计算这些值的平均值