mean():参数“输入”(位置 1)必须是张量,而不是列表

问题描述 投票:0回答:1

我是 DRL 新手,我想使用

DRL
pytorch
tensorflow
解决一个简单的优化问题。但是,我的代码中有错误。任何人都可以帮我调试它。预先感谢!

Error: mean(): argument 'input' (position 1) must be Tensor, not list

该代码试图解决以下优化问题:

Min  2\*X1^2+4\*X2
subject to X1+5\*X2\>=5

我知道有更好的替代方案可以解决优化问题,但这只是开发更复杂模型的说明性概念。互联网上更多可用的代码正在使用预定义的环境。然而,这里我们需要定义我们自己的环境。

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt

# Define a custom Gym environment for the constrained optimization problem
class CustomConstrainedEnv(gym.Env):
    def __init__(self):
        super(CustomConstrainedEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

    def reset(self):
        self.state = torch.rand(2) * 2 - 1
        return self.state

    def step(self, action):
        self.state = torch.clamp(self.state, -1, 1)
        x1, x2 = self.state
        objective = 2 * x1**2 + 4 * x2
        constraint = x1 + 5 * x2 - 5

        reward = -objective
        penalty = -1e6 * torch.max(torch.tensor([0.0]), constraint)

        done = False
        return self.state, reward + penalty, done, {}

# Define a neural network model for the policy
class PolicyModel(nn.Module):
    def __init__(self, num_actions):
        super(PolicyModel, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 64)
        self.mean_head = nn.Linear(64, num_actions)
        self.std_head = nn.Linear(64, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = torch.tanh(self.mean_head(x))
        std = self.softplus(self.std_head(x))  # Use the built-in softplus
        return mean, std

    def softplus(self, x):
        return torch.log(1 + torch.exp(x))

# Hyperparameters
learning_rate = 0.001
gamma = 0.99
num_epochs = 500
num_episodes = 100

# Create the custom environment
env = CustomConstrainedEnv()

# Build the policy model
num_actions = env.action_space.shape[0]
policy_model = PolicyModel(num_actions)
optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

# Training loop
reward_history = []

for epoch in range(num_epochs):
    states, actions, rewards, old_means, old_stds, returns, advantages = [], [], [], [], [], [], []

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        #while not done:
        for _ in range(10):
            action_means, action_stds = policy_model(state)
            action = torch.normal(action_means, action_stds)
            action = torch.clamp(action, -1, 1)

            new_state, reward, done, _ = env.step(action)
            old_mean, old_std = action_means, action_stds

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            old_means.append(old_mean)
            old_stds.append(old_std)

            state = new_state

        discounted_reward = 0
        advantage = 0
        for t in reversed(range(len(rewards))):
            discounted_reward = rewards[t] + gamma * discounted_reward
            advantage = discounted_reward - old_means[t]
            returns.insert(0, discounted_reward)
            advantages.insert(0, advantage)

        advantages = (advantages - torch.mean(advantages)) / (torch.std(advantages) + 1e-8)

        policy_loss = []
        for t in range(len(states)):
            action_means, action_stds = policy_model(states[t])
            action_dist = torch.distributions.Normal(action_means, action_stds)
            new_action_probs = action_dist.log_prob(actions[t])
            old_action_probs = action_dist.log_prob(actions[t])
            prob_ratio = torch.exp(new_action_probs - old_action_probs)

            surrogate_loss = torch.min(
                prob_ratio * advantages[t],
                torch.clamp(prob_ratio, 1 - 0.2, 1 + 0.2) * advantages[t]
            )
            policy_loss.append(-surrogate_loss)

        policy_loss = torch.stack(policy_loss).mean()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

    # Evaluate the learned policy
    total_rewards = []
    for _ in range(10):
        state = env.reset()
        done = False
        episode_reward = 0

        #while not done:
        for _ in range(10):    
            action_means, _ = policy_model(state)
            action = action_means
            state, reward, done, _ = env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)

    avg_reward = np.mean(total_rewards)
    reward_history.append(avg_reward)

    print(f"Epoch {epoch + 1}/{num_epochs}, Average Reward: {avg_reward}")

# Plot the learning progress
plt.plot(reward_history)
plt.xlabel('Epoch')
plt.ylabel('Average Reward')
plt.title('PPO Learning Progress')
plt.show()
python deep-learning pytorch
1个回答
0
投票

问题是您正在尝试计算张量值列表的平均值 所以我认为你应该尝试创建一个循环来计算这些值的平均值

© www.soinside.com 2019 - 2024. All rights reserved.