mean()：参数“输入”（位置 1）必须是张量，而不是列表

Question

我是 DRL 新手，我想使用

DRL

、

pytorch

或

tensorflow

解决一个简单的优化问题。但是，我的代码中有错误。任何人都可以帮我调试它。预先感谢！

Error: mean(): argument 'input' (position 1) must be Tensor, not list

该代码试图解决以下优化问题：

Min  2\*X1^2+4\*X2
subject to X1+5\*X2\>=5

我知道有更好的替代方案可以解决优化问题，但这只是开发更复杂模型的说明性概念。互联网上更多可用的代码正在使用预定义的环境。然而，这里我们需要定义我们自己的环境。

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import matplotlib.pyplot as plt

# Define a custom Gym environment for the constrained optimization problem
class CustomConstrainedEnv(gym.Env):
    def __init__(self):
        super(CustomConstrainedEnv, self).__init__()
        self.action_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
        self.observation_space = gym.spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)

    def reset(self):
        self.state = torch.rand(2) * 2 - 1
        return self.state

    def step(self, action):
        self.state = torch.clamp(self.state, -1, 1)
        x1, x2 = self.state
        objective = 2 * x1**2 + 4 * x2
        constraint = x1 + 5 * x2 - 5

        reward = -objective
        penalty = -1e6 * torch.max(torch.tensor([0.0]), constraint)

        done = False
        return self.state, reward + penalty, done, {}

# Define a neural network model for the policy
class PolicyModel(nn.Module):
    def __init__(self, num_actions):
        super(PolicyModel, self).__init__()
        self.fc1 = nn.Linear(2, 64)
        self.fc2 = nn.Linear(64, 64)
        self.mean_head = nn.Linear(64, num_actions)
        self.std_head = nn.Linear(64, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = torch.tanh(self.mean_head(x))
        std = self.softplus(self.std_head(x))  # Use the built-in softplus
        return mean, std

    def softplus(self, x):
        return torch.log(1 + torch.exp(x))

# Hyperparameters
learning_rate = 0.001
gamma = 0.99
num_epochs = 500
num_episodes = 100

# Create the custom environment
env = CustomConstrainedEnv()

# Build the policy model
num_actions = env.action_space.shape[0]
policy_model = PolicyModel(num_actions)
optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

# Training loop
reward_history = []

for epoch in range(num_epochs):
    states, actions, rewards, old_means, old_stds, returns, advantages = [], [], [], [], [], [], []

    for episode in range(num_episodes):
        state = env.reset()
        done = False

        #while not done:
        for _ in range(10):
            action_means, action_stds = policy_model(state)
            action = torch.normal(action_means, action_stds)
            action = torch.clamp(action, -1, 1)

            new_state, reward, done, _ = env.step(action)
            old_mean, old_std = action_means, action_stds

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            old_means.append(old_mean)
            old_stds.append(old_std)

            state = new_state

        discounted_reward = 0
        advantage = 0
        for t in reversed(range(len(rewards))):
            discounted_reward = rewards[t] + gamma * discounted_reward
            advantage = discounted_reward - old_means[t]
            returns.insert(0, discounted_reward)
            advantages.insert(0, advantage)

        advantages = (advantages - torch.mean(advantages)) / (torch.std(advantages) + 1e-8)

        policy_loss = []
        for t in range(len(states)):
            action_means, action_stds = policy_model(states[t])
            action_dist = torch.distributions.Normal(action_means, action_stds)
            new_action_probs = action_dist.log_prob(actions[t])
            old_action_probs = action_dist.log_prob(actions[t])
            prob_ratio = torch.exp(new_action_probs - old_action_probs)

            surrogate_loss = torch.min(
                prob_ratio * advantages[t],
                torch.clamp(prob_ratio, 1 - 0.2, 1 + 0.2) * advantages[t]
            )
            policy_loss.append(-surrogate_loss)

        policy_loss = torch.stack(policy_loss).mean()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

    # Evaluate the learned policy
    total_rewards = []
    for _ in range(10):
        state = env.reset()
        done = False
        episode_reward = 0

        #while not done:
        for _ in range(10):    
            action_means, _ = policy_model(state)
            action = action_means
            state, reward, done, _ = env.step(action)
            episode_reward += reward

        total_rewards.append(episode_reward)

    avg_reward = np.mean(total_rewards)
    reward_history.append(avg_reward)

    print(f"Epoch {epoch + 1}/{num_epochs}, Average Reward: {avg_reward}")

# Plot the learning progress
plt.plot(reward_history)
plt.xlabel('Epoch')
plt.ylabel('Average Reward')
plt.title('PPO Learning Progress')
plt.show()

Answer 1

问题是您正在尝试计算张量值列表的平均值所以我认为你应该尝试创建一个循环来计算这些值的平均值

mean()：参数“输入”（位置 1）必须是张量，而不是列表

问题描述投票：0回答：1

1个回答

最新问题

mean()：参数“输入”（位置 1）必须是张量，而不是列表

问题描述 投票：0回答：1

1个回答

最新问题

问题描述投票：0回答：1