如何提高这个深度 Q 学习网络在 OpenAI Gym Lunar Lander 问题 (PyTorch) 上的性能？

Question

我正在解决一些 OpenAI Gym 问题，似乎被 Lunar Lander 难倒了。我的 Deep Q-Learning 代理似乎在工作，但正在努力生成一个能够可靠地获得正分数的网络：

我试过使用一系列不同的超参数，包括网络架构（层数）、学习率和 epsilon（epsilon-greedy 选择）。不幸的是，它们似乎都没有对性能做出重大改变。 我的实现有什么问题还是我应该对我的超参数进行更极端的更改？

我所有的代码都在下面。我对它进行了组织，以便它（希望）无需任何调整即可为您工作。

##IMPORTS##
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd
import random
import sys
from time import time
from collections import deque, defaultdict, namedtuple

import gym


##HYPERPARAMS##
env = gym.make('LunarLander-v2')

NUM_ACTIONS = env.action_space.n
STATE_DIM = env.observation_space.shape[0]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MAX_EPISODES = 500
EPSILON = 0.05
REPLAY_SIZE = 10000 
BATCH_SIZE = 64         
GAMMA = 0.99            
LR = 1e-4               
UPDATE_RATE = 10        
MIN_MEMORY_SIZE = 500

##Set Up Classes##

class DQNet(nn.Module):
    
    def __init__(self, stateDim, actionDim):
        super().__init__()
        
        #Hidden Layer 1
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=stateDim, out_features=16),
            nn.ReLU(True))
        
        #Hidden Layer 2
        self.fc2 = nn.Sequential(
            nn.Linear(in_features=16, out_features=32),
            nn.ReLU(True))

        #Output Layer
        #No activation function because we are estimating Q values
        self.fcOutput = nn.Linear(in_features = 32, out_features = actionDim)  
        
    def forward(self, x):
        
        """Forward Pass"""
        
        out = self.fc1(x)
        out = self.fc2(out)
        out = self.fcOutput(out)
        
        return out 

class ReplayMemory(object):
    
    def __init__(self, replaySize, batchSize):

        self.batchSize = batchSize
        self.memory = deque(maxlen = replaySize)
        self.experience = namedtuple("Experience", 
                                     field_names=["State", "Action", "NextState", "Reward"])

    def addExperience(self, state, action, nextState, reward):
        
        experience = self.experience(state, action, nextState, reward)
        self.memory.append(experience)
                
    def sample(self):
        
        batchSize = min(self.batchSize, len(self)) 
        return random.sample(self.memory, batchSize)
                
    def __len__(self):
        return len(self.memory)



class Agent:
    
    def __init__(self, stateSize, numActions):

        
        self.stateSize = stateSize
        self.numActions = numActions
    
        self.netPolicy = DQNet(stateSize, numActions).to(device)
        
        self.netTarget = DQNet(stateSize, numActions).to(device)
        self.netTarget.load_state_dict(self.netPolicy.state_dict()) 
        
        self.optimizer = optim.Adam(self.netPolicy.parameters(), lr = LR)
        self.loss = nn.MSELoss()
        
        self.memory = ReplayMemory(REPLAY_SIZE, BATCH_SIZE)    

        self.numUpdates = 0
        
    

    def update(self):
        
        batch = self.memory.sample()

        states = [experience.State for experience in batch]
        states = torch.tensor(states, dtype=torch.float32, device=device)
        
        actions = [experience.Action for experience in batch]
        actions = torch.tensor(actions, dtype=torch.int64, device=device)
        
        actions = actions.unsqueeze(1).to(device)
        

        rewards = [experience.Reward for experience in batch]
        rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
        
        nextStates = [experience.NextState for experience in batch if experience.NextState is not None]
        nextStates = torch.tensor(nextStates, dtype=torch.float32, device=device)
        noNextStateFilter = torch.tensor([experience.NextState is not None for experience in batch],
                                         dtype=torch.bool)
        
        
        self.netPolicy.train()
        allQVals = self.netPolicy(states)
        # Generate all Q values (Qvalue of every action for a given state)
        
        # Only keep the qvalues that correspond to the action which was actually experienced
        predictedQVals = torch.gather(input = allQVals, dim = 1, index = actions)
 
        
        with torch.no_grad():
            
            self.netTarget.eval()
            allTargetQVals = self.netTarget(nextStates)
            maxTargetQVals = torch.zeros(len(batch), device=device)
            maxTargetQVals[noNextStateFilter] = allTargetQVals.max(dim=1)[0]            
            
        trueQVals = rewards + (GAMMA * maxTargetQVals )
        trueQVals = trueQVals.unsqueeze(1) #make it a a 2-d tensor of shape: (numrow = BATCH_SIZE, numcol = 1)

        loss = self.loss(predictedQVals, trueQVals)

        self.optimizer.zero_grad()
        loss.backward()
        #Gradient clipping for training stability (clip all the gradients greater than 3)
        nn.utils.clip_grad_norm_(self.netPolicy.parameters(), 3)
        self.optimizer.step()
            
        self.numUpdates += 1
        
        if self.numUpdates % UPDATE_RATE == 0:
            
            #print(f"Update: {self.numUpdates} - Overriding Target Network...") 
            
            self.netTarget.load_state_dict(self.netPolicy.state_dict()) 
            
    
    def act_epsilon_greedy(self, state, epsilon):

        if epsilon > 1 or epsilon < 0:
            raise Exception('Value of epsilon must be between 0 and 1')

        with torch.no_grad():
            self.netPolicy.eval()
            state = torch.tensor(state, dtype=torch.float32) 
            out = self.netPolicy(state)

        maxAction = int(out.argmax())

        if random.random() < epsilon:

            otherActions = [action for action in range(NUM_ACTIONS) if action != maxAction]
            action = random.choice(otherActions)

        else:
            action = maxAction

        return action
    

    def learn(self, state, action, nextState, reward):
        
        self.memory.addExperience(state, action, nextState, reward)


##Training##
currentEpisode = 1
scoresList = []
agent = Agent(stateSize = STATE_DIM, numActions = NUM_ACTIONS)


while currentEpisode < MAX_EPISODES + 1:
    
    state = env.reset()
    state = state[0]
    score = 0
    terminated = False
    
    while not terminated:
           
        action = agent.act_epsilon_greedy(state, EPSILON)
        nextState, reward, terminated, truncated , info = env.step(action)
        agent.learn(state, action, nextState, reward)
        state = nextState       
        score += reward
       
        if len(agent.memory) > MIN_MEMORY_SIZE:
            
            agent.update()
            
    if len(agent.memory) > MIN_MEMORY_SIZE:
        
        scoresList.append(score)
        print(f"Current Episode: {currentEpisode} - Current Score: {score}")
        currentEpisode += 1

如何提高这个深度 Q 学习网络在 OpenAI Gym Lunar Lander 问题 (PyTorch) 上的性能？

问题描述投票：0回答：0

最新问题

如何提高这个深度 Q 学习网络在 OpenAI Gym Lunar Lander 问题 (PyTorch) 上的性能？

问题描述 投票：0回答：0

最新问题

问题描述投票：0回答：0