我正在解决一些 OpenAI Gym 问题,似乎被 Lunar Lander 难倒了。我的 Deep Q-Learning 代理似乎在工作,但正在努力生成一个能够可靠地获得正分数的网络:
我试过使用一系列不同的超参数,包括网络架构(层数)、学习率和 epsilon(epsilon-greedy 选择)。不幸的是,它们似乎都没有对性能做出重大改变。 我的实现有什么问题还是我应该对我的超参数进行更极端的更改?
我所有的代码都在下面。我对它进行了组织,以便它(希望)无需任何调整即可为您工作。
##IMPORTS##
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import random
import sys
from time import time
from collections import deque, defaultdict, namedtuple
import gym
##HYPERPARAMS##
env = gym.make('LunarLander-v2')
NUM_ACTIONS = env.action_space.n
STATE_DIM = env.observation_space.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_EPISODES = 500
EPSILON = 0.05
REPLAY_SIZE = 10000
BATCH_SIZE = 64
GAMMA = 0.99
LR = 1e-4
UPDATE_RATE = 10
MIN_MEMORY_SIZE = 500
##Set Up Classes##
class DQNet(nn.Module):
def __init__(self, stateDim, actionDim):
super().__init__()
#Hidden Layer 1
self.fc1 = nn.Sequential(
nn.Linear(in_features=stateDim, out_features=16),
nn.ReLU(True))
#Hidden Layer 2
self.fc2 = nn.Sequential(
nn.Linear(in_features=16, out_features=32),
nn.ReLU(True))
#Output Layer
#No activation function because we are estimating Q values
self.fcOutput = nn.Linear(in_features = 32, out_features = actionDim)
def forward(self, x):
"""Forward Pass"""
out = self.fc1(x)
out = self.fc2(out)
out = self.fcOutput(out)
return out
class ReplayMemory(object):
def __init__(self, replaySize, batchSize):
self.batchSize = batchSize
self.memory = deque(maxlen = replaySize)
self.experience = namedtuple("Experience",
field_names=["State", "Action", "NextState", "Reward"])
def addExperience(self, state, action, nextState, reward):
experience = self.experience(state, action, nextState, reward)
self.memory.append(experience)
def sample(self):
batchSize = min(self.batchSize, len(self))
return random.sample(self.memory, batchSize)
def __len__(self):
return len(self.memory)
class Agent:
def __init__(self, stateSize, numActions):
self.stateSize = stateSize
self.numActions = numActions
self.netPolicy = DQNet(stateSize, numActions).to(device)
self.netTarget = DQNet(stateSize, numActions).to(device)
self.netTarget.load_state_dict(self.netPolicy.state_dict())
self.optimizer = optim.Adam(self.netPolicy.parameters(), lr = LR)
self.loss = nn.MSELoss()
self.memory = ReplayMemory(REPLAY_SIZE, BATCH_SIZE)
self.numUpdates = 0
def update(self):
batch = self.memory.sample()
states = [experience.State for experience in batch]
states = torch.tensor(states, dtype=torch.float32, device=device)
actions = [experience.Action for experience in batch]
actions = torch.tensor(actions, dtype=torch.int64, device=device)
actions = actions.unsqueeze(1).to(device)
rewards = [experience.Reward for experience in batch]
rewards = torch.tensor(rewards, dtype=torch.float32, device=device)
nextStates = [experience.NextState for experience in batch if experience.NextState is not None]
nextStates = torch.tensor(nextStates, dtype=torch.float32, device=device)
noNextStateFilter = torch.tensor([experience.NextState is not None for experience in batch],
dtype=torch.bool)
self.netPolicy.train()
allQVals = self.netPolicy(states)
# Generate all Q values (Qvalue of every action for a given state)
# Only keep the qvalues that correspond to the action which was actually experienced
predictedQVals = torch.gather(input = allQVals, dim = 1, index = actions)
with torch.no_grad():
self.netTarget.eval()
allTargetQVals = self.netTarget(nextStates)
maxTargetQVals = torch.zeros(len(batch), device=device)
maxTargetQVals[noNextStateFilter] = allTargetQVals.max(dim=1)[0]
trueQVals = rewards + (GAMMA * maxTargetQVals )
trueQVals = trueQVals.unsqueeze(1) #make it a a 2-d tensor of shape: (numrow = BATCH_SIZE, numcol = 1)
loss = self.loss(predictedQVals, trueQVals)
self.optimizer.zero_grad()
loss.backward()
#Gradient clipping for training stability (clip all the gradients greater than 3)
nn.utils.clip_grad_norm_(self.netPolicy.parameters(), 3)
self.optimizer.step()
self.numUpdates += 1
if self.numUpdates % UPDATE_RATE == 0:
#print(f"Update: {self.numUpdates} - Overriding Target Network...")
self.netTarget.load_state_dict(self.netPolicy.state_dict())
def act_epsilon_greedy(self, state, epsilon):
if epsilon > 1 or epsilon < 0:
raise Exception('Value of epsilon must be between 0 and 1')
with torch.no_grad():
self.netPolicy.eval()
state = torch.tensor(state, dtype=torch.float32)
out = self.netPolicy(state)
maxAction = int(out.argmax())
if random.random() < epsilon:
otherActions = [action for action in range(NUM_ACTIONS) if action != maxAction]
action = random.choice(otherActions)
else:
action = maxAction
return action
def learn(self, state, action, nextState, reward):
self.memory.addExperience(state, action, nextState, reward)
##Training##
currentEpisode = 1
scoresList = []
agent = Agent(stateSize = STATE_DIM, numActions = NUM_ACTIONS)
while currentEpisode < MAX_EPISODES + 1:
state = env.reset()
state = state[0]
score = 0
terminated = False
while not terminated:
action = agent.act_epsilon_greedy(state, EPSILON)
nextState, reward, terminated, truncated , info = env.step(action)
agent.learn(state, action, nextState, reward)
state = nextState
score += reward
if len(agent.memory) > MIN_MEMORY_SIZE:
agent.update()
if len(agent.memory) > MIN_MEMORY_SIZE:
scoresList.append(score)
print(f"Current Episode: {currentEpisode} - Current Score: {score}")
currentEpisode += 1