我正在尝试遵循 Sutton 和 Barto 方法,实现蒙特卡罗方法来解决二十一点问题。 一切似乎都是正确的,但当我查看 Q 表时,里面的值表明在太多情况下都坚持下去。我无法理解我的方法是否有问题,或者我正在实现错误的代码。
代理类,实施 Epsilon-Greedy 策略:
class MonteCarloAgentEpsilonGreedy:
def __init__(
self,
env,
discount_factor,
epsilon):
self.env = env
self.gamma = discount_factor
self.epsilon = epsilon # Exploration rate
# Create a dictionary to store the Q-values
self.Q_values = defaultdict(lambda: np.zeros(env.action_space.n))
self.Returns = defaultdict(lambda: np.zeros(env.action_space.n))
self.N = defaultdict(lambda: np.zeros(env.action_space.n))
def get_action(self, obs):
"""
Get the action with the highest Q-value for a given observation. (Epsilon-Greedy policy)
Args:
obs: The observation for which the action is to be determined.
Returns:
action: The action with the highest Q-value with probability 1-epsilon, otherwise a random action.
"""
if np.random.rand() < self.epsilon:
action = self.env.action_space.sample() # Choose a random action
else:
action = int(np.argmax(self.Q_values[obs])) # Choose the action with the highest Q-value
return action
def update_Q_values(self, episode):
"""
Update Q-values based on the episode.
Args:
episode: List of (state, action, reward) tuples.
"""
G = 0
for state, action, reward in reversed(episode):
G = self.gamma * G + reward
self.Returns[state][action] += G
self.N[state][action] += 1
# Update rule for Q-values
self.Q_values[state][action] = self.Returns[state][action] / self.N[state][action]
这是主要功能:
if __name__ == "__main__":
env = gym.make('Blackjack-v1', natural=False, sab=False)
# Create an instance of the MonteCarloAgent class
agent = MonteCarloAgentEpsilonGreedy(
env, discount_factor=0.9, epsilon=0.1)
num_episodes = 1000000
for e in range(num_episodes):
episode = []
terminated = False
truncated = False
# Choose initial state randomly
observation, info = env.reset()
while (not terminated and not truncated): # Loop for each episode
action = agent.get_action(observation)
next_obs, reward, terminated, truncated, info = env.step(action)
episode.append((observation, action, reward))
agent.update_Q_values(episode)
env.close()
这些是我针对不同观察获得的 Q 值示例:
(15, 10, 0): 数组([-0.57322077, -0.57813051]), (19, 3, 0): 数组([ 0.39937642, -0.67754011]), (17, 10, 0): 数组([-0.45902484 , -0.68447894]), (11, 8, 0): 数组([-0.47658631, -0.47728385]), (12, 10, 0): 数组([-0.54324405, -0.5438698 ]), (20, 10, 0 ): 数组([ 0.44418773, -0.84017038]), (11, 10, 0): 数组([-0.54170763, -0.54247852]), (15, 3, 0): 数组([-0.24095023, -0.49996364]), (18, 6, 0): 数组([ 0.28397257, -0.6047619 ]), (20, 4, 0): 数组([ 0.65904186, -0.87462687]), (13, 8, 0): 数组([-0.50007986, -0.50656757]),(13,6,0):数组([-0.14338235,-0.38048843]),(17,5,0):数组([-0.03217932,-0.57848101])
主要有一个小错误。事实上,没有将状态更新到下一个状态。所以,正确的 main 是:
if __name__ == "__main__":
env = gym.make('Blackjack-v1', natural=False, sab=False)
# Create an instance of the MonteCarloAgent class
agent = QlearningAgent(
env, discount_factor=1, exploration_rate=0.1, learning_rate=0.1)
num_episodes = 1000000
for e in range(num_episodes):
terminated = False
truncated = False
# Choose initial state randomly
obs, info = env.reset()
while (not (truncated or terminated)): # Loop for each episode
action = agent.get_action(obs)
next_obs, reward, terminated, truncated, info = env.step(action)
agent.update_Q_values(obs, action, reward, next_obs)
obs = next_obs
if (e % 10000 == 0):
print(f'Episode {e}/{num_episodes}')
agent.plot_Q_values()
env.close()
添加这部分代码后,结果与书中所示类似。