import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes = 1000
output_dir = 'model_output/cartpole'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=2000)
self.gamma = 0.9
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.05
self._learning_rate = 0.01
self.model = self._build_model()
def _build_model(self):
model = Sequential()
model.add(Dense(24, input_dim = self.state_size, activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(50,activation='relu'))
model.add(Dense(self.action_size, activation='sigmoid'))
model.compile(loss='mse', optimizer=Adam(lr=self._learning_rate))
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((self, state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon:
return random.randrange(self.action_size)
act_values = self.model.predict(state)
return np.argmax(act_values[0])
def replay(self, batch_size):
minibatch = random.sample(self.memory, batch_size)
print(len(minibatch))
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = (reward + self.gamma*np.amax(self.model.predict(next_state)[0]))
target_f = self.model.predict(state)
target_f[0][action] = target
self.model.fit(state, target_f, epochs=1, verboss=0)
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
def load(self,name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
agent = DQNAgent(state_size, action_size)
done = False
for e in range(n_episodes):
state = env.reset()
state = np.reshape(state, [1, state_size])
if agent.epsilon > agent.epsilon_min:
agent.epsilon *= agent.epsilon_decay
for time in range(5000):
# env.render()
action = agent.act(state)
next_state, reward, done, _ = env.step(action)
reward = reward if not done else -10
next_state = np.reshape(next_state, [1, state_size])
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print("episode: {}/{}, score: {}, e: {:.2}".format(e, n_episodes, time, agent.epsilon))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size)
if e % 50 == 0:
agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")
我正在为 openai 健身房中的 cartpole 环境创建算法,但我收到此错误:
回溯(最近一次调用最后一次): 文件“C:/Users/ardao/Desktop/Ardaficial Intelligence/DQNs/CartPole.py”,第 145 行,位于 代理.重播(batch_size) 文件“C:/Users/ardao/Desktop/Ardaficial Intelligence/DQNs/CartPole.py”,第 93 行,重播中 对于状态、动作、奖励、next_state,以小批量完成: ValueError:需要解压的值太多(预计为 5)
我正在关注本教程:https://www.youtube.com/watch?v=OYhFoMySoVs&t=2444s
谢谢
阿尔达
一般来说,只有当左侧的变量较少时,Python 才能分配所有解压的值,才会出现此错误。每个值都分配给一个变量。当变量的数量和值的数量相同时,解包起作用。每个值都有一个对应的变量。因此,在gym的弃用版本中,env.step()有4个解压值,即
obs, reward, done, info = env.step(action)
但是,在最新版本的gym中,step()函数返回一个被截断的附加变量。所以,你可以用以下方式替换原来的:
obs, reward, terminated, truncated, info = env.step(action)
例如,如果您使用 Blackjack-v1,那么您将得到以下输出。
(17, 7, False) 0.0 False False {}
希望,这有帮助。
您刚刚添加了一个额外的自我。这应该可以解决它。 如果你仔细想想,这个错误是非常不言自明的。
需要解压的值太多(预计有 5 个)
在该行中,您可以看到有 6。youtube 中代码的验证显示了相同的内容。但当你刚开始时,这些很容易被忽视。祝你好运,我鼓励你花点时间喘口气,下次慢慢地再看一遍。也许你可以自己解决。
self.memory.append((state, action, reward, next_state, done))
如果打印出函数 env.step(action) 那么你会看到得到 5 个值
这是冰冻湖泊环境的示例:
(1, 0.0, 假, 假, {'prob': 0.3333333333333333})
所以我们有 (new_sate,reward,done,truncated,info),只需确保添加“truncated”,它是一个布尔值,因为它指示是否在 MDP 之外满足了截断条件。
希望这有帮助,这里有一个链接可以帮助您进一步理解这些值