我正在尝试为我的博士学位进程构建一个 DQN 模型,在使用实际数据实现它之前,我想利用虚拟数据。
使用与简单 Q Learning 相同的过程,该方法是有效的,但是当我将其转换为 DQN 以使其更先进和适应性更强时,我开始面临训练阶段的问题。我还实现了 GPU 加速,但根本没有帮助。我想知道这是因为虚拟数据集的大小,还是其他我无法弄清楚的原因。
感谢任何帮助或指导。
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pandas as pd
from collections import deque
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device:", device)
# Dummy data setup
data = {
'message_size': np.random.randint(1000, 70000, size=1000),
'cpu_usage': np.random.uniform(40, 100, size=1000),
'submission_time': np.random.uniform(0, 300, size=1000)
}
dummy_data = pd.DataFrame(data)
# Parameters
MAX_BLOCK_SIZE = 32768
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.99
BATCH_SIZE = 32
EPISODES = 1000
# DQN model
class DQN(nn.Module):
def __init__(self, input_dim):
super(DQN, self).__init__()
self.fc1 = nn.Linear(input_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# Initialize models and optimizer
dqn = DQN(input_dim=2).to(device)
target_model = DQN(input_dim=2).to(device)
target_model.load_state_dict(dqn.state_dict())
optimizer = optim.Adam(dqn.parameters(), lr=ALPHA)
memory = deque(maxlen=2000)
# Block choice function
def block_choice(state):
if random.random() < EPSILON:
return random.randint(1, int(state[0] // MAX_BLOCK_SIZE) + 1)
else:
state_tensor = torch.FloatTensor(state).to(device)
return torch.argmax(dqn(state_tensor)).item() + 1
# Reward function based on utility
def utility_function_rewarding(total_latency, cpu_per_block, max_latency=300, max_cpu=100):
latency_reward = max(0, 1 - (total_latency / max_latency))
cpu_reward = max(0, 1 - (cpu_per_block / max_cpu))
return latency_reward + cpu_reward
# Training function
def dqn_training(batch_size):
if len(memory) < batch_size:
return
batch = random.sample(memory, batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
# Move data to device
states = torch.FloatTensor(states).to(device)
rewards = torch.FloatTensor(rewards).to(device)
next_states = torch.FloatTensor(next_states).to(device)
dones = torch.FloatTensor(dones).to(device)
state_action_values = dqn(states)
next_state_values = target_model(next_states).max(1)[0]
expected_values = rewards + (GAMMA * next_state_values * (1 - dones))
loss = nn.functional.mse_loss(state_action_values, expected_values)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Store transitions in memory
def store_transition(state, action, reward, next_state, done):
memory.append((state, action, reward, next_state, done))
# Main training loop
for episode in range(EPISODES):
print(f"Starting Episode {episode + 1}/{EPISODES}")
row = dummy_data.sample().iloc[0]
state = [row['submission_time'], row['cpu_usage']]
total_reward = 0
done = False
while not done:
action = block_choice(state)
next_row = dummy_data.sample().iloc[0]
next_latency = next_row['submission_time']
next_cpu = next_row['cpu_usage'] / action
next_state = [next_latency, next_cpu]
reward = utility_function_rewarding(next_latency, next_cpu)
total_reward += reward
done = episode == EPISODES - 1
store_transition(state, action, reward, next_state, done)
state = next_state
dqn_training(BATCH_SIZE)
# Update epsilon for exploration-exploitation balance
if EPSILON > EPSILON_MIN:
EPSILON *= EPSILON_DECAY
print(f"Episode {episode + 1}/{EPISODES} - Total Reward: {total_reward}")
首先,这段代码存在张量形状不匹配的错误:
UserWarning: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
loss = nn.functional.mse_loss(state_action_values, expected_values)
压平
state_action_values
:将其从[32, 1]
转换为[32]
。
或
展开
expected_values
:将其从[32]
转换为[32, 1]
。
其次,剧集有无限循环。
在主训练循环中,完成标志是根据条件
episode == EPISODES - 1
设置的。这意味着对于除最后一集之外的所有剧集,done
仍为 False
,导致内部 while not done 循环无限期运行。
解决方案:
实施类似
MAX_STEPS_PER_EPISODE
的东西。
MAX_STEPS_PER_EPISODE = 100 # You can adjust this value as needed