Reninforcement Learning DQN Pytorch error: “the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)”
I am attempting to use a Deep Q network for the Car Racing Open AI Gym Environment. The general idea is to pass the agent a stack of four greyscale observations (96,96,4) of the environment, with the DQN returning the optimal action. I’ve run into a problem with mismatched tensors, and I think my DQN is returning 32 vectors of 5 q values, however I’m not entirely sure what the problem is in my implementation. Happy for some pointers.
This is the error in question
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_12/2562658152.py in <module>
24
25 #agent.load('acrobot.128x128.DQN.pt')
---> 26 agent.train(max_episodes, lambda x : min(x) >= -90, criterion_episodes)
27
28 # visualise one episode
/tmp/ipykernel_12/2258452059.py in train(self, max_episodes, stop_criterion, criterion_episodes)
155
156 # update Q-network
--> 157 self.update()
158
159 state = next_state
/tmp/ipykernel_12/2258452059.py in update(self)
103
104 # targets are TD targets
--> 105 targets[n, :] = self.target_qnet(state).detach()
106 print(f"Update targets shape is {targets.shape}")
107 if terminated:
RuntimeError: expand(torch.FloatTensor{[32, 5]}, size=[5]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)
This is my code for the Q Network.
class QNetworkHistory(nn.Module): #96,96,4
def __init__(self, state, hidden_sizes, output_size, learning_rate):
super().__init__()
# create network layers
layers = nn.ModuleList()
# input layer
layers.append(nn.Conv2d(4, 8, kernel_size = 3)) #input channels, output channels, kernel_size
layers.append(nn.MaxPool2d(2, 2)) #kernel size
layers.append(nn.Conv2d(8, 16,kernel_size = 3)) #input channels, output channels, kernel_size
layers.append(nn.MaxPool2d(2,2)) #kernel size
layers.append(nn.Conv2d(16, 32,kernel_size = 3)) #input channels, output channels, kernel_size
layers.append(nn.MaxPool2d(2,2)) #kernel size
#flatten
layers.append(nn.Flatten())
#linear
layers.append(nn.Linear(100, hidden_sizes[0]))
layers.append(nn.ReLU())
# hidden layers
for i in range(len(hidden_sizes)-1):
layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
layers.append(nn.ReLU())
# output layer
layers.append(nn.Linear(hidden_sizes[-1], output_size))
# combine layers into feed-forward network
self.net = nn.Sequential(*layers)
# select loss function and optimizer
# note: original paper uses modified MSE loss and RMSprop
self.criterion = nn.MSELoss()
self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
def forward(self, x):
# return output of Q-network for the input x
return self.net(x)
def update(self, inputs, targets):
# update network weights for a minibatch of inputs and targets:
self.optimizer.zero_grad()
outputs = self.net(inputs)
loss = self.criterion(outputs, targets)
loss.backward()
self.optimizer.step()
def copy_from(self, qnetwork):
# copy weights from another Q-network
self.net.load_state_dict(qnetwork.net.state_dict())
This is the code for my DQN agent
class AgentDQNHistory():
def __init__(self, env, gamma,
hidden_sizes=(32, 32),
learning_rate=0.001,
epsilon=0.1,
replay_size=10000,
minibatch_size=32,
target_update=20):
# check if the state space has correct type
continuous = isinstance(env.observation_space, spaces.Box) and len(env.observation_space.shape) == 3
assert continuous, 'Observation space must be continuous with shape (n,n,n)'
self.state_dims = (env.observation_space.shape[0]*env.observation_space.shape[1])*(4)
self.W = env.observation_space.shape[0]
self.H = env.observation_space.shape[1]
self.C = 4
# check if the action space has correct type
assert isinstance(env.action_space, spaces.Discrete), 'Action space must be discrete'
self.num_actions = env.action_space.n
# create Q-networks for action-value function
self.qnet = QNetworkHistory(self.state_dims, hidden_sizes, self.num_actions, learning_rate)
self.target_qnet = QNetworkHistory(self.state_dims, hidden_sizes, self.num_actions, learning_rate)
# copy weights from Q-network to target Q-network
self.target_qnet.copy_from(self.qnet)
# initialise replay buffer
self.replay = deque(maxlen=replay_size)
self.env = env
self.gamma = gamma
self.epsilon = epsilon
self.minibatch_size = minibatch_size
self.target_update = target_update
self.target_update_idx = 0
self.history = []
self.steps = 0
def rgb2gray(self, state):
return np.dot(state[...,:], [0.2989, 0.5870, 0.1140])
def store_history(self, state):
if len(self.history) < 4:
self.history.append(state)
else:
self.history.pop(0)
self.history.append(state)
def behaviour(self, state):
# exploratory behaviour policy
if len(self.history)<4:
return self.env.action_space.sample()
else:
if rng.uniform() >= self.epsilon:
# convert state to torch format
if not torch.is_tensor(state):
state = torch.tensor(state, dtype=torch.float)
# exploitation with probability 1-epsilon; break ties randomly
q = self.qnet(state).detach()
j = rng.permutation(self.num_actions)
#print(j)
return j[q[0][j].argmax().item()]
else:
# exploration with probability epsilon
return self.env.action_space.sample()
def policy(self, state):
# convert state to torch format
if not torch.is_tensor(state):
state = torch.tensor(state, dtype=torch.float)
# greedy policy
q = self.qnet(state).detach()
return q.argmax().item()
def update(self):
# update Q-network if there is enough experience
if len(self.replay) >= self.minibatch_size:
# select mini-batch of experiences uniformly at random without replacement
minibatch = rng.choice(self.replay, size=self.minibatch_size, replace=False)
# calculate inputs and targets for the transitions in the mini-batch
inputs = torch.zeros((self.minibatch_size, self.C, self.W, self.H))
#print(inputs.shape)
targets = torch.zeros((self.minibatch_size, self.num_actions))
#print(targets.shape)
for n, (state, action, reward, next_state, terminated) in enumerate(minibatch):
# inputs are states
inputs[n, :, :, :] = state
# targets are TD targets
targets[n, :] = self.target_qnet(state).detach()
if terminated:
targets[n, action] = reward
else:
targets[n, action] = reward + self.gamma*self.target_qnet(next_state).detach().max()
# train Q-network on the mini-batch
self.qnet.update(inputs, targets)
# periodically copy weights from Q-network to target Q-network
self.target_update_idx += 1
if self.target_update_idx % self.target_update == 0:
self.target_qnet.copy_from(self.qnet)
def train(self, max_episodes, stop_criterion, criterion_episodes):
# train the agent for a number of episodes
rewards = []
num_steps = 0
for episode in range(max_episodes):
#initial observation
state, _ = env.reset()
state = self.rgb2gray(state)
# convert state to torch format
state = torch.tensor(state, dtype=torch.float)
self.store_history(state)
terminated = False
truncated = False
rewards.append(0)
while not (terminated or truncated):
# select action by following behaviour policy
state = self.history #calling last four states
state = torch.stack(state) #converting list to tensor, thus combining states
action = self.behaviour(state) #passing last four states to agent and returning behaviour
# send the action to the environment
next_state, reward, terminated, truncated, _ = env.step(action)
# convert next state to torch format and add experience to replay buffer
next_state = self.rgb2gray(next_state)
next_state = torch.tensor(next_state, dtype=torch.float)
self.store_history(next_state)
self.replay.append((state, action, reward, next_state, terminated))
# update Q-network
self.update()
state = next_state
rewards[-1] += reward
num_steps += 1
print(f'\rEpisode {episode+1} done: steps = {num_steps}, rewards = {rewards[episode]} ', end='')
if episode >= criterion_episodes-1 and stop_criterion(rewards[-criterion_episodes:]):
print(f'\nStopping criterion satisfied after {episode} episodes')
break
# plot rewards received during training
plt.figure(dpi=100)
plt.plot(range(1, len(rewards)+1), rewards, label=f'Rewards')
plt.xlabel('Episodes')
plt.ylabel('Rewards per episode')
plt.legend(loc='lower right')
plt.grid()
plt.show()
def save(self, path):
# save network weights to a file
torch.save(self.qnet.state_dict(), path)
def load(self, path):
# load network weights from a file
self.qnet.load_state_dict(torch.load(path))
self.target_qnet.copy_from(self.qnet)
Here is the code for initializing the environment and agent
gamma = 0.99
hidden_sizes = (128, 128)
learning_rate = 0.001
epsilon = 0.10
replay_size = 10000
minibatch_size = 64
target_update = 20
max_episodes = 100
max_steps = 1000
criterion_episodes = 30
agent = AgentDQNHistory(env,
gamma=gamma,
hidden_sizes=hidden_sizes,
learning_rate=learning_rate,
epsilon=epsilon,
replay_size=replay_size,
minibatch_size=minibatch_size,
target_update=target_update)
#agent.load('acrobot.128x128.DQN.pt')
agent.train(max_episodes, lambda x : min(x) >= -90, criterion_episodes)
# visualise one episode
state, _ = env.reset()
state = agent.rgb2gray(state)
agent.store_history(state)
terminated = False
truncated = False
steps = 0
total_reward = 0
while not (terminated or truncated or steps > max_steps):
# take action based on policy
state_stack = agent.history
state_stack = torch.stack(state_stack)
action = agent.policy(state_stack)
# environment receives the action and returns:
# next observation, reward, terminated, truncated, and additional information (if applicable)
state, reward, terminated, truncated, info = env.step(action)
state = agent.rgb2gray(state)
agent.store_history(state)
total_reward += reward
steps += 1
print(f'Reward: {total_reward}')
# store RGB frames for the entire episode
frames = env.render()
# close the environment
env.close()
# create and play video clip using the frames and given fps
clip = mpy.ImageSequenceClip(frames, fps=15)
clip.ipython_display(rd_kwargs=dict(logger=None), maxduration = 100)
I’ve tried altering the layers in my DQN, however the problem still persists. Grateful for some help as I am quite new to this.