Reninforcement Learning DQN Pytorch error: "the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)"

I am attempting to use a Deep Q network for the Car Racing Open AI Gym Environment. The general idea is to pass the agent a stack of four greyscale observations (96,96,4) of the environment, with the DQN returning the optimal action. I’ve run into a problem with mismatched tensors, and I think my DQN is returning 32 vectors of 5 q values, however I’m not entirely sure what the problem is in my implementation. Happy for some pointers.

This is the error in question

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_12/2562658152.py in <module>
     24 
     25 #agent.load('acrobot.128x128.DQN.pt')
---> 26 agent.train(max_episodes, lambda x : min(x) >= -90, criterion_episodes)
     27 
     28 # visualise one episode

/tmp/ipykernel_12/2258452059.py in train(self, max_episodes, stop_criterion, criterion_episodes)
    155 
    156                 # update Q-network
--> 157                 self.update()
    158 
    159                 state = next_state

/tmp/ipykernel_12/2258452059.py in update(self)
    103 
    104                 # targets are TD targets
--> 105                 targets[n, :] = self.target_qnet(state).detach()
    106                 print(f"Update targets shape is {targets.shape}")
    107                 if terminated:

RuntimeError: expand(torch.FloatTensor{[32, 5]}, size=[5]): the number of sizes provided (1) must be greater or equal to the number of dimensions in the tensor (2)

This is my code for the Q Network.

class QNetworkHistory(nn.Module): #96,96,4
    def __init__(self, state, hidden_sizes, output_size, learning_rate):
        super().__init__()
        
        
        # create network layers
        layers = nn.ModuleList()
        
        # input layer
        layers.append(nn.Conv2d(4, 8, kernel_size = 3)) #input channels, output channels, kernel_size
        layers.append(nn.MaxPool2d(2, 2)) #kernel size
        layers.append(nn.Conv2d(8, 16,kernel_size = 3)) #input channels, output channels, kernel_size
        layers.append(nn.MaxPool2d(2,2)) #kernel size       
        layers.append(nn.Conv2d(16, 32,kernel_size = 3)) #input channels, output channels, kernel_size
        layers.append(nn.MaxPool2d(2,2)) #kernel size
        
        #flatten
        layers.append(nn.Flatten())
        
        #linear
        layers.append(nn.Linear(100, hidden_sizes[0]))
        layers.append(nn.ReLU())
        
        # hidden layers
        for i in range(len(hidden_sizes)-1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
            layers.append(nn.ReLU())
        
        # output layer
        layers.append(nn.Linear(hidden_sizes[-1], output_size))
        
    
        # combine layers into feed-forward network
        self.net = nn.Sequential(*layers)
        
        # select loss function and optimizer
        # note: original paper uses modified MSE loss and RMSprop
        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate)
       

    def forward(self, x):
        # return output of Q-network for the input x
        return self.net(x)
    
    def update(self, inputs, targets):
        # update network weights for a minibatch of inputs and targets:
        self.optimizer.zero_grad()
        outputs = self.net(inputs)
        loss = self.criterion(outputs, targets)
        loss.backward()
        self.optimizer.step()
    
    def copy_from(self, qnetwork):
        # copy weights from another Q-network
        self.net.load_state_dict(qnetwork.net.state_dict())

This is the code for my DQN agent

class AgentDQNHistory():
    def __init__(self, env, gamma,
                 hidden_sizes=(32, 32),
                 learning_rate=0.001,
                 epsilon=0.1,
                 replay_size=10000,
                 minibatch_size=32,
                 target_update=20):        
        
        
        # check if the state space has correct type
        continuous = isinstance(env.observation_space, spaces.Box) and len(env.observation_space.shape) == 3
        assert continuous, 'Observation space must be continuous with shape (n,n,n)'
        self.state_dims = (env.observation_space.shape[0]*env.observation_space.shape[1])*(4)
        
        self.W = env.observation_space.shape[0]
        self.H = env.observation_space.shape[1]
        self.C = 4
        
        # check if the action space has correct type
        assert isinstance(env.action_space, spaces.Discrete), 'Action space must be discrete'
        self.num_actions = env.action_space.n
        
        # create Q-networks for action-value function
        self.qnet = QNetworkHistory(self.state_dims, hidden_sizes, self.num_actions, learning_rate)
        self.target_qnet = QNetworkHistory(self.state_dims, hidden_sizes, self.num_actions, learning_rate)
        
        # copy weights from Q-network to target Q-network
        self.target_qnet.copy_from(self.qnet)
        
        # initialise replay buffer
        self.replay = deque(maxlen=replay_size)
        
        self.env = env
        self.gamma = gamma
        self.epsilon = epsilon
        self.minibatch_size = minibatch_size
        self.target_update = target_update
        self.target_update_idx = 0
        self.history = []
        self.steps = 0
    
    def rgb2gray(self, state):
        return np.dot(state[...,:], [0.2989, 0.5870, 0.1140])
    
    def store_history(self, state):
        if len(self.history) < 4:
            self.history.append(state)
        else:
            self.history.pop(0)
            self.history.append(state)
    
    
    def behaviour(self, state):
        # exploratory behaviour policy
        
        if len(self.history)<4:
            return self.env.action_space.sample()
        else:
            if rng.uniform() >= self.epsilon:
                # convert state to torch format
                if not torch.is_tensor(state):
                    state = torch.tensor(state, dtype=torch.float)
            
                # exploitation with probability 1-epsilon; break ties randomly
                q = self.qnet(state).detach()
                j = rng.permutation(self.num_actions)
                #print(j)
                return j[q[0][j].argmax().item()]
            else:
                # exploration with probability epsilon
                return self.env.action_space.sample()        
        
    def policy(self, state):
        # convert state to torch format
        if not torch.is_tensor(state):
            state = torch.tensor(state, dtype=torch.float)
        
        # greedy policy
        q = self.qnet(state).detach()
        return q.argmax().item()
        
    def update(self):
        # update Q-network if there is enough experience
        if len(self.replay) >= self.minibatch_size:
            # select mini-batch of experiences uniformly at random without replacement
            minibatch = rng.choice(self.replay, size=self.minibatch_size, replace=False)
            
            # calculate inputs and targets for the transitions in the mini-batch
            inputs = torch.zeros((self.minibatch_size, self.C, self.W, self.H))
            #print(inputs.shape)
            targets = torch.zeros((self.minibatch_size, self.num_actions))
            #print(targets.shape)
            
            for n, (state, action, reward, next_state, terminated) in enumerate(minibatch):
                
                # inputs are states
                inputs[n, :, :, :] = state
                
                # targets are TD targets
                targets[n, :] = self.target_qnet(state).detach()
               
                if terminated:
                    targets[n, action] = reward
                else:
                    targets[n, action] = reward + self.gamma*self.target_qnet(next_state).detach().max()
            
            # train Q-network on the mini-batch
            self.qnet.update(inputs, targets)
        
        # periodically copy weights from Q-network to target Q-network
        self.target_update_idx += 1
        if self.target_update_idx % self.target_update == 0:
            self.target_qnet.copy_from(self.qnet)
            
    def train(self, max_episodes, stop_criterion, criterion_episodes):
        # train the agent for a number of episodes
        rewards = []
        num_steps = 0
        for episode in range(max_episodes):
            
            #initial observation
            state, _ = env.reset()
            state = self.rgb2gray(state)
            
            # convert state to torch format
            state = torch.tensor(state, dtype=torch.float)
            self.store_history(state)
            
            terminated = False
            truncated = False
            rewards.append(0)
            
            while not (terminated or truncated):
                # select action by following behaviour policy
                state = self.history #calling last four states
                state = torch.stack(state) #converting list to tensor, thus combining states
                
               
                action = self.behaviour(state) #passing last four states to agent and returning behaviour

                # send the action to the environment
                next_state, reward, terminated, truncated, _ = env.step(action)

                # convert next state to torch format and add experience to replay buffer                
                next_state = self.rgb2gray(next_state)

                next_state = torch.tensor(next_state, dtype=torch.float)
                self.store_history(next_state)
                self.replay.append((state, action, reward, next_state, terminated))                
                
                # update Q-network
                self.update()
                
                state = next_state
                rewards[-1] += reward
                num_steps += 1
                
            print(f'\rEpisode {episode+1} done: steps = {num_steps}, rewards = {rewards[episode]}     ', end='')
            
            if episode >= criterion_episodes-1 and stop_criterion(rewards[-criterion_episodes:]):
                print(f'\nStopping criterion satisfied after {episode} episodes')
                break
            
        # plot rewards received during training
        plt.figure(dpi=100)
        plt.plot(range(1, len(rewards)+1), rewards, label=f'Rewards')

        plt.xlabel('Episodes')
        plt.ylabel('Rewards per episode')
        plt.legend(loc='lower right')
        plt.grid()
        plt.show()    
    
    def save(self, path):
        # save network weights to a file
        torch.save(self.qnet.state_dict(), path)
        
    def load(self, path):
        # load network weights from a file
        self.qnet.load_state_dict(torch.load(path))
        self.target_qnet.copy_from(self.qnet)

Here is the code for initializing the environment and agent



gamma = 0.99
hidden_sizes = (128, 128)
learning_rate = 0.001
epsilon = 0.10
replay_size = 10000
minibatch_size = 64
target_update = 20
max_episodes = 100
max_steps = 1000
criterion_episodes = 30


agent = AgentDQNHistory(env,
                 gamma=gamma,
                 hidden_sizes=hidden_sizes,
                 learning_rate=learning_rate,
                 epsilon=epsilon,
                 replay_size=replay_size,
                 minibatch_size=minibatch_size,
                 target_update=target_update)

#agent.load('acrobot.128x128.DQN.pt')
agent.train(max_episodes, lambda x : min(x) >= -90, criterion_episodes)

# visualise one episode
state, _ = env.reset()
state = agent.rgb2gray(state)
agent.store_history(state)

terminated = False
truncated = False
steps = 0
total_reward = 0
while not (terminated or truncated or steps > max_steps):
    # take action based on policy    
    state_stack = agent.history
    state_stack = torch.stack(state_stack)
    action = agent.policy(state_stack)
    
    # environment receives the action and returns:
    # next observation, reward, terminated, truncated, and additional information (if applicable)
    state, reward, terminated, truncated, info = env.step(action)
    state = agent.rgb2gray(state)
    agent.store_history(state)
    total_reward += reward
    steps += 1
    
print(f'Reward: {total_reward}')

# store RGB frames for the entire episode
frames = env.render()

# close the environment
env.close()

# create and play video clip using the frames and given fps
clip = mpy.ImageSequenceClip(frames, fps=15)
clip.ipython_display(rd_kwargs=dict(logger=None), maxduration = 100)

I’ve tried altering the layers in my DQN, however the problem still persists. Grateful for some help as I am quite new to this.