DQN converges for CartPole but not for lunar lander

Im new to reinforcement learning and I was going off the 2015 paper to implement a DQN I got it to converge for the cartpole problem but It won’t for the lunar landing game. Not sure if its a hyper parameter issue, an architecture issue or I’ve coded something incorrectly. Any help or advice is appreciated

class Model(nn.Module): def __init__(self, in_features=8, h1=64, h2=128, h3=64, out_features=4) -> None: super().__init__() self.fc1 = nn.Linear(in_features,h1) self.fc2 = nn.Linear(h1,h2) self.fc3 = nn.Linear(h2, h3) self.out = nn.Linear(h3, out_features) def forward(self, x): x = F.relu(self.fc1(x)) x = F.dropout(x, 0.2) x = F.relu(self.fc2(x)) x = F.dropout(x, 0.2) x = F.relu(self.fc3(x)) x = self.out(x) return x policy_network = Model() import math def epsilon_decay(epsilon, t, min_exploration_prob, total_episodes): epsilon = max(epsilon – t/total_episodes, min_exploration_prob) return epsilon from collections import deque learning_rate = 0.01 discount_factor = 0.8 exploration_prob = 1.0 min_exploration_prob = 0.1 decay = 0.999 epochs = 5000 replay_buffer_batch_size = 128 min_replay_buffer_size = 5000 replay_buffer = deque(maxlen=min_replay_buffer_size) target_network = Model() target_network.load_state_dict(policy_network.state_dict()) optimizer = torch.optim.Adam(policy_network.parameters(), learning_rate) loss_function = nn.MSELoss() rewards = [] losses = [] loss = -100 for i in range(epochs) : exploration_prob = epsilon_decay(exploration_prob, i, min_exploration_prob, epochs) terminal = False if i % 30 == 0 : target_network.load_state_dict(policy_network.state_dict()) current_state = env.reset() rewardsum = 0 p = False while not terminal : # env.render() if np.random.rand() < exploration_prob: action = env.action_space.sample() else: state_tensor = torch.tensor(np.array([current_state]), dtype=torch.float32) with torch.no_grad(): q_values = policy_network(state_tensor) action = torch.argmax(q_values).item() next_state, reward, terminal, info = env.step(action) rewardsum+=reward replay_buffer.append((current_state, action, terminal, reward, next_state)) if(len(replay_buffer) >= min_replay_buffer_size) : minibatch = random.sample(replay_buffer, replay_buffer_batch_size) batch_states = torch.tensor([transition[0] for transition in minibatch], dtype=torch.float32) batch_actions = torch.tensor([transition[1] for transition in minibatch], dtype=torch.int64) batch_terminal = torch.tensor([transition[2] for transition in minibatch], dtype=torch.bool) batch_rewards = torch.tensor([transition[3] for transition in minibatch], dtype=torch.float32) batch_next_states = torch.tensor([transition[4] for transition in minibatch], dtype=torch.float32) with torch.no_grad(): q_values_next = target_network(batch_next_states).detach() max_q_values_next = q_values_next.max(1)[0] y = batch_rewards + (discount_factor * max_q_values_next * (~batch_terminal)) q_values = policy_network(batch_states).gather(1, batch_actions.unsqueeze(-1)).squeeze(-1) loss = loss_function(y,q_values) losses.append(loss) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(policy_network.parameters(), 10) optimizer.step() if i%100 == 0 and not p: print(loss) p = True current_state = next_state rewards.append(rewardsum) torch.save(policy_network, ‘lunar_game.pth’)

submitted by /u/BigSmoke42169
[link] [comments]

Leave a Reply

The Future Is A.I. !
To top
en_USEnglish