From 443217a3f68a58380ec75622574fd2ab2d33945e Mon Sep 17 00:00:00 2001 From: romue Date: Thu, 17 Jun 2021 16:15:08 +0200 Subject: [PATCH] added own dqn --- algorithms/_base.py | 140 ++++++++++++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 51 deletions(-) diff --git a/algorithms/_base.py b/algorithms/_base.py index 15637c8..767087b 100644 --- a/algorithms/_base.py +++ b/algorithms/_base.py @@ -1,4 +1,4 @@ -from typing import Tuple, NamedTuple +from typing import NamedTuple, Union from collections import namedtuple, deque import numpy as np import random @@ -6,16 +6,17 @@ import torch import torch.nn as nn import torch.nn.functional as F from stable_baselines3.common.utils import polyak_update +from stable_baselines3.common.buffers import ReplayBuffer +import copy class Experience(NamedTuple): - observation: np.ndarray + observation: np.ndarray next_observation: np.ndarray - action: int - reward: float - done : bool - priority: float = 1 - info : dict = {} + action: np.ndarray + reward: Union[float, np.ndarray] + done : Union[bool, np.ndarray] + priority: np.ndarray = 1 class BaseBuffer: @@ -31,7 +32,12 @@ class BaseBuffer: def sample(self, k): sample = random.choices(self.experience, k=k) - return sample + observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float() + next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float() + actions = torch.tensor([e.action for e in sample]).long() + rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1) + dones = torch.tensor([e.done for e in sample]).float().view(-1, 1) + return Experience(observations, next_observations, actions, rewards, dones) class PERBuffer(BaseBuffer): @@ -50,16 +56,16 @@ class BaseDQN(nn.Module): def __init__(self): super(BaseDQN, self).__init__() self.net = nn.Sequential( - nn.Linear(3*5*5, 64), + nn.Linear(3*5*5, 128), nn.ReLU(), - nn.Linear(64, 64), + nn.Linear(128, 128), nn.ReLU(), - nn.Linear(64, 9) + nn.Linear(128, 9) ) - def act(self, x): + def act(self, x) -> np.ndarray: with torch.no_grad(): - action = self.net(x.view(x.shape[0], -1)).argmax(-1).item() + action = self.net(x.view(x.shape[0], -1)).max(-1)[1].numpy() return action def forward(self, x): @@ -70,17 +76,17 @@ class BaseDQN(nn.Module): class BaseQlearner: - def __init__(self, q_net, target_q_net, env, buffer, target_update, warmup, eps_end, + def __init__(self, q_net, target_q_net, env, buffer, target_update, eps_end, n_agents=1, gamma=0.99, train_every_n_steps=4, n_grad_steps=1, exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0): self.q_net = q_net self.target_q_net = target_q_net - self.target_q_net.load_state_dict(self.q_net.state_dict()) + #self.q_net.apply(self.weights_init) + polyak_update(self.q_net.parameters(), self.target_q_net.parameters(), 1) self.target_q_net.eval() self.env = env self.buffer = buffer self.target_update = target_update - self.warmup = warmup self.eps = 1. self.eps_end = eps_end self.exploration_fraction = exploration_fraction @@ -90,73 +96,97 @@ class BaseQlearner: self.n_grad_steps = n_grad_steps self.lr = lr self.reg_weight = reg_weight - self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.lr) + self.n_agents = n_agents self.device = 'cpu' - self.running_reward = deque(maxlen=10) - self.running_loss = deque(maxlen=10) + self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.lr) + self.running_reward = deque(maxlen=30) + self.running_loss = deque(maxlen=30) def to(self, device): self.device = device return self + @staticmethod + def weights_init(module, activation='relu'): + if isinstance(module, (nn.Linear, nn.Conv2d)): + nn.init.xavier_normal_(module.weight, gain=torch.nn.init.calculate_gain(activation)) + if module.bias is not None: + module.bias.data.fill_(0.0) + def anneal_eps(self, step, n_steps): fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0) - eps = 1 + fraction * (self.eps_end - 1) - return eps + self.eps = 1 + fraction * (self.eps_end - 1) + + def get_action(self, obs) -> Union[int, np.ndarray]: + o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs) + if np.random.rand() > self.eps: + action = self.q_net.act(o.float()) + else: + action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)]) + return action def learn(self, n_steps): - step, eps = 0, 1 + step = 0 while step < n_steps: obs, done = self.env.reset(), False total_reward = 0 while not done: - action = self.q_net.act(torch.from_numpy(obs).unsqueeze(0).float()) \ - if np.random.rand() > eps else env.action_space.sample() + action = self.get_action(obs) - next_obs, reward, done, info = env.step(action) + next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0]) - experience = Experience(observation=obs, next_observation=next_obs, action=action, reward=reward, done=done) # do we really need to copy? + experience = Experience(observation=obs.copy(), next_observation=next_obs.copy(), + action=action, reward=reward, done=done) # do we really need to copy? self.buffer.add(experience) # end of step routine obs = next_obs step += 1 total_reward += reward - eps = self.anneal_eps(step, n_steps) + self.anneal_eps(step, n_steps) if step % self.train_every_n_steps == 0: self.train() if step % self.target_update == 0: - self.target_q_net.load_state_dict(self.q_net.state_dict()) + print('UPDATE') + polyak_update(self.q_net.parameters(), self.target_q_net.parameters(), 1.0) self.running_reward.append(total_reward) if step % 10 == 0: - print(f'Step: {step} ({(step/n_steps)*100:.2f}%)\tRunning reward: {sum(list(self.running_reward))/len(self.running_reward)}\t' - f' eps: {eps:.4f}\tRunning loss: {sum(list(self.running_loss))/len(self.running_loss)}') + print(f'Step: {step} ({(step/n_steps)*100:.2f}%)\tRunning reward: {sum(list(self.running_reward))/len(self.running_reward):.2f}\t' + f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss))/len(self.running_loss):.4f}') + def _training_routine(self, obs, next_obs, action): + current_q_values = self.q_net(obs) + current_q_values = torch.gather(current_q_values, dim=1, index=action) + next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach() + return current_q_values, next_q_values_raw def train(self): + if len(self.buffer) < self.batch_size: return for _ in range(self.n_grad_steps): + experience = self.buffer.sample(self.batch_size) + #print(experience.observation.shape, experience.next_observation.shape, experience.action.shape, experience.reward.shape, experience.done.shape) + if self.n_agents <= 1: + pred_q, target_q_raw = self._training_routine(experience.observation, experience.next_observation, experience.action) + else: + pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1)) + for agent_i in range(self.n_agents): + q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i], + experience.next_observation[:, agent_i], + experience.action[:, agent_i].unsqueeze(-1) + ) + pred_q += q_values + target_q_raw += next_q_values_raw - obs = torch.stack([torch.from_numpy(e.observation) for e in experience], 0).float() - next_obs = torch.stack([torch.from_numpy(e.next_observation) for e in experience], 0).float() - actions = torch.tensor([e.action for e in experience]).long() - rewards = torch.tensor([e.reward for e in experience]).float() - dones = torch.tensor([e.done for e in experience]).float() - - next_q_values = self.target_q_net(next_obs).detach().max(-1)[0] - target_q_values = rewards + (1. - dones) * self.gamma * next_q_values - - - q_values = self.q_net(obs).gather(-1, actions.unsqueeze(0)) - - delta = q_values - target_q_values - loss = torch.mean(self.reg_weight * q_values + torch.pow(delta, 2)) + target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw + loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2)) + print(target_q) + # log loss self.running_loss.append(loss.item()) - # Optimize the model self.optimizer.zero_grad() loss.backward() @@ -164,25 +194,33 @@ class BaseQlearner: self.optimizer.step() + if __name__ == '__main__': from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties from algorithms.reg_dqn import RegDQN + from stable_baselines3.common.vec_env import DummyVecEnv + + + N_AGENTS = 1 + dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) move_props = MovementProperties(allow_diagonal_movement=True, allow_square_movement=True, allow_no_op=False) - env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=1, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False) - #print(env.action_space) + env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False) + env = DummyVecEnv([lambda: env]) + print(env) from stable_baselines3.dqn import DQN - #dqn = RegDQN('MlpPolicy', env, verbose=True, buffer_size = 50000, learning_starts = 25000, batch_size = 64, target_update_interval = 5000, exploration_fraction = 0.25, exploration_final_eps = 0.025) - #print(dqn.policy) + #dqn = RegDQN('MlpPolicy', env, verbose=True, buffer_size = 50000, learning_starts = 64, batch_size = 64, + # target_update_interval = 5000, exploration_fraction = 0.25, exploration_final_eps = 0.025, + # train_freq=4, gradient_steps=1, reg_weight=0.05) #dqn.learn(100000) print(env.observation_space, env.action_space) dqn, target_dqn = BaseDQN(), BaseDQN() - learner = BaseQlearner(dqn, target_dqn, env, BaseBuffer(50000), target_update=5000, warmup=25000, lr=1e-4, gamma=0.99, - train_every_n_steps=4, eps_end=0.05, reg_weight=0.1, exploration_fraction=0.25, batch_size=64) + learner = BaseQlearner(dqn, target_dqn, env, BaseBuffer(50000), target_update=10000, lr=0.0001, gamma=0.99, n_agents=N_AGENTS, + train_every_n_steps=4, eps_end=0.05, n_grad_steps=1, reg_weight=0.05, exploration_fraction=0.25, batch_size=64) learner.learn(100000)