added own dqn

This commit is contained in:
romue 2021-06-17 16:15:08 +02:00
parent 26d7705e19
commit 443217a3f6

View File

@ -1,4 +1,4 @@
from typing import Tuple, NamedTuple from typing import NamedTuple, Union
from collections import namedtuple, deque from collections import namedtuple, deque
import numpy as np import numpy as np
import random import random
@ -6,16 +6,17 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from stable_baselines3.common.utils import polyak_update from stable_baselines3.common.utils import polyak_update
from stable_baselines3.common.buffers import ReplayBuffer
import copy
class Experience(NamedTuple): class Experience(NamedTuple):
observation: np.ndarray observation: np.ndarray
next_observation: np.ndarray next_observation: np.ndarray
action: int action: np.ndarray
reward: float reward: Union[float, np.ndarray]
done : bool done : Union[bool, np.ndarray]
priority: float = 1 priority: np.ndarray = 1
info : dict = {}
class BaseBuffer: class BaseBuffer:
@ -31,7 +32,12 @@ class BaseBuffer:
def sample(self, k): def sample(self, k):
sample = random.choices(self.experience, k=k) sample = random.choices(self.experience, k=k)
return sample observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float()
next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float()
actions = torch.tensor([e.action for e in sample]).long()
rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1)
dones = torch.tensor([e.done for e in sample]).float().view(-1, 1)
return Experience(observations, next_observations, actions, rewards, dones)
class PERBuffer(BaseBuffer): class PERBuffer(BaseBuffer):
@ -50,16 +56,16 @@ class BaseDQN(nn.Module):
def __init__(self): def __init__(self):
super(BaseDQN, self).__init__() super(BaseDQN, self).__init__()
self.net = nn.Sequential( self.net = nn.Sequential(
nn.Linear(3*5*5, 64), nn.Linear(3*5*5, 128),
nn.ReLU(), nn.ReLU(),
nn.Linear(64, 64), nn.Linear(128, 128),
nn.ReLU(), nn.ReLU(),
nn.Linear(64, 9) nn.Linear(128, 9)
) )
def act(self, x): def act(self, x) -> np.ndarray:
with torch.no_grad(): with torch.no_grad():
action = self.net(x.view(x.shape[0], -1)).argmax(-1).item() action = self.net(x.view(x.shape[0], -1)).max(-1)[1].numpy()
return action return action
def forward(self, x): def forward(self, x):
@ -70,17 +76,17 @@ class BaseDQN(nn.Module):
class BaseQlearner: class BaseQlearner:
def __init__(self, q_net, target_q_net, env, buffer, target_update, warmup, eps_end, def __init__(self, q_net, target_q_net, env, buffer, target_update, eps_end, n_agents=1,
gamma=0.99, train_every_n_steps=4, n_grad_steps=1, gamma=0.99, train_every_n_steps=4, n_grad_steps=1,
exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0): exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0):
self.q_net = q_net self.q_net = q_net
self.target_q_net = target_q_net self.target_q_net = target_q_net
self.target_q_net.load_state_dict(self.q_net.state_dict()) #self.q_net.apply(self.weights_init)
polyak_update(self.q_net.parameters(), self.target_q_net.parameters(), 1)
self.target_q_net.eval() self.target_q_net.eval()
self.env = env self.env = env
self.buffer = buffer self.buffer = buffer
self.target_update = target_update self.target_update = target_update
self.warmup = warmup
self.eps = 1. self.eps = 1.
self.eps_end = eps_end self.eps_end = eps_end
self.exploration_fraction = exploration_fraction self.exploration_fraction = exploration_fraction
@ -90,73 +96,97 @@ class BaseQlearner:
self.n_grad_steps = n_grad_steps self.n_grad_steps = n_grad_steps
self.lr = lr self.lr = lr
self.reg_weight = reg_weight self.reg_weight = reg_weight
self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.lr) self.n_agents = n_agents
self.device = 'cpu' self.device = 'cpu'
self.running_reward = deque(maxlen=10) self.optimizer = torch.optim.Adam(self.q_net.parameters(), lr=self.lr)
self.running_loss = deque(maxlen=10) self.running_reward = deque(maxlen=30)
self.running_loss = deque(maxlen=30)
def to(self, device): def to(self, device):
self.device = device self.device = device
return self return self
@staticmethod
def weights_init(module, activation='relu'):
if isinstance(module, (nn.Linear, nn.Conv2d)):
nn.init.xavier_normal_(module.weight, gain=torch.nn.init.calculate_gain(activation))
if module.bias is not None:
module.bias.data.fill_(0.0)
def anneal_eps(self, step, n_steps): def anneal_eps(self, step, n_steps):
fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0) fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0)
eps = 1 + fraction * (self.eps_end - 1) self.eps = 1 + fraction * (self.eps_end - 1)
return eps
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
if np.random.rand() > self.eps:
action = self.q_net.act(o.float())
else:
action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)])
return action
def learn(self, n_steps): def learn(self, n_steps):
step, eps = 0, 1 step = 0
while step < n_steps: while step < n_steps:
obs, done = self.env.reset(), False obs, done = self.env.reset(), False
total_reward = 0 total_reward = 0
while not done: while not done:
action = self.q_net.act(torch.from_numpy(obs).unsqueeze(0).float()) \ action = self.get_action(obs)
if np.random.rand() > eps else env.action_space.sample()
next_obs, reward, done, info = env.step(action) next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
experience = Experience(observation=obs, next_observation=next_obs, action=action, reward=reward, done=done) # do we really need to copy? experience = Experience(observation=obs.copy(), next_observation=next_obs.copy(),
action=action, reward=reward, done=done) # do we really need to copy?
self.buffer.add(experience) self.buffer.add(experience)
# end of step routine # end of step routine
obs = next_obs obs = next_obs
step += 1 step += 1
total_reward += reward total_reward += reward
eps = self.anneal_eps(step, n_steps) self.anneal_eps(step, n_steps)
if step % self.train_every_n_steps == 0: if step % self.train_every_n_steps == 0:
self.train() self.train()
if step % self.target_update == 0: if step % self.target_update == 0:
self.target_q_net.load_state_dict(self.q_net.state_dict()) print('UPDATE')
polyak_update(self.q_net.parameters(), self.target_q_net.parameters(), 1.0)
self.running_reward.append(total_reward) self.running_reward.append(total_reward)
if step % 10 == 0: if step % 10 == 0:
print(f'Step: {step} ({(step/n_steps)*100:.2f}%)\tRunning reward: {sum(list(self.running_reward))/len(self.running_reward)}\t' print(f'Step: {step} ({(step/n_steps)*100:.2f}%)\tRunning reward: {sum(list(self.running_reward))/len(self.running_reward):.2f}\t'
f' eps: {eps:.4f}\tRunning loss: {sum(list(self.running_loss))/len(self.running_loss)}') f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss))/len(self.running_loss):.4f}')
def _training_routine(self, obs, next_obs, action):
current_q_values = self.q_net(obs)
current_q_values = torch.gather(current_q_values, dim=1, index=action)
next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
return current_q_values, next_q_values_raw
def train(self): def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps): for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size) experience = self.buffer.sample(self.batch_size)
#print(experience.observation.shape, experience.next_observation.shape, experience.action.shape, experience.reward.shape, experience.done.shape)
if self.n_agents <= 1:
pred_q, target_q_raw = self._training_routine(experience.observation, experience.next_observation, experience.action)
else:
pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
for agent_i in range(self.n_agents):
q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
experience.next_observation[:, agent_i],
experience.action[:, agent_i].unsqueeze(-1)
)
pred_q += q_values
target_q_raw += next_q_values_raw
obs = torch.stack([torch.from_numpy(e.observation) for e in experience], 0).float() target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
next_obs = torch.stack([torch.from_numpy(e.next_observation) for e in experience], 0).float() loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
actions = torch.tensor([e.action for e in experience]).long() print(target_q)
rewards = torch.tensor([e.reward for e in experience]).float()
dones = torch.tensor([e.done for e in experience]).float()
next_q_values = self.target_q_net(next_obs).detach().max(-1)[0]
target_q_values = rewards + (1. - dones) * self.gamma * next_q_values
q_values = self.q_net(obs).gather(-1, actions.unsqueeze(0))
delta = q_values - target_q_values
loss = torch.mean(self.reg_weight * q_values + torch.pow(delta, 2))
# log loss
self.running_loss.append(loss.item()) self.running_loss.append(loss.item())
# Optimize the model # Optimize the model
self.optimizer.zero_grad() self.optimizer.zero_grad()
loss.backward() loss.backward()
@ -164,25 +194,33 @@ class BaseQlearner:
self.optimizer.step() self.optimizer.step()
if __name__ == '__main__': if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.reg_dqn import RegDQN from algorithms.reg_dqn import RegDQN
from stable_baselines3.common.vec_env import DummyVecEnv
N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05)
move_props = MovementProperties(allow_diagonal_movement=True, move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True, allow_square_movement=True,
allow_no_op=False) allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=1, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False) env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False)
#print(env.action_space) env = DummyVecEnv([lambda: env])
print(env)
from stable_baselines3.dqn import DQN from stable_baselines3.dqn import DQN
#dqn = RegDQN('MlpPolicy', env, verbose=True, buffer_size = 50000, learning_starts = 25000, batch_size = 64, target_update_interval = 5000, exploration_fraction = 0.25, exploration_final_eps = 0.025) #dqn = RegDQN('MlpPolicy', env, verbose=True, buffer_size = 50000, learning_starts = 64, batch_size = 64,
#print(dqn.policy) # target_update_interval = 5000, exploration_fraction = 0.25, exploration_final_eps = 0.025,
# train_freq=4, gradient_steps=1, reg_weight=0.05)
#dqn.learn(100000) #dqn.learn(100000)
print(env.observation_space, env.action_space) print(env.observation_space, env.action_space)
dqn, target_dqn = BaseDQN(), BaseDQN() dqn, target_dqn = BaseDQN(), BaseDQN()
learner = BaseQlearner(dqn, target_dqn, env, BaseBuffer(50000), target_update=5000, warmup=25000, lr=1e-4, gamma=0.99, learner = BaseQlearner(dqn, target_dqn, env, BaseBuffer(50000), target_update=10000, lr=0.0001, gamma=0.99, n_agents=N_AGENTS,
train_every_n_steps=4, eps_end=0.05, reg_weight=0.1, exploration_fraction=0.25, batch_size=64) train_every_n_steps=4, eps_end=0.05, n_grad_steps=1, reg_weight=0.05, exploration_fraction=0.25, batch_size=64)
learner.learn(100000) learner.learn(100000)