Merge remote-tracking branch 'origin/main'

This commit is contained in:
steffen-illium 2021-07-13 11:12:17 +02:00
commit 01e7b752b8
7 changed files with 623 additions and 226 deletions

View File

@ -1,226 +0,0 @@
from typing import NamedTuple, Union
from collections import namedtuple, deque
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from stable_baselines3.common.utils import polyak_update
from stable_baselines3.common.buffers import ReplayBuffer
import copy
class Experience(NamedTuple):
observation: np.ndarray
next_observation: np.ndarray
action: np.ndarray
reward: Union[float, np.ndarray]
done : Union[bool, np.ndarray]
priority: np.ndarray = 1
class BaseBuffer:
def __init__(self, size: int):
self.size = size
self.experience = deque(maxlen=size)
def __len__(self):
return len(self.experience)
def add(self, experience):
self.experience.append(experience)
def sample(self, k):
sample = random.choices(self.experience, k=k)
observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float()
next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float()
actions = torch.tensor([e.action for e in sample]).long()
rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1)
dones = torch.tensor([e.done for e in sample]).float().view(-1, 1)
return Experience(observations, next_observations, actions, rewards, dones)
class PERBuffer(BaseBuffer):
def __init__(self, size, alpha=0.2):
super(PERBuffer, self).__init__(size)
self.alpha = alpha
def sample(self, k):
pr = [abs(e.priority)**self.alpha for e in self.experience]
pr = np.array(pr) / sum(pr)
idxs = random.choices(range(len(self)), weights=pr, k=k)
pass
class BaseDQN(nn.Module):
def __init__(self):
super(BaseDQN, self).__init__()
self.net = nn.Sequential(
nn.Flatten(),
nn.Linear(3*5*5, 64),
nn.ELU(),
nn.Linear(64, 64),
nn.ELU()
)
self.value_head = nn.Linear(64, 1)
self.advantage_head = nn.Linear(64, 9)
def act(self, x) -> np.ndarray:
with torch.no_grad():
action = self.forward(x).max(-1)[1].numpy()
return action
def forward(self, x):
features = self.net(x)
advantages = self.advantage_head(features)
values = self.value_head(features)
return values + (advantages - advantages.mean())
def random_action(self):
return random.randrange(0, 5)
class BaseQlearner:
def __init__(self, q_net, target_q_net, env, buffer, target_update, eps_end, n_agents=1,
gamma=0.99, train_every_n_steps=4, n_grad_steps=1,
exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0):
self.q_net = q_net
self.target_q_net = target_q_net
self.q_net.apply(self.weights_init)
self.target_q_net.eval()
self.env = env
self.buffer = buffer
self.target_update = target_update
self.eps = 1.
self.eps_end = eps_end
self.exploration_fraction = exploration_fraction
self.batch_size = batch_size
self.gamma = gamma
self.train_every_n_steps = train_every_n_steps
self.n_grad_steps = n_grad_steps
self.lr = lr
self.reg_weight = reg_weight
self.n_agents = n_agents
self.device = 'cpu'
self.optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=self.lr)
self.running_reward = deque(maxlen=5)
self.running_loss = deque(maxlen=5)
self._n_updates = 0
def to(self, device):
self.device = device
return self
@staticmethod
def weights_init(module, activation='relu'):
if isinstance(module, (nn.Linear, nn.Conv2d)):
nn.init.xavier_normal_(module.weight, gain=torch.nn.init.calculate_gain(activation))
if module.bias is not None:
module.bias.data.fill_(0.0)
def anneal_eps(self, step, n_steps):
fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0)
self.eps = 1 + fraction * (self.eps_end - 1)
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
if np.random.rand() > self.eps:
action = self.q_net.act(o.float())
else:
action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)])
return action
def learn(self, n_steps):
step = 0
while step < n_steps:
obs, done = self.env.reset(), False
total_reward = 0
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
experience = Experience(observation=obs, next_observation=next_obs, action=action, reward=reward, done=done) # do we really need to copy?
self.buffer.add(experience)
# end of step routine
obs = next_obs
step += 1
total_reward += reward
self.anneal_eps(step, n_steps)
if step % self.train_every_n_steps == 0:
self.train()
self._n_updates += 1
if step % self.target_update == 0:
print('UPDATE')
polyak_update(self.q_net.parameters(), self.target_q_net.parameters(), 1)
self.running_reward.append(total_reward)
if step % 10 == 0:
print(f'Step: {step} ({(step/n_steps)*100:.2f}%)\tRunning reward: {sum(list(self.running_reward))/len(self.running_reward):.2f}\t'
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss))/len(self.running_loss):.4f}\tUpdates:{self._n_updates}')
def _training_routine(self, obs, next_obs, action):
current_q_values = self.q_net(obs)
current_q_values = torch.gather(current_q_values, dim=-1, index=action)
next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
return current_q_values, next_q_values_raw
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size)
#print(experience.observation.shape, experience.next_observation.shape, experience.action.shape, experience.reward.shape, experience.done.shape)
if self.n_agents <= 1:
pred_q, target_q_raw = self._training_routine(experience.observation, experience.next_observation, experience.action)
else:
pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
for agent_i in range(self.n_agents):
q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
experience.next_observation[:, agent_i],
experience.action[:, agent_i].unsqueeze(-1)
)
pred_q += q_values
target_q_raw += next_q_values_raw
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
# log loss
self.running_loss.append(loss.item())
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), 10)
self.optimizer.step()
if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.reg_dqn import RegDQN
from stable_baselines3.common.vec_env import DummyVecEnv
N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True)
#env = DummyVecEnv([lambda: env])
from stable_baselines3.dqn import DQN
#dqn = RegDQN('MlpPolicy', env, verbose=True, buffer_size = 40000, learning_starts = 0, batch_size = 64,learning_rate=0.0008,
# target_update_interval = 3500, exploration_fraction = 0.25, exploration_final_eps = 0.05,
# train_freq=4, gradient_steps=1, reg_weight=0.05, seed=69)
#dqn.learn(100000)
dqn, target_dqn = BaseDQN(), BaseDQN()
learner = BaseQlearner(dqn, target_dqn, env, BaseBuffer(40000), target_update=3500, lr=0.0008, gamma=0.99, n_agents=N_AGENTS,
train_every_n_steps=4, eps_end=0.05, n_grad_steps=1, reg_weight=0.05, exploration_fraction=0.25, batch_size=64)
learner.learn(100000)

182
algorithms/common.py Normal file
View File

@ -0,0 +1,182 @@
from typing import NamedTuple, Union
from collections import deque, OrderedDict
import numpy as np
import random
import torch
import torch.nn as nn
class Experience(NamedTuple):
# can be use for a single (s_t, a, r s_{t+1}) tuple
# or for a batch of tuples
observation: np.ndarray
next_observation: np.ndarray
action: np.ndarray
reward: Union[float, np.ndarray]
done : Union[bool, np.ndarray]
episode: int = -1
class BaseLearner:
def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1):
assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]'
self.env = env
self.n_agents = n_agents
self.n_grad_steps = n_grad_steps
self.train_every = train_every
self.device = 'cpu'
self.n_updates = 0
self.step = 0
self.episode_step = 0
self.episode = 0
self.running_reward = deque(maxlen=5)
def to(self, device):
self.device = device
for attr, value in self.__dict__.items():
if isinstance(value, nn.Module):
value = value.to(self.device)
return self
def get_action(self, obs) -> Union[int, np.ndarray]:
pass
def on_new_experience(self, experience):
pass
def on_step_end(self, n_steps):
pass
def on_episode_end(self, n_steps):
pass
def train(self):
pass
def learn(self, n_steps):
train_type, train_freq = self.train_every
while self.step < n_steps:
obs, done = self.env.reset(), False
total_reward = 0
self.episode_step = 0
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
experience = Experience(observation=obs, next_observation=next_obs,
action=action, reward=reward,
done=done, episode=self.episode) # do we really need to copy?
self.on_new_experience(experience)
# end of step routine
obs = next_obs
total_reward += reward
self.step += 1
self.episode_step += 1
self.on_step_end(n_steps)
if train_type == 'step' and (self.step % train_freq == 0):
self.train()
self.n_updates += 1
self.on_episode_end(n_steps)
if train_type == 'episode' and (self.episode % train_freq == 0):
self.train()
self.n_updates += 1
self.running_reward.append(total_reward)
self.episode += 1
try:
if self.step % 10 == 0:
print(
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
except Exception as e:
pass
class BaseBuffer:
def __init__(self, size: int):
self.size = size
self.experience = deque(maxlen=size)
def __len__(self):
return len(self.experience)
def add(self, experience):
self.experience.append(experience)
def sample(self, k, cer=4):
sample = random.choices(self.experience, k=k-cer)
for i in range(cer): sample += [self.experience[-i]]
observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float()
next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float()
actions = torch.tensor([e.action for e in sample]).long()
rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1)
dones = torch.tensor([e.done for e in sample]).float().view(-1, 1)
return Experience(observations, next_observations, actions, rewards, dones)
def soft_update(local_model, target_model, tau):
# taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
target_param.data.copy_(tau*local_param.data + (1.-tau)*target_param.data)
def mlp_maker(dims, flatten=False, activation='elu', activation_last='identity'):
activations = {'elu': nn.ELU, 'relu': nn.ReLU, 'sigmoid': nn.Sigmoid,
'leaky_relu': nn.LeakyReLU, 'tanh': nn.Tanh,
'gelu': nn.GELU, 'identity': nn.Identity}
layers = [('Flatten', nn.Flatten())] if flatten else []
for i in range(1, len(dims)):
layers.append((f'Layer #{i - 1}: Linear', nn.Linear(dims[i - 1], dims[i])))
activation_str = activation if i != len(dims)-1 else activation_last
layers.append((f'Layer #{i - 1}: {activation_str.capitalize()}', activations[activation_str]()))
return nn.Sequential(OrderedDict(layers))
class BaseDQN(nn.Module):
def __init__(self, dims=[3*5*5, 64, 64, 9]):
super(BaseDQN, self).__init__()
self.net = mlp_maker(dims, flatten=True)
@torch.no_grad()
def act(self, x) -> np.ndarray:
action = self.forward(x).max(-1)[1].numpy()
return action
def forward(self, x):
return self.net(x)
class BaseDDQN(BaseDQN):
def __init__(self,
backbone_dims=[3*5*5, 64, 64],
value_dims=[64, 1],
advantage_dims=[64, 9]):
super(BaseDDQN, self).__init__(backbone_dims)
self.net = mlp_maker(backbone_dims, flatten=True)
self.value_head = mlp_maker(value_dims)
self.advantage_head = mlp_maker(advantage_dims)
def forward(self, x):
features = self.net(x)
advantages = self.advantage_head(features)
values = self.value_head(features)
return values + (advantages - advantages.mean())
class QTRANtestNet(nn.Module):
def __init__(self, backbone_dims=[3*5*5, 64, 64], q_head=[64, 9]):
super(QTRANtestNet, self).__init__()
self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='elu')
self.q_head = mlp_maker(q_head)
def forward(self, x):
features = self.backbone(x)
qs = self.q_head(features)
return qs, features
@torch.no_grad()
def act(self, x) -> np.ndarray:
action = self.forward(x)[0].max(-1)[1].numpy()
return action

53
algorithms/m_q_learner.py Normal file
View File

@ -0,0 +1,53 @@
import torch
import torch.nn.functional as F
from algorithms.q_learner import QLearner
class MQLearner(QLearner):
# Munchhausen Q-Learning
def __init__(self, *args, temperature=0.03, alpha=0.9, clip_l0=-1.0, **kwargs):
super(MQLearner, self).__init__(*args, **kwargs)
assert self.n_agents == 1, 'M-DQN currently only supports single agent training'
self.temperature = temperature
self.alpha = alpha
self.clip0 = clip_l0
def tau_ln_pi(self, qs):
# computes log(softmax(qs/temperature))
# Custom log-sum-exp trick from page 18 to compute the log-policy terms
v_k = qs.max(-1)[0].unsqueeze(-1)
advantage = qs - v_k
logsum = torch.logsumexp(advantage / self.temperature, -1).unsqueeze(-1)
tau_ln_pi = advantage - self.temperature * logsum
return tau_ln_pi
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps)
with torch.no_grad():
q_target_next = self.target_q_net(experience.next_observation)
tau_log_pi_next = self.tau_ln_pi(q_target_next)
q_k_targets = self.target_q_net(experience.observation)
log_pi = self.tau_ln_pi(q_k_targets)
pi_target = F.softmax(q_target_next / self.temperature, dim=-1)
q_target = (self.gamma * (pi_target * (q_target_next - tau_log_pi_next) * (1 - experience.done)).sum(-1)).unsqueeze(-1)
munchausen_addon = log_pi.gather(-1, experience.action)
munchausen_reward = (experience.reward + self.alpha * torch.clamp(munchausen_addon, min=self.clip0, max=0))
# Compute Q targets for current states
m_q_target = munchausen_reward + q_target
# Get expected Q values from local model
q_k = self.q_net(experience.observation)
pred_q = q_k.gather(-1, experience.action)
# Compute loss
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - m_q_target, 2))
self._backprop_loss(loss)

122
algorithms/q_learner.py Normal file
View File

@ -0,0 +1,122 @@
from typing import Union
import gym
import torch
import torch.nn as nn
import numpy as np
from collections import deque
from pathlib import Path
import yaml
from algorithms.common import BaseLearner, BaseBuffer, soft_update, Experience
class QLearner(BaseLearner):
def __init__(self, q_net, target_q_net, env, buffer_size=1e5, target_update=3000, eps_end=0.05, n_agents=1,
gamma=0.99, train_every=('step', 4), n_grad_steps=1, tau=1.0, max_grad_norm=10, weight_decay=1e-2,
exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0, eps_start=1):
super(QLearner, self).__init__(env, n_agents, train_every, n_grad_steps)
self.q_net = q_net
self.target_q_net = target_q_net
self.target_q_net.eval()
soft_update(self.q_net, self.target_q_net, tau=1.0)
self.buffer = BaseBuffer(buffer_size)
self.target_update = target_update
self.eps = eps_start
self.eps_start = eps_start
self.eps_end = eps_end
self.exploration_fraction = exploration_fraction
self.batch_size = batch_size
self.gamma = gamma
self.tau = tau
self.reg_weight = reg_weight
self.weight_decay = weight_decay
self.lr = lr
self.optimizer = torch.optim.AdamW(self.q_net.parameters(),
lr=self.lr,
weight_decay=self.weight_decay)
self.max_grad_norm = max_grad_norm
self.running_reward = deque(maxlen=5)
self.running_loss = deque(maxlen=5)
self.n_updates = 0
def save(self, path):
path = Path(path) # no-op if already instance of Path
path.mkdir(parents=True, exist_ok=True)
hparams = {k: v for k, v in self.__dict__.items() if not(isinstance(v, BaseBuffer) or
isinstance(v, torch.optim.Optimizer) or
isinstance(v, gym.Env) or
isinstance(v, nn.Module))
}
hparams.update({'class': self.__class__.__name__})
with (path / 'hparams.yaml').open('w') as outfile:
yaml.dump(hparams, outfile)
torch.save(self.q_net, path / 'q_net.pt')
def anneal_eps(self, step, n_steps):
fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0)
self.eps = 1 + fraction * (self.eps_end - 1)
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
if np.random.rand() > self.eps:
action = self.q_net.act(o.float())
else:
action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)])
return action
def on_new_experience(self, experience):
self.buffer.add(experience)
def on_step_end(self, n_steps):
self.anneal_eps(self.step, n_steps)
if self.step % self.target_update == 0:
print('UPDATE')
soft_update(self.q_net, self.target_q_net, tau=self.tau)
def _training_routine(self, obs, next_obs, action):
current_q_values = self.q_net(obs)
current_q_values = torch.gather(current_q_values, dim=-1, index=action)
next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
return current_q_values, next_q_values_raw
def _backprop_loss(self, loss):
# log loss
self.running_loss.append(loss.item())
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.max_grad_norm)
self.optimizer.step()
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1])
pred_q, target_q_raw = self._training_routine(experience.observation,
experience.next_observation,
experience.action)
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)
if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.common import BaseDDQN
from algorithms.vdn_learner import VDNLearner
from algorithms.udr_learner import UDRLearner
N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True)
dqn, target_dqn = BaseDDQN(), BaseDDQN()
learner = QLearner(dqn, target_dqn, env, 40000, target_update=3500, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10,
train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, batch_size=64)
#learner.save(Path(__file__).parent / 'test' / 'testexperiment1337')
learner.learn(100000)

View File

@ -0,0 +1,48 @@
import torch
from algorithms.q_learner import QLearner
class QTRANLearner(QLearner):
def __init__(self, *args, weight_opt=1., weigt_nopt=1., **kwargs):
super(QTRANLearner, self).__init__(*args, **kwargs)
assert self.n_agents >= 2, 'QTRANLearner requires more than one agent, use QLearner instead'
self.weight_opt = weight_opt
self.weigt_nopt = weigt_nopt
def _training_routine(self, obs, next_obs, action):
# todo remove - is inherited - only used while implementing qtran
current_q_values = self.q_net(obs)
current_q_values = torch.gather(current_q_values, dim=-1, index=action)
next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
return current_q_values, next_q_values_raw
def local_qs(self, observations, actions):
Q_jt = torch.zeros_like(actions) # placeholder to sum up individual q values
features = []
for agent_i in range(self.n_agents):
q_values_agent_i, features_agent_i = self.q_net(observations[:, agent_i]) # Individual action-value network
q_values_agent_i = torch.gather(q_values_agent_i, dim=-1, index=actions[:, agent_i].unsqueeze(-1))
Q_jt += q_values_agent_i
features.append(features_agent_i)
feature_sum = torch.stack(features, 0).sum(0) # (n_agents x hdim) -> hdim
return Q_jt
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps)
Q_jt_prime = self.local_qs(experience.observation, experience.action) # sum of individual q-vals
Q_jt = None
V_jt = None
pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
for agent_i in range(self.n_agents):
q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
experience.next_observation[:, agent_i],
experience.action[:, agent_i].unsqueeze(-1))
pred_q += q_values
target_q_raw += next_q_values_raw
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)

178
algorithms/udr_learner.py Normal file
View File

@ -0,0 +1,178 @@
import random
from typing import Union, List
from collections import deque
import numpy as np
import torch
import torch.nn as nn
from algorithms.common import BaseBuffer, Experience, BaseLearner, BaseDQN, mlp_maker
from collections import defaultdict
class UDRLBuffer(BaseBuffer):
def __init__(self, size):
super(UDRLBuffer, self).__init__(0)
self.experience = defaultdict(list)
self.size = size
def add(self, experience):
self.experience[experience.episode].append(experience)
if len(self.experience) > self.size:
self.sort_and_prune()
def select_time_steps(self, episode: List[Experience]):
T = len(episode) # max horizon
t1 = random.randint(0, T - 1)
t2 = random.randint(t1 + 1, T)
return t1, t2, T
def sort_and_prune(self):
scores = []
for k, episode_experience in self.experience.items():
r = sum([e.reward for e in episode_experience])
scores.append((r, k))
sorted_scores = sorted(scores, reverse=True)
return sorted_scores
def sample(self, batch_size, cer=0):
random_episode_keys = random.choices(list(self.experience.keys()), k=batch_size)
lsts = (obs, desired_rewards, horizons, actions) = [], [], [], []
for ek in random_episode_keys:
episode = self.experience[ek]
t1, t2, T = self.select_time_steps(episode)
t2 = T # TODO only good for episodic envs
observation = episode[t1].observation
desired_reward = sum([experience.reward for experience in episode[t1:t2]])
horizon = t2 - t1
action = episode[t1].action
for lst, val in zip(lsts, [observation, desired_reward, horizon, action]):
lst.append(val)
return (torch.stack([torch.from_numpy(o) for o in obs], 0).float(),
torch.tensor(desired_rewards).view(-1, 1).float(),
torch.tensor(horizons).view(-1, 1).float(),
torch.tensor(actions))
class UDRLearner(BaseLearner):
# Upside Down Reinforcement Learner
def __init__(self, env, desired_reward, desired_horizon,
behavior_fn=None, buffer_size=100, n_warm_up_episodes=8, best_x=20,
batch_size=128, lr=1e-3, n_agents=1, train_every=('episode', 4), n_grad_steps=1):
super(UDRLearner, self).__init__(env, n_agents, train_every, n_grad_steps)
assert self.n_agents == 1, 'UDRL currently only supports single agent training'
self.behavior_fn = behavior_fn
self.buffer_size = buffer_size
self.n_warm_up_episodes = n_warm_up_episodes
self.buffer = UDRLBuffer(buffer_size)
self.batch_size = batch_size
self.mode = 'train'
self.best_x = best_x
self.desired_reward = desired_reward
self.desired_horizon = desired_horizon
self.lr = lr
self.optimizer = torch.optim.AdamW(self.behavior_fn.parameters(), lr=lr)
self.running_loss = deque(maxlen=self.n_grad_steps*5)
def sample_exploratory_commands(self):
top_x = self.buffer.sort_and_prune()[:self.best_x]
# The exploratory desired horizon dh0 is set to the mean of the lengths of the selected episodes
new_desired_horizon = np.mean([len(self.buffer.experience[k]) for _, k in top_x])
# save all top_X cumulative returns in a list
returns = [r for r, _ in top_x]
# from these returns calc the mean and std
mean_returns = np.mean([r for r, _ in top_x])
std_returns = np.std(returns)
# sample desired reward from a uniform distribution given the mean and the std
new_desired_reward = np.random.uniform(mean_returns, mean_returns + std_returns)
self.exploratory_commands = (new_desired_reward, new_desired_horizon)
return torch.tensor([[new_desired_reward]]).float(), torch.tensor([[new_desired_horizon]]).float()
def on_new_experience(self, experience):
self.buffer.add(experience)
self.desired_reward = self.desired_reward - torch.tensor(experience.reward).float().view(1, 1)
def on_step_end(self, n_steps):
one = torch.tensor([1.]).float().view(1, 1)
self.desired_horizon -= one
self.desired_horizon = self.desired_horizon if self.desired_horizon >= 1. else one
def on_episode_end(self, n_steps):
self.desired_reward, self.desired_horizon = self.sample_exploratory_commands()
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
bf_out = self.behavior_fn(o.float(), self.desired_reward, self.desired_horizon)
dist = torch.distributions.Categorical(bf_out)
sample = dist.sample()
return [sample.item()]#[self.env.action_space.sample()]
def _backprop_loss(self, loss):
# log loss
self.running_loss.append(loss.item())
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
#torch.nn.utils.clip_grad_norm_(self.behavior_fn.parameters(), 10)
self.optimizer.step()
def train(self):
if len(self.buffer) < self.n_warm_up_episodes: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size)
bf_out = self.behavior_fn(*experience[:3])
labels = experience[-1]
#print(labels.shape)
loss = nn.CrossEntropyLoss()(bf_out, labels.squeeze())
mean_entropy = torch.distributions.Categorical(bf_out).entropy().mean()
self._backprop_loss(loss - 0.03*mean_entropy)
print(f'Running loss: {np.mean(list(self.running_loss)):.3f}\tRunning reward: {np.mean(self.running_reward):.2f}'
f'\td_r: {self.desired_reward.item():.2f}\ttd_h: {self.desired_horizon.item()}')
class BF(BaseDQN):
def __init__(self, dims=[5*5*3, 64]):
super(BF, self).__init__(dims)
self.net = mlp_maker(dims, activation_last='identity')
self.command_net = mlp_maker([2, 64], activation_last='sigmoid')
self.common_branch = mlp_maker([64, 64, 64, 9])
def forward(self, observation, desired_reward, horizon):
command = torch.cat((desired_reward*(0.02), horizon*(0.01)), dim=-1)
obs_out = self.net(torch.flatten(observation, start_dim=1))
command_out = self.command_net(command)
combined = obs_out*command_out
out = self.common_branch(combined)
return torch.softmax(out, -1)
if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.common import BaseDDQN
from algorithms.vdn_learner import VDNLearner
N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2,
max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True)
bf = BF()
desired_reward = torch.tensor([200.]).view(1, 1).float()
desired_horizon = torch.tensor([400.]).view(1, 1).float()
learner = UDRLearner(env, behavior_fn=bf,
train_every=('episode', 2),
buffer_size=40,
best_x=10,
lr=1e-3,
batch_size=64,
n_warm_up_episodes=12,
n_grad_steps=4,
desired_reward=desired_reward,
desired_horizon=desired_horizon)
#learner.save(Path(__file__).parent / 'test' / 'testexperiment1337')
learner.learn(500000)

40
algorithms/vdn_learner.py Normal file
View File

@ -0,0 +1,40 @@
from typing import Union
import torch
import numpy as np
from algorithms.q_learner import QLearner
class VDNLearner(QLearner):
def __init__(self, *args, **kwargs):
super(VDNLearner, self).__init__(*args, **kwargs)
assert self.n_agents >= 2, 'VDN requires more than one agent, use QLearner instead'
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
eps = np.random.rand(self.n_agents)
greedy = eps > self.eps
agent_actions = None
actions = []
for i in range(self.n_agents):
if greedy[i]:
if agent_actions is None: agent_actions = self.q_net.act(o.float())
action = agent_actions[i]
else:
action = self.env.action_space.sample()
actions.append(action)
return np.array(actions)
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps)
pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
for agent_i in range(self.n_agents):
q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
experience.next_observation[:, agent_i],
experience.action[:, agent_i].unsqueeze(-1))
pred_q += q_values
target_q_raw += next_q_values_raw
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)