diff --git a/.gitignore b/.gitignore index 4994491..59fa333 100644 --- a/.gitignore +++ b/.gitignore @@ -702,3 +702,4 @@ $RECYCLE.BIN/ # End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex /studies/e_1/ +/studies/curious_study/ diff --git a/algorithms/common.py b/algorithms/common.py deleted file mode 100644 index 3ebb1d2..0000000 --- a/algorithms/common.py +++ /dev/null @@ -1,221 +0,0 @@ -from typing import NamedTuple, Union -from collections import deque, OrderedDict, defaultdict -import numpy as np -import random - -import pandas as pd -import torch -import torch.nn as nn - -from tqdm import trange - -class Experience(NamedTuple): - # can be use for a single (s_t, a, r s_{t+1}) tuple - # or for a batch of tuples - observation: np.ndarray - next_observation: np.ndarray - action: np.ndarray - reward: Union[float, np.ndarray] - done : Union[bool, np.ndarray] - episode: int = -1 - - -class BaseLearner: - def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1, stack_n_frames=1): - assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]' - self.env = env - self.n_agents = n_agents - self.n_grad_steps = n_grad_steps - self.train_every = train_every - self.stack_n_frames = deque(maxlen=stack_n_frames) - self.device = 'cpu' - self.n_updates = 0 - self.step = 0 - self.episode_step = 0 - self.episode = 0 - self.running_reward = deque(maxlen=5) - - def to(self, device): - self.device = device - for attr, value in self.__dict__.items(): - if isinstance(value, nn.Module): - value = value.to(self.device) - return self - - def get_action(self, obs) -> Union[int, np.ndarray]: - pass - - def on_new_experience(self, experience): - pass - - def on_step_end(self, n_steps): - pass - - def on_episode_end(self, n_steps): - pass - - def on_all_done(self): - pass - - def train(self): - pass - - def reward(self, r): - return r - - def learn(self, n_steps): - train_type, train_freq = self.train_every - while self.step < n_steps: - obs, done = self.env.reset(), False - total_reward = 0 - self.episode_step = 0 - while not done: - - action = self.get_action(obs) - - next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0]) - - experience = Experience(observation=obs, next_observation=next_obs, - action=action, reward=self.reward(reward), - done=done, episode=self.episode) # do we really need to copy? - self.on_new_experience(experience) - # end of step routine - obs = next_obs - total_reward += reward - self.step += 1 - self.episode_step += 1 - self.on_step_end(n_steps) - if train_type == 'step' and (self.step % train_freq == 0): - self.train() - self.n_updates += 1 - self.on_episode_end(n_steps) - if train_type == 'episode' and (self.episode % train_freq == 0): - self.train() - self.n_updates += 1 - - self.running_reward.append(total_reward) - self.episode += 1 - try: - if self.step % 100 == 0: - print( - f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t' - f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}') - except Exception as e: - pass - self.on_all_done() - - def evaluate(self, n_episodes=100, render=False): - with torch.no_grad(): - data = [] - for eval_i in trange(n_episodes): - obs, done = self.env.reset(), False - while not done: - action = self.get_action(obs) - next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0]) - if render: self.env.render() - obs = next_obs # srsly i'm so stupid - info.update({'reward': reward, 'eval_episode': eval_i}) - data.append(info) - return pd.DataFrame(data).fillna(0) - - - -class BaseBuffer: - def __init__(self, size: int): - self.size = size - self.experience = deque(maxlen=size) - - def __len__(self): - return len(self.experience) - - def add(self, exp: Experience): - self.experience.append(exp) - - def sample(self, k, cer=4): - sample = random.choices(self.experience, k=k-cer) - for i in range(cer): sample += [self.experience[-i]] - observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float() - next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float() - actions = torch.tensor([e.action for e in sample]).long() - rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1) - dones = torch.tensor([e.done for e in sample]).float().view(-1, 1) - #print(observations.shape, next_observations.shape, actions.shape, rewards.shape, dones.shape) - return Experience(observations, next_observations, actions, rewards, dones) - - -class TrajectoryBuffer(BaseBuffer): - def __init__(self, size): - super(TrajectoryBuffer, self).__init__(size) - self.experience = defaultdict(list) - - def add(self, exp: Experience): - self.experience[exp.episode].append(exp) - if len(self.experience) > self.size: - oldest_traj_key = list(sorted(self.experience.keys()))[0] - del self.experience[oldest_traj_key] - - -def soft_update(local_model, target_model, tau): - # taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb - for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): - target_param.data.copy_(tau*local_param.data + (1.-tau)*target_param.data) - - -def mlp_maker(dims, flatten=False, activation='elu', activation_last='identity'): - activations = {'elu': nn.ELU, 'relu': nn.ReLU, 'sigmoid': nn.Sigmoid, - 'leaky_relu': nn.LeakyReLU, 'tanh': nn.Tanh, - 'gelu': nn.GELU, 'identity': nn.Identity} - layers = [('Flatten', nn.Flatten())] if flatten else [] - for i in range(1, len(dims)): - layers.append((f'Layer #{i - 1}: Linear', nn.Linear(dims[i - 1], dims[i]))) - activation_str = activation if i != len(dims)-1 else activation_last - layers.append((f'Layer #{i - 1}: {activation_str.capitalize()}', activations[activation_str]())) - return nn.Sequential(OrderedDict(layers)) - - -class BaseDQN(nn.Module): - def __init__(self, dims=[3*5*5, 64, 64, 9]): - super(BaseDQN, self).__init__() - self.net = mlp_maker(dims, flatten=True) - - @torch.no_grad() - def act(self, x) -> np.ndarray: - action = self.forward(x).max(-1)[1].numpy() - return action - - def forward(self, x): - return self.net(x) - - -class BaseDDQN(BaseDQN): - def __init__(self, - backbone_dims=[3*5*5, 64, 64], - value_dims=[64, 1], - advantage_dims=[64, 9], - activation='elu'): - super(BaseDDQN, self).__init__(backbone_dims) - self.net = mlp_maker(backbone_dims, activation=activation, flatten=True) - self.value_head = mlp_maker(value_dims) - self.advantage_head = mlp_maker(advantage_dims) - - def forward(self, x): - features = self.net(x) - advantages = self.advantage_head(features) - values = self.value_head(features) - return values + (advantages - advantages.mean()) - - -class BaseICM(nn.Module): - def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]): - super(BaseICM, self).__init__() - self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu') - self.icm = mlp_maker(head_dims) - self.ce = nn.CrossEntropyLoss() - - def forward(self, s0, s1, a): - phi_s0 = self.backbone(s0) - phi_s1 = self.backbone(s1) - cat = torch.cat((phi_s0, phi_s1), dim=1) - a_prime = torch.softmax(self.icm(cat), dim=-1) - ce = self.ce(a_prime, a) - return dict(prediction=a_prime, loss=ce) \ No newline at end of file diff --git a/algorithms/m_q_learner.py b/algorithms/m_q_learner.py deleted file mode 100644 index bd57597..0000000 --- a/algorithms/m_q_learner.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -import torch -import torch.nn.functional as F -from algorithms.q_learner import QLearner - - -class MQLearner(QLearner): - # Munchhausen Q-Learning - def __init__(self, *args, temperature=0.03, alpha=0.9, clip_l0=-1.0, **kwargs): - super(MQLearner, self).__init__(*args, **kwargs) - assert self.n_agents == 1, 'M-DQN currently only supports single agent training' - self.temperature = temperature - self.alpha = alpha - self.clip0 = clip_l0 - - def tau_ln_pi(self, qs): - # computes log(softmax(qs/temperature)) - # Custom log-sum-exp trick from page 18 to compute the log-policy terms - v_k = qs.max(-1)[0].unsqueeze(-1) - advantage = qs - v_k - logsum = torch.logsumexp(advantage / self.temperature, -1).unsqueeze(-1) - tau_ln_pi = advantage - self.temperature * logsum - return tau_ln_pi - - def train(self): - if len(self.buffer) < self.batch_size: return - for _ in range(self.n_grad_steps): - - experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1]) - - with torch.no_grad(): - q_target_next = self.target_q_net(experience.next_observation) - tau_log_pi_next = self.tau_ln_pi(q_target_next) - - q_k_targets = self.target_q_net(experience.observation) - log_pi = self.tau_ln_pi(q_k_targets) - - pi_target = F.softmax(q_target_next / self.temperature, dim=-1) - q_target = (self.gamma * (pi_target * (q_target_next - tau_log_pi_next) * (1 - experience.done)).sum(-1)).unsqueeze(-1) - - munchausen_addon = log_pi.gather(-1, experience.action) - - munchausen_reward = (experience.reward + self.alpha * torch.clamp(munchausen_addon, min=self.clip0, max=0)) - - # Compute Q targets for current states - m_q_target = munchausen_reward + q_target - - # Get expected Q values from local model - q_k = self.q_net(experience.observation) - pred_q = q_k.gather(-1, experience.action) - - # Compute loss - loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - m_q_target, 2)) - self._backprop_loss(loss) - -from tqdm import trange -from collections import deque -class MQICMLearner(MQLearner): - def __init__(self, *args, icm, **kwargs): - super(MQICMLearner, self).__init__(*args, **kwargs) - self.icm = icm - self.icm_optimizer = torch.optim.AdamW(self.icm.parameters()) - self.normalize_reward = deque(maxlen=1000) - - def on_all_done(self): - from collections import deque - losses = deque(maxlen=100) - for b in trange(10000): - batch = self.buffer.sample(128, 0) - s0, s1, a = batch.observation, batch.next_observation, batch.action - loss = self.icm(s0, s1, a.squeeze())['loss'] - self.icm_optimizer.zero_grad() - loss.backward() - self.icm_optimizer.step() - losses.append(loss.item()) - if b%100 == 0: - print(np.mean(losses)) diff --git a/algorithms/marl/__init__.py b/algorithms/marl/__init__.py new file mode 100644 index 0000000..f1c46d1 --- /dev/null +++ b/algorithms/marl/__init__.py @@ -0,0 +1,4 @@ +from algorithms.marl.base_ac import BaseActorCritic +from algorithms.marl.iac import LoopIAC +from algorithms.marl.snac import LoopSNAC +from algorithms.marl.seac import LoopSEAC \ No newline at end of file diff --git a/algorithms/marl/base_ac.py b/algorithms/marl/base_ac.py new file mode 100644 index 0000000..e2d2fe9 --- /dev/null +++ b/algorithms/marl/base_ac.py @@ -0,0 +1,176 @@ +import torch +from typing import Union, List +import numpy as np +from torch.distributions import Categorical +from algorithms.marl.memory import MARLActorCriticMemory +from algorithms.utils import add_env_props, instantiate_class +from pathlib import Path +import pandas as pd +from collections import deque +ListOrTensor = Union[List, torch.Tensor] + + +class BaseActorCritic: + def __init__(self, cfg): + add_env_props(cfg) + self.__training = True + self.cfg = cfg + self.n_agents = cfg['env']['n_agents'] + self.setup() + + def setup(self): + self.net = instantiate_class(self.cfg['agent']) + self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5) + + @classmethod + def _as_torch(cls, x): + if isinstance(x, np.ndarray): + return torch.from_numpy(x) + elif isinstance(x, List): + return torch.tensor(x) + elif isinstance(x, (int, float)): + return torch.tensor([x]) + return x + + def train(self): + self.__training = False + networks = [self.net] if not isinstance(self.net, List) else self.net + for net in networks: + net.train() + + def eval(self): + self.__training = False + networks = [self.net] if not isinstance(self.net, List) else self.net + for net in networks: + net.eval() + + def load_state_dict(self, path: Path): + pass + + def get_actions(self, out) -> ListOrTensor: + actions = [Categorical(logits=logits).sample().item() for logits in out['logits']] + return actions + + def init_hidden(self) -> dict[ListOrTensor]: + pass + + def forward(self, + observations: ListOrTensor, + actions: ListOrTensor, + hidden_actor: ListOrTensor, + hidden_critic: ListOrTensor + ): + pass + + + @torch.no_grad() + def train_loop(self, checkpointer=None): + env = instantiate_class(self.cfg['env']) + n_steps, max_steps = [self.cfg['algorithm'][k] for k in ['n_steps', 'max_steps']] + global_steps = 0 + reward_queue = deque(maxlen=2000) + while global_steps < max_steps: + tm = MARLActorCriticMemory(self.n_agents) + obs = env.reset() + last_hiddens = self.init_hidden() + last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents + done, rew_log = [False] * self.n_agents, 0 + tm.add(action=last_action, **last_hiddens) + + while not all(done): + + out = self.forward(obs, last_action, **last_hiddens) + action = self.get_actions(out) + next_obs, reward, done, info = env.step(action) + next_obs = next_obs + if isinstance(done, bool): done = [done] * self.n_agents + + tm.add(observation=obs, action=action, reward=reward, done=done) + obs = next_obs + last_action = action + last_hiddens = dict(hidden_actor=out.get('hidden_actor', None), + hidden_critic=out.get('hidden_critic', None) + ) + + if len(tm) >= n_steps or all(done): + tm.add(observation=next_obs) + if self.__training: + with torch.inference_mode(False): + self.learn(tm) + tm.reset() + tm.add(action=last_action, **last_hiddens) + global_steps += 1 + rew_log += sum(reward) + reward_queue.extend(reward) + + if checkpointer is not None: + checkpointer.step([ + (f'agent#{i}', agent) + for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net) + ]) + + if global_steps >= max_steps: break + print(f'reward at step: {global_steps} = {rew_log}') + + @torch.inference_mode(True) + def eval_loop(self, n_episodes, render=False): + env = instantiate_class(self.cfg['env']) + episode, results = 0, [] + while episode < n_episodes: + obs = env.reset() + last_hiddens = self.init_hidden() + last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents + done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents) + while not all(done): + if render: env.render() + + out = self.forward(obs, last_action, **last_hiddens) + action = self.get_actions(out) + next_obs, reward, done, info = env.step(action) + + if isinstance(done, bool): done = [done] * obs.shape[0] + obs = next_obs + last_action = action + last_hiddens = dict(hidden_actor=out.get('hidden_actor', None), + hidden_critic=out.get('hidden_critic', None) + ) + eps_rew += torch.tensor(reward) + results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode]) + episode += 1 + agent_columns = [f'agent#{i}' for i in range(self.cfg['env']['n_agents'])] + results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode']) + results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'], value_name='reward', var_name='agent') + return results + + @staticmethod + def compute_advantages(critic, reward, done, gamma): + return (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1] + + def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, **kwargs): + obs, actions, done, reward = tm.observation, tm.action, tm.done, tm.reward + + out = network(obs, actions, tm.hidden_actor, tm.hidden_critic) + logits = out['logits'][:, :-1] # last one only needed for v_{t+1} + critic = out['critic'] + + entropy_loss = Categorical(logits=logits).entropy().mean(-1) + advantages = self.compute_advantages(critic, reward, done, gamma) + value_loss = advantages.pow(2).mean(-1) # n_agent + + # policy loss + log_ap = torch.log_softmax(logits, -1) + log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze() + a2c_loss = -(advantages.detach() * log_ap).mean(-1) + # weighted loss + loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss + + return loss.mean() + + def learn(self, tm: MARLActorCriticMemory, **kwargs): + loss = self.actor_critic(tm, self.net, **self.cfg['algorithm'], **kwargs) + # remove next_obs, will be added in next iter + self.optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5) + self.optimizer.step() + diff --git a/algorithms/marl/example_config.yaml b/algorithms/marl/example_config.yaml new file mode 100644 index 0000000..b8a7bd3 --- /dev/null +++ b/algorithms/marl/example_config.yaml @@ -0,0 +1,24 @@ +agent: + classname: algorithms.marl.networks.RecurrentAC + n_agents: 2 + obs_emb_size: 96 + action_emb_size: 16 + hidden_size_actor: 64 + hidden_size_critic: 64 + use_agent_embedding: False +env: + classname: environments.factory.make + env_name: "DirtyFactory-v0" + n_agents: 2 + max_steps: 250 + pomdp_r: 2 + stack_n_frames: 0 + individual_rewards: True +method: algorithms.marl.LoopSEAC +algorithm: + gamma: 0.99 + entropy_coef: 0.01 + vf_coef: 0.5 + n_steps: 5 + max_steps: 1000000 + diff --git a/algorithms/marl/iac.py b/algorithms/marl/iac.py new file mode 100644 index 0000000..a04668f --- /dev/null +++ b/algorithms/marl/iac.py @@ -0,0 +1,58 @@ +import torch +from algorithms.marl.base_ac import BaseActorCritic +from algorithms.utils import instantiate_class +from pathlib import Path +from natsort import natsorted +from algorithms.marl.memory import MARLActorCriticMemory + + +class LoopIAC(BaseActorCritic): + + def __init__(self, cfg): + super(LoopIAC, self).__init__(cfg) + + def setup(self): + self.net = [ + instantiate_class(self.cfg['agent']) for _ in range(self.n_agents) + ] + self.optimizer = [ + torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents) + ] + + def load_state_dict(self, path: Path): + paths = natsorted(list(path.glob('*.pt'))) + print(list(paths)) + for path, net in zip(paths, self.net): + net.load_state_dict(torch.load(path)) + + @staticmethod + def merge_dicts(ds): # todo could be recursive for more than 1 hierarchy + d = {} + for k in ds[0].keys(): + d[k] = [d[k] for d in ds] + return d + + def init_hidden(self): + ha = [net.init_hidden_actor() for net in self.net] + hc = [net.init_hidden_critic() for net in self.net] + return dict(hidden_actor=ha, hidden_critic=hc) + + def forward(self, observations, actions, hidden_actor, hidden_critic): + outputs = [ + net( + self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0), # agents x time + self._as_torch(actions[ag_i]).unsqueeze(0), + hidden_actor[ag_i], + hidden_critic[ag_i] + ) for ag_i, net in enumerate(self.net) + ] + return self.merge_dicts(outputs) + + def learn(self, tms: MARLActorCriticMemory, **kwargs): + for ag_i in range(self.n_agents): + tm, net = tms(ag_i), self.net[ag_i] + loss = self.actor_critic(tm, net, **self.cfg['algorithm'], **kwargs) + self.optimizer[ag_i].zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5) + self.optimizer[ag_i].step() \ No newline at end of file diff --git a/algorithms/marl/memory.py b/algorithms/marl/memory.py new file mode 100644 index 0000000..2b15e07 --- /dev/null +++ b/algorithms/marl/memory.py @@ -0,0 +1,131 @@ +import torch +from typing import Union, List +from torch import Tensor +import numpy as np + + +class ActorCriticMemory(object): + def __init__(self): + self.reset() + + def reset(self): + self.__states = [] + self.__actions = [] + self.__rewards = [] + self.__dones = [] + self.__hiddens_actor = [] + self.__hiddens_critic = [] + + def __len__(self): + return len(self.__states) + + @property + def observation(self): + return torch.stack(self.__states, 0).unsqueeze(0) # 1 x timesteps x hidden dim + + @property + def hidden_actor(self): + if len(self.__hiddens_actor) == 1: + return self.__hiddens_actor[0] + return torch.stack(self.__hiddens_actor, 0) # layers x timesteps x hidden dim + + @property + def hidden_critic(self): + if len(self.__hiddens_critic) == 1: + return self.__hiddens_critic[0] + return torch.stack(self.__hiddens_critic, 0) # layers x timesteps x hidden dim + + @property + def reward(self): + return torch.tensor(self.__rewards).float().unsqueeze(0) # 1 x timesteps + + @property + def action(self): + return torch.tensor(self.__actions).long().unsqueeze(0) # 1 x timesteps+1 + + @property + def done(self): + return torch.tensor(self.__dones).float().unsqueeze(0) # 1 x timesteps + + def add_observation(self, state: Union[Tensor, np.ndarray]): + self.__states.append(state if isinstance(state, Tensor) else torch.from_numpy(state)) + + def add_hidden_actor(self, hidden: Tensor): + # 1x layers x hidden dim + if len(hidden.shape) < 3: hidden = hidden.unsqueeze(0) + self.__hiddens_actor.append(hidden) + + def add_hidden_critic(self, hidden: Tensor): + # 1x layers x hidden dim + if len(hidden.shape) < 3: hidden = hidden.unsqueeze(0) + self.__hiddens_critic.append(hidden) + + def add_action(self, action: int): + self.__actions.append(action) + + def add_reward(self, reward: float): + self.__rewards.append(reward) + + def add_done(self, done: bool): + self.__dones.append(done) + + def add(self, **kwargs): + for k, v in kwargs.items(): + func = getattr(ActorCriticMemory, f'add_{k}') + func(self, v) + + +class MARLActorCriticMemory(object): + def __init__(self, n_agents): + self.n_agents = n_agents + self.memories = [ + ActorCriticMemory() for _ in range(n_agents) + ] + + def __call__(self, agent_i): + return self.memories[agent_i] + + def __len__(self): + return len(self.memories[0]) # todo add assertion check! + + def reset(self): + for mem in self.memories: + mem.reset() + + def add(self, **kwargs): + # todo try catch - print all possible functions + for agent_i in range(self.n_agents): + for k, v in kwargs.items(): + func = getattr(ActorCriticMemory, f'add_{k}') + func(self.memories[agent_i], v[agent_i]) + + @property + def observation(self): + all_obs = [mem.observation for mem in self.memories] + return torch.cat(all_obs, 0) # agents x timesteps+1 x ... + + @property + def action(self): + all_actions = [mem.action for mem in self.memories] + return torch.cat(all_actions, 0) # agents x timesteps+1 x ... + + @property + def done(self): + all_dones = [mem.done for mem in self.memories] + return torch.cat(all_dones, 0).float() # agents x timesteps x ... + + @property + def reward(self): + all_rewards = [mem.reward for mem in self.memories] + return torch.cat(all_rewards, 0).float() # agents x timesteps x ... + + @property + def hidden_actor(self): + all_ha = [mem.hidden_actor for mem in self.memories] + return torch.cat(all_ha, 0) # agents x layers x x timesteps x hidden dim + + @property + def hidden_critic(self): + all_hc = [mem.hidden_critic for mem in self.memories] + return torch.cat(all_hc, 0) # agents x layers x timesteps x hidden dim + diff --git a/algorithms/marl/networks.py b/algorithms/marl/networks.py new file mode 100644 index 0000000..a60f9c4 --- /dev/null +++ b/algorithms/marl/networks.py @@ -0,0 +1,91 @@ +import torch +import torch.nn as nn +import numpy as np +import torch.nn.functional as F +from torch.nn.utils import spectral_norm + + +class RecurrentAC(nn.Module): + def __init__(self, observation_size, n_actions, obs_emb_size, + action_emb_size, hidden_size_actor, hidden_size_critic, + n_agents, use_agent_embedding=True): + super(RecurrentAC, self).__init__() + observation_size = np.prod(observation_size) + self.n_layers = 1 + self.use_agent_embedding = use_agent_embedding + self.hidden_size_actor = hidden_size_actor + self.hidden_size_critic = hidden_size_critic + self.action_emb_size = action_emb_size + self.obs_proj = nn.Linear(observation_size, obs_emb_size) + self.action_emb = nn.Embedding(n_actions+1, action_emb_size, padding_idx=0) + self.agent_emb = nn.Embedding(n_agents, action_emb_size) + mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size + self.mix = nn.Sequential(nn.Tanh(), + nn.Linear(mix_in_size, obs_emb_size), + nn.Tanh(), + nn.Linear(obs_emb_size, obs_emb_size) + ) + self.gru_actor = nn.GRU(obs_emb_size, hidden_size_actor, batch_first=True, num_layers=self.n_layers) + self.gru_critic = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers) + self.action_head = nn.Sequential( + spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)), + nn.Tanh(), + nn.Linear(hidden_size_actor, n_actions) + ) + self.critic_head = nn.Sequential( + nn.Linear(hidden_size_critic, hidden_size_critic), + nn.Tanh(), + nn.Linear(hidden_size_critic, 1) + ) + #self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3) + #self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3) + + def init_hidden_actor(self): + return torch.zeros(1, self.n_layers, self.hidden_size_actor) + + def init_hidden_critic(self): + return torch.zeros(1, self.n_layers, self.hidden_size_critic) + + def forward(self, observations, actions, hidden_actor=None, hidden_critic=None): + n_agents, t, *_ = observations.shape + obs_emb = self.obs_proj(observations.view(n_agents, t, -1).float()) + action_emb = self.action_emb(actions+1) # shift by one due to padding idx + agent_emb = self.agent_emb( + torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)]*t, 1) + ) + x_t = torch.cat((obs_emb, action_emb), -1) \ + if not self.use_agent_embedding else torch.cat((obs_emb, agent_emb, action_emb), -1) + + + mixed_x_t = self.mix(x_t) + output_p, _ = self.gru_actor(input=mixed_x_t, hx=hidden_actor.swapaxes(1, 0)) + output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0)) + + logits = self.action_head(output_p) + critic = self.critic_head(output_c).squeeze(-1) + return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c) + + + +class NormalizedLinear(nn.Linear): + def __init__(self, in_features: int, out_features: int, + device=None, dtype=None, trainable_magnitude=False): + super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype) + self.d_sqrt = in_features**0.5 + self.trainable_magnitude = trainable_magnitude + self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude) + + def forward(self, input): + normalized_input = F.normalize(input, dim=-1, p=2, eps=1e-5) + normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5) + return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale + + +class L2Norm(nn.Module): + def __init__(self, in_features, trainable_magnitude=False): + super(L2Norm, self).__init__() + self.d_sqrt = in_features**0.5 + self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude) + + def forward(self, x): + return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale \ No newline at end of file diff --git a/algorithms/marl/seac.py b/algorithms/marl/seac.py new file mode 100644 index 0000000..5ed9a24 --- /dev/null +++ b/algorithms/marl/seac.py @@ -0,0 +1,55 @@ +import torch +from torch.distributions import Categorical +from algorithms.marl.iac import LoopIAC +from algorithms.marl.memory import MARLActorCriticMemory + + +class LoopSEAC(LoopIAC): + def __init__(self, cfg): + super(LoopSEAC, self).__init__(cfg) + + def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, **kwargs): + obs, actions, done, reward = tm.observation, tm.action, tm.done, tm.reward + outputs = [net(obs, actions, tm.hidden_actor, tm.hidden_critic) for net in networks] + + with torch.inference_mode(True): + true_action_logp = torch.stack([ + torch.log_softmax(out['logits'][ag_i, :-1], -1) + .gather(index=actions[ag_i, 1:, None], dim=-1) + for ag_i, out in enumerate(outputs) + ], 0).squeeze() + + losses = [] + + for ag_i, out in enumerate(outputs): + logits = out['logits'][:, :-1] # last one only needed for v_{t+1} + critic = out['critic'] + + entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean() + advantages = self.compute_advantages(critic, reward, done, gamma) + + # policy loss + log_ap = torch.log_softmax(logits, -1) + log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze() + + # importance weights + iw = (log_ap - true_action_logp).exp().detach() # importance_weights + + a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1) + + + value_loss = (iw*advantages.pow(2)).mean(-1) # n_agent + + # weighted loss + loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean() + losses.append(loss) + + return losses + + def learn(self, tms: MARLActorCriticMemory, **kwargs): + losses = self.actor_critic(tms, self.net, **self.cfg['algorithm'], **kwargs) + for ag_i, loss in enumerate(losses): + self.optimizer[ag_i].zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5) + self.optimizer[ag_i].step() \ No newline at end of file diff --git a/algorithms/marl/snac.py b/algorithms/marl/snac.py new file mode 100644 index 0000000..3d312a8 --- /dev/null +++ b/algorithms/marl/snac.py @@ -0,0 +1,32 @@ +from algorithms.marl.base_ac import BaseActorCritic +import torch +from torch.distributions import Categorical +from pathlib import Path + + +class LoopSNAC(BaseActorCritic): + def __init__(self, cfg): + super().__init__(cfg) + + def load_state_dict(self, path: Path): + path2weights = list(path.glob('*.pt')) + assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}' + self.net.load_state_dict(torch.load(path2weights[0])) + + def init_hidden(self): + hidden_actor = self.net.init_hidden_actor() + hidden_critic = self.net.init_hidden_critic() + return dict(hidden_actor=torch.cat([hidden_actor] * self.n_agents, 0), + hidden_critic=torch.cat([hidden_critic] * self.n_agents, 0) + ) + + def get_actions(self, out): + actions = Categorical(logits=out['logits']).sample().squeeze() + return actions + + def forward(self, observations, actions, hidden_actor, hidden_critic): + out = self.net(self._as_torch(observations).unsqueeze(1), + self._as_torch(actions).unsqueeze(1), + hidden_actor, hidden_critic + ) + return out \ No newline at end of file diff --git a/algorithms/q_learner.py b/algorithms/q_learner.py deleted file mode 100644 index 04f3a86..0000000 --- a/algorithms/q_learner.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import Union -import gym -import torch -import torch.nn as nn -import numpy as np -from collections import deque -from pathlib import Path -import yaml -from algorithms.common import BaseLearner, BaseBuffer, soft_update, Experience - - -class QLearner(BaseLearner): - def __init__(self, q_net, target_q_net, env, buffer_size=1e5, target_update=3000, eps_end=0.05, n_agents=1, - gamma=0.99, train_every=('step', 4), n_grad_steps=1, tau=1.0, max_grad_norm=10, weight_decay=1e-2, - exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0, eps_start=1): - super(QLearner, self).__init__(env, n_agents, train_every, n_grad_steps) - self.q_net = q_net - self.target_q_net = target_q_net - self.target_q_net.eval() - #soft_update(cls.q_net, cls.target_q_net, tau=1.0) - self.buffer = BaseBuffer(buffer_size) - self.target_update = target_update - self.eps = eps_start - self.eps_start = eps_start - self.eps_end = eps_end - self.exploration_fraction = exploration_fraction - self.batch_size = batch_size - self.gamma = gamma - self.tau = tau - self.reg_weight = reg_weight - self.weight_decay = weight_decay - self.lr = lr - self.optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=self.lr, weight_decay=self.weight_decay) - self.max_grad_norm = max_grad_norm - self.running_reward = deque(maxlen=5) - self.running_loss = deque(maxlen=5) - self.n_updates = 0 - - def save(self, path): - path = Path(path) # no-op if already instance of Path - path.mkdir(parents=True, exist_ok=True) - hparams = {k: v for k, v in self.__dict__.items() if not(isinstance(v, BaseBuffer) or - isinstance(v, torch.optim.Optimizer) or - isinstance(v, gym.Env) or - isinstance(v, nn.Module)) - } - hparams.update({'class': self.__class__.__name__}) - with (path / 'hparams.yaml').open('w') as outfile: - yaml.dump(hparams, outfile) - torch.save(self.q_net, path / 'q_net.pt') - - def anneal_eps(self, step, n_steps): - fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0) - self.eps = 1 + fraction * (self.eps_end - 1) - - def get_action(self, obs) -> Union[int, np.ndarray]: - o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs) - if np.random.rand() > self.eps: - action = self.q_net.act(o.float()) - else: - action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)]) - return action - - def on_new_experience(self, experience): - self.buffer.add(experience) - - def on_step_end(self, n_steps): - self.anneal_eps(self.step, n_steps) - if self.step % self.target_update == 0: - print('UPDATE') - soft_update(self.q_net, self.target_q_net, tau=self.tau) - - def _training_routine(self, obs, next_obs, action): - current_q_values = self.q_net(obs) - current_q_values = torch.gather(current_q_values, dim=-1, index=action) - next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach() - return current_q_values, next_q_values_raw - - def _backprop_loss(self, loss): - # log loss - self.running_loss.append(loss.item()) - # Optimize the model - self.optimizer.zero_grad() - loss.backward() - torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.max_grad_norm) - self.optimizer.step() - - def train(self): - if len(self.buffer) < self.batch_size: return - for _ in range(self.n_grad_steps): - experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1]) - pred_q, target_q_raw = self._training_routine(experience.observation, - experience.next_observation, - experience.action) - target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw - loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2)) - self._backprop_loss(loss) - - - -if __name__ == '__main__': - from environments.factory.factory_dirt import DirtFactory, DirtProperties, MovementProperties - from algorithms.common import BaseDDQN, BaseICM - from algorithms.m_q_learner import MQLearner, MQICMLearner - from algorithms.vdn_learner import VDNLearner - - N_AGENTS = 1 - - with (Path(f'../environments/factory/env_default_param.yaml')).open('r') as f: - env_kwargs = yaml.load(f, Loader=yaml.FullLoader) - - env = DirtFactory(**env_kwargs) - obs_shape = np.prod(env.observation_space.shape) - n_actions = env.action_space.n - - dqn, target_dqn = BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128, 1], activation='leaky_relu'),\ - BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128, 1], activation='leaky_relu') - - icm = BaseICM(backbone_dims=[obs_shape, 64, 32], head_dims=[2*32, 64, n_actions]) - - learner = MQICMLearner(dqn, target_dqn, env, 50000, icm=icm, - target_update=5000, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10, - train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, - batch_size=64, weight_decay=1e-3 - ) - #learner.save(Path(__file__).parent / 'test' / 'testexperiment1337') - learner.learn(100000) diff --git a/algorithms/reg_dqn.py b/algorithms/reg_dqn.py deleted file mode 100644 index 34ec42b..0000000 --- a/algorithms/reg_dqn.py +++ /dev/null @@ -1,52 +0,0 @@ -import numpy as np -import torch -import stable_baselines3 as sb3 -from stable_baselines3.common import logger - - -class RegDQN(sb3.dqn.DQN): - def __init__(self, *args, reg_weight=0.1, **kwargs): - super().__init__(*args, **kwargs) - self.reg_weight = reg_weight - - def train(self, gradient_steps: int, batch_size: int = 100) -> None: - # Update learning rate according to schedule - self._update_learning_rate(self.policy.optimizer) - - losses = [] - for _ in range(gradient_steps): - # Sample replay buffer - replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env) - - with torch.no_grad(): - # Compute the next Q-values using the target network - next_q_values = self.q_net_target(replay_data.next_observations) - # Follow greedy policy: use the one with the highest value - next_q_values, _ = next_q_values.max(dim=1) - # Avoid potential broadcast issue - next_q_values = next_q_values.reshape(-1, 1) - # 1-step TD target - target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values - - # Get current Q-values estimates - current_q_values = self.q_net(replay_data.observations) - - # Retrieve the q-values for the actions from the replay buffer - current_q_values = torch.gather(current_q_values, dim=1, index=replay_data.actions.long()) - - delta = current_q_values - target_q_values - loss = torch.mean(self.reg_weight * current_q_values + torch.pow(delta, 2)) - losses.append(loss.item()) - - # Optimize the policy - self.policy.optimizer.zero_grad() - loss.backward() - # Clip gradient norm - torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) - self.policy.optimizer.step() - - # Increase update counter - self._n_updates += gradient_steps - - logger.record("train/n_updates", self._n_updates, exclude="tensorboard") - logger.record("train/loss", np.mean(losses)) \ No newline at end of file diff --git a/algorithms/utils.py b/algorithms/utils.py index d72046a..f79429c 100644 --- a/algorithms/utils.py +++ b/algorithms/utils.py @@ -3,14 +3,51 @@ import torch import numpy as np import yaml from pathlib import Path -from salina import instantiate_class -from salina import TAgent -from salina.agents.gyma import ( - AutoResetGymAgent, - _torch_type, - _format_frame, - _torch_cat_dict -) + + +def load_class(classname): + from importlib import import_module + module_path, class_name = classname.rsplit(".", 1) + module = import_module(module_path) + c = getattr(module, class_name) + return c + + +def instantiate_class(arguments): + from importlib import import_module + + d = dict(arguments) + classname = d["classname"] + del d["classname"] + module_path, class_name = classname.rsplit(".", 1) + module = import_module(module_path) + c = getattr(module, class_name) + return c(**d) + + +def get_class(arguments): + from importlib import import_module + + if isinstance(arguments, dict): + classname = arguments["classname"] + module_path, class_name = classname.rsplit(".", 1) + module = import_module(module_path) + c = getattr(module, class_name) + return c + else: + classname = arguments.classname + module_path, class_name = classname.rsplit(".", 1) + module = import_module(module_path) + c = getattr(module, class_name) + return c + + +def get_arguments(arguments): + from importlib import import_module + d = dict(arguments) + if "classname" in d: + del d["classname"] + return d def load_yaml_file(path: Path): @@ -21,90 +58,29 @@ def load_yaml_file(path: Path): def add_env_props(cfg): env = instantiate_class(cfg['env'].copy()) - cfg['agent'].update(dict(observation_size=env.observation_space.shape, + cfg['agent'].update(dict(observation_size=list(env.observation_space.shape), n_actions=env.action_space.n)) +class Checkpointer(object): + def __init__(self, experiment_name, root, config, total_steps, n_checkpoints): + self.path = root / experiment_name + self.checkpoint_indices = list(np.linspace(1, total_steps, n_checkpoints, dtype=int) - 1) + self.__current_checkpoint = 0 + self.__current_step = 0 + self.path.mkdir(exist_ok=True, parents=True) + with (self.path / 'config.yaml').open('w') as outfile: + yaml.dump(config, outfile, default_flow_style=False) + def save_experiment(self, name: str, model): + cpt_path = self.path / f'checkpoint_{self.__current_checkpoint}' + cpt_path.mkdir(exist_ok=True, parents=True) + torch.save(model.state_dict(), cpt_path / f'{name}.pt') -AGENT_PREFIX = 'agent#' -REWARD = 'reward' -CUMU_REWARD = 'cumulated_reward' -OBS = 'env_obs' -SEP = '_' -ACTION = 'action' - - -def access_str(agent_i, name, prefix=''): - return f'{prefix}{AGENT_PREFIX}{agent_i}{SEP}{name}' - - -class AutoResetGymMultiAgent(AutoResetGymAgent): - def __init__(self, *args, **kwargs): - super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs) - - def per_agent_values(self, name, values): - return {access_str(agent_i, name): value - for agent_i, value in zip(range(self.n_agents), values)} - - def _initialize_envs(self, n): - super()._initialize_envs(n) - n_agents_list = [self.envs[i].unwrapped.n_agents for i in range(n)] - assert all(n_agents == n_agents_list[0] for n_agents in n_agents_list), \ - 'All envs must have the same number of agents.' - self.n_agents = n_agents_list[0] - - def _reset(self, k, save_render): - ret = super()._reset(k, save_render) - obs = ret['env_obs'].squeeze() - self.cumulated_reward[k] = [0.0]*self.n_agents - obs = self.per_agent_values(OBS, [_format_frame(obs[i]) for i in range(self.n_agents)]) - cumu_rew = self.per_agent_values(CUMU_REWARD, torch.zeros(self.n_agents, 1).float().unbind()) - rewards = self.per_agent_values(REWARD, torch.zeros(self.n_agents, 1).float().unbind()) - ret.update(cumu_rew) - ret.update(rewards) - ret.update(obs) - for remove in ['env_obs', 'cumulated_reward', 'reward']: - del ret[remove] - return ret - - def _step(self, k, action, save_render): - self.timestep[k] += 1 - env = self.envs[k] - if len(action.size()) == 0: - action = action.item() - assert isinstance(action, int) - else: - action = np.array(action.tolist()) - o, r, d, _ = env.step(action) - self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])] - observation = self.per_agent_values(OBS, [_format_frame(o[i]) for i in range(self.n_agents)]) - if d: - self.is_running[k] = False - if save_render: - image = env.render(mode="image").unsqueeze(0) - observation["rendering"] = image - rewards = self.per_agent_values(REWARD, torch.tensor(r).float().view(-1, 1).unbind()) - cumulated_rewards = self.per_agent_values(CUMU_REWARD, torch.tensor(self.cumulated_reward[k]).float().view(-1, 1).unbind()) - ret = { - **observation, - **rewards, - **cumulated_rewards, - "done": torch.tensor([d]), - "initial_state": torch.tensor([False]), - "timestep": torch.tensor([self.timestep[k]]) - } - return _torch_type(ret) - - -class CombineActionsAgent(TAgent): - def __init__(self): - super().__init__() - self.pattern = fr'^{AGENT_PREFIX}\d{SEP}{ACTION}$' - - def forward(self, t, **kwargs): - keys = list(self.workspace.keys()) - action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))]) - actions = torch.cat([self.get((k, t)) for k in action_keys], 0) - actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0) - self.set((f'action', t), actions) + def step(self, to_save): + if self.__current_step in self.checkpoint_indices: + print(f'Checkpointing #{self.__current_checkpoint}') + for name, model in to_save: + self.save_experiment(name, model) + self.__current_checkpoint += 1 + self.__current_step += 1 \ No newline at end of file diff --git a/algorithms/vdn_learner.py b/algorithms/vdn_learner.py deleted file mode 100644 index d2e7067..0000000 --- a/algorithms/vdn_learner.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Union -import torch -import numpy as np -import pandas as pd -from algorithms.q_learner import QLearner - - -class VDNLearner(QLearner): - def __init__(self, *args, **kwargs): - super(VDNLearner, self).__init__(*args, **kwargs) - assert self.n_agents >= 2, 'VDN requires more than one agent, use QLearner instead' - - def get_action(self, obs) -> Union[int, np.ndarray]: - o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs) - eps = np.random.rand(self.n_agents) - greedy = eps > self.eps - agent_actions = None - actions = [] - for i in range(self.n_agents): - if greedy[i]: - if agent_actions is None: agent_actions = self.q_net.act(o.float()) - action = agent_actions[i] - else: - action = self.env.action_space.sample() - actions.append(action) - return np.array(actions) - - def train(self): - if len(self.buffer) < self.batch_size: return - for _ in range(self.n_grad_steps): - experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps) - pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1)) - for agent_i in range(self.n_agents): - q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i], - experience.next_observation[:, agent_i], - experience.action[:, agent_i].unsqueeze(-1)) - pred_q += q_values - target_q_raw += next_q_values_raw - target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw - loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2)) - self._backprop_loss(loss) - - def evaluate(self, n_episodes=100, render=False): - with torch.no_grad(): - data = [] - for eval_i in range(n_episodes): - obs, done = self.env.reset(), False - while not done: - action = self.get_action(obs) - next_obs, reward, done, info = self.env.step(action) - if render: self.env.render() - obs = next_obs # srsly i'm so stupid - info.update({'reward': reward, 'eval_episode': eval_i}) - data.append(info) - return pd.DataFrame(data).fillna(0) diff --git a/environments/factory/__init__.py b/environments/factory/__init__.py index 23346f9..a34815a 100644 --- a/environments/factory/__init__.py +++ b/environments/factory/__init__.py @@ -1,22 +1,25 @@ -def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False): +def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False): import yaml from pathlib import Path from environments.factory.combined_factories import DirtItemFactory from environments.factory.factory_item import ItemFactory, ItemProperties - from environments.factory.factory_dirt import DirtProperties, DirtFactory - from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions + from environments.factory.factory_dirt import DirtProperties, DirtFactory, RewardsDirt + from environments.utility_classes import AgentRenderOptions with (Path(__file__).parent / 'levels' / 'parameters' / f'{env_name}.yaml').open('r') as stream: dictionary = yaml.load(stream, Loader=yaml.FullLoader) - obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED, - frames_to_stack=stack_n_frames, pomdp_r=pomdp_r) + obs_props = dict(render_agents=AgentRenderOptions.COMBINED, + pomdp_r=pomdp_r, + indicate_door_area=True, + show_global_position_info=False, + frames_to_stack=stack_n_frames) - factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards, - max_steps=max_steps, obs_prop=obs_props, - mv_prop=MovementProperties(**dictionary['movement_props']), - dirt_prop=DirtProperties(**dictionary['dirt_props']), - record_episodes=False, verbose=False, **dictionary['factory_props'] + factory_kwargs = dict(**dictionary, + n_agents=n_agents, + individual_rewards=individual_rewards, + max_steps=max_steps, + obs_prop=obs_props, + verbose=False, ) - return DirtFactory(**factory_kwargs).__enter__() diff --git a/environments/factory/levels/parameters/DirtyFactory-v0.yaml b/environments/factory/levels/parameters/DirtyFactory-v0.yaml index 4bd0f2a..e9f7de4 100644 --- a/environments/factory/levels/parameters/DirtyFactory-v0.yaml +++ b/environments/factory/levels/parameters/DirtyFactory-v0.yaml @@ -1,8 +1,12 @@ -movement_props: +parse_doors: True +doors_have_area: True +done_at_collision: False +level_name: "rooms" +mv_prop: allow_diagonal_movement: True allow_square_movement: True allow_no_op: False -dirt_props: +dirt_prop: initial_dirt_ratio: 0.35 initial_dirt_spawn_r_var : 0.1 clean_amount: 0.34 @@ -12,8 +16,15 @@ dirt_props: spawn_frequency: 0 max_spawn_ratio: 0.05 dirt_smear_amount: 0.0 - agent_can_interact: True -factory_props: - parse_doors: True - level_name: "rooms" - doors_have_area: False + done_when_clean: True +rewards_base: + MOVEMENTS_VALID: 0 + MOVEMENTS_FAIL: 0 + NOOP: 0 + USE_DOOR_VALID: 0 + USE_DOOR_FAIL: 0 + COLLISION: 0 +rewards_dirt: + CLEAN_UP_VALID: 1 + CLEAN_UP_FAIL: 0 + CLEAN_UP_LAST_PIECE: 5 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 270a9fd..24d52cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ matplotlib>=3.4.1 stable-baselines3>=1.0 pygame>=2.1.0 gym>=0.18.0 -networkx>=2.6.1 +networkx>=2.6.3 simplejson>=3.17.5 PyYAML>=6.0 -git+https://github.com/facebookresearch/salina.git@main#egg=salina \ No newline at end of file +einops \ No newline at end of file diff --git a/studies/normalization_study.py b/studies/normalization_study.py new file mode 100644 index 0000000..ccfb67c --- /dev/null +++ b/studies/normalization_study.py @@ -0,0 +1,24 @@ +from algorithms.utils import Checkpointer +from pathlib import Path +from algorithms.utils import load_yaml_file, add_env_props, instantiate_class, load_class +from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC + + +#study_root = Path(__file__).parent / 'curious_study' +study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/marl') + +for i in range(0, 5): + for name in ['example_config']: + cfg = load_yaml_file(study_root / f'{name}.yaml') + add_env_props(cfg) + + env = instantiate_class(cfg['env']) + net = instantiate_class(cfg['agent']) + max_steps = cfg['algorithm']['max_steps'] + n_steps = cfg['algorithm']['n_steps'] + + checkpointer = Checkpointer(f'{name}#{i}', study_root, cfg, max_steps, 250) + + loop = load_class(cfg['method'])(cfg) + df = loop.train_loop(checkpointer) + diff --git a/studies/playground_file.py b/studies/playground_file.py new file mode 100644 index 0000000..e32f60a --- /dev/null +++ b/studies/playground_file.py @@ -0,0 +1,32 @@ +import numpy as np +import pandas as pd +from pathlib import Path +import matplotlib.pyplot as plt +import seaborn as sns + +study_root = Path(__file__).parent / 'entropy_study' +names_all = ['basic_gru', 'layernorm_gru', 'spectralnorm_gru', 'nonorm_gru'] +names_only_1 = ['L2OnlyAh_gru', 'L2OnlyChAh_gru', 'L2OnlyMix_gru', 'basic_gru'] +names_only_2 = ['L2NoCh_gru', 'L2NoAh_gru', 'nomix_gru', 'basic_gru'] + +names = names_only_2 +#names = ['nonorm_gru'] +# /Users/romue/PycharmProjects/EDYS/studies/normalization_study/basic_gru#3 +csvs = [] +for name in ['basic_gru', 'nonorm_gru', 'spectralnorm_gru']: + for run in range(0, 1): + try: + df = pd.read_csv(study_root / f'{name}#{run}' / 'results.csv') + df = df[df.agent == 'sum'] + df = df.groupby(['checkpoint', 'run']).mean().reset_index() + df['method'] = name + df['run_'] = run + + df.reward = df.reward.rolling(15).mean() + csvs.append(df) + except Exception as e: + print(f'skipped {run}\t {name}') + +csvs = pd.concat(csvs).rename(columns={"checkpoint": "steps*2e3", "B": "c"}) +sns.lineplot(data=csvs, x='steps*2e3', y='reward', hue='method', palette='husl', ci='sd', linewidth=1.8) +plt.savefig('entropy.png') \ No newline at end of file diff --git a/studies/sat_mad.py b/studies/sat_mad.py deleted file mode 100644 index 6bf3e22..0000000 --- a/studies/sat_mad.py +++ /dev/null @@ -1,139 +0,0 @@ -from salina.agents.gyma import AutoResetGymAgent -from salina.agents import Agents, TemporalAgent -from salina.rl.functional import _index, gae -import torch -import torch.nn as nn -from torch.distributions import Categorical -from salina import TAgent, Workspace, get_arguments, get_class, instantiate_class -from pathlib import Path -import numpy as np -from tqdm import tqdm -import time -from algorithms.utils import ( - add_env_props, - load_yaml_file, - CombineActionsAgent, - AutoResetGymMultiAgent, - access_str, - AGENT_PREFIX, REWARD, CUMU_REWARD, OBS, SEP -) - - -class A2CAgent(TAgent): - def __init__(self, observation_size, hidden_size, n_actions, agent_id): - super().__init__() - observation_size = np.prod(observation_size) - print(observation_size) - self.agent_id = agent_id - self.model = nn.Sequential( - nn.Flatten(), - nn.Linear(observation_size, hidden_size), - nn.ELU(), - nn.Linear(hidden_size, hidden_size), - nn.ELU(), - nn.Linear(hidden_size, hidden_size), - nn.ELU() - ) - self.action_head = nn.Linear(hidden_size, n_actions) - self.critic_head = nn.Linear(hidden_size, 1) - - def get_obs(self, t): - observation = self.get((f'env/{access_str(self.agent_id, OBS)}', t)) - return observation - - def forward(self, t, stochastic, **kwargs): - observation = self.get_obs(t) - features = self.model(observation) - scores = self.action_head(features) - probs = torch.softmax(scores, dim=-1) - critic = self.critic_head(features).squeeze(-1) - if stochastic: - action = torch.distributions.Categorical(probs).sample() - else: - action = probs.argmax(1) - self.set((f'{access_str(self.agent_id, "action")}', t), action) - self.set((f'{access_str(self.agent_id, "action_probs")}', t), probs) - self.set((f'{access_str(self.agent_id, "critic")}', t), critic) - - -if __name__ == '__main__': - # Setup workspace - uid = time.time() - workspace = Workspace() - n_agents = 2 - - # load config - cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml') - add_env_props(cfg) - cfg['env'].update({'n_agents': n_agents}) - - # instantiate agent and env - env_agent = AutoResetGymMultiAgent( - get_class(cfg['env']), - get_arguments(cfg['env']), - n_envs=1 - ) - - a2c_agents = [instantiate_class({**cfg['agent'], - 'agent_id': agent_id}) - for agent_id in range(n_agents)] - - # combine agents - acquisition_agent = TemporalAgent(Agents(env_agent, *a2c_agents, CombineActionsAgent())) - acquisition_agent.seed(69) - - # optimizers & other parameters - cfg_optim = cfg['algorithm']['optimizer'] - optimizers = [get_class(cfg_optim)(a2c_agent.parameters(), **get_arguments(cfg_optim)) - for a2c_agent in a2c_agents] - n_timesteps = cfg['algorithm']['n_timesteps'] - - # Decision making loop - best = -float('inf') - with tqdm(range(int(cfg['algorithm']['max_epochs'] / n_timesteps))) as pbar: - for epoch in pbar: - workspace.zero_grad() - if epoch > 0: - workspace.copy_n_last_steps(1) - acquisition_agent(workspace, t=1, n_steps=n_timesteps-1, stochastic=True) - else: - acquisition_agent(workspace, t=0, n_steps=n_timesteps, stochastic=True) - - for agent_id in range(n_agents): - critic, done, action_probs, reward, action = workspace[ - access_str(agent_id, 'critic'), - "env/done", - access_str(agent_id, 'action_probs'), - access_str(agent_id, 'reward', 'env/'), - access_str(agent_id, 'action') - ] - td = gae(critic, reward, done, 0.98, 0.25) - td_error = td ** 2 - critic_loss = td_error.mean() - entropy_loss = Categorical(action_probs).entropy().mean() - action_logp = _index(action_probs, action).log() - a2c_loss = action_logp[:-1] * td.detach() - a2c_loss = a2c_loss.mean() - loss = ( - -0.001 * entropy_loss - + 1.0 * critic_loss - - 0.1 * a2c_loss - ) - optimizer = optimizers[agent_id] - optimizer.zero_grad() - loss.backward() - #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5) - optimizer.step() - - # Compute the cumulated reward on final_state - rews = '' - for agent_i in range(n_agents): - creward = workspace['env/'+access_str(agent_i, CUMU_REWARD)] - creward = creward[done] - if creward.size()[0] > 0: - rews += f'{AGENT_PREFIX}{agent_i}: {creward.mean().item():.2f} | ' - """if cum_r > best: - torch.save(a2c_agent.state_dict(), Path(__file__).parent / f'agent_{uid}.pt') - best = cum_r""" - pbar.set_description(rews, refresh=True) - diff --git a/studies/sat_mad.yaml b/studies/sat_mad.yaml deleted file mode 100644 index 7e7ca71..0000000 --- a/studies/sat_mad.yaml +++ /dev/null @@ -1,27 +0,0 @@ -agent: - classname: studies.sat_mad.A2CAgent - observation_size: 4*5*5 - hidden_size: 128 - n_actions: 10 - -env: - classname: environments.factory.make - env_name: "DirtyFactory-v0" - n_agents: 1 - pomdp_r: 2 - max_steps: 400 - stack_n_frames: 3 - individual_rewards: True - -algorithm: - max_epochs: 1000000 - n_envs: 1 - n_timesteps: 10 - discount_factor: 0.99 - entropy_coef: 0.01 - critic_coef: 1.0 - gae: 0.25 - optimizer: - classname: torch.optim.Adam - lr: 0.0003 - weight_decay: 0.0 \ No newline at end of file diff --git a/studies/viz_policy.py b/studies/viz_policy.py new file mode 100644 index 0000000..b8ffd78 --- /dev/null +++ b/studies/viz_policy.py @@ -0,0 +1,34 @@ +import pandas as pd +from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC +from pathlib import Path +from algorithms.utils import load_yaml_file +from tqdm import trange +study = 'curious_study' +study_root = Path(__file__).parent / study + +#['L2NoAh_gru', 'L2NoCh_gru', 'nomix_gru']: +render = True +eval_eps = 3 +for run in range(0, 5): + for name in ['basic_gru']:#['L2OnlyAh_gru', 'L2OnlyChAh_gru', 'L2OnlyMix_gru']: #['layernorm_gru', 'basic_gru', 'nonorm_gru', 'spectralnorm_gru']: + cfg = load_yaml_file(Path(__file__).parent / study / f'{name}.yaml') + p_root = Path(study_root / f'{name}#{run}') + dfs = [] + for i in trange(500): + path = p_root / f'checkpoint_{i}' + + snac = LoopSEAC(cfg) + snac.load_state_dict(path) + snac.eval() + + df = snac.eval_loop(render=render, n_episodes=eval_eps) + df['checkpoint'] = i + dfs.append(df) + + results = pd.concat(dfs) + results['run'] = run + results.to_csv(p_root / 'results.csv', index=False) + +#sns.lineplot(data=results, x='checkpoint', y='reward', hue='agent', palette='husl') + +#plt.savefig(f'{experiment_name}.png') \ No newline at end of file diff --git a/studies/viz_salina.py b/studies/viz_salina.py deleted file mode 100644 index fcf6f32..0000000 --- a/studies/viz_salina.py +++ /dev/null @@ -1,39 +0,0 @@ -from salina.agents import Agents, TemporalAgent -import torch -from salina import Workspace, get_arguments, get_class, instantiate_class -from pathlib import Path -from salina.agents.gyma import GymAgent -import time -from algorithms.utils import load_yaml_file, add_env_props - -if __name__ == '__main__': - # Setup workspace - uid = time.time() - workspace = Workspace() - weights = Path('/Users/romue/PycharmProjects/EDYS/studies/agent_1636994369.145843.pt') - - cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml') - add_env_props(cfg) - cfg['env'].update({'n_agents': 2}) - - # instantiate agent and env - env_agent = GymAgent( - get_class(cfg['env']), - get_arguments(cfg['env']), - n_envs=1 - ) - - agents = [] - for _ in range(2): - a2c_agent = instantiate_class(cfg['agent']) - if weights: - a2c_agent.load_state_dict(torch.load(weights)) - agents.append(a2c_agent) - - # combine agents - acquisition_agent = TemporalAgent(Agents(env_agent, *agents)) - acquisition_agent.seed(42) - - acquisition_agent(workspace, t=0, n_steps=400, stochastic=False, save_render=True) - -