firs commit for our new MARL algorithms library, contains working implementations of IAC, SNAC and SEAC

2025-12-23 07:06:06 +01:00 · 2022-01-21 15:31:07 +01:00
parent 3e19970a60
commit ffc47752a7
24 changed files with 762 additions and 847 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -702,3 +702,4 @@ $RECYCLE.BIN/

 # End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex
 /studies/e_1/
+/studies/curious_study/
--- a/algorithms/common.py
+++ b/algorithms/common.py
@@ -1,221 +0,0 @@
-from typing import NamedTuple, Union
-from collections import deque, OrderedDict, defaultdict
-import numpy as np
-import random
-
-import pandas as pd
-import torch
-import torch.nn as nn
-
-from tqdm import trange
-
-class Experience(NamedTuple):
-    # can be use for a single (s_t, a, r s_{t+1}) tuple
-    # or for a batch of tuples
-    observation:      np.ndarray
-    next_observation: np.ndarray
-    action:           np.ndarray
-    reward:           Union[float, np.ndarray]
-    done  :           Union[bool, np.ndarray]
-    episode:          int = -1
-
-
-class BaseLearner:
-    def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1, stack_n_frames=1):
-        assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]'
-        self.env = env
-        self.n_agents = n_agents
-        self.n_grad_steps = n_grad_steps
-        self.train_every = train_every
-        self.stack_n_frames = deque(maxlen=stack_n_frames)
-        self.device = 'cpu'
-        self.n_updates = 0
-        self.step = 0
-        self.episode_step = 0
-        self.episode = 0
-        self.running_reward = deque(maxlen=5)
-
-    def to(self, device):
-        self.device = device
-        for attr, value in self.__dict__.items():
-            if isinstance(value, nn.Module):
-                value = value.to(self.device)
-        return self
-
-    def get_action(self, obs) -> Union[int, np.ndarray]:
-        pass
-
-    def on_new_experience(self, experience):
-        pass
-
-    def on_step_end(self, n_steps):
-        pass
-
-    def on_episode_end(self, n_steps):
-        pass
-
-    def on_all_done(self):
-        pass
-
-    def train(self):
-        pass
-
-    def reward(self, r):
-        return r
-
-    def learn(self, n_steps):
-        train_type, train_freq = self.train_every
-        while self.step < n_steps:
-            obs, done = self.env.reset(), False
-            total_reward = 0
-            self.episode_step = 0
-            while not done:
-
-                action = self.get_action(obs)
-
-                next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
-
-                experience = Experience(observation=obs, next_observation=next_obs,
-                                        action=action, reward=self.reward(reward),
-                                        done=done, episode=self.episode)  # do we really need to copy?
-                self.on_new_experience(experience)
-                # end of step routine
-                obs = next_obs
-                total_reward += reward
-                self.step += 1
-                self.episode_step += 1
-                self.on_step_end(n_steps)
-                if train_type == 'step' and (self.step % train_freq == 0):
-                    self.train()
-                    self.n_updates += 1
-            self.on_episode_end(n_steps)
-            if train_type == 'episode' and (self.episode % train_freq == 0):
-                self.train()
-                self.n_updates += 1
-
-            self.running_reward.append(total_reward)
-            self.episode += 1
-            try:
-                if self.step % 100 == 0:
-                    print(
-                        f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
-                        f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
-            except Exception as e:
-                pass
-        self.on_all_done()
-
-    def evaluate(self, n_episodes=100, render=False):
-        with torch.no_grad():
-            data = []
-            for eval_i in trange(n_episodes):
-                obs, done = self.env.reset(), False
-                while not done:
-                    action = self.get_action(obs)
-                    next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
-                    if render: self.env.render()
-                    obs = next_obs  # srsly i'm so stupid
-                    info.update({'reward': reward, 'eval_episode': eval_i})
-                    data.append(info)
-        return pd.DataFrame(data).fillna(0)
-
-
-
-class BaseBuffer:
-    def __init__(self, size: int):
-        self.size = size
-        self.experience = deque(maxlen=size)
-
-    def __len__(self):
-        return len(self.experience)
-
-    def add(self, exp: Experience):
-        self.experience.append(exp)
-
-    def sample(self, k, cer=4):
-        sample = random.choices(self.experience, k=k-cer)
-        for i in range(cer): sample += [self.experience[-i]]
-        observations = torch.stack([torch.from_numpy(e.observation) for e in sample], 0).float()
-        next_observations = torch.stack([torch.from_numpy(e.next_observation) for e in sample], 0).float()
-        actions = torch.tensor([e.action for e in sample]).long()
-        rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1)
-        dones = torch.tensor([e.done for e in sample]).float().view(-1, 1)
-        #print(observations.shape, next_observations.shape, actions.shape, rewards.shape, dones.shape)
-        return Experience(observations, next_observations, actions, rewards, dones)
-
-
-class TrajectoryBuffer(BaseBuffer):
-    def __init__(self, size):
-        super(TrajectoryBuffer, self).__init__(size)
-        self.experience = defaultdict(list)
-
-    def add(self, exp: Experience):
-        self.experience[exp.episode].append(exp)
-        if len(self.experience) > self.size:
-            oldest_traj_key = list(sorted(self.experience.keys()))[0]
-            del self.experience[oldest_traj_key]
-
-
-def soft_update(local_model, target_model, tau):
-    # taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb
-    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
-        target_param.data.copy_(tau*local_param.data + (1.-tau)*target_param.data)
-
-
-def mlp_maker(dims, flatten=False, activation='elu', activation_last='identity'):
-    activations = {'elu': nn.ELU, 'relu': nn.ReLU, 'sigmoid': nn.Sigmoid,
-                  'leaky_relu': nn.LeakyReLU, 'tanh': nn.Tanh,
-                  'gelu': nn.GELU, 'identity': nn.Identity}
-    layers = [('Flatten', nn.Flatten())] if flatten else []
-    for i in range(1, len(dims)):
-        layers.append((f'Layer #{i - 1}: Linear', nn.Linear(dims[i - 1], dims[i])))
-        activation_str = activation if i != len(dims)-1 else activation_last
-        layers.append((f'Layer #{i - 1}: {activation_str.capitalize()}', activations[activation_str]()))
-    return nn.Sequential(OrderedDict(layers))
-
-
-class BaseDQN(nn.Module):
-    def __init__(self, dims=[3*5*5, 64, 64, 9]):
-        super(BaseDQN, self).__init__()
-        self.net = mlp_maker(dims, flatten=True)
-
-    @torch.no_grad()
-    def act(self, x) -> np.ndarray:
-        action = self.forward(x).max(-1)[1].numpy()
-        return action
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class BaseDDQN(BaseDQN):
-    def __init__(self,
-                 backbone_dims=[3*5*5, 64, 64],
-                 value_dims=[64, 1],
-                 advantage_dims=[64, 9],
-                 activation='elu'):
-        super(BaseDDQN, self).__init__(backbone_dims)
-        self.net = mlp_maker(backbone_dims, activation=activation, flatten=True)
-        self.value_head         =  mlp_maker(value_dims)
-        self.advantage_head     =  mlp_maker(advantage_dims)
-
-    def forward(self, x):
-        features = self.net(x)
-        advantages = self.advantage_head(features)
-        values = self.value_head(features)
-        return values + (advantages - advantages.mean())
-
-
-class BaseICM(nn.Module):
-    def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
-        super(BaseICM, self).__init__()
-        self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu')
-        self.icm = mlp_maker(head_dims)
-        self.ce = nn.CrossEntropyLoss()
-
-    def forward(self, s0, s1, a):
-        phi_s0 = self.backbone(s0)
-        phi_s1 = self.backbone(s1)
-        cat = torch.cat((phi_s0, phi_s1), dim=1)
-        a_prime = torch.softmax(self.icm(cat), dim=-1)
-        ce = self.ce(a_prime, a)
-        return dict(prediction=a_prime, loss=ce)
--- a/algorithms/m_q_learner.py
+++ b/algorithms/m_q_learner.py
@@ -1,77 +0,0 @@
-import numpy as np
-import torch
-import torch.nn.functional as F
-from algorithms.q_learner import QLearner
-
-
-class MQLearner(QLearner):
-    # Munchhausen Q-Learning
-    def __init__(self, *args, temperature=0.03, alpha=0.9, clip_l0=-1.0, **kwargs):
-        super(MQLearner, self).__init__(*args, **kwargs)
-        assert self.n_agents == 1, 'M-DQN currently only supports single agent training'
-        self.temperature = temperature
-        self.alpha = alpha
-        self.clip0 = clip_l0
-
-    def tau_ln_pi(self, qs):
-        # computes log(softmax(qs/temperature))
-        # Custom log-sum-exp trick from page 18 to compute the log-policy terms
-        v_k = qs.max(-1)[0].unsqueeze(-1)
-        advantage = qs - v_k
-        logsum = torch.logsumexp(advantage / self.temperature, -1).unsqueeze(-1)
-        tau_ln_pi = advantage - self.temperature * logsum
-        return tau_ln_pi
-
-    def train(self):
-        if len(self.buffer) < self.batch_size: return
-        for _ in range(self.n_grad_steps):
-
-            experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1])
-
-            with torch.no_grad():
-                q_target_next = self.target_q_net(experience.next_observation)
-                tau_log_pi_next = self.tau_ln_pi(q_target_next)
-
-                q_k_targets = self.target_q_net(experience.observation)
-                log_pi = self.tau_ln_pi(q_k_targets)
-
-                pi_target = F.softmax(q_target_next / self.temperature, dim=-1)
-                q_target = (self.gamma * (pi_target * (q_target_next - tau_log_pi_next) * (1 - experience.done)).sum(-1)).unsqueeze(-1)
-
-                munchausen_addon = log_pi.gather(-1, experience.action)
-
-                munchausen_reward = (experience.reward + self.alpha * torch.clamp(munchausen_addon, min=self.clip0, max=0))
-
-                # Compute Q targets for current states
-                m_q_target = munchausen_reward + q_target
-
-            # Get expected Q values from local model
-            q_k = self.q_net(experience.observation)
-            pred_q = q_k.gather(-1, experience.action)
-
-            # Compute loss
-            loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - m_q_target, 2))
-            self._backprop_loss(loss)
-
-from tqdm import trange
-from collections import deque
-class MQICMLearner(MQLearner):
-    def __init__(self, *args, icm, **kwargs):
-        super(MQICMLearner, self).__init__(*args, **kwargs)
-        self.icm = icm
-        self.icm_optimizer = torch.optim.AdamW(self.icm.parameters())
-        self.normalize_reward = deque(maxlen=1000)
-
-    def on_all_done(self):
-        from collections import deque
-        losses = deque(maxlen=100)
-        for b in trange(10000):
-            batch = self.buffer.sample(128, 0)
-            s0, s1, a = batch.observation,  batch.next_observation, batch.action
-            loss = self.icm(s0, s1, a.squeeze())['loss']
-            self.icm_optimizer.zero_grad()
-            loss.backward()
-            self.icm_optimizer.step()
-            losses.append(loss.item())
-            if b%100 == 0:
-                print(np.mean(losses))
--- a/algorithms/marl/init.py
+++ b/algorithms/marl/init.py
@@ -0,0 +1,4 @@
+from algorithms.marl.base_ac import BaseActorCritic
+from algorithms.marl.iac import LoopIAC
+from algorithms.marl.snac import LoopSNAC
+from algorithms.marl.seac import LoopSEAC
--- a/algorithms/marl/base_ac.py
+++ b/algorithms/marl/base_ac.py
@@ -0,0 +1,176 @@
+import torch
+from typing import Union, List
+import numpy as np
+from torch.distributions import Categorical
+from algorithms.marl.memory import MARLActorCriticMemory
+from algorithms.utils import add_env_props, instantiate_class
+from pathlib import Path
+import pandas as pd
+from collections import deque
+ListOrTensor = Union[List, torch.Tensor]
+
+
+class BaseActorCritic:
+    def __init__(self, cfg):
+        add_env_props(cfg)
+        self.__training = True
+        self.cfg = cfg
+        self.n_agents = cfg['env']['n_agents']
+        self.setup()
+
+    def setup(self):
+        self.net = instantiate_class(self.cfg['agent'])
+        self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5)
+
+    @classmethod
+    def _as_torch(cls, x):
+        if isinstance(x, np.ndarray):
+            return torch.from_numpy(x)
+        elif isinstance(x, List):
+            return torch.tensor(x)
+        elif isinstance(x, (int, float)):
+            return torch.tensor([x])
+        return x
+
+    def train(self):
+        self.__training = False
+        networks = [self.net] if not isinstance(self.net, List) else self.net
+        for net in networks:
+            net.train()
+
+    def eval(self):
+        self.__training = False
+        networks = [self.net] if not isinstance(self.net, List) else self.net
+        for net in networks:
+            net.eval()
+
+    def load_state_dict(self, path: Path):
+        pass
+
+    def get_actions(self, out) -> ListOrTensor:
+        actions = [Categorical(logits=logits).sample().item() for logits in out['logits']]
+        return actions
+
+    def init_hidden(self) -> dict[ListOrTensor]:
+        pass
+
+    def forward(self,
+                observations:  ListOrTensor,
+                actions:       ListOrTensor,
+                hidden_actor:  ListOrTensor,
+                hidden_critic: ListOrTensor
+                ):
+        pass
+
+
+    @torch.no_grad()
+    def train_loop(self, checkpointer=None):
+        env = instantiate_class(self.cfg['env'])
+        n_steps, max_steps = [self.cfg['algorithm'][k] for k in ['n_steps', 'max_steps']]
+        global_steps = 0
+        reward_queue = deque(maxlen=2000)
+        while global_steps < max_steps:
+            tm = MARLActorCriticMemory(self.n_agents)
+            obs = env.reset()
+            last_hiddens        = self.init_hidden()
+            last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
+            done, rew_log       = [False]    * self.n_agents, 0
+            tm.add(action=last_action, **last_hiddens)
+
+            while not all(done):
+
+                out = self.forward(obs, last_action, **last_hiddens)
+                action = self.get_actions(out)
+                next_obs, reward, done, info = env.step(action)
+                next_obs = next_obs
+                if isinstance(done, bool): done = [done] * self.n_agents
+
+                tm.add(observation=obs, action=action, reward=reward, done=done)
+                obs = next_obs
+                last_action = action
+                last_hiddens = dict(hidden_actor=out.get('hidden_actor', None),
+                                    hidden_critic=out.get('hidden_critic', None)
+                                    )
+
+                if len(tm) >= n_steps or all(done):
+                    tm.add(observation=next_obs)
+                    if self.__training:
+                        with torch.inference_mode(False):
+                            self.learn(tm)
+                    tm.reset()
+                    tm.add(action=last_action, **last_hiddens)
+                global_steps += 1
+                rew_log += sum(reward)
+                reward_queue.extend(reward)
+
+                if checkpointer is not None:
+                    checkpointer.step([
+                        (f'agent#{i}', agent)
+                        for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net)
+                    ])
+
+                if global_steps >= max_steps: break
+            print(f'reward at step: {global_steps} = {rew_log}')
+
+    @torch.inference_mode(True)
+    def eval_loop(self, n_episodes, render=False):
+        env = instantiate_class(self.cfg['env'])
+        episode, results = 0, []
+        while episode < n_episodes:
+            obs = env.reset()
+            last_hiddens           = self.init_hidden()
+            last_action, reward    = [-1] * self.n_agents, [0.] * self.n_agents
+            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
+            while not all(done):
+                if render: env.render()
+
+                out    = self.forward(obs, last_action, **last_hiddens)
+                action = self.get_actions(out)
+                next_obs, reward, done, info = env.step(action)
+
+                if isinstance(done, bool): done = [done] * obs.shape[0]
+                obs = next_obs
+                last_action = action
+                last_hiddens = dict(hidden_actor=out.get('hidden_actor',   None),
+                                    hidden_critic=out.get('hidden_critic', None)
+                                    )
+                eps_rew += torch.tensor(reward)
+            results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
+            episode += 1
+        agent_columns = [f'agent#{i}' for i in range(self.cfg['env']['n_agents'])]
+        results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
+        results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'], value_name='reward', var_name='agent')
+        return results
+
+    @staticmethod
+    def compute_advantages(critic, reward, done, gamma):
+        return (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1]
+
+    def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, **kwargs):
+        obs, actions, done, reward = tm.observation, tm.action, tm.done, tm.reward
+
+        out = network(obs, actions, tm.hidden_actor, tm.hidden_critic)
+        logits = out['logits'][:, :-1]  # last one only needed for v_{t+1}
+        critic = out['critic']
+
+        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
+        advantages = self.compute_advantages(critic, reward, done, gamma)
+        value_loss = advantages.pow(2).mean(-1)  # n_agent
+
+        # policy loss
+        log_ap = torch.log_softmax(logits, -1)
+        log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
+        a2c_loss = -(advantages.detach() * log_ap).mean(-1)
+        # weighted loss
+        loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss
+
+        return loss.mean()
+
+    def learn(self, tm: MARLActorCriticMemory, **kwargs):
+        loss = self.actor_critic(tm, self.net, **self.cfg['algorithm'], **kwargs)
+        # remove next_obs, will be added in next iter
+        self.optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
+        self.optimizer.step()
+
--- a/algorithms/marl/example_config.yaml
+++ b/algorithms/marl/example_config.yaml
@@ -0,0 +1,24 @@
+agent:
+  classname:           algorithms.marl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          environments.factory.make
+  env_name:           "DirtyFactory-v0"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+method:               algorithms.marl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.5
+  n_steps:            5
+  max_steps:          1000000
+
--- a/algorithms/marl/iac.py
+++ b/algorithms/marl/iac.py
@@ -0,0 +1,58 @@
+import torch
+from algorithms.marl.base_ac import BaseActorCritic
+from algorithms.utils import instantiate_class
+from pathlib import Path
+from natsort import natsorted
+from algorithms.marl.memory import MARLActorCriticMemory
+
+
+class LoopIAC(BaseActorCritic):
+
+    def __init__(self, cfg):
+        super(LoopIAC, self).__init__(cfg)
+
+    def setup(self):
+        self.net = [
+            instantiate_class(self.cfg['agent']) for _ in range(self.n_agents)
+        ]
+        self.optimizer = [
+            torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents)
+        ]
+
+    def load_state_dict(self, path: Path):
+        paths = natsorted(list(path.glob('*.pt')))
+        print(list(paths))
+        for path, net in zip(paths, self.net):
+            net.load_state_dict(torch.load(path))
+
+    @staticmethod
+    def merge_dicts(ds):  # todo could be recursive for more than 1 hierarchy
+        d = {}
+        for k in ds[0].keys():
+            d[k] = [d[k] for d in ds]
+        return d
+
+    def init_hidden(self):
+        ha  = [net.init_hidden_actor()  for net in self.net]
+        hc  = [net.init_hidden_critic() for net in self.net]
+        return dict(hidden_actor=ha, hidden_critic=hc)
+
+    def forward(self, observations, actions, hidden_actor, hidden_critic):
+        outputs = [
+            net(
+                self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0),  # agents x time
+                self._as_torch(actions[ag_i]).unsqueeze(0),
+                hidden_actor[ag_i],
+                hidden_critic[ag_i]
+                ) for ag_i, net in enumerate(self.net)
+        ]
+        return self.merge_dicts(outputs)
+
+    def learn(self, tms: MARLActorCriticMemory, **kwargs):
+        for ag_i in range(self.n_agents):
+            tm, net = tms(ag_i), self.net[ag_i]
+            loss = self.actor_critic(tm, net, **self.cfg['algorithm'], **kwargs)
+            self.optimizer[ag_i].zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
+            self.optimizer[ag_i].step()
--- a/algorithms/marl/memory.py
+++ b/algorithms/marl/memory.py
@@ -0,0 +1,131 @@
+import torch
+from typing import Union, List
+from torch import Tensor
+import numpy as np
+
+
+class ActorCriticMemory(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.__states  = []
+        self.__actions = []
+        self.__rewards = []
+        self.__dones   = []
+        self.__hiddens_actor = []
+        self.__hiddens_critic = []
+
+    def __len__(self):
+        return len(self.__states)
+
+    @property
+    def observation(self):
+        return torch.stack(self.__states, 0).unsqueeze(0)      # 1 x timesteps x hidden dim
+
+    @property
+    def hidden_actor(self):
+        if len(self.__hiddens_actor) == 1:
+            return self.__hiddens_actor[0]
+        return torch.stack(self.__hiddens_actor, 0)  # layers x timesteps x hidden dim
+
+    @property
+    def hidden_critic(self):
+        if len(self.__hiddens_critic) == 1:
+            return self.__hiddens_critic[0]
+        return torch.stack(self.__hiddens_critic, 0)  # layers x timesteps x hidden dim
+
+    @property
+    def reward(self):
+        return  torch.tensor(self.__rewards).float().unsqueeze(0)  # 1 x timesteps
+
+    @property
+    def action(self):
+        return torch.tensor(self.__actions).long().unsqueeze(0)  # 1 x timesteps+1
+
+    @property
+    def done(self):
+        return torch.tensor(self.__dones).float().unsqueeze(0)  # 1 x timesteps
+
+    def add_observation(self, state:  Union[Tensor, np.ndarray]):
+        self.__states.append(state    if isinstance(state, Tensor) else torch.from_numpy(state))
+
+    def add_hidden_actor(self, hidden: Tensor):
+        # 1x layers x hidden dim
+        if len(hidden.shape) < 3: hidden = hidden.unsqueeze(0)
+        self.__hiddens_actor.append(hidden)
+
+    def add_hidden_critic(self, hidden: Tensor):
+        # 1x layers x hidden dim
+        if len(hidden.shape) < 3: hidden = hidden.unsqueeze(0)
+        self.__hiddens_critic.append(hidden)
+
+    def add_action(self, action: int):
+        self.__actions.append(action)
+
+    def add_reward(self, reward: float):
+        self.__rewards.append(reward)
+
+    def add_done(self, done:   bool):
+        self.__dones.append(done)
+
+    def add(self, **kwargs):
+        for k, v in kwargs.items():
+            func = getattr(ActorCriticMemory, f'add_{k}')
+            func(self, v)
+
+
+class MARLActorCriticMemory(object):
+    def __init__(self, n_agents):
+        self.n_agents = n_agents
+        self.memories = [
+            ActorCriticMemory() for _ in range(n_agents)
+        ]
+
+    def __call__(self, agent_i):
+        return self.memories[agent_i]
+
+    def __len__(self):
+        return len(self.memories[0])  # todo add assertion check!
+
+    def reset(self):
+        for mem in self.memories:
+            mem.reset()
+
+    def add(self, **kwargs):
+        # todo try catch - print all possible functions
+        for agent_i in range(self.n_agents):
+            for k, v in kwargs.items():
+                func = getattr(ActorCriticMemory, f'add_{k}')
+                func(self.memories[agent_i], v[agent_i])
+
+    @property
+    def observation(self):
+        all_obs = [mem.observation for mem in self.memories]
+        return torch.cat(all_obs, 0)  # agents x timesteps+1 x ...
+
+    @property
+    def action(self):
+        all_actions = [mem.action for mem in self.memories]
+        return torch.cat(all_actions, 0)  # agents x timesteps+1 x ...
+
+    @property
+    def done(self):
+        all_dones = [mem.done for mem in self.memories]
+        return torch.cat(all_dones, 0).float()  # agents x timesteps x ...
+
+    @property
+    def reward(self):
+        all_rewards = [mem.reward for mem in self.memories]
+        return torch.cat(all_rewards, 0).float()  # agents x timesteps x ...
+
+    @property
+    def hidden_actor(self):
+        all_ha = [mem.hidden_actor for mem in self.memories]
+        return torch.cat(all_ha, 0)  # agents x layers x  x timesteps x hidden dim
+
+    @property
+    def hidden_critic(self):
+        all_hc = [mem.hidden_critic for mem in self.memories]
+        return torch.cat(all_hc, 0)  # agents  x layers x timesteps x hidden dim
+
--- a/algorithms/marl/networks.py
+++ b/algorithms/marl/networks.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+
+
+class RecurrentAC(nn.Module):
+    def __init__(self, observation_size, n_actions, obs_emb_size,
+                 action_emb_size, hidden_size_actor, hidden_size_critic,
+                 n_agents, use_agent_embedding=True):
+        super(RecurrentAC, self).__init__()
+        observation_size = np.prod(observation_size)
+        self.n_layers = 1
+        self.use_agent_embedding = use_agent_embedding
+        self.hidden_size_actor = hidden_size_actor
+        self.hidden_size_critic = hidden_size_critic
+        self.action_emb_size    = action_emb_size
+        self.obs_proj   = nn.Linear(observation_size, obs_emb_size)
+        self.action_emb =  nn.Embedding(n_actions+1, action_emb_size, padding_idx=0)
+        self.agent_emb  =  nn.Embedding(n_agents, action_emb_size)
+        mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size
+        self.mix = nn.Sequential(nn.Tanh(),
+                                 nn.Linear(mix_in_size, obs_emb_size),
+                                 nn.Tanh(),
+                                 nn.Linear(obs_emb_size, obs_emb_size)
+                                 )
+        self.gru_actor   = nn.GRU(obs_emb_size, hidden_size_actor, batch_first=True, num_layers=self.n_layers)
+        self.gru_critic  = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers)
+        self.action_head = nn.Sequential(
+            spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)),
+            nn.Tanh(),
+            nn.Linear(hidden_size_actor, n_actions)
+        )
+        self.critic_head = nn.Sequential(
+            nn.Linear(hidden_size_critic, hidden_size_critic),
+            nn.Tanh(),
+            nn.Linear(hidden_size_critic, 1)
+        )
+        #self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3)
+        #self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3)
+
+    def init_hidden_actor(self):
+        return torch.zeros(1, self.n_layers, self.hidden_size_actor)
+
+    def init_hidden_critic(self):
+        return torch.zeros(1, self.n_layers, self.hidden_size_critic)
+
+    def forward(self, observations, actions, hidden_actor=None, hidden_critic=None):
+        n_agents, t, *_ = observations.shape
+        obs_emb    = self.obs_proj(observations.view(n_agents, t, -1).float())
+        action_emb = self.action_emb(actions+1)  # shift by one due to padding idx
+        agent_emb  = self.agent_emb(
+            torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)]*t, 1)
+        )
+        x_t        = torch.cat((obs_emb, action_emb), -1) \
+            if not self.use_agent_embedding else torch.cat((obs_emb, agent_emb, action_emb), -1)
+
+
+        mixed_x_t   = self.mix(x_t)
+        output_p, _ = self.gru_actor(input=mixed_x_t,  hx=hidden_actor.swapaxes(1, 0))
+        output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0))
+
+        logits = self.action_head(output_p)
+        critic = self.critic_head(output_c).squeeze(-1)
+        return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c)
+
+
+
+class NormalizedLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int,
+                 device=None, dtype=None, trainable_magnitude=False):
+        super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype)
+        self.d_sqrt = in_features**0.5
+        self.trainable_magnitude = trainable_magnitude
+        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
+
+    def forward(self, input):
+        normalized_input = F.normalize(input, dim=-1, p=2, eps=1e-5)
+        normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
+        return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
+
+
+class L2Norm(nn.Module):
+    def __init__(self, in_features, trainable_magnitude=False):
+        super(L2Norm, self).__init__()
+        self.d_sqrt = in_features**0.5
+        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
+
+    def forward(self, x):
+        return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale
--- a/algorithms/marl/seac.py
+++ b/algorithms/marl/seac.py
@@ -0,0 +1,55 @@
+import torch
+from torch.distributions import Categorical
+from algorithms.marl.iac import LoopIAC
+from algorithms.marl.memory import MARLActorCriticMemory
+
+
+class LoopSEAC(LoopIAC):
+    def __init__(self, cfg):
+        super(LoopSEAC, self).__init__(cfg)
+
+    def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, **kwargs):
+        obs, actions, done, reward = tm.observation, tm.action, tm.done, tm.reward
+        outputs = [net(obs, actions, tm.hidden_actor, tm.hidden_critic) for net in networks]
+
+        with torch.inference_mode(True):
+            true_action_logp = torch.stack([
+                torch.log_softmax(out['logits'][ag_i, :-1], -1)
+                    .gather(index=actions[ag_i, 1:, None], dim=-1)
+                for ag_i, out in enumerate(outputs)
+            ], 0).squeeze()
+
+        losses = []
+
+        for ag_i, out in enumerate(outputs):
+            logits = out['logits'][:, :-1]  # last one only needed for v_{t+1}
+            critic = out['critic']
+
+            entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean()
+            advantages = self.compute_advantages(critic, reward, done, gamma)
+
+            # policy loss
+            log_ap = torch.log_softmax(logits, -1)
+            log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
+
+            # importance weights
+            iw = (log_ap - true_action_logp).exp().detach()  # importance_weights
+
+            a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
+
+
+            value_loss = (iw*advantages.pow(2)).mean(-1)  # n_agent
+
+            # weighted loss
+            loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean()
+            losses.append(loss)
+
+        return losses
+
+    def learn(self, tms: MARLActorCriticMemory, **kwargs):
+        losses = self.actor_critic(tms, self.net, **self.cfg['algorithm'], **kwargs)
+        for ag_i, loss in enumerate(losses):
+            self.optimizer[ag_i].zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
+            self.optimizer[ag_i].step()
--- a/algorithms/marl/snac.py
+++ b/algorithms/marl/snac.py
@@ -0,0 +1,32 @@
+from algorithms.marl.base_ac import BaseActorCritic
+import torch
+from torch.distributions import Categorical
+from pathlib import Path
+
+
+class LoopSNAC(BaseActorCritic):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def load_state_dict(self, path: Path):
+        path2weights = list(path.glob('*.pt'))
+        assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}'
+        self.net.load_state_dict(torch.load(path2weights[0]))
+
+    def init_hidden(self):
+        hidden_actor = self.net.init_hidden_actor()
+        hidden_critic = self.net.init_hidden_critic()
+        return dict(hidden_actor=torch.cat([hidden_actor]   * self.n_agents,  0),
+                    hidden_critic=torch.cat([hidden_critic] * self.n_agents,  0)
+                    )
+
+    def get_actions(self, out):
+        actions = Categorical(logits=out['logits']).sample().squeeze()
+        return actions
+
+    def forward(self, observations, actions, hidden_actor, hidden_critic):
+        out = self.net(self._as_torch(observations).unsqueeze(1),
+                       self._as_torch(actions).unsqueeze(1),
+                       hidden_actor, hidden_critic
+                       )
+        return out
--- a/algorithms/q_learner.py
+++ b/algorithms/q_learner.py
@@ -1,127 +0,0 @@
-from typing import Union
-import gym
-import torch
-import torch.nn as nn
-import numpy as np
-from collections import deque
-from pathlib import Path
-import yaml
-from algorithms.common import BaseLearner, BaseBuffer, soft_update, Experience
-
-
-class QLearner(BaseLearner):
-    def __init__(self, q_net, target_q_net, env, buffer_size=1e5, target_update=3000, eps_end=0.05, n_agents=1,
-                 gamma=0.99, train_every=('step', 4), n_grad_steps=1, tau=1.0, max_grad_norm=10, weight_decay=1e-2,
-                 exploration_fraction=0.2, batch_size=64, lr=1e-4, reg_weight=0.0, eps_start=1):
-        super(QLearner, self).__init__(env, n_agents, train_every, n_grad_steps)
-        self.q_net = q_net
-        self.target_q_net = target_q_net
-        self.target_q_net.eval()
-        #soft_update(cls.q_net, cls.target_q_net, tau=1.0)
-        self.buffer = BaseBuffer(buffer_size)
-        self.target_update = target_update
-        self.eps = eps_start
-        self.eps_start = eps_start
-        self.eps_end = eps_end
-        self.exploration_fraction = exploration_fraction
-        self.batch_size = batch_size
-        self.gamma = gamma
-        self.tau = tau
-        self.reg_weight = reg_weight
-        self.weight_decay = weight_decay
-        self.lr = lr
-        self.optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
-        self.max_grad_norm = max_grad_norm
-        self.running_reward = deque(maxlen=5)
-        self.running_loss = deque(maxlen=5)
-        self.n_updates = 0
-
-    def save(self, path):
-        path = Path(path)  # no-op if already instance of Path
-        path.mkdir(parents=True, exist_ok=True)
-        hparams = {k: v for k, v in self.__dict__.items() if not(isinstance(v, BaseBuffer) or
-                                                                 isinstance(v, torch.optim.Optimizer) or
-                                                                 isinstance(v, gym.Env) or
-                                                                 isinstance(v, nn.Module))
-                   }
-        hparams.update({'class': self.__class__.__name__})
-        with (path / 'hparams.yaml').open('w') as outfile:
-            yaml.dump(hparams, outfile)
-        torch.save(self.q_net, path / 'q_net.pt')
-
-    def anneal_eps(self, step, n_steps):
-        fraction = min(float(step) / int(self.exploration_fraction*n_steps), 1.0)
-        self.eps = 1 + fraction * (self.eps_end - 1)
-
-    def get_action(self, obs) -> Union[int, np.ndarray]:
-        o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
-        if np.random.rand() > self.eps:
-            action = self.q_net.act(o.float())
-        else:
-            action = np.array([self.env.action_space.sample() for _ in range(self.n_agents)])
-        return action
-
-    def on_new_experience(self, experience):
-        self.buffer.add(experience)
-
-    def on_step_end(self, n_steps):
-        self.anneal_eps(self.step, n_steps)
-        if self.step % self.target_update == 0:
-            print('UPDATE')
-            soft_update(self.q_net, self.target_q_net, tau=self.tau)
-
-    def _training_routine(self, obs, next_obs, action):
-        current_q_values = self.q_net(obs)
-        current_q_values = torch.gather(current_q_values, dim=-1, index=action)
-        next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
-        return current_q_values, next_q_values_raw
-
-    def _backprop_loss(self, loss):
-        # log loss
-        self.running_loss.append(loss.item())
-        # Optimize the model
-        self.optimizer.zero_grad()
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.max_grad_norm)
-        self.optimizer.step()
-
-    def train(self):
-        if len(self.buffer) < self.batch_size: return
-        for _ in range(self.n_grad_steps):
-            experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1])
-            pred_q, target_q_raw = self._training_routine(experience.observation,
-                                                          experience.next_observation,
-                                                          experience.action)
-            target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
-            loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
-            self._backprop_loss(loss)
-
-
-
-if __name__ == '__main__':
-    from environments.factory.factory_dirt import DirtFactory, DirtProperties, MovementProperties
-    from algorithms.common import BaseDDQN, BaseICM
-    from algorithms.m_q_learner import MQLearner, MQICMLearner
-    from algorithms.vdn_learner import VDNLearner
-
-    N_AGENTS = 1
-
-    with (Path(f'../environments/factory/env_default_param.yaml')).open('r') as f:
-        env_kwargs = yaml.load(f, Loader=yaml.FullLoader)
-
-    env = DirtFactory(**env_kwargs)
-    obs_shape = np.prod(env.observation_space.shape)
-    n_actions = env.action_space.n
-
-    dqn, target_dqn = BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128, 1], activation='leaky_relu'),\
-                      BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128, 1], activation='leaky_relu')
-
-    icm = BaseICM(backbone_dims=[obs_shape, 64, 32], head_dims=[2*32, 64, n_actions])
-
-    learner = MQICMLearner(dqn, target_dqn, env, 50000, icm=icm,
-                           target_update=5000, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10,
-                           train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25,
-                           batch_size=64, weight_decay=1e-3
-                           )
-    #learner.save(Path(__file__).parent / 'test' / 'testexperiment1337')
-    learner.learn(100000)
--- a/algorithms/reg_dqn.py
+++ b/algorithms/reg_dqn.py
@@ -1,52 +0,0 @@
-import numpy as np
-import torch
-import stable_baselines3 as sb3
-from stable_baselines3.common import logger
-
-
-class RegDQN(sb3.dqn.DQN):
-    def __init__(self, *args, reg_weight=0.1, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.reg_weight = reg_weight
-
-    def train(self, gradient_steps: int, batch_size: int = 100) -> None:
-        # Update learning rate according to schedule
-        self._update_learning_rate(self.policy.optimizer)
-
-        losses = []
-        for _ in range(gradient_steps):
-            # Sample replay buffer
-            replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
-
-            with torch.no_grad():
-                # Compute the next Q-values using the target network
-                next_q_values = self.q_net_target(replay_data.next_observations)
-                # Follow greedy policy: use the one with the highest value
-                next_q_values, _ = next_q_values.max(dim=1)
-                # Avoid potential broadcast issue
-                next_q_values = next_q_values.reshape(-1, 1)
-                # 1-step TD target
-                target_q_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
-
-            # Get current Q-values estimates
-            current_q_values = self.q_net(replay_data.observations)
-
-            # Retrieve the q-values for the actions from the replay buffer
-            current_q_values = torch.gather(current_q_values, dim=1, index=replay_data.actions.long())
-
-            delta = current_q_values - target_q_values
-            loss = torch.mean(self.reg_weight * current_q_values + torch.pow(delta, 2))
-            losses.append(loss.item())
-
-            # Optimize the policy
-            self.policy.optimizer.zero_grad()
-            loss.backward()
-            # Clip gradient norm
-            torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm)
-            self.policy.optimizer.step()
-
-        # Increase update counter
-        self._n_updates += gradient_steps
-
-        logger.record("train/n_updates", self._n_updates, exclude="tensorboard")
-        logger.record("train/loss", np.mean(losses))
--- a/algorithms/utils.py
+++ b/algorithms/utils.py
@@ -3,14 +3,51 @@ import torch
 import numpy as np
 import yaml
 from pathlib import Path
-from salina import instantiate_class
-from salina import TAgent
-from salina.agents.gyma import (
-    AutoResetGymAgent,
-    _torch_type,
-    _format_frame,
-    _torch_cat_dict
-)
+
+
+def load_class(classname):
+    from importlib import import_module
+    module_path, class_name = classname.rsplit(".", 1)
+    module = import_module(module_path)
+    c = getattr(module, class_name)
+    return c
+
+
+def instantiate_class(arguments):
+    from importlib import import_module
+
+    d = dict(arguments)
+    classname = d["classname"]
+    del d["classname"]
+    module_path, class_name = classname.rsplit(".", 1)
+    module = import_module(module_path)
+    c = getattr(module, class_name)
+    return c(**d)
+
+
+def get_class(arguments):
+    from importlib import import_module
+
+    if isinstance(arguments, dict):
+        classname = arguments["classname"]
+        module_path, class_name = classname.rsplit(".", 1)
+        module = import_module(module_path)
+        c = getattr(module, class_name)
+        return c
+    else:
+        classname = arguments.classname
+        module_path, class_name = classname.rsplit(".", 1)
+        module = import_module(module_path)
+        c = getattr(module, class_name)
+        return c
+
+
+def get_arguments(arguments):
+    from importlib import import_module
+    d = dict(arguments)
+    if "classname" in d:
+        del d["classname"]
+    return d


 def load_yaml_file(path: Path):
@@ -21,90 +58,29 @@ def load_yaml_file(path: Path):

 def add_env_props(cfg):
    env = instantiate_class(cfg['env'].copy())
-    cfg['agent'].update(dict(observation_size=env.observation_space.shape,
+    cfg['agent'].update(dict(observation_size=list(env.observation_space.shape),
                             n_actions=env.action_space.n))


+class Checkpointer(object):
+    def __init__(self, experiment_name, root, config, total_steps, n_checkpoints):
+        self.path = root / experiment_name
+        self.checkpoint_indices = list(np.linspace(1, total_steps, n_checkpoints, dtype=int) - 1)
+        self.__current_checkpoint = 0
+        self.__current_step = 0
+        self.path.mkdir(exist_ok=True, parents=True)
+        with (self.path / 'config.yaml').open('w') as outfile:
+            yaml.dump(config, outfile, default_flow_style=False)

+    def save_experiment(self, name: str, model):
+        cpt_path = self.path / f'checkpoint_{self.__current_checkpoint}'
+        cpt_path.mkdir(exist_ok=True, parents=True)
+        torch.save(model.state_dict(), cpt_path / f'{name}.pt')

-AGENT_PREFIX = 'agent#'
-REWARD       =  'reward'
-CUMU_REWARD  = 'cumulated_reward'
-OBS          = 'env_obs'
-SEP          = '_'
-ACTION       = 'action'
-
-
-def access_str(agent_i, name, prefix=''):
-    return f'{prefix}{AGENT_PREFIX}{agent_i}{SEP}{name}'
-
-
-class AutoResetGymMultiAgent(AutoResetGymAgent):
-    def __init__(self, *args, **kwargs):
-        super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
-
-    def per_agent_values(self, name, values):
-        return {access_str(agent_i, name): value
-                for agent_i, value in zip(range(self.n_agents), values)}
-
-    def _initialize_envs(self, n):
-        super()._initialize_envs(n)
-        n_agents_list = [self.envs[i].unwrapped.n_agents for i in range(n)]
-        assert all(n_agents == n_agents_list[0] for n_agents in n_agents_list), \
-            'All envs must have the same number of agents.'
-        self.n_agents = n_agents_list[0]
-
-    def _reset(self, k, save_render):
-        ret = super()._reset(k, save_render)
-        obs = ret['env_obs'].squeeze()
-        self.cumulated_reward[k] = [0.0]*self.n_agents
-        obs      = self.per_agent_values(OBS,  [_format_frame(obs[i]) for i in range(self.n_agents)])
-        cumu_rew = self.per_agent_values(CUMU_REWARD, torch.zeros(self.n_agents, 1).float().unbind())
-        rewards  = self.per_agent_values(REWARD,      torch.zeros(self.n_agents, 1).float().unbind())
-        ret.update(cumu_rew)
-        ret.update(rewards)
-        ret.update(obs)
-        for remove in ['env_obs', 'cumulated_reward', 'reward']:
-            del ret[remove]
-        return ret
-
-    def _step(self, k, action, save_render):
-        self.timestep[k] += 1
-        env = self.envs[k]
-        if len(action.size()) == 0:
-            action = action.item()
-            assert isinstance(action, int)
-        else:
-            action = np.array(action.tolist())
-        o, r, d, _ = env.step(action)
-        self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
-        observation = self.per_agent_values(OBS, [_format_frame(o[i]) for i in range(self.n_agents)])
-        if d:
-            self.is_running[k] = False
-        if save_render:
-            image = env.render(mode="image").unsqueeze(0)
-            observation["rendering"] = image
-        rewards           = self.per_agent_values(REWARD, torch.tensor(r).float().view(-1, 1).unbind())
-        cumulated_rewards = self.per_agent_values(CUMU_REWARD, torch.tensor(self.cumulated_reward[k]).float().view(-1, 1).unbind())
-        ret = {
-            **observation,
-            **rewards,
-            **cumulated_rewards,
-            "done": torch.tensor([d]),
-            "initial_state": torch.tensor([False]),
-            "timestep": torch.tensor([self.timestep[k]])
-        }
-        return _torch_type(ret)
-
-
-class CombineActionsAgent(TAgent):
-    def __init__(self):
-        super().__init__()
-        self.pattern = fr'^{AGENT_PREFIX}\d{SEP}{ACTION}$'
-
-    def forward(self, t, **kwargs):
-        keys = list(self.workspace.keys())
-        action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
-        actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
-        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
-        self.set((f'action', t), actions)
+    def step(self, to_save):
+        if self.__current_step in self.checkpoint_indices:
+            print(f'Checkpointing #{self.__current_checkpoint}')
+            for name, model in to_save:
+                self.save_experiment(name, model)
+            self.__current_checkpoint += 1
+        self.__current_step += 1
--- a/algorithms/vdn_learner.py
+++ b/algorithms/vdn_learner.py
@@ -1,55 +0,0 @@
-from typing import Union
-import torch
-import numpy as np
-import pandas as pd
-from algorithms.q_learner import QLearner
-
-
-class VDNLearner(QLearner):
-    def __init__(self, *args, **kwargs):
-        super(VDNLearner, self).__init__(*args, **kwargs)
-        assert self.n_agents >= 2, 'VDN requires more than one agent, use QLearner instead'
-
-    def get_action(self, obs) -> Union[int, np.ndarray]:
-        o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
-        eps = np.random.rand(self.n_agents)
-        greedy = eps > self.eps
-        agent_actions = None
-        actions = []
-        for i in range(self.n_agents):
-            if greedy[i]:
-                if agent_actions is None: agent_actions = self.q_net.act(o.float())
-                action = agent_actions[i]
-            else:
-                action = self.env.action_space.sample()
-            actions.append(action)
-        return np.array(actions)
-
-    def train(self):
-        if len(self.buffer) < self.batch_size: return
-        for _ in range(self.n_grad_steps):
-            experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps)
-            pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
-            for agent_i in range(self.n_agents):
-                q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
-                                                                     experience.next_observation[:, agent_i],
-                                                                     experience.action[:, agent_i].unsqueeze(-1))
-                pred_q += q_values
-                target_q_raw += next_q_values_raw
-            target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
-            loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
-            self._backprop_loss(loss)
-
-    def evaluate(self, n_episodes=100, render=False):
-        with torch.no_grad():
-            data = []
-            for eval_i in range(n_episodes):
-                obs, done = self.env.reset(), False
-                while not done:
-                    action = self.get_action(obs)
-                    next_obs, reward, done, info = self.env.step(action)
-                    if render: self.env.render()
-                    obs = next_obs  # srsly i'm so stupid
-                    info.update({'reward': reward, 'eval_episode': eval_i})
-                    data.append(info)
-        return pd.DataFrame(data).fillna(0)
--- a/environments/factory/init.py
+++ b/environments/factory/init.py
@@ -1,22 +1,25 @@
-def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1,  individual_rewards=False):
+def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False):
    import yaml
    from pathlib import Path
    from environments.factory.combined_factories import DirtItemFactory
    from environments.factory.factory_item import ItemFactory, ItemProperties
-    from environments.factory.factory_dirt import DirtProperties, DirtFactory
-    from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions
+    from environments.factory.factory_dirt import DirtProperties, DirtFactory, RewardsDirt
+    from environments.utility_classes import AgentRenderOptions

    with (Path(__file__).parent / 'levels' / 'parameters' / f'{env_name}.yaml').open('r') as stream:
        dictionary = yaml.load(stream, Loader=yaml.FullLoader)

-    obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED,
-                                      frames_to_stack=stack_n_frames, pomdp_r=pomdp_r)
+    obs_props = dict(render_agents=AgentRenderOptions.COMBINED,
+                     pomdp_r=pomdp_r,
+                     indicate_door_area=True,
+                     show_global_position_info=False,
+                     frames_to_stack=stack_n_frames)

-    factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards,
-                          max_steps=max_steps, obs_prop=obs_props,
-                          mv_prop=MovementProperties(**dictionary['movement_props']),
-                          dirt_prop=DirtProperties(**dictionary['dirt_props']),
-                          record_episodes=False, verbose=False, **dictionary['factory_props']
+    factory_kwargs = dict(**dictionary,
+                          n_agents=n_agents,
+                          individual_rewards=individual_rewards,
+                          max_steps=max_steps,
+                          obs_prop=obs_props,
+                          verbose=False,
                          )
-
    return DirtFactory(**factory_kwargs).__enter__()
--- a/environments/factory/levels/parameters/DirtyFactory-v0.yaml
+++ b/environments/factory/levels/parameters/DirtyFactory-v0.yaml
@@ -1,8 +1,12 @@
-movement_props:
+parse_doors:                True
+doors_have_area:            True
+done_at_collision:          False
+level_name:                 "rooms"
+mv_prop:
    allow_diagonal_movement:    True
    allow_square_movement:      True
    allow_no_op:                False
-dirt_props:
+dirt_prop:
    initial_dirt_ratio:         0.35
    initial_dirt_spawn_r_var :  0.1
    clean_amount:               0.34
@@ -12,8 +16,15 @@ dirt_props:
    spawn_frequency:            0
    max_spawn_ratio:            0.05
    dirt_smear_amount:          0.0
-    agent_can_interact:         True
-factory_props:
-    parse_doors:                True
-    level_name:                 "rooms"
-    doors_have_area:            False
+    done_when_clean:            True
+rewards_base:
+    MOVEMENTS_VALID:    0
+    MOVEMENTS_FAIL:     0
+    NOOP:               0
+    USE_DOOR_VALID:     0
+    USE_DOOR_FAIL:      0
+    COLLISION:          0
+rewards_dirt:
+    CLEAN_UP_VALID:       1
+    CLEAN_UP_FAIL:        0
+    CLEAN_UP_LAST_PIECE:  5
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ matplotlib>=3.4.1
 stable-baselines3>=1.0
 pygame>=2.1.0
 gym>=0.18.0
-networkx>=2.6.1
+networkx>=2.6.3
 simplejson>=3.17.5
 PyYAML>=6.0
-git+https://github.com/facebookresearch/salina.git@main#egg=salina
+einops
--- a/studies/normalization_study.py
+++ b/studies/normalization_study.py
@@ -0,0 +1,24 @@
+from algorithms.utils import Checkpointer
+from pathlib import Path
+from algorithms.utils import load_yaml_file, add_env_props, instantiate_class, load_class
+from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC
+
+
+#study_root = Path(__file__).parent / 'curious_study'
+study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/marl')
+
+for i in range(0, 5):
+    for name in ['example_config']:
+        cfg = load_yaml_file(study_root / f'{name}.yaml')
+        add_env_props(cfg)
+
+        env = instantiate_class(cfg['env'])
+        net = instantiate_class(cfg['agent'])
+        max_steps = cfg['algorithm']['max_steps']
+        n_steps = cfg['algorithm']['n_steps']
+
+        checkpointer = Checkpointer(f'{name}#{i}', study_root, cfg, max_steps, 250)
+
+        loop = load_class(cfg['method'])(cfg)
+        df = loop.train_loop(checkpointer)
+
--- a/studies/playground_file.py
+++ b/studies/playground_file.py
@@ -0,0 +1,32 @@
+import numpy as np
+import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+study_root = Path(__file__).parent / 'entropy_study'
+names_all = ['basic_gru', 'layernorm_gru', 'spectralnorm_gru', 'nonorm_gru']
+names_only_1 = ['L2OnlyAh_gru', 'L2OnlyChAh_gru', 'L2OnlyMix_gru', 'basic_gru']
+names_only_2 = ['L2NoCh_gru', 'L2NoAh_gru', 'nomix_gru', 'basic_gru']
+
+names = names_only_2
+#names = ['nonorm_gru']
+# /Users/romue/PycharmProjects/EDYS/studies/normalization_study/basic_gru#3
+csvs = []
+for name in ['basic_gru', 'nonorm_gru', 'spectralnorm_gru']:
+    for run in range(0, 1):
+        try:
+            df = pd.read_csv(study_root / f'{name}#{run}' / 'results.csv')
+            df = df[df.agent == 'sum']
+            df = df.groupby(['checkpoint', 'run']).mean().reset_index()
+            df['method'] = name
+            df['run_'] = run
+
+            df.reward = df.reward.rolling(15).mean()
+            csvs.append(df)
+        except Exception as e:
+            print(f'skipped {run}\t {name}')
+
+csvs = pd.concat(csvs).rename(columns={"checkpoint": "steps*2e3", "B": "c"})
+sns.lineplot(data=csvs, x='steps*2e3', y='reward', hue='method', palette='husl', ci='sd', linewidth=1.8)
+plt.savefig('entropy.png')
--- a/studies/sat_mad.py
+++ b/studies/sat_mad.py
@@ -1,139 +0,0 @@
-from salina.agents.gyma import AutoResetGymAgent
-from salina.agents import Agents, TemporalAgent
-from salina.rl.functional import _index, gae
-import torch
-import torch.nn as nn
-from torch.distributions import Categorical
-from salina import TAgent, Workspace, get_arguments, get_class, instantiate_class
-from pathlib import Path
-import numpy as np
-from tqdm import tqdm
-import time
-from algorithms.utils import (
-    add_env_props,
-    load_yaml_file,
-    CombineActionsAgent,
-    AutoResetGymMultiAgent,
-    access_str,
-    AGENT_PREFIX, REWARD, CUMU_REWARD, OBS, SEP
-)
-
-
-class A2CAgent(TAgent):
-    def __init__(self, observation_size, hidden_size, n_actions, agent_id):
-        super().__init__()
-        observation_size = np.prod(observation_size)
-        print(observation_size)
-        self.agent_id = agent_id
-        self.model = nn.Sequential(
-            nn.Flatten(),
-            nn.Linear(observation_size, hidden_size),
-            nn.ELU(),
-            nn.Linear(hidden_size, hidden_size),
-            nn.ELU(),
-            nn.Linear(hidden_size, hidden_size),
-            nn.ELU()
-        )
-        self.action_head = nn.Linear(hidden_size, n_actions)
-        self.critic_head = nn.Linear(hidden_size, 1)
-
-    def get_obs(self, t):
-        observation = self.get((f'env/{access_str(self.agent_id, OBS)}', t))
-        return observation
-
-    def forward(self, t, stochastic, **kwargs):
-        observation = self.get_obs(t)
-        features = self.model(observation)
-        scores = self.action_head(features)
-        probs = torch.softmax(scores, dim=-1)
-        critic = self.critic_head(features).squeeze(-1)
-        if stochastic:
-            action = torch.distributions.Categorical(probs).sample()
-        else:
-            action = probs.argmax(1)
-        self.set((f'{access_str(self.agent_id, "action")}', t), action)
-        self.set((f'{access_str(self.agent_id, "action_probs")}', t), probs)
-        self.set((f'{access_str(self.agent_id, "critic")}', t), critic)
-
-
-if __name__ == '__main__':
-    # Setup workspace
-    uid = time.time()
-    workspace = Workspace()
-    n_agents = 2
-
-    # load config
-    cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml')
-    add_env_props(cfg)
-    cfg['env'].update({'n_agents': n_agents})
-
-    # instantiate agent and env
-    env_agent = AutoResetGymMultiAgent(
-        get_class(cfg['env']),
-        get_arguments(cfg['env']),
-        n_envs=1
-    )
-
-    a2c_agents = [instantiate_class({**cfg['agent'],
-                                     'agent_id': agent_id})
-                  for agent_id in range(n_agents)]
-
-    # combine agents
-    acquisition_agent = TemporalAgent(Agents(env_agent, *a2c_agents, CombineActionsAgent()))
-    acquisition_agent.seed(69)
-
-    # optimizers & other parameters
-    cfg_optim = cfg['algorithm']['optimizer']
-    optimizers = [get_class(cfg_optim)(a2c_agent.parameters(), **get_arguments(cfg_optim))
-                  for a2c_agent in a2c_agents]
-    n_timesteps = cfg['algorithm']['n_timesteps']
-
-    # Decision making loop
-    best = -float('inf')
-    with tqdm(range(int(cfg['algorithm']['max_epochs'] / n_timesteps))) as pbar:
-        for epoch in pbar:
-            workspace.zero_grad()
-            if epoch > 0:
-                workspace.copy_n_last_steps(1)
-                acquisition_agent(workspace, t=1, n_steps=n_timesteps-1, stochastic=True)
-            else:
-                acquisition_agent(workspace, t=0, n_steps=n_timesteps,  stochastic=True)
-
-            for agent_id in range(n_agents):
-                critic, done, action_probs, reward, action = workspace[
-                    access_str(agent_id, 'critic'),
-                    "env/done",
-                    access_str(agent_id, 'action_probs'),
-                    access_str(agent_id, 'reward', 'env/'),
-                    access_str(agent_id, 'action')
-                ]
-                td = gae(critic, reward, done, 0.98, 0.25)
-                td_error = td ** 2
-                critic_loss = td_error.mean()
-                entropy_loss = Categorical(action_probs).entropy().mean()
-                action_logp = _index(action_probs, action).log()
-                a2c_loss = action_logp[:-1] * td.detach()
-                a2c_loss = a2c_loss.mean()
-                loss = (
-                    -0.001 * entropy_loss
-                    + 1.0 * critic_loss
-                    - 0.1 * a2c_loss
-                )
-                optimizer = optimizers[agent_id]
-                optimizer.zero_grad()
-                loss.backward()
-                #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5)
-                optimizer.step()
-
-                # Compute the cumulated reward on final_state
-                rews = ''
-                for agent_i in range(n_agents):
-                    creward = workspace['env/'+access_str(agent_i, CUMU_REWARD)]
-                    creward = creward[done]
-                    if creward.size()[0] > 0:
-                        rews += f'{AGENT_PREFIX}{agent_i}: {creward.mean().item():.2f}  |  '
-                        """if cum_r > best:
-                            torch.save(a2c_agent.state_dict(), Path(__file__).parent / f'agent_{uid}.pt')
-                            best = cum_r"""
-                        pbar.set_description(rews, refresh=True)
-
--- a/studies/sat_mad.yaml
+++ b/studies/sat_mad.yaml
@@ -1,27 +0,0 @@
-agent:
-  classname:        studies.sat_mad.A2CAgent
-  observation_size: 4*5*5
-  hidden_size:      128
-  n_actions:        10
-
-env:
-  classname:          environments.factory.make
-  env_name:           "DirtyFactory-v0"
-  n_agents:           1
-  pomdp_r:            2
-  max_steps:          400
-  stack_n_frames:     3
-  individual_rewards: True
-
-algorithm:
-  max_epochs:             1000000
-  n_envs:                 1
-  n_timesteps:            10
-  discount_factor:        0.99
-  entropy_coef:           0.01
-  critic_coef:            1.0
-  gae:                    0.25
-  optimizer:
-    classname:            torch.optim.Adam
-    lr:                   0.0003
-    weight_decay:         0.0
--- a/studies/viz_policy.py
+++ b/studies/viz_policy.py
@@ -0,0 +1,34 @@
+import pandas as pd
+from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC
+from pathlib import Path
+from algorithms.utils import load_yaml_file
+from tqdm import trange
+study = 'curious_study'
+study_root = Path(__file__).parent / study
+
+#['L2NoAh_gru', 'L2NoCh_gru', 'nomix_gru']:
+render = True
+eval_eps = 3
+for run in range(0, 5):
+    for name in ['basic_gru']:#['L2OnlyAh_gru', 'L2OnlyChAh_gru', 'L2OnlyMix_gru']: #['layernorm_gru', 'basic_gru', 'nonorm_gru', 'spectralnorm_gru']:
+        cfg = load_yaml_file(Path(__file__).parent / study / f'{name}.yaml')
+        p_root = Path(study_root / f'{name}#{run}')
+        dfs = []
+        for i in trange(500):
+            path = p_root / f'checkpoint_{i}'
+
+            snac = LoopSEAC(cfg)
+            snac.load_state_dict(path)
+            snac.eval()
+
+            df = snac.eval_loop(render=render, n_episodes=eval_eps)
+            df['checkpoint'] = i
+            dfs.append(df)
+
+        results = pd.concat(dfs)
+        results['run'] = run
+        results.to_csv(p_root / 'results.csv', index=False)
+
+#sns.lineplot(data=results, x='checkpoint', y='reward', hue='agent', palette='husl')
+
+#plt.savefig(f'{experiment_name}.png')
--- a/studies/viz_salina.py
+++ b/studies/viz_salina.py
@@ -1,39 +0,0 @@
-from salina.agents import Agents, TemporalAgent
-import torch
-from salina import Workspace, get_arguments, get_class, instantiate_class
-from pathlib import Path
-from salina.agents.gyma import GymAgent
-import time
-from algorithms.utils import load_yaml_file, add_env_props
-
-if __name__ == '__main__':
-    # Setup workspace
-    uid = time.time()
-    workspace = Workspace()
-    weights = Path('/Users/romue/PycharmProjects/EDYS/studies/agent_1636994369.145843.pt')
-
-    cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml')
-    add_env_props(cfg)
-    cfg['env'].update({'n_agents': 2})
-
-    # instantiate agent and env
-    env_agent = GymAgent(
-        get_class(cfg['env']),
-        get_arguments(cfg['env']),
-        n_envs=1
-    )
-
-    agents = []
-    for _ in range(2):
-        a2c_agent = instantiate_class(cfg['agent'])
-        if weights:
-            a2c_agent.load_state_dict(torch.load(weights))
-        agents.append(a2c_agent)
-
-    # combine agents
-    acquisition_agent = TemporalAgent(Agents(env_agent, *agents))
-    acquisition_agent.seed(42)
-
-    acquisition_agent(workspace, t=0, n_steps=400, stochastic=False, save_render=True)
-
-