renaming

2026-01-09 21:26:04 +01:00 · 2023-06-21 11:28:36 +02:00
parent 95c85bfedd
commit d11b1a8172
133 changed files with 225 additions and 225 deletions
--- a/marl_factory_grid/algorithms/init.py
+++ b/marl_factory_grid/algorithms/init.py
@@ -0,0 +1 @@
+import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
@@ -0,0 +1 @@
+from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
--- a/marl_factory_grid/algorithms/marl/base_ac.py
+++ b/marl_factory_grid/algorithms/marl/base_ac.py
@@ -0,0 +1,221 @@
+import torch
+from typing import Union, List, Dict
+import numpy as np
+from torch.distributions import Categorical
+from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
+from pathlib import Path
+import pandas as pd
+from collections import deque
+
+
+class Names:
+    REWARD          = 'reward'
+    DONE            = 'done'
+    ACTION          = 'action'
+    OBSERVATION     = 'observation'
+    LOGITS          = 'logits'
+    HIDDEN_ACTOR    = 'hidden_actor'
+    HIDDEN_CRITIC   = 'hidden_critic'
+    AGENT           = 'agent'
+    ENV             = 'environment'
+    N_AGENTS        = 'n_agents'
+    ALGORITHM       = 'algorithm'
+    MAX_STEPS       = 'max_steps'
+    N_STEPS         = 'n_steps'
+    BUFFER_SIZE     = 'buffer_size'
+    CRITIC          = 'critic'
+    BATCH_SIZE      = 'bnatch_size'
+    N_ACTIONS       = 'n_actions'
+
+nms = Names
+ListOrTensor = Union[List, torch.Tensor]
+
+
+class BaseActorCritic:
+    def __init__(self, cfg):
+        add_env_props(cfg)
+        self.__training = True
+        self.cfg = cfg
+        self.n_agents = cfg[nms.ENV][nms.N_AGENTS]
+        self.reset_memory_after_epoch = True
+        self.setup()
+
+    def setup(self):
+        self.net = instantiate_class(self.cfg[nms.AGENT])
+        self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5)
+
+    @classmethod
+    def _as_torch(cls, x):
+        if isinstance(x, np.ndarray):
+            return torch.from_numpy(x)
+        elif isinstance(x, List):
+            return torch.tensor(x)
+        elif isinstance(x, (int, float)):
+            return torch.tensor([x])
+        return x
+
+    def train(self):
+        self.__training = False
+        networks = [self.net] if not isinstance(self.net, List) else self.net
+        for net in networks:
+            net.train()
+
+    def eval(self):
+        self.__training = False
+        networks = [self.net] if not isinstance(self.net, List) else self.net
+        for net in networks:
+            net.eval()
+
+    def load_state_dict(self, path: Path):
+        pass
+
+    def get_actions(self, out) -> ListOrTensor:
+        actions = [Categorical(logits=logits).sample().item() for logits in out[nms.LOGITS]]
+        return actions
+
+    def init_hidden(self) -> Dict[str, ListOrTensor]:
+        pass
+
+    def forward(self,
+                observations:  ListOrTensor,
+                actions:       ListOrTensor,
+                hidden_actor:  ListOrTensor,
+                hidden_critic: ListOrTensor
+                ) -> Dict[str, ListOrTensor]:
+        pass
+
+    @torch.no_grad()
+    def train_loop(self, checkpointer=None):
+        env = instantiate_class(self.cfg[nms.ENV])
+        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
+        tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps))
+        global_steps, episode, df_results = 0, 0, []
+        reward_queue = deque(maxlen=2000)
+
+        while global_steps < max_steps:
+            obs = env.reset()
+            last_hiddens        = self.init_hidden()
+            last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
+            done, rew_log       = [False] * self.n_agents, 0
+
+            if self.reset_memory_after_epoch:
+                tm.reset()
+
+            tm.add(observation=obs, action=last_action,
+                   logits=torch.zeros(self.n_agents, 1, self.cfg[nms.AGENT][nms.N_ACTIONS]),
+                   values=torch.zeros(self.n_agents, 1), reward=reward, done=done, **last_hiddens)
+
+            while not all(done):
+                out = self.forward(obs, last_action, **last_hiddens)
+                action = self.get_actions(out)
+                next_obs, reward, done, info = env.step(action)
+                done = [done] * self.n_agents if isinstance(done, bool) else done
+
+                last_hiddens = dict(hidden_actor =out[nms.HIDDEN_ACTOR],
+                                    hidden_critic=out[nms.HIDDEN_CRITIC])
+
+
+                tm.add(observation=obs, action=action, reward=reward, done=done,
+                       logits=out.get(nms.LOGITS, None), values=out.get(nms.CRITIC, None),
+                       **last_hiddens)
+
+                obs = next_obs
+                last_action = action
+
+                if (global_steps+1) % n_steps == 0 or all(done):
+                    with torch.inference_mode(False):
+                        self.learn(tm)
+
+                global_steps += 1
+                rew_log += sum(reward)
+                reward_queue.extend(reward)
+
+                if checkpointer is not None:
+                    checkpointer.step([
+                        (f'agent#{i}', agent)
+                        for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net)
+                    ])
+
+                if global_steps >= max_steps:
+                    break
+            print(f'reward at episode: {episode} = {rew_log}')
+            episode += 1
+            df_results.append([episode, rew_log, *reward])
+        df_results = pd.DataFrame(df_results, columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]])
+        if checkpointer is not None:
+            df_results.to_csv(checkpointer.path / 'results.csv', index=False)
+        return df_results
+
+    @torch.inference_mode(True)
+    def eval_loop(self, n_episodes, render=False):
+        env = instantiate_class(self.cfg[nms.ENV])
+        episode, results = 0, []
+        while episode < n_episodes:
+            obs = env.reset()
+            last_hiddens           = self.init_hidden()
+            last_action, reward    = [-1] * self.n_agents, [0.] * self.n_agents
+            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
+            while not all(done):
+                if render: env.render()
+
+                out    = self.forward(obs, last_action, **last_hiddens)
+                action = self.get_actions(out)
+                next_obs, reward, done, info = env.step(action)
+
+                if isinstance(done, bool): done = [done] * obs.shape[0]
+                obs = next_obs
+                last_action = action
+                last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR,   None),
+                                    hidden_critic=out.get(nms.HIDDEN_CRITIC, None)
+                                    )
+                eps_rew += torch.tensor(reward)
+            results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
+            episode += 1
+        agent_columns = [f'agent#{i}' for i in range(self.cfg['environment']['n_agents'])]
+        results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
+        results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'], value_name='reward', var_name='agent')
+        return results
+
+    @staticmethod
+    def compute_advantages(critic, reward, done, gamma, gae_coef=0.0):
+        tds = (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1]
+
+        if gae_coef <= 0:
+            return tds
+
+        gae = torch.zeros_like(tds[:, -1])
+        gaes = []
+        for t in range(tds.shape[1]-1, -1, -1):
+            gae = tds[:, t] + gamma * gae_coef * (1.0 - done[:, t]) * gae
+            gaes.insert(0, gae)
+        gaes = torch.stack(gaes, dim=1)
+        return gaes
+
+    def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
+        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
+
+        out = network(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0])
+        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
+        critic = out[nms.CRITIC]
+
+        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
+        advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
+        value_loss = advantages.pow(2).mean(-1)  # n_agent
+
+        # policy loss
+        log_ap = torch.log_softmax(logits, -1)
+        log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
+        a2c_loss = -(advantages.detach() * log_ap).mean(-1)
+        # weighted loss
+        loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss
+        return loss.mean()
+
+    def learn(self, tm: MARLActorCriticMemory, **kwargs):
+        loss = self.actor_critic(tm, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
+        # remove next_obs, will be added in next iter
+        self.optimizer.zero_grad()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
+        self.optimizer.step()
+
--- a/marl_factory_grid/algorithms/marl/example_config.yaml
+++ b/marl_factory_grid/algorithms/marl/example_config.yaml
@@ -0,0 +1,24 @@
+agent:
+  classname:           algorithms.marl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          environments.factory.make
+  env_name:           "DirtyFactory-v0"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+method:               algorithms.marl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.5
+  n_steps:            5
+  max_steps:          1000000
+
--- a/marl_factory_grid/algorithms/marl/iac.py
+++ b/marl_factory_grid/algorithms/marl/iac.py
@@ -0,0 +1,57 @@
+import torch
+from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
+from marl_factory_grid.algorithms.utils import instantiate_class
+from pathlib import Path
+from natsort import natsorted
+from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+
+
+class LoopIAC(BaseActorCritic):
+
+    def __init__(self, cfg):
+        super(LoopIAC, self).__init__(cfg)
+
+    def setup(self):
+        self.net = [
+            instantiate_class(self.cfg[nms.AGENT]) for _ in range(self.n_agents)
+        ]
+        self.optimizer = [
+            torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents)
+        ]
+
+    def load_state_dict(self, path: Path):
+        paths = natsorted(list(path.glob('*.pt')))
+        for path, net in zip(paths, self.net):
+            net.load_state_dict(torch.load(path))
+
+    @staticmethod
+    def merge_dicts(ds):  # todo could be recursive for more than 1 hierarchy
+        d = {}
+        for k in ds[0].keys():
+            d[k] = [d[k] for d in ds]
+        return d
+
+    def init_hidden(self):
+        ha  = [net.init_hidden_actor()  for net in self.net]
+        hc  = [net.init_hidden_critic() for net in self.net]
+        return dict(hidden_actor=ha, hidden_critic=hc)
+
+    def forward(self, observations, actions, hidden_actor, hidden_critic):
+        outputs = [
+            net(
+                self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0),  # agent x time
+                self._as_torch(actions[ag_i]).unsqueeze(0),
+                hidden_actor[ag_i],
+                hidden_critic[ag_i]
+                ) for ag_i, net in enumerate(self.net)
+        ]
+        return self.merge_dicts(outputs)
+
+    def learn(self, tms: MARLActorCriticMemory, **kwargs):
+        for ag_i in range(self.n_agents):
+            tm, net = tms(ag_i), self.net[ag_i]
+            loss = self.actor_critic(tm, net, **self.cfg[nms.ALGORITHM], **kwargs)
+            self.optimizer[ag_i].zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
+            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/marl/mappo.py
+++ b/marl_factory_grid/algorithms/marl/mappo.py
@@ -0,0 +1,66 @@
+from marl_factory_grid.algorithms.marl.base_ac import Names as nms
+from marl_factory_grid.algorithms.marl.snac import LoopSNAC
+from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+import torch
+from torch.distributions import Categorical
+from marl_factory_grid.algorithms.utils import instantiate_class
+
+
+class LoopMAPPO(LoopSNAC):
+    def __init__(self, *args, **kwargs):
+        super(LoopMAPPO, self).__init__(*args, **kwargs)
+        self.reset_memory_after_epoch = False
+
+    def setup(self):
+        self.net = instantiate_class(self.cfg[nms.AGENT])
+        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=3e-4, eps=1e-5)
+
+    def learn(self, tm: MARLActorCriticMemory, **kwargs):
+        if len(tm) >= self.cfg['algorithm']['buffer_size']:
+            # only learn when buffer is full
+            for batch_i in range(self.cfg['algorithm']['n_updates']):
+                batch = tm.chunk_dataloader(chunk_len=self.cfg['algorithm']['n_steps'],
+                                            k=self.cfg['algorithm']['batch_size'])
+                loss = self.mappo(batch, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
+                self.optimizer.zero_grad()
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
+                self.optimizer.step()
+
+    def monte_carlo_returns(self, rewards, done, gamma):
+        rewards_ = []
+        discounted_reward = torch.zeros_like(rewards[:, -1])
+        for t in range(rewards.shape[1]-1, -1, -1):
+            discounted_reward = rewards[:, t] + (gamma * (1.0 - done[:, t]) * discounted_reward)
+            rewards_.insert(0, discounted_reward)
+        rewards_ = torch.stack(rewards_, dim=1)
+        return rewards_
+
+    def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **kwargs):
+        out = network(batch[nms.OBSERVATION], batch[nms.ACTION], batch[nms.HIDDEN_ACTOR], batch[nms.HIDDEN_CRITIC])
+        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
+
+        old_log_probs = torch.log_softmax(batch[nms.LOGITS], -1)
+        old_log_probs = torch.gather(old_log_probs, index=batch[nms.ACTION][:, 1:].unsqueeze(-1), dim=-1).squeeze()
+
+        # monte carlo returns
+        mc_returns = self.monte_carlo_returns(batch[nms.REWARD], batch[nms.DONE], gamma)
+        mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8) #todo: norm across agent ok?
+        advantages =  mc_returns - out[nms.CRITIC][:, :-1]
+
+        # policy loss
+        log_ap = torch.log_softmax(logits, -1)
+        log_ap = torch.gather(log_ap, dim=-1, index=batch[nms.ACTION][:, 1:].unsqueeze(-1)).squeeze()
+        ratio = (log_ap - old_log_probs).exp()
+        surr1 = ratio * advantages.detach()
+        surr2 = torch.clamp(ratio, 1 - clip_range, 1 + clip_range) * advantages.detach()
+        policy_loss = -torch.min(surr1, surr2).mean(-1)
+
+        # entropy & value loss
+        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
+        value_loss = advantages.pow(2).mean(-1)  # n_agent
+
+        # weighted loss
+        loss = policy_loss + vf_coef*value_loss - entropy_coef * entropy_loss
+
+        return loss.mean()
--- a/marl_factory_grid/algorithms/marl/memory.py
+++ b/marl_factory_grid/algorithms/marl/memory.py
@@ -0,0 +1,221 @@
+import numpy as np
+from collections import deque
+import torch
+from typing import Union
+from torch import Tensor
+from torch.utils.data import Dataset, ConcatDataset
+import random
+
+
+class ActorCriticMemory(object):
+    def __init__(self, capacity=10):
+        self.capacity = capacity
+        self.reset()
+
+    def reset(self):
+        self.__actions        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__hidden_actor   = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__hidden_critic  = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__states         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__rewards        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__dones          = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__logits         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+        self.__values         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
+
+    def __len__(self):
+        return len(self.__rewards) - 1
+
+    @property
+    def observation(self, sls=slice(0, None)):  # add time dimension through stacking
+        return self.__states[sls].unsqueeze(0)      # 1 x time x hidden dim
+
+    @property
+    def hidden_actor(self,  sls=slice(0, None)):  # 1 x n_layers x dim
+        return self.__hidden_actor[sls].unsqueeze(0)    # 1 x time x n_layers x dim
+
+    @property
+    def hidden_critic(self, sls=slice(0, None)):  # 1 x n_layers x dim
+        return self.__hidden_critic[sls].unsqueeze(0)    # 1 x time x n_layers x dim
+
+    @property
+    def reward(self, sls=slice(0, None)):
+        return self.__rewards[sls].squeeze().unsqueeze(0)  # 1 x time
+
+    @property
+    def action(self, sls=slice(0, None)):
+        return self.__actions[sls].long().squeeze().unsqueeze(0)  # 1 x time
+
+    @property
+    def done(self, sls=slice(0, None)):
+        return self.__dones[sls].float().squeeze().unsqueeze(0)  # 1 x time
+
+    @property
+    def logits(self, sls=slice(0, None)):  # assumes a trailing 1 for time dimension - common when using output from NN
+        return self.__logits[sls].squeeze().unsqueeze(0)  # 1 x time x actions
+
+    @property
+    def values(self, sls=slice(0, None)):
+        return self.__values[sls].squeeze().unsqueeze(0)  # 1 x time x actions
+
+    def add_observation(self, state:  Union[Tensor, np.ndarray]):
+        self.__states.append(state    if isinstance(state, Tensor) else torch.from_numpy(state))
+
+    def add_hidden_actor(self, hidden: Tensor):
+        # layers x hidden dim
+        self.__hidden_actor.append(hidden)
+
+    def add_hidden_critic(self, hidden: Tensor):
+        # layers x hidden dim
+        self.__hidden_critic.append(hidden)
+
+    def add_action(self, action: Union[int, Tensor]):
+        if not isinstance(action, Tensor):
+            action = torch.tensor(action)
+        self.__actions.append(action)
+
+    def add_reward(self, reward: Union[float, Tensor]):
+        if not isinstance(reward, Tensor):
+            reward = torch.tensor(reward)
+        self.__rewards.append(reward)
+
+    def add_done(self, done:   bool):
+        if not isinstance(done, Tensor):
+            done = torch.tensor(done)
+        self.__dones.append(done)
+
+    def add_logits(self, logits: Tensor):
+        self.__logits.append(logits)
+
+    def add_values(self, values: Tensor):
+        self.__values.append(values)
+
+    def add(self, **kwargs):
+        for k, v in kwargs.items():
+            func = getattr(ActorCriticMemory, f'add_{k}')
+            func(self, v)
+
+
+class MARLActorCriticMemory(object):
+    def __init__(self, n_agents, capacity):
+        self.n_agents = n_agents
+        self.memories = [
+            ActorCriticMemory(capacity) for _ in range(n_agents)
+        ]
+
+    def __call__(self, agent_i):
+        return self.memories[agent_i]
+
+    def __len__(self):
+        return len(self.memories[0])  # todo add assertion check!
+
+    def reset(self):
+        for mem in self.memories:
+            mem.reset()
+
+    def add(self, **kwargs):
+        for agent_i in range(self.n_agents):
+            for k, v in kwargs.items():
+                func = getattr(ActorCriticMemory, f'add_{k}')
+                func(self.memories[agent_i], v[agent_i])
+
+    def __getattr__(self, attr):
+        all_attrs = [getattr(mem, attr) for mem in self.memories]
+        return torch.cat(all_attrs, 0)  # agent x time ...
+
+    def chunk_dataloader(self, chunk_len, k):
+        datasets = [ExperienceChunks(mem, chunk_len, k) for mem in self.memories]
+        dataset = ConcatDataset(datasets)
+        data = [dataset[i] for i in range(len(dataset))]
+        data = custom_collate_fn(data)
+        return data
+
+
+def custom_collate_fn(batch):
+    elem = batch[0]
+    return {key: torch.cat([d[key] for d in batch], dim=0) for key in elem}
+
+
+class ExperienceChunks(Dataset):
+    def __init__(self, memory, chunk_len, k):
+        assert chunk_len <= len(memory), 'chunk_len cannot be longer than the size of the memory'
+        self.memory = memory
+        self.chunk_len = chunk_len
+        self.k = k
+
+    @property
+    def whitelist(self):
+        whitelist = torch.ones(len(self.memory) - self.chunk_len)
+        for d in self.memory.done.squeeze().nonzero().flatten():
+            whitelist[max((0, d-self.chunk_len-1)):d+2] = 0
+        whitelist[0] = 0
+        return whitelist.tolist()
+
+    def sample(self, start=1):
+        cl = self.chunk_len
+        sample = dict(observation=self.memory.observation[:, start:start+cl+1],
+                      action=self.memory.action[:, start-1:start+cl],
+                      hidden_actor=self.memory.hidden_actor[:, start-1],
+                      hidden_critic=self.memory.hidden_critic[:, start-1],
+                      reward=self.memory.reward[:, start:start + cl],
+                      done=self.memory.done[:, start:start + cl],
+                      logits=self.memory.logits[:, start:start + cl],
+                      values=self.memory.values[:, start:start + cl])
+        return sample
+
+    def __len__(self):
+        return self.k
+
+    def __getitem__(self, i):
+        idx = random.choices(range(0, len(self.memory) - self.chunk_len), weights=self.whitelist, k=1)
+        return self.sample(idx[0])
+
+
+class LazyTensorFiFoQueue:
+    def __init__(self, maxlen):
+        self.maxlen = maxlen
+        self.reset()
+
+    def reset(self):
+        self.__lazy_queue = deque(maxlen=self.maxlen)
+        self.shape = None
+        self.queue = None
+
+    def shape_init(self, tensor: Tensor):
+        self.shape = torch.Size([self.maxlen, *tensor.shape])
+
+    def build_tensor_queue(self):
+        if len(self.__lazy_queue) > 0:
+            block = torch.stack(list(self.__lazy_queue), dim=0)
+            l = block.shape[0]
+            if self.queue is None:
+                self.queue = block
+            elif self.true_len() <= self.maxlen:
+                self.queue = torch.cat((self.queue, block),  dim=0)
+            else:
+                self.queue = torch.cat((self.queue[l:], block),  dim=0)
+            self.__lazy_queue.clear()
+
+    def append(self, data):
+        if self.shape is None:
+            self.shape_init(data)
+        self.__lazy_queue.append(data)
+        if len(self.__lazy_queue) >= self.maxlen:
+            self.build_tensor_queue()
+
+    def true_len(self):
+        return len(self.__lazy_queue) + (0 if self.queue is None else self.queue.shape[0])
+
+    def __len__(self):
+        return min((self.true_len(), self.maxlen))
+
+    def __str__(self):
+        return f'LazyTensorFiFoQueue\tmaxlen: {self.maxlen}, shape: {self.shape}, ' \
+               f'len: {len(self)}, true_len: {self.true_len()}, elements in lazy queue: {len(self.__lazy_queue)}'
+
+    def __getitem__(self, item_or_slice):
+        self.build_tensor_queue()
+        return self.queue[item_or_slice]
+
+
+
+
--- a/marl_factory_grid/algorithms/marl/networks.py
+++ b/marl_factory_grid/algorithms/marl/networks.py
@@ -0,0 +1,104 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+
+
+class RecurrentAC(nn.Module):
+    def __init__(self, observation_size, n_actions, obs_emb_size,
+                 action_emb_size, hidden_size_actor, hidden_size_critic,
+                 n_agents, use_agent_embedding=True):
+        super(RecurrentAC, self).__init__()
+        observation_size = np.prod(observation_size)
+        self.n_layers = 1
+        self.n_actions = n_actions
+        self.use_agent_embedding = use_agent_embedding
+        self.hidden_size_actor = hidden_size_actor
+        self.hidden_size_critic = hidden_size_critic
+        self.action_emb_size    = action_emb_size
+        self.obs_proj   = nn.Linear(observation_size, obs_emb_size)
+        self.action_emb =  nn.Embedding(n_actions+1, action_emb_size, padding_idx=0)
+        self.agent_emb  =  nn.Embedding(n_agents, action_emb_size)
+        mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size
+        self.mix = nn.Sequential(nn.Tanh(),
+                                 nn.Linear(mix_in_size, obs_emb_size),
+                                 nn.Tanh(),
+                                 nn.Linear(obs_emb_size, obs_emb_size)
+                                 )
+        self.gru_actor   = nn.GRU(obs_emb_size, hidden_size_actor,  batch_first=True, num_layers=self.n_layers)
+        self.gru_critic  = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers)
+        self.action_head = nn.Sequential(
+            nn.Linear(hidden_size_actor, hidden_size_actor),
+            nn.Tanh(),
+            nn.Linear(hidden_size_actor, n_actions)
+        )
+        #            spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)),
+        self.critic_head = nn.Sequential(
+            nn.Linear(hidden_size_critic, hidden_size_critic),
+            nn.Tanh(),
+            nn.Linear(hidden_size_critic, 1)
+        )
+        #self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3)
+        #self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3)
+
+    def init_hidden_actor(self):
+        return torch.zeros(1, self.n_layers, self.hidden_size_actor)
+
+    def init_hidden_critic(self):
+        return torch.zeros(1, self.n_layers, self.hidden_size_critic)
+
+    def forward(self, observations, actions, hidden_actor=None, hidden_critic=None):
+        n_agents, t, *_ = observations.shape
+        obs_emb    = self.obs_proj(observations.view(n_agents, t, -1).float())
+        action_emb = self.action_emb(actions+1)  # shift by one due to padding idx
+
+        if not self.use_agent_embedding:
+            x_t = torch.cat((obs_emb, action_emb), -1)
+        else:
+            agent_emb = self.agent_emb(
+                torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)] * t, 1)
+            )
+            x_t = torch.cat((obs_emb, agent_emb, action_emb), -1)
+
+        mixed_x_t   = self.mix(x_t)
+        output_p, _ = self.gru_actor(input=mixed_x_t,  hx=hidden_actor.swapaxes(1, 0))
+        output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0))
+
+        logits = self.action_head(output_p)
+        critic = self.critic_head(output_c).squeeze(-1)
+        return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c)
+
+
+class RecurrentACL2(RecurrentAC):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.action_head = nn.Sequential(
+            nn.Linear(self.hidden_size_actor, self.hidden_size_actor),
+            nn.Tanh(),
+            NormalizedLinear(self.hidden_size_actor, self.n_actions, trainable_magnitude=True)
+        )
+
+
+class NormalizedLinear(nn.Linear):
+    def __init__(self, in_features: int, out_features: int,
+                 device=None, dtype=None, trainable_magnitude=False):
+        super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype)
+        self.d_sqrt = in_features**0.5
+        self.trainable_magnitude = trainable_magnitude
+        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
+
+    def forward(self, input):
+        normalized_input = F.normalize(input, dim=-1, p=2, eps=1e-5)
+        normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
+        return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
+
+
+class L2Norm(nn.Module):
+    def __init__(self, in_features, trainable_magnitude=False):
+        super(L2Norm, self).__init__()
+        self.d_sqrt = in_features**0.5
+        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
+
+    def forward(self, x):
+        return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale
--- a/marl_factory_grid/algorithms/marl/seac.py
+++ b/marl_factory_grid/algorithms/marl/seac.py
@@ -0,0 +1,56 @@
+import torch
+from torch.distributions import Categorical
+from marl_factory_grid.algorithms.marl.iac import LoopIAC
+from marl_factory_grid.algorithms.marl.base_ac import nms
+from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+
+
+class LoopSEAC(LoopIAC):
+    def __init__(self, cfg):
+        super(LoopSEAC, self).__init__(cfg)
+
+    def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
+        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
+        outputs = [net(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0]) for net in networks]
+
+        with torch.inference_mode(True):
+            true_action_logp = torch.stack([
+                torch.log_softmax(out[nms.LOGITS][ag_i, :-1], -1)
+                    .gather(index=actions[ag_i, 1:, None], dim=-1)
+                for ag_i, out in enumerate(outputs)
+            ], 0).squeeze()
+
+        losses = []
+
+        for ag_i, out in enumerate(outputs):
+            logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
+            critic = out[nms.CRITIC]
+
+            entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean()
+            advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
+
+            # policy loss
+            log_ap = torch.log_softmax(logits, -1)
+            log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
+
+            # importance weights
+            iw = (log_ap - true_action_logp).exp().detach()  # importance_weights
+
+            a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
+
+
+            value_loss = (iw*advantages.pow(2)).mean(-1)  # n_agent
+
+            # weighted loss
+            loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean()
+            losses.append(loss)
+
+        return losses
+
+    def learn(self, tms: MARLActorCriticMemory, **kwargs):
+        losses = self.actor_critic(tms, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
+        for ag_i, loss in enumerate(losses):
+            self.optimizer[ag_i].zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
+            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/marl/snac.py
+++ b/marl_factory_grid/algorithms/marl/snac.py
@@ -0,0 +1,33 @@
+from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
+from marl_factory_grid.algorithms.marl.base_ac import nms
+import torch
+from torch.distributions import Categorical
+from pathlib import Path
+
+
+class LoopSNAC(BaseActorCritic):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def load_state_dict(self, path: Path):
+        path2weights = list(path.glob('*.pt'))
+        assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}'
+        self.net.load_state_dict(torch.load(path2weights[0]))
+
+    def init_hidden(self):
+        hidden_actor = self.net.init_hidden_actor()
+        hidden_critic = self.net.init_hidden_critic()
+        return dict(hidden_actor=torch.cat([hidden_actor]   * self.n_agents,  0),
+                    hidden_critic=torch.cat([hidden_critic] * self.n_agents,  0)
+                    )
+
+    def get_actions(self, out):
+        actions = Categorical(logits=out[nms.LOGITS]).sample().squeeze()
+        return actions
+
+    def forward(self, observations, actions, hidden_actor, hidden_critic):
+        out = self.net(self._as_torch(observations).unsqueeze(1),
+                       self._as_torch(actions).unsqueeze(1),
+                       hidden_actor, hidden_critic
+                       )
+        return out
--- a/marl_factory_grid/algorithms/static/TSP_base_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_base_agent.py
@@ -0,0 +1,130 @@
+import itertools
+from random import choice
+
+import numpy as np
+
+import networkx as nx
+from networkx.algorithms.approximation import traveling_salesman as tsp
+
+from marl_factory_grid.modules.doors import constants as do
+from marl_factory_grid.environment import constants as c
+from marl_factory_grid.utils.helpers import MOVEMAP
+
+from abc import abstractmethod, ABC
+
+future_planning = 7
+
+
+def points_to_graph(coordiniates_or_tiles, allow_euclidean_connections=True, allow_manhattan_connections=True):
+    """
+    Given a set of coordinates, this function contructs a non-directed graph, by conncting adjected points.
+    There are three combinations of settings:
+        Allow all neigbors:     Distance(a, b) <= sqrt(2)
+        Allow only manhattan:   Distance(a, b) == 1
+        Allow only euclidean:   Distance(a, b) == sqrt(2)
+
+
+    :param coordiniates_or_tiles: A set of coordinates.
+    :type coordiniates_or_tiles: Tiles
+    :param allow_euclidean_connections: Whether to regard diagonal adjected cells as neighbors
+    :type: bool
+    :param allow_manhattan_connections: Whether to regard directly adjected cells as neighbors
+    :type: bool
+
+    :return: A graph with nodes that are conneceted as specified by the parameters.
+    :rtype: nx.Graph
+    """
+    assert allow_euclidean_connections or allow_manhattan_connections
+    if hasattr(coordiniates_or_tiles, 'positions'):
+        coordiniates_or_tiles = coordiniates_or_tiles.positions
+    possible_connections = itertools.combinations(coordiniates_or_tiles, 2)
+    graph = nx.Graph()
+    for a, b in possible_connections:
+        diff = np.linalg.norm(np.asarray(a)-np.asarray(b))
+        if allow_manhattan_connections and allow_euclidean_connections and diff <= np.sqrt(2):
+            graph.add_edge(a, b)
+        elif not allow_manhattan_connections and allow_euclidean_connections and diff == np.sqrt(2):
+            graph.add_edge(a, b)
+        elif allow_manhattan_connections and not allow_euclidean_connections and diff == 1:
+            graph.add_edge(a, b)
+    return graph
+
+
+class TSPBaseAgent(ABC):
+
+    def __init__(self, state, agent_i, static_problem: bool = True):
+        self.static_problem = static_problem
+        self.local_optimization = True
+        self._env = state
+        self.state = self._env.state[c.AGENT][agent_i]
+        self._floortile_graph = points_to_graph(self._env[c.FLOOR].positions)
+        self._static_route = None
+
+    @abstractmethod
+    def predict(self, *_, **__) -> int:
+        return 0
+
+    def _use_door_or_move(self, door, target):
+        if door.is_closed:
+            # Translate the action_object to an integer to have the same output as any other model
+            action = do.ACTION_DOOR_USE
+        else:
+            action = self._predict_move(target)
+        return action
+
+    def calculate_tsp_route(self, target_identifier):
+        positions = [x for x in self._env.state[target_identifier].positions if x != c.VALUE_NO_POS]
+        if self.local_optimization:
+            nodes = \
+                [self.state.pos] + \
+                [x for x in positions if max(abs(np.subtract(x, self.state.pos))) < 3]
+            try:
+                while len(nodes) < 7:
+                    nodes += [next(x for x in positions if x not in nodes)]
+            except StopIteration:
+                nodes = [self.state.pos] + positions
+
+        else:
+            nodes = [self.state.pos] + positions
+        route = tsp.traveling_salesman_problem(self._floortile_graph,
+                                               nodes=nodes, cycle=True, method=tsp.greedy_tsp)
+        return route
+
+    def _door_is_close(self):
+        try:
+            return next(y for x in self.state.tile.neighboring_floor for y in x.guests if do.DOOR in y.name)
+        except StopIteration:
+            return None
+
+    def _has_targets(self, target_identifier):
+        return bool(len([x for x in self._env.state[target_identifier] if x.pos != c.VALUE_NO_POS]) >= 1)
+
+    def _predict_move(self, target_identifier):
+        if self._has_targets(target_identifier):
+            if self.static_problem:
+                if not self._static_route:
+                    self._static_route = self.calculate_tsp_route(target_identifier)
+                else:
+                    pass
+                next_pos = self._static_route.pop(0)
+                while next_pos == self.state.pos:
+                    next_pos = self._static_route.pop(0)
+            else:
+                if not self._static_route:
+                    self._static_route = self.calculate_tsp_route(target_identifier)[:7]
+                next_pos = self._static_route.pop(0)
+                while next_pos == self.state.pos:
+                    next_pos = self._static_route.pop(0)
+
+            diff = np.subtract(next_pos, self.state.pos)
+            # Retrieve action based on the pos dif (like in: What do I have to do to get there?)
+            try:
+                action = next(action for action, pos_diff in MOVEMAP.items() if np.all(diff == pos_diff))
+            except StopIteration:
+                print(f'diff: {diff}')
+                print('This Should not happen!')
+                action = choice(self.state.actions).name
+        else:
+            action = choice(self.state.actions).name
+        # noinspection PyUnboundLocalVariable
+        return action
--- a/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
@@ -0,0 +1,27 @@
+from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+
+from marl_factory_grid.modules.clean_up import constants as di
+
+future_planning = 7
+
+
+class TSPDirtAgent(TSPBaseAgent):
+
+    def __init__(self, *args, **kwargs):
+        super(TSPDirtAgent, self).__init__(*args, **kwargs)
+
+    def predict(self, *_, **__):
+        if self._env.state[di.DIRT].by_pos(self.state.pos) is not None:
+            # Translate the action_object to an integer to have the same output as any other model
+            action = di.CLEAN_UP
+        elif door := self._door_is_close():
+            action = self._use_door_or_move(door, di.DIRT)
+        else:
+            action = self._predict_move(di.DIRT)
+        # Translate the action_object to an integer to have the same output as any other model
+        try:
+            action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
+        except (StopIteration, UnboundLocalError):
+            print('Will not happen')
+            raise EnvironmentError
+        return action_obj
--- a/marl_factory_grid/algorithms/static/TSP_item_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_item_agent.py
@@ -0,0 +1,59 @@
+import numpy as np
+
+from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+
+from marl_factory_grid.modules.items import constants as i
+
+future_planning = 7
+inventory_size  = 3
+
+MODE_GET        = 'Mode_Get'
+MODE_BRING      = 'Mode_Bring'
+
+
+class TSPItemAgent(TSPBaseAgent):
+
+    def __init__(self, *args, mode=MODE_GET, **kwargs):
+        super(TSPItemAgent, self).__init__(*args, **kwargs)
+        self.mode = mode
+
+    def predict(self, *_, **__):
+        if self._env.state[i.ITEM].by_pos(self.state.pos) is not None:
+            # Translate the action_object to an integer to have the same output as any other model
+            action = i.ITEM_ACTION
+        elif self._env.state[i.DROP_OFF].by_pos(self.state.pos) is not None:
+            # Translate the action_object to an integer to have the same output as any other model
+            action = i.ITEM_ACTION
+        elif door := self._door_is_close():
+            action = self._use_door_or_move(door, i.DROP_OFF if self.mode == MODE_BRING else i.ITEM)
+        else:
+            action = self._choose()
+        # Translate the action_object to an integer to have the same output as any other model
+        try:
+            action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
+        except (StopIteration, UnboundLocalError):
+            print('Will not happen')
+            raise EnvironmentError
+        # noinspection PyUnboundLocalVariable
+        if self.mode == MODE_BRING and len(self._env[i.INVENTORY].by_entity(self.state)):
+            pass
+        elif self.mode == MODE_BRING and not len(self._env[i.INVENTORY].by_entity(self.state)):
+            self.mode = MODE_GET
+        elif self.mode == MODE_GET and len(self._env[i.INVENTORY].by_entity(self.state)) > inventory_size:
+            self.mode = MODE_BRING
+        else:
+            pass
+        return action_obj
+
+    def _choose(self):
+        target = i.DROP_OFF if self.mode == MODE_BRING else i.ITEM
+        if len(self._env.state[i.ITEM]) >= 1:
+            action = self._predict_move(target)
+
+        elif len(self._env[i.INVENTORY].by_entity(self.state)):
+            self.mode = MODE_BRING
+            action = self._predict_move(target)
+        else:
+            action = int(np.random.randint(self._env.action_space.n))
+        # noinspection PyUnboundLocalVariable
+        return action
--- a/marl_factory_grid/algorithms/static/TSP_target_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_target_agent.py
@@ -0,0 +1,32 @@
+from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+
+from marl_factory_grid.modules.destinations import constants as d
+from marl_factory_grid.modules.doors import constants as do
+
+future_planning = 7
+
+
+class TSPTargetAgent(TSPBaseAgent):
+
+    def __init__(self, *args, **kwargs):
+        super(TSPTargetAgent, self).__init__(*args, **kwargs)
+
+    def _handle_doors(self):
+
+        try:
+            return next(y for x in self.state.tile.neighboring_floor for y in x.guests if do.DOOR in y.name)
+        except StopIteration:
+            return None
+
+    def predict(self, *_, **__):
+        if door := self._door_is_close():
+            action = self._use_door_or_move(door, d.DESTINATION)
+        else:
+            action = self._predict_move(d.DESTINATION)
+        # Translate the action_object to an integer to have the same output as any other model
+        try:
+            action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
+        except (StopIteration, UnboundLocalError):
+            print('Will not happen')
+        return action_obj
+
--- a/marl_factory_grid/algorithms/static/init.py
+++ b/marl_factory_grid/algorithms/static/init.py
--- a/marl_factory_grid/algorithms/static/random_agent.py
+++ b/marl_factory_grid/algorithms/static/random_agent.py
@@ -0,0 +1,15 @@
+from random import randint
+
+from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+
+future_planning = 7
+
+
+class TSPRandomAgent(TSPBaseAgent):
+
+    def __init__(self, n_actions, *args, **kwargs):
+        super(TSPRandomAgent, self).__init__(*args, **kwargs)
+        self.n_action = n_actions
+
+    def predict(self, *_, **__):
+        return randint(0, self.n_action - 1)
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@@ -0,0 +1,85 @@
+import torch
+import numpy as np
+import yaml
+from pathlib import Path
+
+
+def load_class(classname):
+    from importlib import import_module
+    module_path, class_name = classname.rsplit(".", 1)
+    module = import_module(module_path)
+    c = getattr(module, class_name)
+    return c
+
+
+def instantiate_class(arguments):
+    from importlib import import_module
+
+    d = dict(arguments)
+    classname = d["classname"]
+    del d["classname"]
+    module_path, class_name = classname.rsplit(".", 1)
+    module = import_module(module_path)
+    c = getattr(module, class_name)
+    return c(**d)
+
+
+def get_class(arguments):
+    from importlib import import_module
+
+    if isinstance(arguments, dict):
+        classname = arguments["classname"]
+        module_path, class_name = classname.rsplit(".", 1)
+        module = import_module(module_path)
+        c = getattr(module, class_name)
+        return c
+    else:
+        classname = arguments.classname
+        module_path, class_name = classname.rsplit(".", 1)
+        module = import_module(module_path)
+        c = getattr(module, class_name)
+        return c
+
+
+def get_arguments(arguments):
+    from importlib import import_module
+    d = dict(arguments)
+    if "classname" in d:
+        del d["classname"]
+    return d
+
+
+def load_yaml_file(path: Path):
+    with path.open() as stream:
+        cfg = yaml.load(stream, Loader=yaml.FullLoader)
+    return cfg
+
+
+def add_env_props(cfg):
+    env = instantiate_class(cfg['environment'].copy())
+    cfg['agent'].update(dict(observation_size=list(env.observation_space.shape),
+                             n_actions=env.action_space.n))
+
+
+class Checkpointer(object):
+    def __init__(self, experiment_name, root, config, total_steps, n_checkpoints):
+        self.path = root / experiment_name
+        self.checkpoint_indices = list(np.linspace(1, total_steps, n_checkpoints, dtype=int) - 1)
+        self.__current_checkpoint = 0
+        self.__current_step = 0
+        self.path.mkdir(exist_ok=True, parents=True)
+        with (self.path / 'config.yaml').open('w') as outfile:
+            yaml.dump(config, outfile, default_flow_style=False)
+
+    def save_experiment(self, name: str, model):
+        cpt_path = self.path / f'checkpoint_{self.__current_checkpoint}'
+        cpt_path.mkdir(exist_ok=True, parents=True)
+        torch.save(model.state_dict(), cpt_path / f'{name}.pt')
+
+    def step(self, to_save):
+        if self.__current_step in self.checkpoint_indices:
+            print(f'Checkpointing #{self.__current_checkpoint}')
+            for name, model in to_save:
+                self.save_experiment(name, model)
+            self.__current_checkpoint += 1
+        self.__current_step += 1
				`@@ -0,0 +1 @@`
				`import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))`
				`@@ -0,0 +1 @@`
				`from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory`