Reset tsp route caching + renamed and moved configs + removed unnecessary files

2025-07-11 23:42:40 +02:00 · 2024-05-24 16:12:05 +02:00
parent 98113ea849
commit c8336e8f78
144 changed files with 86 additions and 8056 deletions
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
@ -1 +1 @@
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@ -11,7 +11,6 @@ import numpy as np
 from torch.distributions import Categorical

 from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
 from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
 from pathlib import Path
 from collections import deque
--- a/marl_factory_grid/algorithms/marl/base_a2c.py
+++ b/marl_factory_grid/algorithms/marl/base_a2c.py
@ -2,8 +2,6 @@ import numpy as np; import torch as th; import scipy as sp;
 from collections import deque
 from torch import nn

-# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
-# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
 cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]

 class Net(th.nn.Module):
--- a/marl_factory_grid/algorithms/marl/base_ac.py
+++ b/marl_factory_grid/algorithms/marl/base_ac.py
@ -1,242 +0,0 @@
-import torch
-from typing import Union, List, Dict
-import numpy as np
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
-from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
-from pathlib import Path
-import pandas as pd
-from collections import deque
-
-
-class Names:
-    REWARD          = 'reward'
-    DONE            = 'done'
-    ACTION          = 'action'
-    OBSERVATION     = 'observation'
-    LOGITS          = 'logits'
-    HIDDEN_ACTOR    = 'hidden_actor'
-    HIDDEN_CRITIC   = 'hidden_critic'
-    AGENT           = 'agent'
-    ENV             = 'env'
-    ENV_NAME        = 'env_name'
-    N_AGENTS        = 'n_agents'
-    ALGORITHM       = 'algorithm'
-    MAX_STEPS       = 'max_steps'
-    N_STEPS         = 'n_steps'
-    BUFFER_SIZE     = 'buffer_size'
-    CRITIC          = 'critic'
-    BATCH_SIZE      = 'bnatch_size'
-    N_ACTIONS       = 'n_actions'
-    TRAIN_RENDER    = 'train_render'
-    EVAL_RENDER     = 'eval_render'
-
-
-nms = Names
-ListOrTensor = Union[List, torch.Tensor]
-
-
-class BaseActorCritic:
-    def __init__(self, cfg):
-        self.factory = add_env_props(cfg)
-        self.__training = True
-        self.cfg = cfg
-        self.n_agents = cfg[nms.AGENT][nms.N_AGENTS]
-        self.reset_memory_after_epoch = True
-        self.setup()
-
-    def setup(self):
-        self.net = instantiate_class(self.cfg[nms.AGENT])
-        self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5)
-
-    @classmethod
-    def _as_torch(cls, x):
-        if isinstance(x, np.ndarray):
-            return torch.from_numpy(x)
-        elif isinstance(x, List):
-            return torch.tensor(x)
-        elif isinstance(x, (int, float)):
-            return torch.tensor([x])
-        return x
-
-    def train(self):
-        self.__training = False
-        networks = [self.net] if not isinstance(self.net, List) else self.net
-        for net in networks:
-            net.train()
-
-    def eval(self):
-        self.__training = False
-        networks = [self.net] if not isinstance(self.net, List) else self.net
-        for net in networks:
-            net.eval()
-
-    def load_state_dict(self, path: Path):
-        pass
-
-    def get_actions(self, out) -> ListOrTensor:
-        actions = [Categorical(logits=logits).sample().item() for logits in out[nms.LOGITS]]
-        return actions
-
-    def init_hidden(self) -> Dict[str, ListOrTensor]:
-        pass
-
-    def forward(self,
-                observations:  ListOrTensor,
-                actions:       ListOrTensor,
-                hidden_actor:  ListOrTensor,
-                hidden_critic: ListOrTensor
-                ) -> Dict[str, ListOrTensor]:
-        pass
-
-    @torch.no_grad()
-    def train_loop(self, checkpointer=None):
-        env = self.factory
-        if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-            env.render()
-        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
-        tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps))
-        global_steps, episode, df_results = 0, 0, []
-        reward_queue = deque(maxlen=2000)
-
-        while global_steps < max_steps:
-            obs = env.reset()
-            obs = list(obs.values())
-            last_hiddens        = self.init_hidden()
-            last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
-            done, rew_log       = [False] * self.n_agents, 0
-
-            if self.reset_memory_after_epoch:
-                tm.reset()
-
-            tm.add(observation=obs, action=last_action,
-                   logits=torch.zeros(self.n_agents, 1, self.cfg[nms.AGENT][nms.N_ACTIONS]),
-                   values=torch.zeros(self.n_agents, 1), reward=reward, done=done, **last_hiddens)
-
-            while not all(done):
-                out = self.forward(obs, last_action, **last_hiddens)
-                action = self.get_actions(out)
-                _, next_obs, reward, done, info = env.step(action)
-                done = [done] * self.n_agents if isinstance(done, bool) else done
-
-                if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-                    env.render()
-
-                last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR],
-                                    hidden_critic=out[nms.HIDDEN_CRITIC])
-
-                logits = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.LOGITS, None)], dim=0)
-                values = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.CRITIC, None)], dim=0)
-
-                tm.add(observation=obs, action=action, reward=reward, done=done,
-                       logits=logits, values=values,
-                       **last_hiddens)
-
-                obs = next_obs
-                last_action = action
-
-                if (global_steps+1) % n_steps == 0 or all(done):
-                    with torch.inference_mode(False):
-                        self.learn(tm)
-
-                global_steps += 1
-                rew_log += sum(reward)
-                reward_queue.extend(reward)
-
-                if checkpointer is not None:
-                    checkpointer.step([
-                        (f'agent#{i}', agent)
-                        for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net)
-                    ])
-
-                if global_steps >= max_steps:
-                    break
-            if global_steps%100 == 0:
-                print(f'reward at episode: {episode} = {rew_log}')
-            episode += 1
-            df_results.append([episode, rew_log, *reward])
-        df_results = pd.DataFrame(df_results,
-                                  columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]]
-                                  )
-        if checkpointer is not None:
-            df_results.to_csv(checkpointer.path / 'results.csv', index=False)
-        return df_results
-
-    @torch.inference_mode(True)
-    def eval_loop(self, n_episodes, render=False):
-        env = self.factory
-        if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-            env.render()
-        episode, results = 0, []
-        while episode < n_episodes:
-            obs = env.reset()
-            obs = list(obs.values())
-            last_hiddens           = self.init_hidden()
-            last_action, reward    = [-1] * self.n_agents, [0.] * self.n_agents
-            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
-            while not all(done):
-                out    = self.forward(obs, last_action, **last_hiddens)
-                action = self.get_actions(out)
-                _, next_obs, reward, done, info = env.step(action)
-
-                if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                    env.render()
-
-                if isinstance(done, bool):
-                    done = [done] * obs[0].shape[0]
-                obs = next_obs
-                last_action = action
-                last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR,   None),
-                                    hidden_critic=out.get(nms.HIDDEN_CRITIC, None)
-                                    )
-                eps_rew += torch.tensor(reward)
-            results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
-            episode += 1
-        agent_columns = [f'agent#{i}' for i in range(self.cfg[nms.ENV][nms.N_AGENTS])]
-        results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
-        results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'],
-                          value_name='reward', var_name='agent')
-        return results
-
-    @staticmethod
-    def compute_advantages(critic, reward, done, gamma, gae_coef=0.0):
-        tds = (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1]
-
-        if gae_coef <= 0:
-            return tds
-
-        gae = torch.zeros_like(tds[:, -1])
-        gaes = []
-        for t in range(tds.shape[1]-1, -1, -1):
-            gae = tds[:, t] + gamma * gae_coef * (1.0 - done[:, t]) * gae
-            gaes.insert(0, gae)
-        gaes = torch.stack(gaes, dim=1)
-        return gaes
-
-    def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
-        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
-
-        out = network(obs, actions, tm.hidden_actor[:, 0].squeeze(0), tm.hidden_critic[:, 0].squeeze(0))
-        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-        critic = out[nms.CRITIC]
-
-        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
-        advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
-        value_loss = advantages.pow(2).mean(-1)  # n_agent
-
-        # policy loss
-        log_ap = torch.log_softmax(logits, -1)
-        log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
-        a2c_loss = -(advantages.detach() * log_ap).mean(-1)
-        # weighted loss
-        loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss
-        return loss.mean()
-
-    def learn(self, tm: MARLActorCriticMemory, **kwargs):
-        loss = self.actor_critic(tm, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-        # remove next_obs, will be added in next iter
-        self.optimizer.zero_grad()
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
-        self.optimizer.step()
-
--- a/marl_factory_grid/algorithms/marl/configs/environment_changes
+++ b/marl_factory_grid/algorithms/marl/configs/environment_changes
@ -1,8 +0,0 @@
-marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
-marl_factory_grid>environment>rewards.py
-marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
-marl_factory_grid>environment>rules.py#AgentSpawnRule
-marl_factory_grid>utils>states.py#GameState.__init__()
-marl_factory_grid>environment>factory.py>Factory#render
-marl_factory_grid>environment>factory.py>Factory#set_recorder
-marl_factory_grid>utils>renderer.py>Renderer#render
--- a/marl_factory_grid/algorithms/marl/iac.py
+++ b/marl_factory_grid/algorithms/marl/iac.py
@ -1,57 +0,0 @@
-import torch
-from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
-from marl_factory_grid.algorithms.utils import instantiate_class
-from pathlib import Path
-from natsort import natsorted
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
-
-
-class LoopIAC(BaseActorCritic):
-
-    def __init__(self, cfg):
-        super(LoopIAC, self).__init__(cfg)
-
-    def setup(self):
-        self.net = [
-            instantiate_class(self.cfg[nms.AGENT]) for _ in range(self.n_agents)
-        ]
-        self.optimizer = [
-            torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents)
-        ]
-
-    def load_state_dict(self, path: Path):
-        paths = natsorted(list(path.glob('*.pt')))
-        for path, net in zip(paths, self.net):
-            net.load_state_dict(torch.load(path))
-
-    @staticmethod
-    def merge_dicts(ds):  # todo could be recursive for more than 1 hierarchy
-        d = {}
-        for k in ds[0].keys():
-            d[k] = [d[k] for d in ds]
-        return d
-
-    def init_hidden(self):
-        ha  = [net.init_hidden_actor()  for net in self.net]
-        hc  = [net.init_hidden_critic() for net in self.net]
-        return dict(hidden_actor=ha, hidden_critic=hc)
-
-    def forward(self, observations, actions, hidden_actor, hidden_critic):
-        outputs = [
-            net(
-                self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0),  # agent x time
-                self._as_torch(actions[ag_i]).unsqueeze(0),
-                hidden_actor[ag_i],
-                hidden_critic[ag_i]
-                ) for ag_i, net in enumerate(self.net)
-        ]
-        return self.merge_dicts(outputs)
-
-    def learn(self, tms: MARLActorCriticMemory, **kwargs):
-        for ag_i in range(self.n_agents):
-            tm, net = tms(ag_i), self.net[ag_i]
-            loss = self.actor_critic(tm, net, **self.cfg[nms.ALGORITHM], **kwargs)
-            self.optimizer[ag_i].zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
-            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/marl/mappo.py
+++ b/marl_factory_grid/algorithms/marl/mappo.py
@ -1,66 +0,0 @@
-from marl_factory_grid.algorithms.marl.base_ac import Names as nms
-from marl_factory_grid.algorithms.marl.snac import LoopSNAC
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
-import torch
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.utils import instantiate_class
-
-
-class LoopMAPPO(LoopSNAC):
-    def __init__(self, *args, **kwargs):
-        super(LoopMAPPO, self).__init__(*args, **kwargs)
-        self.reset_memory_after_epoch = False
-
-    def setup(self):
-        self.net = instantiate_class(self.cfg[nms.AGENT])
-        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=3e-4, eps=1e-5)
-
-    def learn(self, tm: MARLActorCriticMemory, **kwargs):
-        if len(tm) >= self.cfg['algorithm']['buffer_size']:
-            # only learn when buffer is full
-            for batch_i in range(self.cfg['algorithm']['n_updates']):
-                batch = tm.chunk_dataloader(chunk_len=self.cfg['algorithm']['n_steps'],
-                                            k=self.cfg['algorithm']['batch_size'])
-                loss = self.mappo(batch, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-                self.optimizer.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
-                self.optimizer.step()
-
-    def monte_carlo_returns(self, rewards, done, gamma):
-        rewards_ = []
-        discounted_reward = torch.zeros_like(rewards[:, -1])
-        for t in range(rewards.shape[1]-1, -1, -1):
-            discounted_reward = rewards[:, t] + (gamma * (1.0 - done[:, t]) * discounted_reward)
-            rewards_.insert(0, discounted_reward)
-        rewards_ = torch.stack(rewards_, dim=1)
-        return rewards_
-
-    def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **__):
-        out = network(batch[nms.OBSERVATION], batch[nms.ACTION], batch[nms.HIDDEN_ACTOR], batch[nms.HIDDEN_CRITIC])
-        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-
-        old_log_probs = torch.log_softmax(batch[nms.LOGITS], -1)
-        old_log_probs = torch.gather(old_log_probs, index=batch[nms.ACTION][:, 1:].unsqueeze(-1), dim=-1).squeeze()
-
-        # monte carlo returns
-        mc_returns = self.monte_carlo_returns(batch[nms.REWARD], batch[nms.DONE], gamma)
-        mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8)  # todo: norm across agent ok?
-        advantages = mc_returns - out[nms.CRITIC][:, :-1]
-
-        # policy loss
-        log_ap = torch.log_softmax(logits, -1)
-        log_ap = torch.gather(log_ap, dim=-1, index=batch[nms.ACTION][:, 1:].unsqueeze(-1)).squeeze()
-        ratio = (log_ap - old_log_probs).exp()
-        surr1 = ratio * advantages.detach()
-        surr2 = torch.clamp(ratio, 1 - clip_range, 1 + clip_range) * advantages.detach()
-        policy_loss = -torch.min(surr1, surr2).mean(-1)
-
-        # entropy & value loss
-        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
-        value_loss = advantages.pow(2).mean(-1)  # n_agent
-
-        # weighted loss
-        loss = policy_loss + vf_coef*value_loss - entropy_coef * entropy_loss
-
-        return loss.mean()
--- a/marl_factory_grid/algorithms/marl/memory.py
+++ b/marl_factory_grid/algorithms/marl/memory.py
@ -1,221 +0,0 @@
-import numpy as np
-from collections import deque
-import torch
-from typing import Union
-from torch import Tensor
-from torch.utils.data import Dataset, ConcatDataset
-import random
-
-
-class ActorCriticMemory(object):
-    def __init__(self, capacity=10):
-        self.capacity = capacity
-        self.reset()
-
-    def reset(self):
-        self.__actions        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__hidden_actor   = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__hidden_critic  = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__states         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__rewards        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__dones          = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__logits         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__values         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-
-    def __len__(self):
-        return len(self.__rewards) - 1
-
-    @property
-    def observation(self, sls=slice(0, None)):  # add time dimension through stacking
-        return self.__states[sls].unsqueeze(0)      # 1 x time x hidden dim
-
-    @property
-    def hidden_actor(self,  sls=slice(0, None)):  # 1 x n_layers x dim
-        return self.__hidden_actor[sls].unsqueeze(0)    # 1 x time x n_layers x dim
-
-    @property
-    def hidden_critic(self, sls=slice(0, None)):  # 1 x n_layers x dim
-        return self.__hidden_critic[sls].unsqueeze(0)    # 1 x time x n_layers x dim
-
-    @property
-    def reward(self, sls=slice(0, None)):
-        return self.__rewards[sls].squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def action(self, sls=slice(0, None)):
-        return self.__actions[sls].long().squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def done(self, sls=slice(0, None)):
-        return self.__dones[sls].float().squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def logits(self, sls=slice(0, None)):  # assumes a trailing 1 for time dimension - common when using output from NN
-        return self.__logits[sls].squeeze().unsqueeze(0)  # 1 x time x actions
-
-    @property
-    def values(self, sls=slice(0, None)):
-        return self.__values[sls].squeeze().unsqueeze(0)  # 1 x time x actions
-
-    def add_observation(self, state:  Union[Tensor, np.ndarray]):
-        self.__states.append(state    if isinstance(state, Tensor) else torch.from_numpy(state))
-
-    def add_hidden_actor(self, hidden: Tensor):
-        # layers x hidden dim
-        self.__hidden_actor.append(hidden)
-
-    def add_hidden_critic(self, hidden: Tensor):
-        # layers x hidden dim
-        self.__hidden_critic.append(hidden)
-
-    def add_action(self, action: Union[int, Tensor]):
-        if not isinstance(action, Tensor):
-            action = torch.tensor(action)
-        self.__actions.append(action)
-
-    def add_reward(self, reward: Union[float, Tensor]):
-        if not isinstance(reward, Tensor):
-            reward = torch.tensor(reward)
-        self.__rewards.append(reward)
-
-    def add_done(self, done:   bool):
-        if not isinstance(done, Tensor):
-            done = torch.tensor(done)
-        self.__dones.append(done)
-
-    def add_logits(self, logits: Tensor):
-        self.__logits.append(logits)
-
-    def add_values(self, values: Tensor):
-        self.__values.append(values)
-
-    def add(self, **kwargs):
-        for k, v in kwargs.items():
-            func = getattr(ActorCriticMemory, f'add_{k}')
-            func(self, v)
-
-
-class MARLActorCriticMemory(object):
-    def __init__(self, n_agents, capacity):
-        self.n_agents = n_agents
-        self.memories = [
-            ActorCriticMemory(capacity) for _ in range(n_agents)
-        ]
-
-    def __call__(self, agent_i):
-        return self.memories[agent_i]
-
-    def __len__(self):
-        return len(self.memories[0])  # todo add assertion check!
-
-    def reset(self):
-        for mem in self.memories:
-            mem.reset()
-
-    def add(self, **kwargs):
-        for agent_i in range(self.n_agents):
-            for k, v in kwargs.items():
-                func = getattr(ActorCriticMemory, f'add_{k}')
-                func(self.memories[agent_i], v[agent_i])
-
-    def __getattr__(self, attr):
-        all_attrs = [getattr(mem, attr) for mem in self.memories]
-        return torch.cat(all_attrs, 0)  # agent x time ...
-
-    def chunk_dataloader(self, chunk_len, k):
-        datasets = [ExperienceChunks(mem, chunk_len, k) for mem in self.memories]
-        dataset = ConcatDataset(datasets)
-        data = [dataset[i] for i in range(len(dataset))]
-        data = custom_collate_fn(data)
-        return data
-
-
-def custom_collate_fn(batch):
-    elem = batch[0]
-    return {key: torch.cat([d[key] for d in batch], dim=0) for key in elem}
-
-
-class ExperienceChunks(Dataset):
-    def __init__(self, memory, chunk_len, k):
-        assert chunk_len <= len(memory), 'chunk_len cannot be longer than the size of the memory'
-        self.memory = memory
-        self.chunk_len = chunk_len
-        self.k = k
-
-    @property
-    def whitelist(self):
-        whitelist = torch.ones(len(self.memory) - self.chunk_len)
-        for d in self.memory.done.squeeze().nonzero().flatten():
-            whitelist[max((0, d-self.chunk_len-1)):d+2] = 0
-        whitelist[0] = 0
-        return whitelist.tolist()
-
-    def sample(self, start=1):
-        cl = self.chunk_len
-        sample = dict(observation=self.memory.observation[:, start:start+cl+1],
-                      action=self.memory.action[:, start-1:start+cl],
-                      hidden_actor=self.memory.hidden_actor[:, start-1],
-                      hidden_critic=self.memory.hidden_critic[:, start-1],
-                      reward=self.memory.reward[:, start:start + cl],
-                      done=self.memory.done[:, start:start + cl],
-                      logits=self.memory.logits[:, start:start + cl],
-                      values=self.memory.values[:, start:start + cl])
-        return sample
-
-    def __len__(self):
-        return self.k
-
-    def __getitem__(self, i):
-        idx = random.choices(range(0, len(self.memory) - self.chunk_len), weights=self.whitelist, k=1)
-        return self.sample(idx[0])
-
-
-class LazyTensorFiFoQueue:
-    def __init__(self, maxlen):
-        self.maxlen = maxlen
-        self.reset()
-
-    def reset(self):
-        self.__lazy_queue = deque(maxlen=self.maxlen)
-        self.shape = None
-        self.queue = None
-
-    def shape_init(self, tensor: Tensor):
-        self.shape = torch.Size([self.maxlen, *tensor.shape])
-
-    def build_tensor_queue(self):
-        if len(self.__lazy_queue) > 0:
-            block = torch.stack(list(self.__lazy_queue), dim=0)
-            l = block.shape[0]
-            if self.queue is None:
-                self.queue = block
-            elif self.true_len() <= self.maxlen:
-                self.queue = torch.cat((self.queue, block),  dim=0)
-            else:
-                self.queue = torch.cat((self.queue[l:], block),  dim=0)
-            self.__lazy_queue.clear()
-
-    def append(self, data):
-        if self.shape is None:
-            self.shape_init(data)
-        self.__lazy_queue.append(data)
-        if len(self.__lazy_queue) >= self.maxlen:
-            self.build_tensor_queue()
-
-    def true_len(self):
-        return len(self.__lazy_queue) + (0 if self.queue is None else self.queue.shape[0])
-
-    def __len__(self):
-        return min((self.true_len(), self.maxlen))
-
-    def __str__(self):
-        return f'LazyTensorFiFoQueue\tmaxlen: {self.maxlen}, shape: {self.shape}, ' \
-               f'len: {len(self)}, true_len: {self.true_len()}, elements in lazy queue: {len(self.__lazy_queue)}'
-
-    def __getitem__(self, item_or_slice):
-        self.build_tensor_queue()
-        return self.queue[item_or_slice]
-
-
-
-
--- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@ -7,8 +7,8 @@ agent:
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/MultiAgentConfigs/dirt_quadrant_train_config"
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/dirt_quadrant_eval_config"
  n_agents:           2
  max_steps:          250
  pomdp_r:            2
--- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
@ -7,8 +7,8 @@ agent:
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/two_rooms_one_door_modified_train_config"
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/two_rooms_eval_config"
  n_agents:           2
  max_steps:          250
  pomdp_r:            2
--- a/marl_factory_grid/algorithms/marl/networks.py
+++ b/marl_factory_grid/algorithms/marl/networks.py
@ -1,103 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class RecurrentAC(nn.Module):
-    def __init__(self, observation_size, n_actions, obs_emb_size,
-                 action_emb_size, hidden_size_actor, hidden_size_critic,
-                 n_agents, use_agent_embedding=True):
-        super(RecurrentAC, self).__init__()
-        observation_size = np.prod(observation_size)
-        self.n_layers = 1
-        self.n_actions = n_actions
-        self.use_agent_embedding = use_agent_embedding
-        self.hidden_size_actor = hidden_size_actor
-        self.hidden_size_critic = hidden_size_critic
-        self.action_emb_size    = action_emb_size
-        self.obs_proj   = nn.Linear(observation_size, obs_emb_size)
-        self.action_emb =  nn.Embedding(n_actions+1, action_emb_size, padding_idx=0)
-        self.agent_emb  =  nn.Embedding(n_agents, action_emb_size)
-        mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size
-        self.mix = nn.Sequential(nn.Tanh(),
-                                 nn.Linear(mix_in_size, obs_emb_size),
-                                 nn.Tanh(),
-                                 nn.Linear(obs_emb_size, obs_emb_size)
-                                 )
-        self.gru_actor   = nn.GRU(obs_emb_size, hidden_size_actor,  batch_first=True, num_layers=self.n_layers)
-        self.gru_critic  = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers)
-        self.action_head = nn.Sequential(
-            nn.Linear(hidden_size_actor, hidden_size_actor),
-            nn.Tanh(),
-            nn.Linear(hidden_size_actor, n_actions)
-        )
-        #            spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)),
-        self.critic_head = nn.Sequential(
-            nn.Linear(hidden_size_critic, hidden_size_critic),
-            nn.Tanh(),
-            nn.Linear(hidden_size_critic, 1)
-        )
-        #self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3)
-        #self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3)
-
-    def init_hidden_actor(self):
-        return torch.zeros(1, self.n_layers, self.hidden_size_actor)
-
-    def init_hidden_critic(self):
-        return torch.zeros(1, self.n_layers, self.hidden_size_critic)
-
-    def forward(self, observations, actions, hidden_actor=None, hidden_critic=None):
-        n_agents, t, *_ = observations.shape
-        obs_emb    = self.obs_proj(observations.view(n_agents, t, -1).float())
-        action_emb = self.action_emb(actions+1)  # shift by one due to padding idx
-
-        if not self.use_agent_embedding:
-            x_t = torch.cat((obs_emb, action_emb), -1)
-        else:
-            agent_emb = self.agent_emb(
-                torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)] * t, 1)
-            )
-            x_t = torch.cat((obs_emb, agent_emb, action_emb), -1)
-
-        mixed_x_t   = self.mix(x_t)
-        output_p, _ = self.gru_actor(input=mixed_x_t,  hx=hidden_actor.swapaxes(1, 0))
-        output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0))
-
-        logits = self.action_head(output_p)
-        critic = self.critic_head(output_c).squeeze(-1)
-        return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c)
-
-
-class RecurrentACL2(RecurrentAC):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.action_head = nn.Sequential(
-            nn.Linear(self.hidden_size_actor, self.hidden_size_actor),
-            nn.Tanh(),
-            NormalizedLinear(self.hidden_size_actor, self.n_actions, trainable_magnitude=True)
-        )
-
-
-class NormalizedLinear(nn.Linear):
-    def __init__(self, in_features: int, out_features: int,
-                 device=None, dtype=None, trainable_magnitude=False):
-        super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype)
-        self.d_sqrt = in_features**0.5
-        self.trainable_magnitude = trainable_magnitude
-        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
-
-    def forward(self, in_array):
-        normalized_input = F.normalize(in_array, dim=-1, p=2, eps=1e-5)
-        normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
-        return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
-
-
-class L2Norm(nn.Module):
-    def __init__(self, in_features, trainable_magnitude=False):
-        super(L2Norm, self).__init__()
-        self.d_sqrt = in_features**0.5
-        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
-
-    def forward(self, x):
-        return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale
--- a/marl_factory_grid/algorithms/marl/seac.py
+++ b/marl_factory_grid/algorithms/marl/seac.py
@ -1,55 +0,0 @@
-import torch
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.marl.iac import LoopIAC
-from marl_factory_grid.algorithms.marl.base_ac import nms
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
-
-
-class LoopSEAC(LoopIAC):
-    def __init__(self, cfg):
-        super(LoopSEAC, self).__init__(cfg)
-
-    def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
-        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
-        outputs = [net(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0]) for net in networks]
-
-        with torch.inference_mode(True):
-            true_action_logp = torch.stack([
-                torch.log_softmax(out[nms.LOGITS][ag_i, :-1], -1)
-                .gather(index=actions[ag_i, 1:, None], dim=-1)
-                for ag_i, out in enumerate(outputs)
-            ], 0).squeeze()
-
-        losses = []
-
-        for ag_i, out in enumerate(outputs):
-            logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-            critic = out[nms.CRITIC]
-
-            entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean()
-            advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
-
-            # policy loss
-            log_ap = torch.log_softmax(logits, -1)
-            log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
-
-            # importance weights
-            iw = (log_ap - true_action_logp).exp().detach()  # importance_weights
-
-            a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
-
-            value_loss = (iw*advantages.pow(2)).mean(-1)  # n_agent
-
-            # weighted loss
-            loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean()
-            losses.append(loss)
-
-        return losses
-
-    def learn(self, tms: MARLActorCriticMemory, **kwargs):
-        losses = self.actor_critic(tms, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-        for ag_i, loss in enumerate(losses):
-            self.optimizer[ag_i].zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
-            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/marl/single_agent_configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/single_agent_configs/dirt_quadrant_config.yaml
@ -7,8 +7,8 @@ agent:
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/dirt_quadrant_train_config"
+  classname:          marl_factory_grid.environment.configs.rl
+  env_name:           "rl/dirt_quadrant_train_config"
  n_agents:           1
  max_steps:          250
  pomdp_r:            2
--- a/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml
@ -7,8 +7,8 @@ agent:
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/two_rooms_one_door_modified_train_config"
+  classname:          marl_factory_grid.environment.configs.rl
+  env_name:           "rl/two_rooms_train_config"
  n_agents:           1
  max_steps:          250
  pomdp_r:            2
--- a/marl_factory_grid/algorithms/marl/snac.py
+++ b/marl_factory_grid/algorithms/marl/snac.py
@ -1,33 +0,0 @@
-from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
-from marl_factory_grid.algorithms.marl.base_ac import nms
-import torch
-from torch.distributions import Categorical
-from pathlib import Path
-
-
-class LoopSNAC(BaseActorCritic):
-    def __init__(self, cfg):
-        super().__init__(cfg)
-
-    def load_state_dict(self, path: Path):
-        path2weights = list(path.glob('*.pt'))
-        assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}'
-        self.net.load_state_dict(torch.load(path2weights[0]))
-
-    def init_hidden(self):
-        hidden_actor = self.net.init_hidden_actor()
-        hidden_critic = self.net.init_hidden_critic()
-        return dict(hidden_actor=torch.cat([hidden_actor]   * self.n_agents,  0),
-                    hidden_critic=torch.cat([hidden_critic] * self.n_agents,  0)
-                    )
-
-    def get_actions(self, out):
-        actions = Categorical(logits=out[nms.LOGITS]).sample().squeeze()
-        return actions
-
-    def forward(self, observations, actions, hidden_actor, hidden_critic):
-        out = self.net(self._as_torch(observations).unsqueeze(1),
-                       self._as_torch(actions).unsqueeze(1),
-                       hidden_actor, hidden_critic
-                       )
-        return out
--- a/marl_factory_grid/algorithms/static/TSP_base_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_base_agent.py
@ -37,7 +37,6 @@ class TSPBaseAgent(ABC):
        self._position_graph = self.generate_pos_graph()
        self._static_route = None
        self.cached_route = None
-        self.fallback_action = None
        self.action_list = []

    @abstractmethod
@ -50,46 +49,6 @@ class TSPBaseAgent(ABC):
        """
        return 0

-    def calculate_tsp_route(self, target_identifier):
-        """
-        Calculate the TSP route to reach a target.
-
-        :param target_identifier: Identifier of the target entity
-        :type target_identifier: str
-
-        :return: TSP route
-        :rtype: List[int]
-        """
-        target_positions = [x for x in self._env.state[target_identifier].positions if x != c.VALUE_NO_POS]
-
-        # if there are cached routes, search for one matching the current and target position
-        if self._env.state.route_cache and (
-                route := self._env.state.get_cached_route(self.state.pos, target_positions)) is not None:
-            # print(f"Retrieved cached route: {route}")
-            return route
-        # if none are found, calculate tsp route and cache it
-        else:
-            start_time = time.time()
-            if self.local_optimization:
-                nodes = \
-                    [self.state.pos] + \
-                    [x for x in target_positions if max(abs(np.subtract(x, self.state.pos))) < 3]
-                try:
-                    while len(nodes) < 7:
-                        nodes += [next(x for x in target_positions if x not in nodes)]
-                except StopIteration:
-                    nodes = [self.state.pos] + target_positions
-
-            else:
-                nodes = [self.state.pos] + target_positions
-
-            route = tsp.traveling_salesman_problem(self._position_graph,
-                                                   nodes=nodes, cycle=True, method=tsp.greedy_tsp)
-            duration = time.time() - start_time
-            print("TSP calculation took {:.2f} seconds to execute".format(duration))
-            self._env.state.cache_route(route)
-            return route
-
    def _use_door_or_move(self, door, target):
        """
        Helper method to decide whether to use a door or move towards a target.
@ -108,6 +67,47 @@ class TSPBaseAgent(ABC):
            action = self._predict_move(target)
        return action

+    def calculate_tsp_route(self, target_identifier):
+        """
+        Calculate the TSP route to reach a target.
+
+        :param target_identifier: Identifier of the target entity
+        :type target_identifier: str
+
+        :return: TSP route
+        :rtype: List[int]
+        """
+        start_time = time.time()
+
+        if self.cached_route is not None:
+            #print(f" Used cached route: {self.cached_route}")
+            return copy.deepcopy(self.cached_route)
+
+        else:
+            positions = [x for x in self._env.state[target_identifier].positions if x != c.VALUE_NO_POS]
+            if self.local_optimization:
+                nodes = \
+                    [self.state.pos] + \
+                    [x for x in positions if max(abs(np.subtract(x, self.state.pos))) < 3]
+                try:
+                    while len(nodes) < 7:
+                        nodes += [next(x for x in positions if x not in nodes)]
+                except StopIteration:
+                    nodes = [self.state.pos] + positions
+
+            else:
+                nodes = [self.state.pos] + positions
+
+            route = tsp.traveling_salesman_problem(self._position_graph,
+                                                   nodes=nodes, cycle=True, method=tsp.greedy_tsp)
+            self.cached_route = copy.deepcopy(route)
+            #print(f"Cached route: {self.cached_route}")
+
+        end_time = time.time()
+        duration = end_time - start_time
+        #print("TSP calculation took {:.2f} seconds to execute".format(duration))
+        return route
+
    def _door_is_close(self, state):
        """
        Check if a door is close to the agent's position.
@ -173,11 +173,8 @@ class TSPBaseAgent(ABC):
                action = next(action for action, pos_diff in MOVEMAP.items() if
                              np.all(diff == pos_diff) and action in allowed_directions)
            except StopIteration:
-                print(f"No valid action found for pos diff: {diff}. Using fallback action: {self.fallback_action}.")
-                if self.fallback_action and any(self.fallback_action == action.name for action in self.state.actions):
-                    action = self.fallback_action
-                else:
-                    action = choice(self.state.actions).name
+                print(f"No valid action found for pos diff: {diff}. Using fallback action.")
+                action = choice(self.state.actions).name
        else:
            action = choice(self.state.actions).name
        # noinspection PyUnboundLocalVariable
--- a/marl_factory_grid/algorithms/static/TSP_item_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_item_agent.py
@ -1,76 +0,0 @@
-import numpy as np
-
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
-
-from marl_factory_grid.modules.items import constants as i
-from marl_factory_grid.environment import constants as c
-
-future_planning = 7
-inventory_size  = 3
-
-MODE_GET        = 'Mode_Get'
-MODE_BRING      = 'Mode_Bring'
-
-
-class TSPItemAgent(TSPBaseAgent):
-
-    def __init__(self, *args, mode=MODE_GET, **kwargs):
-        """
-        Initializes a TSPItemAgent that colects items in the environment, stores them in his inventory and drops them off
-        at a drop-off location.
-
-        :param mode: Mode of the agent, either MODE_GET or MODE_BRING.
-        """
-        super(TSPItemAgent, self).__init__(*args, **kwargs)
-        self.mode = mode
-        self.fallback_action = c.NOOP
-
-    def predict(self, *_, **__):
-        item_at_position = self._env.state[i.ITEM].by_pos(self.state.pos)
-        dropoff_at_position = self._env.state[i.DROP_OFF].by_pos(self.state.pos)
-        if item_at_position:
-            # Translate the action_object to an integer to have the same output as any other model
-            action = i.ITEM_ACTION
-        elif dropoff_at_position:
-            # Translate the action_object to an integer to have the same output as any other model
-            action = i.ITEM_ACTION
-        elif door := self._door_is_close(self._env.state):
-            action = self._use_door_or_move(door, i.DROP_OFF if self.mode == MODE_BRING else i.ITEM)
-        else:
-            action = self._choose()
-        self.action_list.append(action)
-        # Translate the action_object to an integer to have the same output as any other model
-        try:
-            action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
-        except (StopIteration, UnboundLocalError):
-            print('Will not happen')
-            raise EnvironmentError
-        # noinspection PyUnboundLocalVariable
-        if self.mode == MODE_BRING and len(self._env[i.INVENTORY].by_entity(self.state)):
-            pass
-        elif self.mode == MODE_BRING and not len(self._env[i.INVENTORY].by_entity(self.state)):
-            self.mode = MODE_GET
-        elif self.mode == MODE_GET and len(self._env[i.INVENTORY].by_entity(self.state)) > inventory_size:
-            self.mode = MODE_BRING
-        else:
-            pass
-        return action_obj
-
-    def _choose(self):
-        """
-        Internal Usage. Chooses the action based on the agent's mode and the environment state.
-
-        :return: Chosen action.
-        :rtype: int
-        """
-        target = i.DROP_OFF if self.mode == MODE_BRING else i.ITEM
-        if len(self._env.state[i.ITEM]) >= 1:
-            action = self._predict_move(target)
-
-        elif len(self._env[i.INVENTORY].by_entity(self.state)):
-            self.mode = MODE_BRING
-            action = self._predict_move(target)
-        else:
-            action = int(np.random.randint(self._env.action_space.n))
-        # noinspection PyUnboundLocalVariable
-        return action
--- a/marl_factory_grid/algorithms/static/random_agent.py
+++ b/marl_factory_grid/algorithms/static/random_agent.py
@ -1,27 +0,0 @@
-from random import randint
-
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
-
-future_planning = 7
-
-
-class TSPRandomAgent(TSPBaseAgent):
-
-    def __init__(self, n_actions, *args, **kwargs):
-        """
-        Initializes a TSPRandomAgent that performs random actions from within his action space.
-
-        :param n_actions: Number of possible actions.
-        :type n_actions: int
-        """
-        super(TSPRandomAgent, self).__init__(*args, **kwargs)
-        self.n_action = n_actions
-
-    def predict(self, *_, **__):
-        """
-        Predicts the next action randomly.
-
-        :return: Predicted action.
-        :rtype: int
-        """
-        return randint(0, self.n_action - 1)
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@ -58,7 +58,7 @@ def load_yaml_file(path: Path):

 def add_env_props(cfg):
    # Path to config File
-    env_path = Path(f'../marl_factory_grid/configs/{cfg["env"]["env_name"]}.yaml')
+    env_path = Path(f'../marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')

    # Env Init
    factory = Factory(env_path)
				`@ -1 +1 @@`
				`from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory`