State of repo for ISOLA paper

2025-12-14 03:00:37 +01:00 · 2024-10-25 17:24:11 +02:00
parent 95749d8238
commit e37b23c20c
120 changed files with 1487 additions and 6439 deletions
--- a/marl_factory_grid/init.py
+++ b/marl_factory_grid/init.py
@@ -1,4 +1,3 @@
-from .quickstart import init
 from marl_factory_grid.environment.factory import Factory
 """
 Main module of the 'rl-factory-grid'-environment.
--- a/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_coin_quadrant.pth
+++ b/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_coin_quadrant.pth
--- a/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_two_rooms_agent1.pth
+++ b/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_two_rooms_agent1.pth
--- a/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_two_rooms_agent2.pth
+++ b/marl_factory_grid/algorithms/agent_models/PolicyNet_model_parameters_two_rooms_agent2.pth
--- a/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_coin_quadrant.pth
+++ b/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_coin_quadrant.pth
--- a/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_two_rooms_agent1.pth
+++ b/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_two_rooms_agent1.pth
--- a/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_two_rooms_agent2.pth
+++ b/marl_factory_grid/algorithms/agent_models/ValueNet_model_parameters_two_rooms_agent2.pth
--- a/marl_factory_grid/algorithms/marl/RL_runner.py
+++ b/marl_factory_grid/algorithms/marl/RL_runner.py
@@ -0,0 +1,80 @@
+from pathlib import Path
+from marl_factory_grid.algorithms.marl.a2c_coin import A2C
+from marl_factory_grid.algorithms.marl.utils import get_algorithms_marl_path
+from marl_factory_grid.algorithms.utils import load_yaml_file
+
+
+####### Training routines ######
+def rerun_coin_quadrant_agent1_training():
+    train_cfg_path = Path(f'./marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_train_config.yaml')
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    print("Training phase")
+    agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg, mode="train")
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop("coin_quadrant", n_episodes=1)
+
+
+def two_rooms_training(max_steps, agent_name):
+    train_cfg_path = Path(f'./marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_train_config.yaml')
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # train_cfg["algorithm"]["max_steps"] = max_steps
+    train_cfg["env"]["env_name"] = f"marl/single_agent_configs/two_rooms_{agent_name}_train_config"
+    eval_cfg["env"]["env_name"] = f"marl/single_agent_configs/two_rooms_{agent_name}_eval_config"
+    print("Training phase")
+    agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg, mode="train")
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop("two_rooms", n_episodes=1)
+
+
+def rerun_two_rooms_agent1_training():
+    two_rooms_training(max_steps=190000, agent_name="agent1")
+
+
+def rerun_two_rooms_agent2_training():
+    two_rooms_training(max_steps=260000, agent_name="agent2")
+
+
+####### Eval routines ########
+def single_agent_eval(config_name, run_folder_name):
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/marl/single_agent_configs/{config_name}_eval_config.yaml')
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(eval_cfg=eval_cfg, mode="eval")
+    print("Evaluation phase")
+    agent.load_agents(config_name, [run_folder_name])
+    agent.eval_loop(config_name, 1)
+
+
+def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
+    eval_cfg_path = Path(f'{get_algorithms_marl_path()}/multi_agent_configs/{config_name}' +
+                         f'_eval_config{"_emergent" if emergent_phenomenon else ""}.yaml')
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(eval_cfg=eval_cfg, mode="eval")
+    print("Evaluation phase")
+    agent.load_agents(config_name, runs)
+    agent.eval_loop(config_name, 1)
+
+
+def coin_quadrant_multi_agent_rl_eval(emergent_phenomenon):
+    # Using an empty list for runs indicates, that the default agents in algorithms/agent_models should be used.
+    # If you want to use different agents, that were obtained by running the training with a different seed, you can
+    # load these agents by inserting the names of the runs in study_out/ into the runs list e.g. ["run1", "run2"]
+    multi_agent_eval("coin_quadrant", [], emergent_phenomenon)
+
+
+def two_rooms_multi_agent_rl_eval(emergent_phenomenon):
+    # Using an empty list for runs indicates, that the default agents in algorithms/agent_models should be used.
+    # If you want to use different agents, that were obtained by running the training with a different seed, you can
+    # load these agents by inserting the names of the runs in study_out/ into the runs list e.g. ["run1", "run2"]
+    multi_agent_eval("two_rooms", [], emergent_phenomenon)
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
@@ -0,0 +1 @@
+
--- a/marl_factory_grid/algorithms/marl/a2c_coin.py
+++ b/marl_factory_grid/algorithms/marl/a2c_coin.py
@@ -1,53 +1,66 @@
 import os
+import pickle
 import torch
 from typing import Union, List
 import numpy as np
 from tqdm import tqdm

-from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient
-from marl_factory_grid.algorithms.rl.constants import Names
-from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
+from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
+from marl_factory_grid.algorithms.marl.constants import Names
+from marl_factory_grid.algorithms.marl.utils import transform_observations, _as_torch, is_door_close, \
    get_coin_piles_positions, update_target_pile, update_ordered_coin_piles, get_all_collected_coin_piles, \
    distribute_indices, set_agents_spawnpoints, get_ordered_coin_piles, handle_finished_episode, save_configs, \
-    save_agent_models, get_all_observations, get_agents_positions
-from marl_factory_grid.algorithms.utils import add_env_props
-from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
-    create_info_maps
+    save_agent_models, get_all_observations, get_agents_positions, has_low_change_phase_started, significant_deviation, \
+    get_agent_models_path
+
+from marl_factory_grid.algorithms.utils import add_env_props, get_study_out_path
+from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_return_development, \
+    create_info_maps, plot_return_development_change

 nms = Names
 ListOrTensor = Union[List, torch.Tensor]


 class A2C:
-    def __init__(self, train_cfg, eval_cfg):
-        self.results_path = None
-        self.agents = None
-        self.act_dim = None
-        self.obs_dim = None
-        self.factory = add_env_props(train_cfg)
+    def __init__(self, train_cfg=None, eval_cfg=None, mode="train"):
+        self.mode = mode
+        if mode == nms.TRAIN:
+            self.train_factory = add_env_props(train_cfg)
+            self.train_cfg = train_cfg
+            self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
+        else:
+            self.n_agents = eval_cfg[nms.ENV][nms.N_AGENTS]
        self.eval_factory = add_env_props(eval_cfg)
-        self.__training = True
-        self.train_cfg = train_cfg
        self.eval_cfg = eval_cfg
-        self.cfg = train_cfg
-        self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
        self.setup()
-        self.reward_development = []
        self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}

    def setup(self):
        """ Initialize agents and create entry for run results according to configuration """
+        if self.mode == "train":
+            self.cfg = self.train_cfg
+            self.factory = self.train_factory
+            self.gamma = self.cfg[nms.ALGORITHM][nms.GAMMA]
+        else:
+            self.cfg = self.eval_cfg
+            self.factory = self.eval_factory
+            self.gamma = 0.99
+
+        seed = self.cfg[nms.ALGORITHM][nms.SEED]
+        print("Algorithm Seed: ", seed)
+        if seed == -1:
+            seed = np.random.choice(range(1000))
+            print("Algorithm seed is -1. Pick random seed: ", seed)
+
        self.obs_dim = 2 + 2 * len(get_coin_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
                                                                                  nms.PILE_OBSERVABILITY] == nms.ALL else 4
        self.act_dim = 4  # The 4 movement directions
-        self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
+        self.agents = [PolicyGradient(self.factory, seed=seed, gamma=self.gamma, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
                       range(self.n_agents)]

        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
            # Define study_out_path and check if it exists
-            base_dir = os.path.dirname(os.path.abspath(__file__))  # Directory of the script
-            study_out_path = os.path.join(base_dir, '../../../study_out')
-            study_out_path = os.path.abspath(study_out_path)
+            study_out_path = get_study_out_path()

            if not os.path.exists(study_out_path):
                raise FileNotFoundError(f"The directory {study_out_path} does not exist.")
@@ -62,56 +75,86 @@ class A2C:
            # Save settings in results folder
            save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)

-    def set_cfg(self, eval=False):
-        if eval:
-            self.cfg = self.eval_cfg
-        else:
-            self.cfg = self.train_cfg
-
-    def load_agents(self, runs_list):
+    def load_agents(self, config_name, runs_list):
        """ Initialize networks with parameters of already trained agents """
-        for idx, run in enumerate(runs_list):
-            run_path = f"./study_out/{run}"
-            self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
-            self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
+        if len(runs_list) == 0 or runs_list is None:
+            if config_name == "coin_quadrant":
+                for idx in range(self.n_agents):
+                    self.agents[idx].pi.load_model_parameters(f"{get_agent_models_path()}/PolicyNet_model_parameters_coin_quadrant.pth")
+                    self.agents[idx].vf.load_model_parameters(f"{get_agent_models_path()}/ValueNet_model_parameters_coin_quadrant.pth")
+            elif config_name == "two_rooms":
+                for idx in range(self.n_agents):
+                    self.agents[idx].pi.load_model_parameters(f"{get_agent_models_path()}/PolicyNet_model_parameters_two_rooms_agent{idx+1}.pth")
+                    self.agents[idx].vf.load_model_parameters(f"{get_agent_models_path()}/ValueNet_model_parameters_two_rooms_agent{idx+1}.pth")
+            else:
+                print("No such config does exist! Abort...")
+        else:
+            for idx, run in enumerate(runs_list):
+                run_path = f"./study_out/{run}"
+                self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
+                self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")

    @torch.no_grad()
    def train_loop(self):
        """ Function for training agents """
        env = self.factory
-        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
+        n_steps, max_steps = [self.train_cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
        global_steps, episode = 0, 0
-        indices = distribute_indices(env, self.cfg, self.n_agents)
+        indices = distribute_indices(env, self.train_cfg, self.n_agents)
        coin_piles_positions = get_coin_piles_positions(env)
        target_pile = [partition[0] for partition in
                       indices]  # list of pointers that point to the current target pile for each agent
        collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+        low_change_phase_start_episode = -1
+        episode_rewards_development = []
+        return_change_development = []

        pbar = tqdm(total=max_steps)
-        while global_steps < max_steps:
+        loop_condition = True if self.train_cfg[nms.ALGORITHM][nms.EARLY_STOPPING] else global_steps < max_steps
+        while loop_condition:
            _ = env.reset()
-            if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
+            if self.train_cfg[nms.ENV][nms.TRAIN_RENDER]:
                env.render()
            set_agents_spawnpoints(env, self.n_agents)
-            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
+            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.train_cfg, self.n_agents)
            # Reset current target pile at episode begin if all piles have to be collected in one episode
-            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
+            if self.train_cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
                target_pile = [partition[0] for partition in indices]
                collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+            episode_rewards_development.append([])

            # Supply each agent with its local observation
-            obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
-            done, rew_log = [False] * self.n_agents, 0
+            obs = transform_observations(env, ordered_coin_piles, target_pile, self.train_cfg, self.n_agents)
+            done, ep_return = [False] * self.n_agents, 0
+
+            if self.train_cfg[nms.ALGORITHM][nms.EARLY_STOPPING]:
+                if len(return_change_development) > self.train_cfg[nms.ALGORITHM][
+                    nms.LAST_N_EPISODES] and low_change_phase_start_episode == -1 and has_low_change_phase_started(
+                        return_change_development, self.train_cfg[nms.ALGORITHM][nms.LAST_N_EPISODES],
+                        self.train_cfg[nms.ALGORITHM][nms.MEAN_TARGET_CHANGE]):
+                    low_change_phase_start_episode = len(return_change_development)
+                    print(low_change_phase_start_episode)
+
+                # Check if requirements for early stopping are met
+                if low_change_phase_start_episode != -1 and significant_deviation(return_change_development, low_change_phase_start_episode):
+                    print(f"Early Stopping in Episode: {global_steps} because of significant deviation.")
+                    break
+                if low_change_phase_start_episode != -1 and (len(return_change_development) - low_change_phase_start_episode) >= 1000:
+                    print(f"Early Stopping in Episode: {global_steps} because of episode time limit")
+                    break
+                if low_change_phase_start_episode != -1 and global_steps >= max_steps:
+                    print(f"Early Stopping in Episode: {global_steps} because of global steps time limit")
+                    break

            while not all(done):
                action = self.use_door_or_move(env, obs, collected_coin_piles) \
                    if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
                _, next_obs, reward, done, info = env.step(action)
-                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.train_cfg, self.n_agents)

                # Handle case where agent is on field with coin
                reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
-                                                reward, done)
+                                                reward, done, self.train_cfg)

                if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True

@@ -122,50 +165,67 @@ class A2C:
                        agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])

                # Visualize state update
-                if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()
+                if self.train_cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()

                obs = next_obs

-                if all(done): handle_finished_episode(obs, self.agents, self.cfg)
-
                global_steps += 1
-                rew_log += sum(reward)
+                episode_rewards_development[-1].extend(reward)

-                if global_steps >= max_steps: break
+                if all(done):
+                    handle_finished_episode(obs, self.agents, self.train_cfg)
+                    break

-            self.reward_development.append(rew_log)
+            if global_steps >= max_steps: break
+
+            return_change_development.append(
+                sum(episode_rewards_development[-1]) - sum(episode_rewards_development[-2])
+                if len(episode_rewards_development) > 1 else 0.0)
            episode += 1
            pbar.update(global_steps - pbar.n)

        pbar.close()
-        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
-            plot_reward_development(self.reward_development, self.results_path)
-            create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
+        if self.train_cfg[nms.ENV][nms.SAVE_AND_LOG]:
+            return_development = [np.sum(rewards) for rewards in episode_rewards_development]
+            discounted_return_development = [np.sum([reward * pow(self.gamma, i) for i, reward in enumerate(ep_rewards)]) for ep_rewards in episode_rewards_development]
+            plot_return_development(return_development, self.results_path)
+            plot_return_development(discounted_return_development, self.results_path, discounted=True)
+            plot_return_development_change(return_change_development, self.results_path)
+            create_info_maps(env, get_all_observations(env, self.train_cfg, self.n_agents),
                             get_coin_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
+            metrics_data = {"episode_rewards_development": episode_rewards_development,
+                            "return_development": return_development,
+                            "discounted_return_development": discounted_return_development,
+                            "return_change_development": return_change_development}
+            with open(f"{self.results_path}/metrics", "wb") as pickle_file:
+                pickle.dump(metrics_data, pickle_file)
            save_agent_models(self.results_path, self.agents)
            plot_action_maps(env, [self], self.results_path)

    @torch.inference_mode(True)
-    def eval_loop(self, n_episodes):
+    def eval_loop(self, config_name, n_episodes):
        """ Function for performing inference """
        env = self.eval_factory
-        self.set_cfg(eval=True)
        episode, results = 0, []
        coin_piles_positions = get_coin_piles_positions(env)
-        indices = distribute_indices(env, self.cfg, self.n_agents)
+        if config_name == "coin_quadrant": print("Coin Piles positions", coin_piles_positions)
+        indices = distribute_indices(env, self.eval_cfg, self.n_agents)
        target_pile = [partition[0] for partition in
                       indices]  # list of pointers that point to the current target pile for each agent
-        if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
+        if self.eval_cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
            collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
                                  range(self.n_agents)]
-        else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+        else:
+            collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+
+        collected_coin_piles_per_step = []

        while episode < n_episodes:
            _ = env.reset()
            set_agents_spawnpoints(env, self.n_agents)
-            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
+            if self.eval_cfg[nms.ENV][nms.EVAL_RENDER]:
                # Don't render auxiliary piles
-                if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
+                if self.eval_cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.COIN_PILES]) if
                                       idx % 2 == 0]
                    for pile in auxiliary_piles:
@@ -174,19 +234,23 @@ class A2C:
                env._renderer.fps = 5  # Slow down agent movement

            # Reset current target pile at episode begin if all piles have to be collected in one episode
-            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
+            if self.eval_cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
                target_pile = [partition[0] for partition in indices]
-                if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
+                if self.eval_cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
                    collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
                                          range(self.n_agents)]
-                else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+                else:
+                    collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]

-            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
+            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.eval_cfg, self.n_agents)

            # Supply each agent with its local observation
-            obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+            obs = transform_observations(env, ordered_coin_piles, target_pile, self.eval_cfg, self.n_agents)
            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)

+            collected_coin_piles_per_step.append([])
+
+            ep_steps = 0
            while not all(done):
                action = self.use_door_or_move(env, obs, collected_coin_piles, det=True) \
                    if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
@@ -195,20 +259,44 @@ class A2C:

                # Handle case where agent is on field with coin
                reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
-                                                reward, done)
+                                                reward, done, self.eval_cfg)
+
+                ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.eval_cfg, self.n_agents)

                # Get transformed next_obs that might have been updated because of handle_coin
-                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.eval_cfg, self.n_agents)

                done = [done] * self.n_agents if isinstance(done, bool) else done

-                if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()
+                if self.eval_cfg[nms.ENV][nms.EVAL_RENDER]: env.render()

                obs = next_obs

-            episode += 1
+                # Count the overall number of cleaned coin piles in each step
+                collected_piles = 0
+                for dict in collected_coin_piles:
+                    for value in dict.values():
+                        if value:
+                            collected_piles += 1
+                collected_coin_piles_per_step[-1].append(collected_piles)

-    # -------------------------------------- HELPER FUNCTIONS ------------------------------------------------- #
+                ep_steps += 1
+
+            episode += 1
+            print("Number of environment steps:", ep_steps)
+            if config_name == "coin_quadrant":
+                print("Collected coins per step:", collected_coin_piles_per_step)
+            else:
+                # For the RL agent, we encode the flags internally as coins as well.
+                # Also, we have to subtract the auxiliary pile in the emergence prevention mechanism case
+                print("Reached flags per step:", [[max(0, coin_pile - 1) for coin_pile in ele] for ele in collected_coin_piles_per_step])
+
+        if self.eval_cfg[nms.ENV][nms.SAVE_AND_LOG]:
+            metrics_data = {"collected_coin_piles_per_step": collected_coin_piles_per_step}
+            with open(f"{self.results_path}/metrics", "wb") as pickle_file:
+                pickle.dump(metrics_data, pickle_file)
+
+    ########## Helper functions ########

    def get_actions(self, observations) -> ListOrTensor:
        """ Given local observations, get actions for both agents """
@@ -247,14 +335,18 @@ class A2C:
                            a.name == nms.USE_DOOR))
                        # Don't include action in agent experience
                    else:
-                        if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
-                        else: action.append(int(agent.step(agent_obs)))
+                        if det:
+                            action.append(int(agent.pi(agent_obs, det=True)[0]))
+                        else:
+                            action.append(int(agent.step(agent_obs)))
                else:
-                    if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
-                    else: action.append(int(agent.step(agent_obs)))
+                    if det:
+                        action.append(int(agent.pi(agent_obs, det=True)[0]))
+                    else:
+                        action.append(int(agent.step(agent_obs)))
        return action

-    def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done):
+    def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done, cfg):
        """ Check if agent moved on field with coin. If that is the case collect coin automatically """
        agents_positions = get_agents_positions(env, self.n_agents)
        coin_piles_positions = get_coin_piles_positions(env)
@@ -269,10 +361,10 @@ class A2C:
                            reward[idx] += 50
                            collected_coin_piles[idx][pos] = True
                            # Set pointer to next coin pile
-                            update_target_pile(env, idx, target_pile, indices, self.cfg)
+                            update_target_pile(env, idx, target_pile, indices, cfg)
                            update_ordered_coin_piles(idx, collected_coin_piles, ordered_coin_piles, env,
-                                                      self.cfg, self.n_agents)
-                            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE:
+                                                      cfg, self.n_agents)
+                            if cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE:
                                done = True
                                if all(collected_coin_piles[idx].values()):
                                    # Reset collected_coin_piles indicator
@@ -285,11 +377,15 @@ class A2C:
                    # Indicate that renderer can hide coin pile
                    coin_at_position = env.state[nms.COIN_PILES].by_pos(pos)
                    coin_at_position[0].set_new_amount(0)
+                    """
+                    coin_at_position = env.state[nms.COIN_PILES].by_pos(pos)[0]
+                    env.state[nms.COIN_PILES].delete_env_object(coin_at_position)
+                    """

-            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]:
+            if cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]:
                if all([all(collected_coin_piles[i].values()) for i in range(self.n_agents)]):
                    done = True
-            elif self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED:
+            elif cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED:
                # End episode if both agents together have collected all coin piles
                if all(get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, self.n_agents).values()):
                    done = True
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@@ -1,755 +0,0 @@
-import copy
-import os
-import random
-
-import imageio # requires ffmpeg install on operating system and imageio-ffmpeg package for python
-from scipy import signal
-import matplotlib.pyplot as plt
-import torch
-from typing import Union, List, Dict
-import numpy as np
-from torch.distributions import Categorical
-
-from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
-from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
-from pathlib import Path
-from collections import deque
-
-from marl_factory_grid.environment.actions import Noop
-from marl_factory_grid.modules import Clean, DoorUse
-from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps
-
-
-class Names:
-    REWARD          = 'reward'
-    DONE            = 'done'
-    ACTION          = 'action'
-    OBSERVATION     = 'observation'
-    LOGITS          = 'logits'
-    HIDDEN_ACTOR    = 'hidden_actor'
-    HIDDEN_CRITIC   = 'hidden_critic'
-    AGENT           = 'agent'
-    ENV             = 'env'
-    ENV_NAME        = 'env_name'
-    N_AGENTS        = 'n_agents'
-    ALGORITHM       = 'algorithm'
-    MAX_STEPS       = 'max_steps'
-    N_STEPS         = 'n_steps'
-    BUFFER_SIZE     = 'buffer_size'
-    CRITIC          = 'critic'
-    BATCH_SIZE      = 'bnatch_size'
-    N_ACTIONS       = 'n_actions'
-    TRAIN_RENDER    = 'train_render'
-    EVAL_RENDER     = 'eval_render'
-
-
-nms = Names
-ListOrTensor = Union[List, torch.Tensor]
-
-
-class A2C:
-    def __init__(self, train_cfg, eval_cfg):
-        self.factory = add_env_props(train_cfg)
-        self.eval_factory = add_env_props(eval_cfg)
-        self.__training = True
-        self.train_cfg = train_cfg
-        self.eval_cfg = eval_cfg
-        self.cfg = train_cfg
-        self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
-        self.setup()
-        self.reward_development = []
-        self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
-
-    def setup(self):
-        dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
-                                range(len(self.factory.state.entities['DirtPiles']))]
-        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
-            obs_dim = 2 + 2*len(dirt_piles_positions)
-        else:
-            obs_dim = 4
-        self.obs_dim = obs_dim
-        self.act_dim = 4
-        # act_dim=4, because we want the agent to only learn a routing problem
-        self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim, act_dim=self.act_dim) for i in range(self.n_agents)]
-        if self.cfg[nms.ENV]["save_and_log"]:
-            # Create results folder
-            runs = os.listdir("../study_out/")
-            run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
-            next_run_number = max(run_numbers)+1 if run_numbers else 0
-            self.results_path = f"../study_out/run{next_run_number}"
-            os.mkdir(self.results_path)
-            # Save settings in results folder
-            self.save_configs()
-            if self.cfg[nms.ENV]["record"]:
-                self.recorder = imageio.get_writer(f'{self.results_path}/pygame_recording.mp4', fps=5)
-
-    def set_cfg(self, eval=False):
-        if eval:
-            self.cfg = self.eval_cfg
-        else:
-            self.cfg = self.train_cfg
-
-    @classmethod
-    def _as_torch(cls, x):
-        if isinstance(x, np.ndarray):
-            return torch.from_numpy(x)
-        elif isinstance(x, List):
-            return torch.tensor(x)
-        elif isinstance(x, (int, float)):
-            return torch.tensor([x])
-        return x
-
-    def get_actions(self, observations) -> ListOrTensor:
-        # Given an observation, get actions for both agents
-        actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
-        return actions
-
-    def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
-        # Use deterministic policy for inference
-        actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
-        for agent_idx in range(self.n_agents):
-            if all(cleaned_dirt_piles[agent_idx].values()):
-                actions[agent_idx] = np.array(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
-        return actions
-
-    def transform_observations(self, env, ordered_dirt_piles, target_pile):
-        """ Assumes that agent has observations -DirtPiles and -Self """
-        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
-            trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
-        else:
-            # Only show current target pile
-            trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
-        for i, pos in enumerate(agent_positions):
-            agent_x, agent_y = pos[0], pos[1]
-            trans_obs[i][0] = agent_x
-            trans_obs[i][1] = agent_y
-            idx = 2
-            if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
-                for pile_pos in ordered_dirt_piles[i]:
-                    trans_obs[i][idx] = pile_pos[0]
-                    trans_obs[i][idx + 1] = pile_pos[1]
-                    idx += 2
-            else:
-                trans_obs[i][2] = ordered_dirt_piles[i][target_pile[i]][0]
-                trans_obs[i][3] = ordered_dirt_piles[i][target_pile[i]][1]
-        return trans_obs
-
-    def get_all_observations(self, env):
-        dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
-                                range(len(env.state.entities['DirtPiles']))]
-        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
-            obs = [torch.zeros(2 + 2 * len(dirt_piles_positions))]
-            observations = [[]]
-            # Fill in pile positions
-            idx = 2
-            for pile_pos in dirt_piles_positions:
-                obs[0][idx] = pile_pos[0]
-                obs[0][idx + 1] = pile_pos[1]
-                idx += 2
-        else:
-            # Have multiple observation layers of the map for each dirt pile one
-            obs = [torch.zeros(4) for _ in range(self.n_agents) for _ in dirt_piles_positions]
-            observations = [[] for _ in dirt_piles_positions]
-            for idx, pile_pos in enumerate(dirt_piles_positions):
-                obs[idx][2] = pile_pos[0]
-                obs[idx][3] = pile_pos[1]
-        valid_agent_positions = env.state.entities.floorlist
-        #observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2)
-        for idx, pos in enumerate(valid_agent_positions):
-            for obs_layer in range(len(obs)):
-                observation = copy.deepcopy(obs[obs_layer])
-                observation[0] = pos[0]
-                observation[1] = pos[1]
-                observations[obs_layer].append(observation)
-
-        return observations
-
-    def get_dirt_piles_positions(self, env):
-        return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
-
-    def get_ordered_dirt_piles(self, env, cleaned_dirt_piles, target_pile):
-        """ Each agent can have it's individual pile order """
-        ordered_dirt_piles = [[] for _ in range(self.n_agents)]
-        dirt_pile_positions = self.get_dirt_piles_positions(env)
-        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-        for agent_idx in range(self.n_agents):
-            if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
-                ordered_dirt_piles[agent_idx] = dirt_pile_positions
-            elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
-                ordered_dirt_piles[agent_idx] = dirt_pile_positions
-                random.shuffle(ordered_dirt_piles)
-            elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
-                ordered_dirt_piles[agent_idx] = None
-            elif self.cfg[nms.ALGORITHM]["pile-order"] in ["smart", "dynamic"]:
-                # Calculate distances for remaining unvisited dirt piles
-                remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
-                pile_distances = {pos:0 for pos in remaining_target_piles}
-                agent_pos = agent_positions[agent_idx]
-                for pos in remaining_target_piles:
-                    pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
-
-                if self.cfg[nms.ALGORITHM]["pile-order"] == "smart":
-                    # Check if there is an agent in line with any of the remaining dirt piles
-                    for pile_pos in remaining_target_piles:
-                        for other_pos in agent_positions:
-                            if other_pos != agent_pos:
-                                if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
-                                    # Get the line between the agent and the goal
-                                    path = self.bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
-
-                                    # Check if the entity lies on the path between the agent and the goal
-                                    if other_pos in path:
-                                        pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])
-
-                sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
-                # Insert already visited dirt piles
-                ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
-                # Fill up with sorted positions
-                for pos in sorted_pile_distances.keys():
-                    ordered_dirt_piles[agent_idx].append(pos)
-
-            else:
-                print("Not a valid pile order option.")
-                exit()
-
-        return ordered_dirt_piles
-
-    def bresenham(self, x0, y0, x1, y1):
-        """Bresenham's line algorithm to get the coordinates of a line between two points."""
-        dx = np.abs(x1 - x0)
-        dy = np.abs(y1 - y0)
-        sx = 1 if x0 < x1 else -1
-        sy = 1 if y0 < y1 else -1
-        err = dx - dy
-
-        coordinates = []
-        while True:
-            coordinates.append((x0, y0))
-            if x0 == x1 and y0 == y1:
-                break
-            e2 = 2 * err
-            if e2 > -dy:
-                err -= dy
-                x0 += sx
-            if e2 < dx:
-                err += dx
-                y0 += sy
-        return coordinates
-
-    def update_ordered_dirt_piles(self, agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile):
-        # Only update ordered_dirt_pile for agent that reached its target pile
-        updated_ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
-        for i in range(len(ordered_dirt_piles[agent_idx])):
-            ordered_dirt_piles[agent_idx][i] = updated_ordered_dirt_piles[agent_idx][i]
-
-    def distribute_indices(self, env):
-        indices = []
-        n_dirt_piles = len(self.get_dirt_piles_positions(env))
-        if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
-            indices = [[0] for _ in range(self.n_agents)]
-        else:
-            base_count = n_dirt_piles // self.n_agents
-            remainder = n_dirt_piles % self.n_agents
-
-            start_index = 0
-            for i in range(self.n_agents):
-                # Add an extra index to the first 'remainder' objects
-                end_index = start_index + base_count + (1 if i < remainder else 0)
-                indices.append(list(range(start_index, end_index)))
-                start_index = end_index
-
-            # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-            # -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
-            if self.cfg[nms.ALGORITHM]["auxiliary_piles"] and "Doors" in env.state.entities.keys():
-                door_positions = [door.pos for door in env.state.entities["Doors"]]
-                agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-                distances = {door_pos:[] for door_pos in door_positions}
-
-                # Calculate distance of every agent to every door
-                for door_pos in door_positions:
-                    for agent_pos in agent_positions:
-                        distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
-
-                def duplicate_indices(lst, item):
-                    return [i for i, x in enumerate(lst) if x == item]
-
-                # Get agent indices of agents with same distance to door
-                affected_agents = {door_pos:{} for door_pos in door_positions}
-                for door_pos in distances.keys():
-                    dist = distances[door_pos]
-                    dist_set = set(dist)
-                    for d in dist_set:
-                        affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)
-
-                # TODO: Make generic for multiple doors
-                updated_indices = []
-                if len(affected_agents[door_positions[0]]) == 0:
-                    # Remove auxiliary piles for all agents
-                    updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
-                else:
-                    for distance, agent_indices in affected_agents[door_positions[0]].items():
-                        # Pick random agent to keep auxiliary pile and remove it for all others
-                        #selected_agent = np.random.choice(agent_indices)
-                        selected_agent = 0
-                        for agent_idx in agent_indices:
-                            if agent_idx == selected_agent:
-                                updated_indices.append(indices[agent_idx])
-                            else:
-                                updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
-
-                indices = updated_indices
-
-        return indices
-
-    def update_target_pile(self, env, agent_idx, target_pile, indices):
-        if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
-            if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
-                target_pile[agent_idx] += 1
-            else:
-                target_pile[agent_idx] = 0
-        else:
-            if target_pile[agent_idx] + 1 in indices[agent_idx]:
-                target_pile[agent_idx] += 1
-
-    def door_is_close(self, env, agent_idx):
-        neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state["Agent"][agent_idx].pos)
-                        for y in env.state.entities.pos_dict[x] if "Door" in y.name]
-        if neighbourhood:
-            return neighbourhood[0]
-
-    def use_door_or_move(self, env, obs, cleaned_dirt_piles, target_pile, det=False):
-        action = []
-        for agent_idx, agent in enumerate(self.agents):
-            agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
-            # If agent already reached its target
-            if all(cleaned_dirt_piles[agent_idx].values()):
-                action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
-                if not det:
-                    # Include agent experience entry manually
-                    agent._episode.append((None, None, None, agent.vf(agent_obs)))
-            else:
-                if door := self.door_is_close(env, agent_idx):
-                    if door.is_closed:
-                        action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "use_door"))
-                        # Don't include action in agent experience
-                    else:
-                        if det:
-                            action.append(int(agent.pi(agent_obs, det=True)[0]))
-                        else:
-                            action.append(int(agent.step(agent_obs)))
-                else:
-                    if det:
-                        action.append(int(agent.pi(agent_obs, det=True)[0]))
-                    else:
-                        action.append(int(agent.step(agent_obs)))
-        return action
-
-    def reward_distance(self, env, obs, target_pile, reward):
-        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-        # Give a negative reward for every step that keeps agent from getting closer to currently selected target pile/ closest pile
-        for idx, pos in enumerate(agent_positions):
-            last_pos = (int(obs[idx][0]), int(obs[idx][1].item()))
-            target_pile_pos = self.get_dirt_piles_positions(env)[target_pile[idx]]
-            last_distance = np.abs(target_pile_pos[0] - last_pos[0]) + np.abs(target_pile_pos[1] - last_pos[1])
-            new_distance = np.abs(target_pile_pos[0] - pos[0]) + np.abs(target_pile_pos[1] - pos[1])
-            if new_distance >= last_distance:
-                reward[idx] -= 0.05  # 0.05
-        return reward
-
-    def punish_entering_same_field(self, next_obs, passed_fields, reward):
-        # Give a high negative reward if agent enters same field twice
-        for idx in range(self.n_agents):
-            if (next_obs[idx][0], next_obs[idx][1]) in passed_fields[idx]:
-                reward[idx] += -0.1
-            else:
-                passed_fields[idx].append((next_obs[idx][0], next_obs[idx][1]))
-
-
-    def handle_dirt_quadrant_observation_bugs(self, obs, env):
-        try:
-            # Check that dirt position and amount are still correct
-            assert np.where(obs[0][0] == 0.5)[0][0] == 1 and np.where(obs[0][0] == 0.5)[0][0] == 1
-        except:
-            print("Missing dirt pile")
-            # Manually place dirt on defined position
-            obs[0][0][1][1] = 0.5
-        try:
-            # Check that self still returns a valid agent position on the map
-            assert np.where(obs[0][1] == 1)[0][0] and np.where(obs[0][1] == 1)[1][0]
-        except:
-            # Place agent manually in obs object on last known position
-            x, y = env.state.moving_entites[0].pos[0], env.state.moving_entites[0].pos[1]
-            obs[0][1][x][y] = 1
-            print("Missing agent position")
-
-    def get_all_cleaned_dirt_piles(self, dirt_piles_positions, cleaned_dirt_piles):
-        meta_cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
-        for agent_idx in range(self.n_agents):
-            for (pos, cleaned) in cleaned_dirt_piles[agent_idx].items():
-                if cleaned:
-                    meta_cleaned_dirt_piles[pos] = True
-        return meta_cleaned_dirt_piles
-
-    def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done):
-        # Check if agent moved on field with dirt. If that is the case collect dirt automatically
-        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-        dirt_piles_positions = self.get_dirt_piles_positions(env)
-        if any([True for pos in agent_positions if pos in dirt_piles_positions]):
-            # Do Noop for agent that does not collect dirt
-            """action = [np.array(5), np.array(5)]
-
-            # Execute real step in environment
-            for idx, pos in enumerate(agent_positions):
-                if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
-                    action[idx] = np.array(4)
-                    # Collect dirt
-                    _, next_obs, reward, done, info = env.step(action)
-                    cleaned_dirt_piles[idx][pos] = True
-                    break"""
-
-            # Only simulate collecting the dirt
-            for idx, pos in enumerate(agent_positions):
-                if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
-                    # print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
-                    # If dirt piles should be cleaned in a specific order
-                    if ordered_dirt_piles[idx]:
-                        if pos == ordered_dirt_piles[idx][target_pile[idx]]:
-                            reward[idx] += 50  # 1
-                            cleaned_dirt_piles[idx][pos] = True
-                            # Set pointer to next dirt pile
-                            self.update_target_pile(env, idx, target_pile, indices)
-                            self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
-                            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
-                                done = True
-                                if all(cleaned_dirt_piles[idx].values()):
-                                    # Reset cleaned_dirt_piles indicator
-                                    for pos in dirt_piles_positions:
-                                        cleaned_dirt_piles[idx][pos] = False
-                    else:
-                        reward[idx] += 50  # 1
-                        cleaned_dirt_piles[idx][pos] = True
-
-            if self.cfg[nms.ALGORITHM]["pile_all_done"] in ["all", "distributed"]:
-                if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
-                    done = True
-            elif self.cfg[nms.ALGORITHM]["pile_all_done"] == "shared":
-                # End episode if both agents together have cleaned all dirt piles
-                if all(self.get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles).values()):
-                    done = True
-
-        return reward, done
-
-    def handle_finished_episode(self, obs):
-        with torch.inference_mode(False):
-            for ag_i, agent in enumerate(self.agents):
-                # Get states, actions, rewards and values from rollout buffer
-                data = agent.finish_episode()
-                # Chunk episode data, such that there will be no memory failure for very long episodes
-                chunks = self.split_into_chunks(data)
-                for (s, a, R, V) in chunks:
-                    # Calculate discounted return and advantage
-                    G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
-                    if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
-                        A = G
-                    elif self.cfg[nms.ALGORITHM]["advantage"] == "Advantage-AC":
-                        A = G - V  # Actor-Critic Advantages
-                    elif self.cfg[nms.ALGORITHM]["advantage"] == "TD-Advantage-AC":
-                        with torch.no_grad():
-                            A = R + self.cfg[nms.ALGORITHM]["gamma"] * np.append(V[1:], agent.vf(
-                                self._as_torch(obs[ag_i]).view(-1).to(
-                                    torch.float32)).numpy()) - V  # TD Actor-Critic Advantages
-                    else:
-                        print("Not a valid advantage option.")
-                        exit()
-
-                    rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
-                    # Update policy and value net of agent with experience from rollout buffer
-                    agent.train(*rollout)
-
-    def split_into_chunks(self, data_tuple):
-        result = [data_tuple]
-        chunk_size = self.cfg[nms.ALGORITHM]["chunk-episode"]
-        if chunk_size > 0:
-            # Get the maximum length of the lists in the tuple to handle different lengths
-            max_length = max(len(lst) for lst in data_tuple)
-
-            # Prepare a list to store the result
-            result = []
-
-            # Split each list into chunks and add them to the result
-            for i in range(0, max_length, chunk_size):
-                # Create a sublist containing the ith chunk from each list
-                sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
-                result.append(sublist)
-
-        return result
-
-    def set_agent_spawnpoint(self, env):
-        for agent_idx in range(self.n_agents):
-            agent_name = list(env.state.agents_conf.keys())[agent_idx]
-            current_pos_pointer = env.state.agents_conf[agent_name]["pos_pointer"]
-            # Making the reset dependent on the number of spawnpoints and not the number of dirtpiles allows
-            # for having multiple subsequent spawnpoints with the same target pile
-            if current_pos_pointer == len(env.state.agents_conf[agent_name]['positions']) - 1:
-                env.state.agents_conf[agent_name]["pos_pointer"] = 0
-            else:
-                env.state.agents_conf[agent_name]["pos_pointer"] += 1
-
-    @torch.no_grad()
-    def train_loop(self):
-        env = self.factory
-        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
-        global_steps, episode = 0, 0
-        indices = self.distribute_indices(env)
-        dirt_piles_positions = self.get_dirt_piles_positions(env)
-        used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
-        target_pile = [partition[0] for partition in indices]  # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
-        cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
-
-        while global_steps < max_steps:
-            print(global_steps)
-            obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given
-            if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-                env.render()
-            self.set_agent_spawnpoint(env)
-            ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
-            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
-            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
-                target_pile = [partition[0] for partition in indices]
-                cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
-            """passed_fields = [[] for _ in range(self.n_agents)]"""
-
-            """obs = list(obs.values())"""
-            obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
-            done, rew_log       = [False] * self.n_agents, 0
-
-            print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
-            print("Agents target piles:", target_pile)
-            print("Agents initial observation:", obs)
-            print("Agents cleaned dirt piles:", cleaned_dirt_piles)
-
-            # Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
-            """for i in range(self.n_agents):
-                self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
-
-            while not all(done):
-                # 0="North", 1="East", 2="South", 3="West", 4="Clean", 5="Noop"
-                action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile) \
-                    if "Doors" in env.state.entities.keys() else self.get_actions(obs)
-                used_actions[int(action[0])] += 1
-                _, next_obs, reward, done, info = env.step(action)
-                if done:
-                    print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
-                next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
-
-                # Add small negative reward if agent has moved away from the target_pile
-                # reward = self.reward_distance(env, obs, target_pile, reward)
-
-                # Check and handle if agent is on field with dirt. This method can change the observation for the next step.
-                # If pile_all_done is "single", the episode ends if agents reached its target pile and the new episode begins
-                # with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
-                # the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
-                # before this method is called.
-                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
-
-                if n_steps != 0 and (global_steps + 1) % n_steps == 0:
-                    print("max_steps reached")
-                    done = True
-
-                done = [done] * self.n_agents if isinstance(done, bool) else done
-                for ag_i, agent in enumerate(self.agents):
-                    # For forced actions like door opening, we have to call the step function with this action, but
-                    # since we are not allowed to exceed the dimensions range, we can't log the corresponding step info.
-                    if action[ag_i] in range(self.act_dim):
-                        # Add agent results into respective rollout buffers
-                        agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
-
-                if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-                    env.render()
-
-                obs = next_obs
-
-                if all(done): self.handle_finished_episode(obs)
-
-                global_steps += 1
-                rew_log += sum(reward)
-
-                if global_steps >= max_steps:
-                    break
-
-            print(f'reward at episode: {episode} = {rew_log}')
-            self.reward_development.append(rew_log)
-            episode += 1
-
-        self.plot_reward_development()
-        if self.cfg[nms.ENV]["save_and_log"]:
-            self.create_info_maps(env, used_actions)
-            self.save_agent_models()
-            plot_action_maps(env, [self], self.results_path)
-
-    @torch.inference_mode(True)
-    def eval_loop(self, n_episodes, render=False):
-        env = self.eval_factory
-        self.set_cfg(eval=True)
-        episode, results = 0, []
-        dirt_piles_positions = self.get_dirt_piles_positions(env)
-        indices = self.distribute_indices(env)
-        target_pile = [partition[0] for partition in indices]  # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
-        if self.cfg[nms.ALGORITHM]["pile_all_done"] == "distributed":
-            cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
-        else:
-            cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
-
-        while episode < n_episodes:
-            obs = env.reset()
-            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
-                    env.set_recorder(self.recorder)
-                env.render()
-                env._renderer.fps = 5
-            self.set_agent_spawnpoint(env)
-            """obs = list(obs.values())"""
-            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
-            if self.cfg[nms.ALGORITHM]["pile_all_done"] in ["all", "distributed", "shared"]:
-                target_pile = [partition[0] for partition in indices]
-                if self.cfg[nms.ALGORITHM]["pile_all_done"] == "distributed":
-                    cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
-                else:
-                    cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
-
-            ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
-
-            obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
-            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
-
-            # Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
-            """for i in range(self.n_agents):
-                self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
-
-            while not all(done):
-                action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) \
-                    if "Doors" in env.state.entities.keys() else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
-                _, next_obs, reward, done, info = env.step(action) # Note that this call seems to flip the lists in indices
-                if done:
-                    print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
-
-                # Add small negative reward if agent has moved away from the target_pile
-                # reward = self.reward_distance(env, obs, target_pile, reward)
-
-                # Check and handle if agent is on field with dirt
-                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
-
-                # Get transformed next_obs that might have been updated because of self.handle_dirt.
-                # For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
-                # in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
-                next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
-
-                done = [done] * self.n_agents if isinstance(done, bool) else done
-
-                if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                    env.render()
-
-                obs = next_obs
-
-            episode += 1
-
-        # Properly finalize the video file
-        if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
-            self.recorder.close()
-
-    def plot_reward_development(self):
-        smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
-        plt.plot(smoothed_data)
-        plt.ylim([-10, max(smoothed_data) + 20])
-        plt.title('Smoothed Reward Development')
-        plt.xlabel('Episode')
-        plt.ylabel('Reward')
-        if self.cfg[nms.ENV]["save_and_log"]:
-            plt.savefig(f"{self.results_path}/smoothed_reward_development.png")
-        plt.show()
-
-    def save_configs(self):
-        with open(f"{self.results_path}/MARL_config.txt", "w") as txt_file:
-            txt_file.write(str(self.cfg))
-        with open(f"{self.results_path}/train_env_config.txt", "w") as txt_file:
-            txt_file.write(str(self.factory.conf))
-        with open(f"{self.results_path}/eval_env_config.txt", "w") as txt_file:
-            txt_file.write(str(self.eval_factory.conf))
-
-    def save_agent_models(self):
-        for idx, agent in enumerate(self.agents):
-            agent_name = list(self.factory.state.agents_conf.keys())[idx]
-            agent.pi.save_model_parameters(self.results_path, agent_name)
-            agent.vf.save_model_parameters(self.results_path, agent_name)
-
-    def load_agents(self, runs_list):
-        for idx, run in enumerate(runs_list):
-            run_path = f"../study_out/{run}"
-            agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
-            self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
-            self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
-
-    def create_info_maps(self, env, used_actions):
-        # Create value map
-        all_valid_observations = self.get_all_observations(env)
-        dirt_piles_positions = self.get_dirt_piles_positions(env)
-        with open(f"{self.results_path}/info_maps.txt", "w") as txt_file:
-            for obs_layer, pos in enumerate(dirt_piles_positions):
-                observations_shape = (
-                max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
-                value_maps = [np.zeros(observations_shape) for _ in self.agents]
-                likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
-                action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], self.act_dim)) for
-                                        _ in self.agents]
-                for obs in all_valid_observations[obs_layer]:
-                    """obs = self._as_torch(obs).view(-1).to(torch.float32)"""
-                    for idx, agent in enumerate(self.agents):
-                        """indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
-                        x, y = indices[0][0], indices[1][0]"""
-                        x, y = int(obs[0]), int(obs[1])
-                        try:
-                            value_maps[idx][x][y] = agent.vf(obs)
-                            probs = agent.pi.distribution(obs).probs
-                            likeliest_action[idx][x][y] = torch.argmax(probs)  # get the likeliest action at the current agent position
-                            action_probabilities[idx][x][y] = probs
-                        except:
-                            pass
-
-                txt_file.write("=======Value Maps=======\n")
-                print("=======Value Maps=======")
-                for agent_idx, vmap in enumerate(value_maps):
-                    txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
-                    print(f"Value map of agent {agent_idx} for target pile {pos}:")
-                    vmap = self._as_torch(vmap).round(decimals=4)
-                    max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
-                    for idx, row in enumerate(vmap):
-                        txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
-                        txt_file.write("\n")
-                        print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
-                txt_file.write("\n")
-                txt_file.write("=======Likeliest Action=======\n")
-                print("=======Likeliest Action=======")
-                for agent_idx, amap in enumerate(likeliest_action):
-                    txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
-                    print(f"Likeliest action map of agent {agent_idx} for target pile {pos}:")
-                    txt_file.write(np.array2string(amap))
-                    print(amap)
-                txt_file.write("\n")
-                txt_file.write("=======Action Probabilities=======\n")
-                print("=======Action Probabilities=======")
-                for agent_idx, pmap in enumerate(action_probabilities):
-                    self.action_probabilities[agent_idx].append(pmap)
-                    txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
-                    print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
-                    for d in range(pmap.shape[0]):
-                        row = '['
-                        for r in range(pmap.shape[1]):
-                            row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
-                        txt_file.write(row + "]")
-                        txt_file.write("\n")
-                        print(row + "]")
-                txt_file.write(f"Used actions: {used_actions}\n")
-                print("Used actions:", used_actions)
-
--- a/marl_factory_grid/algorithms/marl/base_a2c.py
+++ b/marl_factory_grid/algorithms/marl/base_a2c.py
@@ -2,8 +2,6 @@ import numpy as np; import torch as th; import scipy as sp;
 from collections import deque
 from torch import nn

-# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
-# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
 cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]

 class Net(th.nn.Module):
@@ -21,11 +19,11 @@ class Net(th.nn.Module):
        if module.bias is not None:
          nn.init.uniform_(module.bias, a=-0.1, b=0.1)

-  def save_model(self, path, agent_name):
-    th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
+  def save_model(self, path):
+    th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")

-  def save_model_parameters(self, path, agent_name):
-    th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
+  def save_model_parameters(self, path):
+    th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")

  def load_model_parameters(self, path):
    self.net.load_state_dict(th.load(path))
--- a/marl_factory_grid/algorithms/marl/constants.py
+++ b/marl_factory_grid/algorithms/marl/constants.py
@@ -1,3 +1,4 @@
+
 class Names:
    ENV             = 'env'
    ENV_NAME = 'env_name'
@@ -35,3 +36,8 @@ class Names:
    SINGLE = 'single'
    DISTRIBUTED = 'distributed'
    SHARED = 'shared'
+    EARLY_STOPPING = 'early_stopping'
+    TRAIN = 'train'
+    SEED = 'seed'
+    LAST_N_EPISODES = 'last_n_episodes'
+    MEAN_TARGET_CHANGE = 'mean_target_change'
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/coin_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/coin_quadrant_eval_config.yaml
@@ -0,0 +1,12 @@
+env:
+  classname:          marl_factory_grid.configs.marl.multi_agent_configs
+  env_name:           "marl/multi_agent_configs/coin_quadrant_eval_config"
+  n_agents:           2 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  pile-order:         "smart" # Triggers implementation of our emergence prevention mechanism. Agents consider distance to other agent
+  pile-observability: "single" # Agents can only perceive one coin pile at any given time step
+  pile_all_done:      "shared" # Indicates that agents don't have to collect the same coin piles
+  auxiliary_piles:    False # Coin quadrant does not use this option
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/coin_quadrant_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/coin_quadrant_eval_config_emergent.yaml
@@ -0,0 +1,13 @@
+# Configuration that shows emergent behavior in out coin-quadrant environment
+env:
+  classname:          marl_factory_grid.configs.marl.multi_agent_configs
+  env_name:           "marl/multi_agent_configs/coin_quadrant_eval_config"
+  n_agents:           2 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  pile-order:         "dynamic" # Agents only decide on next target pile based on the distance to the respective piles
+  pile-observability: "single" # Agents can only perceive one coin pile at any given time step
+  pile_all_done:      "shared" # Indicates that agents don't have to collect the same coin piles
+  auxiliary_piles:    False # Coin quadrant does not use this option
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_eval_config.yaml
@@ -0,0 +1,16 @@
+env:
+  classname:          marl_factory_grid.configs.marl.multi_agent_configs
+  env_name:           "marl/multi_agent_configs/two_rooms_eval_config"
+  n_agents:           2 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  # Piles (=encoded flags) are evenly distributed among the two agents and have to be collected in the order defined
+  # by the environment config (cf. coords_or_quantity)
+  pile-order:         "agents"
+  pile-observability: "single" # Agents can only perceive one dirt pile at any given time step
+  pile_all_done:      "distributed" # Indicates that agents must clean their specifically assigned dirt piles
+  auxiliary_piles:    True # Allows agents to go to an auxiliary pile
+
+
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
@@ -0,0 +1,17 @@
+# Configuration that shows emergent behavior in our two-rooms environment
+env:
+  classname:          marl_factory_grid.configs..marl.multi_agent_configs
+  env_name:           "marl/multi_agent_configs/two_rooms_eval_config_emergent"
+  n_agents:           2 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  # Piles (=encoded flags) are evenly distributed among the two agents and have to be collected in the order defined
+  # by the environment config (cf. coords_or_quantity)
+  pile-order:         "agents"
+  pile-observability: "single" # Agents can only perceive one dirt pile at any given time step
+  pile_all_done:      "distributed" # Indicates that agents must clean their specifically assigned dirt piles
+  auxiliary_piles:    False # Shows emergent behavior
+
+
--- a/marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_eval_config.yaml
@@ -0,0 +1,13 @@
+env:
+  classname:          marl_factory_grid.configs.marl.single_agent_configs
+  env_name:           "marl/single_agent_configs/coin_quadrant_agent1_eval_config"
+  n_agents:           1 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  pile-order:         "fixed" # Clean coin piles in a fixed order specified by the environment config (cf. coords_or_quantity)
+  pile-observability: "single" # Agent can only perceive one coin pile at any given time step
+  pile_all_done:      "all" # During inference the episode ends only when all coin piles are cleaned
+  auxiliary_piles:    False # Coin quadrant does not use this option
+
--- a/marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_train_config.yaml
+++ b/marl_factory_grid/algorithms/marl/single_agent_configs/coin_quadrant_train_config.yaml
@@ -0,0 +1,21 @@
+env:
+  classname:          marl_factory_grid.configs.marl.single_agent_configs
+  env_name:           "marl/single_agent_configs/coin_quadrant_agent1_train_config"
+  n_agents:           1 # Number of agents in the environment
+  train_render:       False # If training should be graphically visualized
+  save_and_log:       True # If configurations and potential logging files should be saved
+algorithm:
+  seed:               9 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  gamma:              0.99 # The gamma value that is used as discounting factor
+  n_steps:            0 # How much experience should be sampled at most until the next value- and policy-net updates are performed. (0 = Monte Carlo)
+  chunk-episode:      20000 # For update, splits very large episodes in batches of approximately equal size. (0 = update networks with full episode at once)
+  max_steps:          400000 # Number of training steps used for agent1 (=agent2)
+  early_stopping:     True # If the early stopping functionality should be used
+  last_n_episodes:    100 # To determine if low change phase has begun, the last n episodes are checked if the mean target change is reached
+  mean_target_change: 2.0 # What should be the accepted fluctuation for determining if a low change phase has begun
+  advantage:          "Advantage-AC" # Defines the used actor critic model
+  pile-order:         "fixed" # Clean coin piles in a fixed order specified by the environment config (cf. coords_or_quantity)
+  pile-observability: "single" # Agent can only perceive one coin pile at any given time step
+  pile_all_done:      "single" # Episode ends when the current target pile is cleaned
+  auxiliary_piles:    False # Coin quadrant does not use this option
+
--- a/marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_eval_config.yaml
@@ -0,0 +1,14 @@
+env:
+  classname:          marl_factory_grid.configs.marl.single_agent_configs
+  env_name:           "marl/single_agent_configs/two_rooms_agent2_eval_config"
+  n_agents:           1 # Number of agents in the environment
+  eval_render:        True # If inference should be graphically visualized
+  save_and_log:       False # If configurations and potential logging files should be saved
+algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  pile-order:         "fixed" # Clean coin piles (=encoded flags) in a fixed order specified by the environment config (cf. coords_or_quantity)
+  pile-observability: "single" # Agent can only perceive one coin pile at any given time step
+  pile_all_done:      "all" # During inference the episode ends only when all coin piles are cleaned
+  auxiliary_piles:    False # Auxiliary piles are only differentiated from regular target piles during marl eval
+
+
--- a/marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_train_config.yaml
+++ b/marl_factory_grid/algorithms/marl/single_agent_configs/two_rooms_train_config.yaml
@@ -0,0 +1,22 @@
+env:
+  classname:          marl_factory_grid.configs.marl.single_agent_configs
+  env_name:           "marl/single_agent_configs/two_rooms_agent2_train_config"
+  n_agents:           1 # Number of agents in the environment
+  train_render:       False # If training should be graphically visualized
+  save_and_log:       True # If configurations and potential logging files should be saved
+algorithm:
+  seed:               9 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
+  gamma:              0.99 # The gamma value that is used as discounting factor
+  n_steps:            0 # How much experience should be sampled at most until the next value- and policy-net updates are performed. (0 = Monte Carlo)
+  chunk-episode:      20000 # For update, splits very large episodes in batches of approximately equal size. (0 = update networks with full episode at once)
+  max_steps:          300000 # Number of training steps used to train the agent. Here, only a placeholder value
+  early_stopping:     True # If the early stopping functionality should be used
+  last_n_episodes:    100 # To determine if low change phase has begun, the last n episodes are checked if the mean target change is reached
+  mean_target_change: 2.0 # What should be the accepted fluctuation for determining if a low change phase has begun
+  advantage:          "Advantage-AC" # Defines the used actor critic model
+  pile-order:         "fixed" # Clean coin piles (=encoded flags) in a fixed order specified by the environment config (cf. coords_or_quantity)
+  pile-observability: "single" # Agent can only perceive one coin pile at any given time step
+  pile_all_done:      "single" # Episode ends when the current target pile is cleaned
+  auxiliary_piles:    False # Auxiliary piles are only differentiated from regular target piles during marl eval
+
+
--- a/marl_factory_grid/algorithms/marl/utils.py
+++ b/marl_factory_grid/algorithms/marl/utils.py
@@ -1,11 +1,14 @@
 import copy
+import os
+from pathlib import Path
 from typing import List
 import numpy as np
+import pandas as pd
 import torch

-from marl_factory_grid.algorithms.rl.constants import Names as nms
+from marl_factory_grid.algorithms.marl.constants import Names as nms

-from marl_factory_grid.algorithms.rl.base_a2c import cumulate_discount
+from marl_factory_grid.algorithms.marl.base_a2c import cumulate_discount


 def _as_torch(x):
@@ -187,7 +190,7 @@ def distribute_indices(env, cfg, n_agents):
        # -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
        if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
            door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
-            distances = {door_pos: [] for door_pos in door_positions}
+            distances = {door_pos:[] for door_pos in door_positions}

            # Calculate distance of every agent to every door
            for door_pos in door_positions:
@@ -198,7 +201,7 @@ def distribute_indices(env, cfg, n_agents):
                return [i for i, x in enumerate(lst) if x == item]

            # Get agent indices of agents with same distance to door
-            affected_agents = {door_pos: {} for door_pos in door_positions}
+            affected_agents = {door_pos:{} for door_pos in door_positions}
            for door_pos in distances.keys():
                dist = distances[door_pos]
                dist_set = set(dist)
@@ -206,22 +209,20 @@ def distribute_indices(env, cfg, n_agents):
                    affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)

            updated_indices = []
-
-            for door_pos, agent_distances in affected_agents.items():
-                if len(agent_distances) == 0:
-                    # Remove auxiliary piles for all agents
-                    # (In config, we defined every pile with an even numbered index to be an auxiliary pile)
-                    updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
-                else:
-                    for distance, agent_indices in agent_distances.items():
-                        # For each distance group, pick one random agent to keep the auxiliary pile
-                        # selected_agent = np.random.choice(agent_indices)
-                        selected_agent = 0
-                        for agent_idx in agent_indices:
-                            if agent_idx == selected_agent:
-                                updated_indices.append(indices[agent_idx])
-                            else:
-                                updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
+            if len(affected_agents[door_positions[0]]) == 0:
+                # Remove auxiliary piles for all agents
+                # (In config, we defined every pile with an even numbered index to be an auxiliary pile)
+                updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
+            else:
+                for distance, agent_indices in affected_agents[door_positions[0]].items():
+                    # Pick random agent to keep auxiliary pile and remove it for all others
+                    #selected_agent = np.random.choice(agent_indices)
+                    selected_agent = 0
+                    for agent_idx in agent_indices:
+                        if agent_idx == selected_agent:
+                            updated_indices.append(indices[agent_idx])
+                        else:
+                            updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])

            indices = updated_indices

@@ -335,3 +336,42 @@ def save_agent_models(results_path, agents):
    for idx, agent in enumerate(agents):
        agent.pi.save_model_parameters(results_path)
        agent.vf.save_model_parameters(results_path)
+
+
+def has_low_change_phase_started(return_change_development, last_n_episodes, mean_target_change):
+    """ Checks if training has reached a phase with only marginal average change """
+    if np.mean(np.abs(return_change_development[-last_n_episodes:])) < mean_target_change:
+        print("Low change phase started.")
+        return True
+    return False
+
+
+def significant_deviation(return_change_development, low_change_phase_start_episode):
+    """ Determines if a significant return deviation has occurred in the last episode """
+    return_change_development = return_change_development[low_change_phase_start_episode:]
+
+    df = pd.DataFrame({'Episode': range(len(return_change_development)), 'DeltaReturn': return_change_development})
+    df['Difference'] = df['DeltaReturn'].diff().abs()
+
+    # Only the most extreme changes (those that are greater than 99.99% of all changes) will be considered significant
+    threshold = df['Difference'].quantile(0.9999)
+
+    # Identify significant changes
+    significant_changes = df[df['Difference'] > threshold]
+    print("Threshold: ", threshold, "Significant changes: ", significant_changes)
+
+    if len(significant_changes["Episode"]) > 0:
+        return True
+    return False
+
+
+def get_algorithms_marl_path():
+    return Path(Path(__file__).parent)
+
+
+def get_configs_marl_path():
+    return Path(os.path.join(Path(__file__).parent.parent.parent, "configs"))
+
+
+def get_agent_models_path():
+    return Path(os.path.join(Path(__file__).parent.parent, "agent_models"))
--- a/marl_factory_grid/algorithms/rl/init.py
+++ b/marl_factory_grid/algorithms/rl/init.py
@@ -1 +0,0 @@
-from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
--- a/marl_factory_grid/algorithms/rl/base_a2c.py
+++ b/marl_factory_grid/algorithms/rl/base_a2c.py
@@ -1,112 +0,0 @@
-import numpy as np
-import torch as th
-import scipy as sp
-from collections import deque
-from torch import nn
-
-cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]
-
-
-class Net(th.nn.Module):
-    def __init__(self, shape, activation, lr):
-        super().__init__()
-        self.net = th.nn.Sequential(*[layer
-                                      for io, a in zip(zip(shape[:-1], shape[1:]),
-                                                       [activation] * (len(shape) - 2) + [th.nn.Identity])
-                                      for layer in [th.nn.Linear(*io), a()]])
-        self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
-
-        # Initialize weights uniformly, so that for the policy net all actions have approximately the same
-        # probability in the beginning
-        for module in self.modules():
-            if isinstance(module, nn.Linear):
-                nn.init.uniform_(module.weight, a=-0.1, b=0.1)
-                if module.bias is not None:
-                    nn.init.uniform_(module.bias, a=-0.1, b=0.1)
-
-    def save_model(self, path):
-        th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
-
-    def save_model_parameters(self, path):
-        th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
-
-    def load_model_parameters(self, path):
-        self.net.load_state_dict(th.load(path))
-        self.net.eval()
-
-
-class ValueNet(Net):
-    def __init__(self, obs_dim, hidden_sizes=[64, 64], activation=th.nn.ReLU, lr=1e-3):
-        super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
-
-    def forward(self, obs): return self.net(obs)
-
-    def loss(self, states, returns): return ((returns - self(states)) ** 2).mean()
-
-
-class PolicyNet(Net):
-    def __init__(self, obs_dim, act_dim, hidden_sizes=[64, 64], activation=th.nn.Tanh, lr=3e-4):
-        super().__init__([obs_dim] + hidden_sizes + [act_dim], activation, lr)
-        self.distribution = lambda obs: th.distributions.Categorical(logits=self.net(obs))
-
-    def forward(self, obs, act=None, det=False):
-        """Given an observation: Returns policy distribution and probablilty for a given action
-          or Returns a sampled action and its corresponding probablilty"""
-        pi = self.distribution(obs)
-        if act is not None: return pi, pi.log_prob(act)
-        act = self.net(obs).argmax() if det else pi.sample()  # sample from the learned distribution
-        return act, pi.log_prob(act)
-
-    def loss(self, states, actions, advantages):
-        _, logp = self.forward(states, actions)
-        loss = -(logp * advantages).mean()
-        return loss
-
-
-class PolicyGradient:
-    """ Autonomous agent using vanilla policy gradient. """
-
-    def __init__(self, env, seed=42, gamma=0.99, agent_id=0, act_dim=None, obs_dim=None):
-        self.env = env
-        self.gamma = gamma                                  # Setup env and discount
-        th.manual_seed(seed)
-        np.random.seed(seed)                                # Seed Torch, numpy and gym
-        # Keep track of previous rewards and performed steps to calcule the mean Return metric
-        self._episode, self.ep_returns, self.num_steps = [], deque(maxlen=100), 0
-        # Get observation and action shapes
-        if not obs_dim:
-            obs_size = env.observation_space.shape if len(env.state.entities.by_name("Agents")) == 1 \
-                else env.observation_space[agent_id].shape  # Single agent case vs. multi-agent case
-            obs_dim = np.prod(obs_size)
-        if not act_dim:
-            act_dim = env.action_space[agent_id].n
-        self.vf = ValueNet(obs_dim)                         # Setup Value Network (Critic)
-        self.pi = PolicyNet(obs_dim, act_dim)               # Setup Policy Network (Actor)
-
-    def step(self, obs):
-        """ Given an observation, get action and probs from policy and values from critic"""
-        with th.no_grad():
-            (a, _), v = self.pi(obs), self.vf(obs)
-        self._episode.append((None, None, None, v))
-        return a.numpy()
-
-    def policy(self, obs, det=True):
-        return self.pi(obs, det=det)[0].numpy()
-
-    def finish_episode(self):
-        """Process self._episode & reset self.env, Returns (s,a,G,V)-Tuple and new inital state"""
-        s, a, r, v = (np.array(e) for e in zip(*self._episode))  # Get trajectories from rollout
-        self.ep_returns.append(sum(r))
-        self._episode = []                  # Add episode return to buffer & reset
-        return s, a, r, v                   # state, action, Return, Value Tensors
-
-    def train(self, states, actions, returns, advantages):  # Update policy weights
-        self.pi.optimizer.zero_grad()
-        self.vf.optimizer.zero_grad()       # Reset optimizer
-        states = states.flatten(1, -1)      # Reduce dimensionality to rollout_dim x input_dim
-        policy_loss = self.pi.loss(states, actions, advantages)  # Calculate Policy loss
-        policy_loss.backward()
-        self.pi.optimizer.step()            # Apply Policy loss
-        value_loss = self.vf.loss(states, returns)  # Calculate Value loss
-        value_loss.backward()
-        self.vf.optimizer.step()            # Apply Value loss
--- a/marl_factory_grid/algorithms/rl/base_ac.py
+++ b/marl_factory_grid/algorithms/rl/base_ac.py
@@ -1,242 +0,0 @@
-import torch
-from typing import Union, List, Dict
-import numpy as np
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
-from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
-from pathlib import Path
-import pandas as pd
-from collections import deque
-
-
-class Names:
-    REWARD          = 'reward'
-    DONE            = 'done'
-    ACTION          = 'action'
-    OBSERVATION     = 'observation'
-    LOGITS          = 'logits'
-    HIDDEN_ACTOR    = 'hidden_actor'
-    HIDDEN_CRITIC   = 'hidden_critic'
-    AGENT           = 'agent'
-    ENV             = 'env'
-    ENV_NAME        = 'env_name'
-    N_AGENTS        = 'n_agents'
-    ALGORITHM       = 'algorithm'
-    MAX_STEPS       = 'max_steps'
-    N_STEPS         = 'n_steps'
-    BUFFER_SIZE     = 'buffer_size'
-    CRITIC          = 'critic'
-    BATCH_SIZE      = 'bnatch_size'
-    N_ACTIONS       = 'n_actions'
-    TRAIN_RENDER    = 'train_render'
-    EVAL_RENDER     = 'eval_render'
-
-
-nms = Names
-ListOrTensor = Union[List, torch.Tensor]
-
-
-class BaseActorCritic:
-    def __init__(self, cfg):
-        self.factory = add_env_props(cfg)
-        self.__training = True
-        self.cfg = cfg
-        self.n_agents = cfg[nms.AGENT][nms.N_AGENTS]
-        self.reset_memory_after_epoch = True
-        self.setup()
-
-    def setup(self):
-        self.net = instantiate_class(self.cfg[nms.AGENT])
-        self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5)
-
-    @classmethod
-    def _as_torch(cls, x):
-        if isinstance(x, np.ndarray):
-            return torch.from_numpy(x)
-        elif isinstance(x, List):
-            return torch.tensor(x)
-        elif isinstance(x, (int, float)):
-            return torch.tensor([x])
-        return x
-
-    def train(self):
-        self.__training = False
-        networks = [self.net] if not isinstance(self.net, List) else self.net
-        for net in networks:
-            net.train()
-
-    def eval(self):
-        self.__training = False
-        networks = [self.net] if not isinstance(self.net, List) else self.net
-        for net in networks:
-            net.eval()
-
-    def load_state_dict(self, path: Path):
-        pass
-
-    def get_actions(self, out) -> ListOrTensor:
-        actions = [Categorical(logits=logits).sample().item() for logits in out[nms.LOGITS]]
-        return actions
-
-    def init_hidden(self) -> Dict[str, ListOrTensor]:
-        pass
-
-    def forward(self,
-                observations:  ListOrTensor,
-                actions:       ListOrTensor,
-                hidden_actor:  ListOrTensor,
-                hidden_critic: ListOrTensor
-                ) -> Dict[str, ListOrTensor]:
-        pass
-
-    @torch.no_grad()
-    def train_loop(self, checkpointer=None):
-        env = self.factory
-        if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-            env.render()
-        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
-        tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps))
-        global_steps, episode, df_results = 0, 0, []
-        reward_queue = deque(maxlen=2000)
-
-        while global_steps < max_steps:
-            obs = env.reset()
-            obs = list(obs.values())
-            last_hiddens        = self.init_hidden()
-            last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
-            done, rew_log       = [False] * self.n_agents, 0
-
-            if self.reset_memory_after_epoch:
-                tm.reset()
-
-            tm.add(observation=obs, action=last_action,
-                   logits=torch.zeros(self.n_agents, 1, self.cfg[nms.AGENT][nms.N_ACTIONS]),
-                   values=torch.zeros(self.n_agents, 1), reward=reward, done=done, **last_hiddens)
-
-            while not all(done):
-                out = self.forward(obs, last_action, **last_hiddens)
-                action = self.get_actions(out)
-                _, next_obs, reward, done, info = env.step(action)
-                done = [done] * self.n_agents if isinstance(done, bool) else done
-
-                if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-                    env.render()
-
-                last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR],
-                                    hidden_critic=out[nms.HIDDEN_CRITIC])
-
-                logits = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.LOGITS, None)], dim=0)
-                values = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.CRITIC, None)], dim=0)
-
-                tm.add(observation=obs, action=action, reward=reward, done=done,
-                       logits=logits, values=values,
-                       **last_hiddens)
-
-                obs = next_obs
-                last_action = action
-
-                if (global_steps+1) % n_steps == 0 or all(done):
-                    with torch.inference_mode(False):
-                        self.learn(tm)
-
-                global_steps += 1
-                rew_log += sum(reward)
-                reward_queue.extend(reward)
-
-                if checkpointer is not None:
-                    checkpointer.step([
-                        (f'agent#{i}', agent)
-                        for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net)
-                    ])
-
-                if global_steps >= max_steps:
-                    break
-            if global_steps%100 == 0:
-                print(f'reward at episode: {episode} = {rew_log}')
-            episode += 1
-            df_results.append([episode, rew_log, *reward])
-        df_results = pd.DataFrame(df_results,
-                                  columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]]
-                                  )
-        if checkpointer is not None:
-            df_results.to_csv(checkpointer.path / 'results.csv', index=False)
-        return df_results
-
-    @torch.inference_mode(True)
-    def eval_loop(self, n_episodes, render=False):
-        env = self.factory
-        if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-            env.render()
-        episode, results = 0, []
-        while episode < n_episodes:
-            obs = env.reset()
-            obs = list(obs.values())
-            last_hiddens           = self.init_hidden()
-            last_action, reward    = [-1] * self.n_agents, [0.] * self.n_agents
-            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
-            while not all(done):
-                out    = self.forward(obs, last_action, **last_hiddens)
-                action = self.get_actions(out)
-                _, next_obs, reward, done, info = env.step(action)
-
-                if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                    env.render()
-
-                if isinstance(done, bool):
-                    done = [done] * obs[0].shape[0]
-                obs = next_obs
-                last_action = action
-                last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR,   None),
-                                    hidden_critic=out.get(nms.HIDDEN_CRITIC, None)
-                                    )
-                eps_rew += torch.tensor(reward)
-            results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
-            episode += 1
-        agent_columns = [f'agent#{i}' for i in range(self.cfg[nms.ENV][nms.N_AGENTS])]
-        results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
-        results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'],
-                          value_name='reward', var_name='agent')
-        return results
-
-    @staticmethod
-    def compute_advantages(critic, reward, done, gamma, gae_coef=0.0):
-        tds = (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1]
-
-        if gae_coef <= 0:
-            return tds
-
-        gae = torch.zeros_like(tds[:, -1])
-        gaes = []
-        for t in range(tds.shape[1]-1, -1, -1):
-            gae = tds[:, t] + gamma * gae_coef * (1.0 - done[:, t]) * gae
-            gaes.insert(0, gae)
-        gaes = torch.stack(gaes, dim=1)
-        return gaes
-
-    def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
-        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
-
-        out = network(obs, actions, tm.hidden_actor[:, 0].squeeze(0), tm.hidden_critic[:, 0].squeeze(0))
-        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-        critic = out[nms.CRITIC]
-
-        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
-        advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
-        value_loss = advantages.pow(2).mean(-1)  # n_agent
-
-        # policy loss
-        log_ap = torch.log_softmax(logits, -1)
-        log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
-        a2c_loss = -(advantages.detach() * log_ap).mean(-1)
-        # weighted loss
-        loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss
-        return loss.mean()
-
-    def learn(self, tm: MARLActorCriticMemory, **kwargs):
-        loss = self.actor_critic(tm, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-        # remove next_obs, will be added in next iter
-        self.optimizer.zero_grad()
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
-        self.optimizer.step()
-
--- a/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@@ -1,34 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
-  n_agents:            2
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/MultiAgentConfigs/dirt_quadrant_train_config"
-  n_agents:           2
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.rl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          200000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
--- a/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
@@ -1,35 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
-  n_agents:            2
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/two_rooms_one_door_modified_train_config"
-  n_agents:           2
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.rl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          260000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
-  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
-
--- a/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml
@@ -1,34 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
-  n_agents:            1
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/dirt_quadrant_train_config"
-  n_agents:           1
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.rl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          240000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
--- a/marl_factory_grid/algorithms/rl/configs/environment_changes
+++ b/marl_factory_grid/algorithms/rl/configs/environment_changes
@@ -1,8 +0,0 @@
-marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
-marl_factory_grid>environment>rewards.py
-marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
-marl_factory_grid>environment>rules.py#AgentSpawnRule
-marl_factory_grid>utils>states.py#GameState.__init__()
-marl_factory_grid>environment>factory.py>Factory#render
-marl_factory_grid>environment>factory.py>Factory#set_recorder
-marl_factory_grid>utils>renderer.py>Renderer#render
--- a/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml
@@ -1,35 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
-  n_agents:            1
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/two_rooms_one_door_modified_train_config"
-  n_agents:           1
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       False
-  record:             False
-method:               marl_factory_grid.algorithms.rl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          260000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
-
--- a/marl_factory_grid/algorithms/rl/iac.py
+++ b/marl_factory_grid/algorithms/rl/iac.py
@@ -1,57 +0,0 @@
-import torch
-from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic, nms
-from marl_factory_grid.algorithms.utils import instantiate_class
-from pathlib import Path
-from natsort import natsorted
-from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
-
-
-class LoopIAC(BaseActorCritic):
-
-    def __init__(self, cfg):
-        super(LoopIAC, self).__init__(cfg)
-
-    def setup(self):
-        self.net = [
-            instantiate_class(self.cfg[nms.AGENT]) for _ in range(self.n_agents)
-        ]
-        self.optimizer = [
-            torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents)
-        ]
-
-    def load_state_dict(self, path: Path):
-        paths = natsorted(list(path.glob('*.pt')))
-        for path, net in zip(paths, self.net):
-            net.load_state_dict(torch.load(path))
-
-    @staticmethod
-    def merge_dicts(ds):  # todo could be recursive for more than 1 hierarchy
-        d = {}
-        for k in ds[0].keys():
-            d[k] = [d[k] for d in ds]
-        return d
-
-    def init_hidden(self):
-        ha  = [net.init_hidden_actor()  for net in self.net]
-        hc  = [net.init_hidden_critic() for net in self.net]
-        return dict(hidden_actor=ha, hidden_critic=hc)
-
-    def forward(self, observations, actions, hidden_actor, hidden_critic):
-        outputs = [
-            net(
-                self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0),  # agent x time
-                self._as_torch(actions[ag_i]).unsqueeze(0),
-                hidden_actor[ag_i],
-                hidden_critic[ag_i]
-                ) for ag_i, net in enumerate(self.net)
-        ]
-        return self.merge_dicts(outputs)
-
-    def learn(self, tms: MARLActorCriticMemory, **kwargs):
-        for ag_i in range(self.n_agents):
-            tm, net = tms(ag_i), self.net[ag_i]
-            loss = self.actor_critic(tm, net, **self.cfg[nms.ALGORITHM], **kwargs)
-            self.optimizer[ag_i].zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
-            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/rl/mappo.py
+++ b/marl_factory_grid/algorithms/rl/mappo.py
@@ -1,66 +0,0 @@
-from marl_factory_grid.algorithms.rl.base_ac import Names as nms
-from marl_factory_grid.algorithms.rl.snac import LoopSNAC
-from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
-import torch
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.utils import instantiate_class
-
-
-class LoopMAPPO(LoopSNAC):
-    def __init__(self, *args, **kwargs):
-        super(LoopMAPPO, self).__init__(*args, **kwargs)
-        self.reset_memory_after_epoch = False
-
-    def setup(self):
-        self.net = instantiate_class(self.cfg[nms.AGENT])
-        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=3e-4, eps=1e-5)
-
-    def learn(self, tm: MARLActorCriticMemory, **kwargs):
-        if len(tm) >= self.cfg['algorithm']['buffer_size']:
-            # only learn when buffer is full
-            for batch_i in range(self.cfg['algorithm']['n_updates']):
-                batch = tm.chunk_dataloader(chunk_len=self.cfg['algorithm']['n_steps'],
-                                            k=self.cfg['algorithm']['batch_size'])
-                loss = self.mappo(batch, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-                self.optimizer.zero_grad()
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
-                self.optimizer.step()
-
-    def monte_carlo_returns(self, rewards, done, gamma):
-        rewards_ = []
-        discounted_reward = torch.zeros_like(rewards[:, -1])
-        for t in range(rewards.shape[1]-1, -1, -1):
-            discounted_reward = rewards[:, t] + (gamma * (1.0 - done[:, t]) * discounted_reward)
-            rewards_.insert(0, discounted_reward)
-        rewards_ = torch.stack(rewards_, dim=1)
-        return rewards_
-
-    def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **__):
-        out = network(batch[nms.OBSERVATION], batch[nms.ACTION], batch[nms.HIDDEN_ACTOR], batch[nms.HIDDEN_CRITIC])
-        logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-
-        old_log_probs = torch.log_softmax(batch[nms.LOGITS], -1)
-        old_log_probs = torch.gather(old_log_probs, index=batch[nms.ACTION][:, 1:].unsqueeze(-1), dim=-1).squeeze()
-
-        # monte carlo returns
-        mc_returns = self.monte_carlo_returns(batch[nms.REWARD], batch[nms.DONE], gamma)
-        mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8)  # todo: norm across agent ok?
-        advantages = mc_returns - out[nms.CRITIC][:, :-1]
-
-        # policy loss
-        log_ap = torch.log_softmax(logits, -1)
-        log_ap = torch.gather(log_ap, dim=-1, index=batch[nms.ACTION][:, 1:].unsqueeze(-1)).squeeze()
-        ratio = (log_ap - old_log_probs).exp()
-        surr1 = ratio * advantages.detach()
-        surr2 = torch.clamp(ratio, 1 - clip_range, 1 + clip_range) * advantages.detach()
-        policy_loss = -torch.min(surr1, surr2).mean(-1)
-
-        # entropy & value loss
-        entropy_loss = Categorical(logits=logits).entropy().mean(-1)
-        value_loss = advantages.pow(2).mean(-1)  # n_agent
-
-        # weighted loss
-        loss = policy_loss + vf_coef*value_loss - entropy_coef * entropy_loss
-
-        return loss.mean()
--- a/marl_factory_grid/algorithms/rl/memory.py
+++ b/marl_factory_grid/algorithms/rl/memory.py
@@ -1,221 +0,0 @@
-import numpy as np
-from collections import deque
-import torch
-from typing import Union
-from torch import Tensor
-from torch.utils.data import Dataset, ConcatDataset
-import random
-
-
-class ActorCriticMemory(object):
-    def __init__(self, capacity=10):
-        self.capacity = capacity
-        self.reset()
-
-    def reset(self):
-        self.__actions        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__hidden_actor   = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__hidden_critic  = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__states         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__rewards        = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__dones          = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__logits         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-        self.__values         = LazyTensorFiFoQueue(maxlen=self.capacity+1)
-
-    def __len__(self):
-        return len(self.__rewards) - 1
-
-    @property
-    def observation(self, sls=slice(0, None)):  # add time dimension through stacking
-        return self.__states[sls].unsqueeze(0)      # 1 x time x hidden dim
-
-    @property
-    def hidden_actor(self,  sls=slice(0, None)):  # 1 x n_layers x dim
-        return self.__hidden_actor[sls].unsqueeze(0)    # 1 x time x n_layers x dim
-
-    @property
-    def hidden_critic(self, sls=slice(0, None)):  # 1 x n_layers x dim
-        return self.__hidden_critic[sls].unsqueeze(0)    # 1 x time x n_layers x dim
-
-    @property
-    def reward(self, sls=slice(0, None)):
-        return self.__rewards[sls].squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def action(self, sls=slice(0, None)):
-        return self.__actions[sls].long().squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def done(self, sls=slice(0, None)):
-        return self.__dones[sls].float().squeeze().unsqueeze(0)  # 1 x time
-
-    @property
-    def logits(self, sls=slice(0, None)):  # assumes a trailing 1 for time dimension - common when using output from NN
-        return self.__logits[sls].squeeze().unsqueeze(0)  # 1 x time x actions
-
-    @property
-    def values(self, sls=slice(0, None)):
-        return self.__values[sls].squeeze().unsqueeze(0)  # 1 x time x actions
-
-    def add_observation(self, state:  Union[Tensor, np.ndarray]):
-        self.__states.append(state    if isinstance(state, Tensor) else torch.from_numpy(state))
-
-    def add_hidden_actor(self, hidden: Tensor):
-        # layers x hidden dim
-        self.__hidden_actor.append(hidden)
-
-    def add_hidden_critic(self, hidden: Tensor):
-        # layers x hidden dim
-        self.__hidden_critic.append(hidden)
-
-    def add_action(self, action: Union[int, Tensor]):
-        if not isinstance(action, Tensor):
-            action = torch.tensor(action)
-        self.__actions.append(action)
-
-    def add_reward(self, reward: Union[float, Tensor]):
-        if not isinstance(reward, Tensor):
-            reward = torch.tensor(reward)
-        self.__rewards.append(reward)
-
-    def add_done(self, done:   bool):
-        if not isinstance(done, Tensor):
-            done = torch.tensor(done)
-        self.__dones.append(done)
-
-    def add_logits(self, logits: Tensor):
-        self.__logits.append(logits)
-
-    def add_values(self, values: Tensor):
-        self.__values.append(values)
-
-    def add(self, **kwargs):
-        for k, v in kwargs.items():
-            func = getattr(ActorCriticMemory, f'add_{k}')
-            func(self, v)
-
-
-class MARLActorCriticMemory(object):
-    def __init__(self, n_agents, capacity):
-        self.n_agents = n_agents
-        self.memories = [
-            ActorCriticMemory(capacity) for _ in range(n_agents)
-        ]
-
-    def __call__(self, agent_i):
-        return self.memories[agent_i]
-
-    def __len__(self):
-        return len(self.memories[0])  # todo add assertion check!
-
-    def reset(self):
-        for mem in self.memories:
-            mem.reset()
-
-    def add(self, **kwargs):
-        for agent_i in range(self.n_agents):
-            for k, v in kwargs.items():
-                func = getattr(ActorCriticMemory, f'add_{k}')
-                func(self.memories[agent_i], v[agent_i])
-
-    def __getattr__(self, attr):
-        all_attrs = [getattr(mem, attr) for mem in self.memories]
-        return torch.cat(all_attrs, 0)  # agent x time ...
-
-    def chunk_dataloader(self, chunk_len, k):
-        datasets = [ExperienceChunks(mem, chunk_len, k) for mem in self.memories]
-        dataset = ConcatDataset(datasets)
-        data = [dataset[i] for i in range(len(dataset))]
-        data = custom_collate_fn(data)
-        return data
-
-
-def custom_collate_fn(batch):
-    elem = batch[0]
-    return {key: torch.cat([d[key] for d in batch], dim=0) for key in elem}
-
-
-class ExperienceChunks(Dataset):
-    def __init__(self, memory, chunk_len, k):
-        assert chunk_len <= len(memory), 'chunk_len cannot be longer than the size of the memory'
-        self.memory = memory
-        self.chunk_len = chunk_len
-        self.k = k
-
-    @property
-    def whitelist(self):
-        whitelist = torch.ones(len(self.memory) - self.chunk_len)
-        for d in self.memory.done.squeeze().nonzero().flatten():
-            whitelist[max((0, d-self.chunk_len-1)):d+2] = 0
-        whitelist[0] = 0
-        return whitelist.tolist()
-
-    def sample(self, start=1):
-        cl = self.chunk_len
-        sample = dict(observation=self.memory.observation[:, start:start+cl+1],
-                      action=self.memory.action[:, start-1:start+cl],
-                      hidden_actor=self.memory.hidden_actor[:, start-1],
-                      hidden_critic=self.memory.hidden_critic[:, start-1],
-                      reward=self.memory.reward[:, start:start + cl],
-                      done=self.memory.done[:, start:start + cl],
-                      logits=self.memory.logits[:, start:start + cl],
-                      values=self.memory.values[:, start:start + cl])
-        return sample
-
-    def __len__(self):
-        return self.k
-
-    def __getitem__(self, i):
-        idx = random.choices(range(0, len(self.memory) - self.chunk_len), weights=self.whitelist, k=1)
-        return self.sample(idx[0])
-
-
-class LazyTensorFiFoQueue:
-    def __init__(self, maxlen):
-        self.maxlen = maxlen
-        self.reset()
-
-    def reset(self):
-        self.__lazy_queue = deque(maxlen=self.maxlen)
-        self.shape = None
-        self.queue = None
-
-    def shape_init(self, tensor: Tensor):
-        self.shape = torch.Size([self.maxlen, *tensor.shape])
-
-    def build_tensor_queue(self):
-        if len(self.__lazy_queue) > 0:
-            block = torch.stack(list(self.__lazy_queue), dim=0)
-            l = block.shape[0]
-            if self.queue is None:
-                self.queue = block
-            elif self.true_len() <= self.maxlen:
-                self.queue = torch.cat((self.queue, block),  dim=0)
-            else:
-                self.queue = torch.cat((self.queue[l:], block),  dim=0)
-            self.__lazy_queue.clear()
-
-    def append(self, data):
-        if self.shape is None:
-            self.shape_init(data)
-        self.__lazy_queue.append(data)
-        if len(self.__lazy_queue) >= self.maxlen:
-            self.build_tensor_queue()
-
-    def true_len(self):
-        return len(self.__lazy_queue) + (0 if self.queue is None else self.queue.shape[0])
-
-    def __len__(self):
-        return min((self.true_len(), self.maxlen))
-
-    def __str__(self):
-        return f'LazyTensorFiFoQueue\tmaxlen: {self.maxlen}, shape: {self.shape}, ' \
-               f'len: {len(self)}, true_len: {self.true_len()}, elements in lazy queue: {len(self.__lazy_queue)}'
-
-    def __getitem__(self, item_or_slice):
-        self.build_tensor_queue()
-        return self.queue[item_or_slice]
-
-
-
-
--- a/marl_factory_grid/algorithms/rl/networks.py
+++ b/marl_factory_grid/algorithms/rl/networks.py
@@ -1,103 +0,0 @@
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class RecurrentAC(nn.Module):
-    def __init__(self, observation_size, n_actions, obs_emb_size,
-                 action_emb_size, hidden_size_actor, hidden_size_critic,
-                 n_agents, use_agent_embedding=True):
-        super(RecurrentAC, self).__init__()
-        observation_size = np.prod(observation_size)
-        self.n_layers = 1
-        self.n_actions = n_actions
-        self.use_agent_embedding = use_agent_embedding
-        self.hidden_size_actor = hidden_size_actor
-        self.hidden_size_critic = hidden_size_critic
-        self.action_emb_size    = action_emb_size
-        self.obs_proj   = nn.Linear(observation_size, obs_emb_size)
-        self.action_emb =  nn.Embedding(n_actions+1, action_emb_size, padding_idx=0)
-        self.agent_emb  =  nn.Embedding(n_agents, action_emb_size)
-        mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size
-        self.mix = nn.Sequential(nn.Tanh(),
-                                 nn.Linear(mix_in_size, obs_emb_size),
-                                 nn.Tanh(),
-                                 nn.Linear(obs_emb_size, obs_emb_size)
-                                 )
-        self.gru_actor   = nn.GRU(obs_emb_size, hidden_size_actor,  batch_first=True, num_layers=self.n_layers)
-        self.gru_critic  = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers)
-        self.action_head = nn.Sequential(
-            nn.Linear(hidden_size_actor, hidden_size_actor),
-            nn.Tanh(),
-            nn.Linear(hidden_size_actor, n_actions)
-        )
-        #            spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)),
-        self.critic_head = nn.Sequential(
-            nn.Linear(hidden_size_critic, hidden_size_critic),
-            nn.Tanh(),
-            nn.Linear(hidden_size_critic, 1)
-        )
-        #self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3)
-        #self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3)
-
-    def init_hidden_actor(self):
-        return torch.zeros(1, self.n_layers, self.hidden_size_actor)
-
-    def init_hidden_critic(self):
-        return torch.zeros(1, self.n_layers, self.hidden_size_critic)
-
-    def forward(self, observations, actions, hidden_actor=None, hidden_critic=None):
-        n_agents, t, *_ = observations.shape
-        obs_emb    = self.obs_proj(observations.view(n_agents, t, -1).float())
-        action_emb = self.action_emb(actions+1)  # shift by one due to padding idx
-
-        if not self.use_agent_embedding:
-            x_t = torch.cat((obs_emb, action_emb), -1)
-        else:
-            agent_emb = self.agent_emb(
-                torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)] * t, 1)
-            )
-            x_t = torch.cat((obs_emb, agent_emb, action_emb), -1)
-
-        mixed_x_t   = self.mix(x_t)
-        output_p, _ = self.gru_actor(input=mixed_x_t,  hx=hidden_actor.swapaxes(1, 0))
-        output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0))
-
-        logits = self.action_head(output_p)
-        critic = self.critic_head(output_c).squeeze(-1)
-        return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c)
-
-
-class RecurrentACL2(RecurrentAC):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.action_head = nn.Sequential(
-            nn.Linear(self.hidden_size_actor, self.hidden_size_actor),
-            nn.Tanh(),
-            NormalizedLinear(self.hidden_size_actor, self.n_actions, trainable_magnitude=True)
-        )
-
-
-class NormalizedLinear(nn.Linear):
-    def __init__(self, in_features: int, out_features: int,
-                 device=None, dtype=None, trainable_magnitude=False):
-        super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype)
-        self.d_sqrt = in_features**0.5
-        self.trainable_magnitude = trainable_magnitude
-        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
-
-    def forward(self, in_array):
-        normalized_input = F.normalize(in_array, dim=-1, p=2, eps=1e-5)
-        normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
-        return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
-
-
-class L2Norm(nn.Module):
-    def __init__(self, in_features, trainable_magnitude=False):
-        super(L2Norm, self).__init__()
-        self.d_sqrt = in_features**0.5
-        self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
-
-    def forward(self, x):
-        return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale
--- a/marl_factory_grid/algorithms/rl/seac.py
+++ b/marl_factory_grid/algorithms/rl/seac.py
@@ -1,55 +0,0 @@
-import torch
-from torch.distributions import Categorical
-from marl_factory_grid.algorithms.rl.iac import LoopIAC
-from marl_factory_grid.algorithms.rl.base_ac import nms
-from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
-
-
-class LoopSEAC(LoopIAC):
-    def __init__(self, cfg):
-        super(LoopSEAC, self).__init__(cfg)
-
-    def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
-        obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
-        outputs = [net(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0]) for net in networks]
-
-        with torch.inference_mode(True):
-            true_action_logp = torch.stack([
-                torch.log_softmax(out[nms.LOGITS][ag_i, :-1], -1)
-                .gather(index=actions[ag_i, 1:, None], dim=-1)
-                for ag_i, out in enumerate(outputs)
-            ], 0).squeeze()
-
-        losses = []
-
-        for ag_i, out in enumerate(outputs):
-            logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
-            critic = out[nms.CRITIC]
-
-            entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean()
-            advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
-
-            # policy loss
-            log_ap = torch.log_softmax(logits, -1)
-            log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
-
-            # importance weights
-            iw = (log_ap - true_action_logp).exp().detach()  # importance_weights
-
-            a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
-
-            value_loss = (iw*advantages.pow(2)).mean(-1)  # n_agent
-
-            # weighted loss
-            loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean()
-            losses.append(loss)
-
-        return losses
-
-    def learn(self, tms: MARLActorCriticMemory, **kwargs):
-        losses = self.actor_critic(tms, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
-        for ag_i, loss in enumerate(losses):
-            self.optimizer[ag_i].zero_grad()
-            loss.backward()
-            torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
-            self.optimizer[ag_i].step()
--- a/marl_factory_grid/algorithms/rl/snac.py
+++ b/marl_factory_grid/algorithms/rl/snac.py
@@ -1,33 +0,0 @@
-from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic
-from marl_factory_grid.algorithms.rl.base_ac import nms
-import torch
-from torch.distributions import Categorical
-from pathlib import Path
-
-
-class LoopSNAC(BaseActorCritic):
-    def __init__(self, cfg):
-        super().__init__(cfg)
-
-    def load_state_dict(self, path: Path):
-        path2weights = list(path.glob('*.pt'))
-        assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}'
-        self.net.load_state_dict(torch.load(path2weights[0]))
-
-    def init_hidden(self):
-        hidden_actor = self.net.init_hidden_actor()
-        hidden_critic = self.net.init_hidden_critic()
-        return dict(hidden_actor=torch.cat([hidden_actor]   * self.n_agents,  0),
-                    hidden_critic=torch.cat([hidden_critic] * self.n_agents,  0)
-                    )
-
-    def get_actions(self, out):
-        actions = Categorical(logits=out[nms.LOGITS]).sample().squeeze()
-        return actions
-
-    def forward(self, observations, actions, hidden_actor, hidden_critic):
-        out = self.net(self._as_torch(observations).unsqueeze(1),
-                       self._as_torch(actions).unsqueeze(1),
-                       hidden_actor, hidden_critic
-                       )
-        return out
--- a/marl_factory_grid/algorithms/static/TSP_base_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_base_agent.py
@@ -33,6 +33,7 @@ class TSPBaseAgent(ABC):
        self.local_optimization = True
        self._env = state
        self.state = self._env.state[c.AGENT][agent_i]
+        self.spawn_position = np.array(self.state.pos)
        self._position_graph = self.generate_pos_graph()
        self._static_route = None
        self.cached_route = None
@@ -79,7 +80,7 @@ class TSPBaseAgent(ABC):
        start_time = time.time()

        if self.cached_route is not None:
-            print(f" Used cached route: {self.cached_route}")
+            #print(f" Used cached route: {self.cached_route}")
            return copy.deepcopy(self.cached_route)

        else:
@@ -89,7 +90,7 @@ class TSPBaseAgent(ABC):
                    [self.state.pos] + \
                    [x for x in positions if max(abs(np.subtract(x, self.state.pos))) < 3]
                try:
-                    while len(nodes) < 7:
+                    while len(nodes) < 13:
                        nodes += [next(x for x in positions if x not in nodes)]
                except StopIteration:
                    nodes = [self.state.pos] + positions
@@ -100,11 +101,11 @@ class TSPBaseAgent(ABC):
            route = tsp.traveling_salesman_problem(self._position_graph,
                                                   nodes=nodes, cycle=True, method=tsp.greedy_tsp)
            self.cached_route = copy.deepcopy(route)
-            print(f"Cached route: {self.cached_route}")
+            #print(f"Cached route: {self.cached_route}")

        end_time = time.time()
        duration = end_time - start_time
-        print("TSP calculation took {:.2f} seconds to execute".format(duration))
+        #print("TSP calculation took {:.2f} seconds to execute".format(duration))
        return route

    def _door_is_close(self, state):
--- a/marl_factory_grid/algorithms/static/TSP_runner.py
+++ b/marl_factory_grid/algorithms/static/TSP_runner.py
@@ -0,0 +1,96 @@
+import os
+import pickle
+from pathlib import Path
+
+from tqdm import trange
+
+from marl_factory_grid import Factory
+from marl_factory_grid.algorithms.static.contortions import get_coin_quadrant_tsp_agents, get_two_rooms_tsp_agents
+
+
+def coin_quadrant_multi_agent_tsp_eval(emergent_phenomenon):
+    run_tsp_setting("coin_quadrant", emergent_phenomenon, log=False)
+
+
+def two_rooms_multi_agent_tsp_eval(emergent_phenomenon):
+    run_tsp_setting("two_rooms", emergent_phenomenon, log=False)
+
+
+def run_tsp_setting(config_name, emergent_phenomenon, n_episodes=1, log=False):
+    # Render at each step?
+    render = True
+
+    # Path to config File
+    path = Path(f'./marl_factory_grid/configs/tsp/{config_name}.yaml')
+
+    # Create results folder
+    runs = os.listdir("./study_out/")
+    run_numbers = [int(run[7:]) for run in runs if run[:7] == "tsp_run"]
+    next_run_number = max(run_numbers) + 1 if run_numbers else 0
+    results_path = f"./study_out/tsp_run{next_run_number}"
+    os.mkdir(results_path)
+
+    # Env Init
+    factory = Factory(path)
+
+    with open(f"{results_path}/env_config.txt", "w") as txt_file:
+        txt_file.write(str(factory.conf))
+
+    still_existing_coin_piles = []
+    reached_flags = []
+
+    for episode in trange(n_episodes):
+        _ = factory.reset()
+        still_existing_coin_piles.append([])
+        reached_flags.append([])
+        done = False
+        if render:
+            factory.render()
+            factory._renderer.fps = 5
+        if config_name == "coin_quadrant":
+            agents = get_coin_quadrant_tsp_agents(emergent_phenomenon, factory)
+        elif config_name == "two_rooms":
+            agents = get_two_rooms_tsp_agents(emergent_phenomenon, factory)
+        else:
+            print("Config name does not exist. Abort...")
+            break
+        ep_steps = 0
+        while not done:
+            a = [x.predict() for x in agents]
+            # Have this condition, to terminate as soon as all coin piles are collected. This ensures that the implementation
+            # of the TSP agent is equivalent to that of the RL agent
+            if 'CoinPiles' in list(factory.state.entities.keys()) and factory.state.entities['CoinPiles'].global_amount == 0.0:
+                break
+            obs_type, _, _, done, info = factory.step(a)
+            if 'CoinPiles' in list(factory.state.entities.keys()):
+                still_existing_coin_piles[-1].append(len(factory.state.entities['CoinPiles']))
+            if 'Destinations' in list(factory.state.entities.keys()):
+                reached_flags[-1].append(sum([1 for ele in [x.was_reached() for x in factory.state['Destinations']] if ele]))
+            ep_steps += 1
+            if render:
+                factory.render()
+            if done:
+                break
+
+        collected_coin_piles_per_step = []
+        if 'CoinPiles' in list(factory.state.entities.keys()):
+            for ep in still_existing_coin_piles:
+                collected_coin_piles_per_step.append([max(ep)-ep[idx] for idx, value in enumerate(ep)])
+            # Remove first element and add last element where all coin piles have been collected
+            del collected_coin_piles_per_step[-1][0]
+            collected_coin_piles_per_step[-1].append(max(still_existing_coin_piles[-1]))
+
+        # Add last entry to reached_flags
+        print("Number of environment steps:", ep_steps)
+        if 'CoinPiles' in list(factory.state.entities.keys()):
+            print("Collected coins per step:", collected_coin_piles_per_step)
+        if 'Destinations' in list(factory.state.entities.keys()):
+            print("Reached flags per step:", reached_flags)
+
+        if log:
+            if 'CoinPiles' in list(factory.state.entities.keys()):
+                metrics_data = {"collected_coin_piles_per_step": collected_coin_piles_per_step}
+            if 'Destinations' in list(factory.state.entities.keys()):
+                metrics_data = {"reached_flags": reached_flags}
+            with open(f"{results_path}/metrics", "wb") as pickle_file:
+                pickle.dump(metrics_data, pickle_file)
--- a/marl_factory_grid/algorithms/static/contortions.py
+++ b/marl_factory_grid/algorithms/static/contortions.py
@@ -0,0 +1,55 @@
+import numpy as np
+from marl_factory_grid.algorithms.static.TSP_coin_agent import TSPCoinAgent
+from marl_factory_grid.algorithms.static.TSP_target_agent import TSPTargetAgent
+
+
+def get_coin_quadrant_tsp_agents(emergent_phenomenon, factory):
+    agents = [TSPCoinAgent(factory, 0), TSPCoinAgent(factory, 1)]
+    if not emergent_phenomenon:
+        edge_costs = {}
+        # Add costs for horizontal edges
+        for i in range(1, 10):
+            for j in range(1, 9):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i, j + 1}"] = 0.55 + (i - 1) * 0.05
+                edge_costs[f"{i, j + 1}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
+
+        # Add costs for vertical edges
+        for i in range(1, 9):
+            for j in range(1, 10):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i + 1, j}"] = 0.55 + (i) * 0.05
+                edge_costs[f"{i + 1, j}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
+
+
+        for agent in agents:
+            for u, v, weight in agent._position_graph.edges(data='weight'):
+                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
+
+
+    return agents
+
+
+def get_two_rooms_tsp_agents(emergent_phenomenon, factory):
+    agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
+    if not emergent_phenomenon:
+        edge_costs = {}
+        # Add costs for horizontal edges
+        for i in range(1, 6):
+            for j in range(1, 13):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i, j + 1}"] = np.abs(5/i*np.cbrt(((j+1)/4 - 1)) - 1)
+                edge_costs[f"{i, j + 1}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
+
+        # Add costs for vertical edges
+        for i in range(1, 5):
+            for j in range(1, 14):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i + 1, j}"] = np.abs(5/(i+1)*np.cbrt((j/4 - 1)) - 1)
+                edge_costs[f"{i + 1, j}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
+
+
+        for agent in agents:
+            for u, v, weight in agent._position_graph.edges(data='weight'):
+                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
+    return agents
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@@ -1,9 +1,11 @@
+import os
 from pathlib import Path

 import numpy as np
 import yaml

 from marl_factory_grid import Factory
+from marl_factory_grid.algorithms.marl.utils import get_configs_marl_path


 def load_class(classname):
@@ -43,6 +45,10 @@ def get_class(arguments):
        return c


+def get_study_out_path():
+    return Path(os.path.join(Path(__file__).parent.parent.parent, "study_out"))
+
+
 def get_arguments(arguments):
    d = dict(arguments)
    if "classname" in d:
@@ -58,19 +64,13 @@ def load_yaml_file(path: Path):

 def add_env_props(cfg):
    # Path to config File
-    env_path = Path(f'../marl_factory_grid/configs/{cfg["env"]["env_name"]}.yaml')
+    env_path = Path(f'{get_configs_marl_path()}/{cfg["env"]["env_name"]}.yaml')
+    print(cfg)

    # Env Init
    factory = Factory(env_path)
    _ = factory.reset()

-    # Agent Init
-    if len(factory.state.moving_entites) == 1: # Single agent setting
-        observation_size = list(factory.observation_space.shape)
-    else: # Multi-agent setting
-        observation_size = list(factory.observation_space[0].shape)
-    cfg['agent'].update(dict(observation_size=observation_size, n_actions=factory.action_space[0].n))
-
    return factory


--- a/marl_factory_grid/configs/custom/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/configs/custom/dirt_quadrant_eval_config.yaml
@@ -1,78 +0,0 @@
-General:
-  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
-  env_seed: 69
-  # Individual vs global rewards
-  individual_rewards: true
-  # The level.txt file to load from marl_factory_grid/levels
-  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
-  # Print all messages and events
-  verbose: false
-  # Run tests
-  tests: false
-
-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to collect coin piles.
-Agents:
-  # The collect coin agents
-  #Sigmund:
-    #Actions:
-      #- Move4
-      #- Noop
-    #Observations:
-      #- CoinPiles
-      #- Self
-    #Positions:
-      #- (9,1)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (9,9)
-      #- (9,1)
-  Wolfgang:
-    Actions:
-      - Move4
-    Observations:
-      - CoinPiles
-      - Self
-    Positions:
-      - (9,5)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (9,9)
-      #- (9,5)
-
-Entities:
-  CoinPiles:
-    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) #(9,9), (7,9), (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
-    coin_spawn_r_var: 0
-    max_global_amount: 12
-    max_local_amount: 1
-
-# Rules section specifies the rules governing the dynamics of the environment.
-Rules:
-
-  # Utilities
-  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
-  # Can be omitted/ignored if you do not want to take care of collisions at all.
-  WatchCollisions:
-    done_at_collisions: false
-
-  # Done Conditions
-  # Define the conditions for the environment to stop. Either success or a fail conditions.
-  # The environment stops when all coins are collected
-  DoneOnAllCoinsCollected:
-  #DoneAtMaxStepsReached:
-    #max_steps: 200
--- a/marl_factory_grid/configs/custom/two_rooms_one_door_modified_eval_config.yaml
+++ b/marl_factory_grid/configs/custom/two_rooms_one_door_modified_eval_config.yaml
@@ -1,62 +0,0 @@
-General:
-  env_seed: 69
-  # Individual vs global rewards
-  individual_rewards: true
-  # The level.txt file to load from marl_factory_grid/levels
-  level_name: two_rooms_modified
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
-  # Print all messages and events
-  verbose: false
-  # Run tests
-  tests: false
-
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
-Agents:
-  #Sigmund:
-    #Actions:
-      #- Move4
-      #- DoorUse
-    #Observations:
-      #- CoinPiles
-      #- Self
-    #Positions:
-      #- (3,1)
-      #- (2,1)
-  Wolfgang:
-    Actions:
-      - Move4
-      - DoorUse
-    Observations:
-      - CoinPiles
-      - Self
-    Positions:
-      - (3,13)
-      - (2,13)
-
-Entities:
-  CoinPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
-    coin_spawn_r_var: 0
-    max_global_amount: 12
-    max_local_amount: 1
-
-  Doors: { }
-
-Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
-  # Utilities
-  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
-  WatchCollisions:
-    done_at_collisions: false
-
-  # Done Conditions
-  #DoneOnAllDirtCleaned:
-  DoneAtMaxStepsReached:
-    max_steps: 50
--- a/marl_factory_grid/configs/custom/two_rooms_one_door_modified_train_config.yaml
+++ b/marl_factory_grid/configs/custom/two_rooms_one_door_modified_train_config.yaml
@@ -1,75 +0,0 @@
-General:
-  env_seed: 69
-  # Individual vs global rewards
-  individual_rewards: true
-  # The level.txt file to load from marl_factory_grid/levels
-  level_name: two_rooms_modified
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
-  # Print all messages and events
-  verbose: false
-  # Run tests
-  tests: false
-
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
-Agents:
-  #Sigmund:
-    #Actions:
-      #- Move4
-    #Observations:
-      #- CoinPiles
-      #- Self
-    #Positions:
-      #- (3,1)
-      #- (1,1)
-      #- (3,1)
-      #- (5,1)
-      #- (3,1)
-      #- (1,8)
-      #- (3,1)
-      #- (5,8)
-  Wolfgang:
-    Actions:
-      - Move4
-    Observations:
-      - CoinPiles
-      - Self
-    Positions:
-      - (3,13)
-      - (2,13)
-      - (1,13)
-      - (3,13)
-      - (1,8)
-      - (2,6)
-      - (3,10)
-      - (4,6)
-
-Entities:
-  CoinPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
-    coin_spawn_r_var: 0
-    max_global_amount: 12
-    max_local_amount: 1
-
-  #Doors: { }
-
-Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
-  # Utilities
-  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
-  WatchCollisions:
-    done_at_collisions: false
-
-  # Done Conditions
-  DoneOnAllCoinsCollected:
-  #DoneAtMaxStepsReached:
-    #max_steps: 100
-
-  AgentSpawnRule:
-    spawn_rule: "order"
--- a/marl_factory_grid/configs/default_config.yaml
+++ b/marl_factory_grid/configs/default_config.yaml
@@ -26,6 +26,28 @@ Agents:
      - Noop
      - Charge
      - Clean
+      - DestAction
+      - DoorUse
+      - ItemAction
+      - Move8
+    Observations:
+      - Combined:
+          - Other
+          - Walls
+      - GlobalPosition
+      - Battery
+      - ChargePods
+      - DirtPiles
+      - Destinations
+      - Doors
+      - Items
+      - Inventory
+      - DropOffLocations
+      - Maintainers
+  Herbert:
+    Actions:
+      - Noop
+      - Charge
      - Collect
      - DestAction
      - DoorUse
@@ -39,7 +61,6 @@ Agents:
      - Battery
      - ChargePods
      - CoinPiles
-      - DirtPiles
      - Destinations
      - Doors
      - Items
@@ -62,10 +83,10 @@ Entities:
  # CoinPiles: Entities that can be collected by an agent.
  CoinPiles:
    coords_or_quantity: 10
-    initial_amount: 2
+    initial_amount: 1
    collect_amount: 1
    coin_spawn_r_var: 0.1
-    max_global_amount: 20
+    max_global_amount: 10
    max_local_amount: 5

  # Destinations: Entities representing target locations for agents.
--- a/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_train_config.yaml
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_train_config.yaml
@@ -5,60 +5,47 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "collect and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to collect coin piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  # The collect coin agents
-  Sigmund:
+  # The clean agents
+  Agent1:
    Actions:
      - Move4
-      #- Collect
-      #- Noop
+      - Noop
    Observations:
      - CoinPiles
      - Self
    Positions:
      - (9,1)
-      - (4,5)
-      - (1,1)
-      - (4,5)
-      - (9,1)
-      - (9,9)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
-      #- Collect
-      #- Noop
+      - Noop
    Observations:
      - CoinPiles
      - Self
    Positions:
      - (9,5)
-      - (4,5)
-      - (1,1)
-      - (4,5)
-      - (9,5)
-      - (9,9)

 Entities:
  CoinPiles:
-    coords_or_quantity: (9,9), (1,1), (4,5)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
+    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1)
+    initial_amount: 0.5
+    clean_amount: 1
    coin_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
+    randomize: False # If coins should spawn at random positions instead of the positions defined above

 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@@ -67,7 +54,5 @@ Rules:

  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions.
-  # The environment stops when all coins are collected
+  # The environment stops when all coin is cleaned
  DoneOnAllCoinsCollected:
-  #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
-    #max_steps: 100
--- a/marl_factory_grid/configs/custom/MultiAgentConfigs/two_rooms_one_door_modified_eval_config.yaml
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/two_rooms_one_door_modified_eval_config.yaml
@@ -1,20 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
-  level_name: two_rooms_modified
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@@ -36,10 +36,11 @@ Agents:
      - (3,13)

 Entities:
+  # For RL-agent we model the flags as coin piles to be more flexible
  CoinPiles:
    coords_or_quantity: (2,1), (3,12), (2,13), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
+    initial_amount: 0.5
+    clean_amount: 1
    coin_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
@@ -47,16 +48,13 @@ Entities:
  Doors: { }

 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/configs/custom/MultiAgentConfigs/two_rooms_one_door_modified_eval_config_emergent.yaml
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/two_rooms_one_door_modified_eval_config_emergent.yaml
@@ -1,20 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
-  level_name: two_rooms_modified
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@@ -36,10 +36,11 @@ Agents:
      - (3,13)

 Entities:
+  # For RL-agent we model the flags as coin piles to be more flexible
  CoinPiles:
-    coords_or_quantity: (3,12), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
+    coords_or_quantity: (3,12), (3,2) # Locations of flags
+    initial_amount: 0.5
+    clean_amount: 1
    coin_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
@@ -47,16 +48,13 @@ Entities:
  Doors: { }

 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
    max_steps: 30
--- a/marl_factory_grid/configs/marl/single_agent_configs/coin_quadrant_agent1_eval_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/coin_quadrant_agent1_eval_config.yaml
@@ -0,0 +1,48 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: quadrant
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  # The clean agents
+  Agent1:
+    Actions:
+      - Move4
+      - Noop
+    Observations:
+      - CoinPiles
+      - Self
+    Positions:
+      - (9,1)
+
+Entities:
+  CoinPiles:
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
+    coin_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  # Can be omitted/ignored if you do not want to take care of collisions at all.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # The environment stops when all coin is cleaned
+  DoneOnAllCoinsCollected:
--- a/marl_factory_grid/configs/marl/single_agent_configs/coin_quadrant_agent1_train_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/coin_quadrant_agent1_train_config.yaml
@@ -5,69 +5,45 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "collect and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to collect coin piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
  # The clean agents
-  #Sigmund:
-    #Actions:
-      #- Move4
-    #Observations:
-      #- CoinPiles
-      #- Self
-    #Positions:
-      #- (9,1)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (6,8)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (6,8)
-      #- (7,9)
-      #- (9,9)
-      #- (9,1)
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
    Observations:
      - CoinPiles
      - Self
-    Positions:
-      - (9,5)
+    Positions: # Each spawnpoint is mapped to one coin pile looping over coords_or_quantity (see below)
+      - (9,1)
      - (1,1)
      - (2,4)
      - (4,7)
-      - (6,8)
      - (7,9)
      - (2,4)
      - (4,7)
-      - (6,8)
      - (7,9)
      - (9,9)
-      - (9,5)
-
+      - (9,1)

 Entities:
  CoinPiles:
-    coords_or_quantity: (1, 1), (2,4), (4,7), (6,8), (7,9), (9,9)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
    coin_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1

 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@@ -76,10 +52,8 @@ Rules:

  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions.
-  # The environment stops when all coins are collected
+  # The environment stops when all coin is cleaned
  DoneOnAllCoinsCollected:
-  #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
-    #max_steps: 1000

  # Define how agents spawn.
  # Options: "random" (Spawn agent at a random position from the list of defined positions)
--- a/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent1_eval_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent1_eval_config.yaml
@@ -0,0 +1,50 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent1:
+    Actions:
+      - Move4
+      - DoorUse
+    Observations:
+      - CoinPiles
+      - Self
+    Positions: # Each spawnpoint is mapped to one coin pile looping over coords_or_quantity (see below)
+      - (3,1)
+      - (2,1) # spawnpoint only required if agent1 should go to its auxiliary pile
+
+Entities:
+  CoinPiles:
+    coords_or_quantity: (2,1), (3,12) # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
+    coin_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  Doors: { }
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
+  DoneAtMaxStepsReached:
+    max_steps: 30
--- a/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent1_train_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent1_train_config.yaml
@@ -0,0 +1,55 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent1:
+    Actions:
+      - Move4
+    Observations:
+      - CoinPiles
+      - Self
+    Positions: # Each spawnpoint is mapped to one coin pile looping over coords_or_quantity (see below)
+      - (5,1)
+      - (2,1)
+      - (1,1)
+
+Entities:
+  CoinPiles:
+    coords_or_quantity: (3,12) # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
+    coin_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  #Doors: { }  # We leave out the door during training
+
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # The environment stops when all coin is cleaned
+  DoneOnAllCoinsCollected:
+
+  # Define how agents spawn.
+  # Options: "random" (Spawn agent at a random position from the list of defined positions)
+  # "first" (Always spawn agent at first position regardless of the other provided positions)
+  # "order" (Loop through agent positions)
+  AgentSpawnRule:
+    spawn_rule: "order"
--- a/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent2_eval_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent2_eval_config.yaml
@@ -0,0 +1,49 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent2:
+    Actions:
+      - Move4
+      - DoorUse
+    Observations:
+      - CoinPiles
+      - Self
+    Positions: # Each spawnpoint is mapped to one coin pile looping over coords_or_quantity (see below)
+      - (3,13)
+
+Entities:
+  CoinPiles:
+    coords_or_quantity: (3,2)  # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
+    coin_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  Doors: { }
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
+  DoneAtMaxStepsReached:
+    max_steps: 30
--- a/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent2_train_config.yaml
+++ b/marl_factory_grid/configs/marl/single_agent_configs/two_rooms_agent2_train_config.yaml
@@ -0,0 +1,54 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent2:
+    Actions:
+      - Move4
+    Observations:
+      - CoinPiles
+      - Self
+    Positions: # Each spawnpoint is mapped to one coin pile looping over coords_or_quantity (see below)
+      - (3,13)
+
+Entities:
+  CoinPiles:
+    coords_or_quantity: (3,2)  # Locations of coin piles
+    initial_amount: 0.5
+    clean_amount: 1
+    coin_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  #Doors: { } # We leave out the door during training
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # The environment stops when all coin is cleaned
+  DoneOnAllCoinsCollected:
+
+  # Defines how agents spawn.
+  # Options: "random" (Spawn agent at a random position from the list of defined positions)
+  # "first" (Always spawn agent at first position regardless of the other provided positions)
+  # "order" (Loop through agent positions)
+  AgentSpawnRule:
+    spawn_rule: "order"
--- a/marl_factory_grid/configs/test_config.yaml
+++ b/marl_factory_grid/configs/test_config.yaml
@@ -18,28 +18,28 @@ Agents:
 #      - Doors
 #      - Maintainers
 #    Clones: 0
-#  Item test agent:
-#    Actions:
-#      - Noop
-#      - Charge
-#      - DestAction
-#      - DoorUse
-#      - ItemAction
-#      - Move8
-#    Observations:
-#      - Combined:
-#          - Other
-#          - Walls
-#      - GlobalPosition
-#      - Battery
-#      - ChargePods
-#      - Destinations
-#      - Doors
-#      - Items
-#      - Inventory
-#      - DropOffLocations
-#      - Maintainers
-#    Clones: 0
+  Item test agent:
+    Actions:
+      - Noop
+      - Charge
+      - DestAction
+      - DoorUse
+      - ItemAction
+      - Move8
+    Observations:
+      - Combined:
+          - Other
+          - Walls
+      - GlobalPosition
+      - Battery
+      - ChargePods
+      - Destinations
+      - Doors
+      - Items
+      - Inventory
+      - DropOffLocations
+      - Maintainers
+    Clones: 0
 #  Target test agent:
 #    Actions:
 #      - Noop
@@ -56,25 +56,25 @@ Agents:
 #      - Doors
 #      - Maintainers
 #    Clones: 1
-  Coin test agent:
-    Actions:
-      - Noop
-      - Charge
-      - Collect
-      - DoorUse
-      - Move8
-    Observations:
-      - Combined:
-          - Other
-          - Walls
-      - GlobalPosition
-      - Battery
-      - ChargePods
-      - CoinPiles
-      - Destinations
-      - Doors
-      - Maintainers
-    Clones: 1
+#  Coin test agent:
+#    Actions:
+#      - Noop
+#      - Charge
+#      - Collect
+#      - DoorUse
+#      - Move8
+#    Observations:
+#      - Combined:
+#          - Other
+#          - Walls
+#      - GlobalPosition
+#      - Battery
+#      - ChargePods
+#      - CoinPiles
+#      - Destinations
+#      - Doors
+#      - Maintainers
+#    Clones: 1

 Entities:

@@ -93,7 +93,7 @@ Entities:
 #    dirt_spawn_r_var: 0.1
 #    max_global_amount: 20
 #    max_local_amount: 5
-  CoinPiles:
+  DirtPiles:
    coords_or_quantity: 10
    initial_amount: 2
    collect_amount: 1
@@ -134,7 +134,7 @@ Rules:
  #    respawn_freq: 15
  RespawnItems:
    respawn_freq: 15
-  RespawnCoins:
+  RespawnDirt:
    respawn_freq: 15

  # Utilities
--- a/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml
@@ -5,31 +5,34 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to clean dirt piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  # The coin collect agents
-  Sigmund:
+  # The clean agents
+  Agent1:
    Actions:
      - Move4
+      - Collect
      - Noop
    Observations:
+      - Walls
      - CoinPiles
      - Self
    Positions:
      - (9,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
+      - Collect
      - Noop
    Observations:
+      - Walls
      - CoinPiles
      - Self
    Positions:
@@ -37,12 +40,13 @@ Agents:

 Entities:
  CoinPiles:
-    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to collect this field, can collect the coin in one action
-    collect_amount: 1
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9)
+    initial_amount: 0.5
+    clean_amount: 1
    coin_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
+    randomize: False # If coins should spawn at random positions instead of the positions defined above

 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
@@ -55,7 +59,5 @@ Rules:

  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions.
-  # The environment stops when all coins are collected
+  # The environment stops when all coin is cleaned
  DoneOnAllCoinsCollected:
-  #DoneAtMaxStepsReached:
-    #max_steps: 200
--- a/marl_factory_grid/configs/two_rooms_one_door_modified.yaml
+++ b/marl_factory_grid/configs/two_rooms_one_door_modified.yaml
@@ -1,40 +1,38 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
-  level_name: two_rooms_modified
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  level_name: two_rooms_small
+  # View Radius
+  pomdp_r: 0 # Use custom partial observability setting
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
      - Noop
-      - DestAction
+      - DestAction # Action that is performed when the destination is reached
      - DoorUse
    Observations:
      - Walls
-      - Other
      - Doors
      - Destination
    Positions:
-      - (3,1) # Agent spawnpoint
-  Sigmund:
+      - (3,1)
+  Agent2:
    Actions:
      - Move4
      - Noop
      - DestAction
      - DoorUse
    Observations:
-      - Other
      - Walls
      - Destination
      - Doors
@@ -45,10 +43,11 @@ Entities:
  Destinations:
    spawnrule:
      SpawnDestinationsPerAgent:
+        # Target coordinates
        coords_or_quantity:
-          Wolfgang:
-            - (3,12) # Target coordinates
-          Sigmund:
+          Agent1:
+            - (3,12)
+          Agent2:
            - (3,2)

  Doors: { }
@@ -68,10 +67,12 @@ Rules:
  AssignGlobalPositions: { }

  DoneAtDestinationReach:
-    reward_at_done: 1
+    reward_at_done: 50
    # We want to give rewards only, when all targets have been reached.
    condition: "all"

  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/factory.py
+++ b/marl_factory_grid/environment/factory.py
@@ -1,3 +1,4 @@
+import copy
 import shutil

 from collections import defaultdict
@@ -100,7 +101,7 @@ class Factory(gym.Env):

        parsed_entities = self.conf.load_entities()
        self.map = LevelParser(self.level_filepath, parsed_entities, self.conf.pomdp_r)
-        self.levels_that_require_masking = ['two_rooms']
+        self.levels_that_require_masking = ['two_rooms_small']

        # Init for later usage:
        # noinspection PyTypeChecker
@@ -274,10 +275,15 @@ class Factory(gym.Env):
            global Renderer
            self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10)

-        render_entities = self.state.entities.render()
+        # Remove potential Nones from entities
+        render_entities_full = self.state.entities.render()

        # Hide entities where certain conditions are met (e.g., amount <= 0 for DirtPiles)
-        render_entities = self.filter_entities(render_entities)
+        maintain_indices = self.filter_entities(self.state.entities)
+        if maintain_indices:
+            render_entities = [render_entity for idx, render_entity in enumerate(render_entities_full) if idx in maintain_indices]
+        else:
+            render_entities = render_entities_full

        # Mask entities based on dynamic conditions instead of hardcoding level-specific logic
        if self.conf['General']['level_name'] in self.levels_that_require_masking:
@@ -291,18 +297,18 @@ class Factory(gym.Env):

    def filter_entities(self, entities):
        """ Generalized method to filter out entities that shouldn't be rendered. """
-        if 'DirtPiles' in self.state.entities.keys():
-            entities = [entity for entity in entities if not (entity.name == 'DirtPiles' and entity.amount <= 0)]
-        return entities
+        if 'CoinPiles' in self.state.entities.keys():
+            all_entities = [item for sublist in [[e for e in entity] for entity in entities] for item in sublist]
+            return [idx for idx, entity in enumerate(all_entities) if not ('CoinPile' in entity.name and entity.amount <= 0)]

    def mask_entities(self, entities):
        """ Generalized method to mask entities based on dynamic conditions. """
        for entity in entities:
            if entity.name == 'CoinPiles':
-                # entity.name = 'Destinations'
-                # entity.value = 1
-                entity.mask = 'Destinations'
-                entity.mask_value = 1
+                entity.name = 'Destinations'
+                entity.value = 1
+                #entity.mask = 'Destinations'
+                #entity.mask_value = 1
        return entities

    def set_recorder(self, recorder):
--- a/marl_factory_grid/levels/two_rooms_modified.txt
+++ b/marl_factory_grid/levels/two_rooms_modified.txt
--- a/marl_factory_grid/modules/coins/entitites.py
+++ b/marl_factory_grid/modules/coins/entitites.py
@@ -43,4 +43,4 @@ class CoinPile(Entity):
        return state_dict

    def render(self):
-        return RenderEntity(d.COIN, self.pos, min(0.15 + self.amount, 1.5), 'scale')
+        return RenderEntity(d.COIN, self.pos, min(0 + self.amount, 1.5), 'scale')
--- a/marl_factory_grid/modules/coins/groups.py
+++ b/marl_factory_grid/modules/coins/groups.py
@@ -1,4 +1,6 @@
 import ast
+import random
+
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.environment.groups.collection import Collection
 from marl_factory_grid.modules.coins.entitites import CoinPile
@@ -30,12 +32,12 @@ class CoinPiles(Collection):
        """
        Internal Usage
        """
-        return sum([dirt.amount for dirt in self])
+        return sum([coin.amount for coin in self])

    def __init__(self, *args, max_local_amount=5, collect_amount=1, max_global_amount: int = 20, coords_or_quantity=10,
-                 initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs):
+                 initial_amount=2, amount_var=0.2, n_var=0.2, randomize=False, randomization_seed=0, **kwargs):
        """
-        A Collection of dirt piles that triggers their spawn.
+        A Collection of coin piles that triggers their spawn.

        :param max_local_amount: The maximum amount of coins allowed in a single pile at one position.
        :type max_local_amount: int
@@ -67,6 +69,8 @@ class CoinPiles(Collection):
        self.max_local_amount = max_local_amount
        self.coords_or_quantity = coords_or_quantity
        self.initial_amount = initial_amount
+        self.randomize = randomize
+        self.randomized_selection = None

    def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]:
        if ignore_blocking:
@@ -85,7 +89,17 @@ class CoinPiles(Collection):
            else:
                n_new = [pos for pos in coords_or_quantity]

-        amounts = [amount if amount else (self.initial_amount ) # removed rng amount
+        if self.randomize:
+            if not self.randomized_selection:
+                n_new_prime = []
+                for n in n_new:
+                    if random.random() < 0.5:
+                        n_new_prime.append(n)
+                n_new = n_new_prime
+                self.randomized_selection = n_new
+            else:
+                n_new = self.randomized_selection
+        amounts = [amount if amount else (self.initial_amount)  # removed rng amount
                   for _ in range(len(n_new))]

        spawn_counter = 0
--- a/marl_factory_grid/quickstart.py
+++ b/marl_factory_grid/quickstart.py
@@ -1,19 +0,0 @@
-import os
-import shutil
-from pathlib import Path
-
-from marl_factory_grid.utils.tools import ConfigExplainer
-
-
-def init():
-    print('Retrieving available options...')
-    ce = ConfigExplainer()
-    cwd = Path(os.getcwd())
-    ce.save_all(cwd / 'full_config.yaml')
-    template_path = Path(__file__).parent / 'modules' / '_template'
-    print(f'Available config options saved to: {(cwd / "full_config.yaml").resolve()}')
-    print('-----------------------------')
-    print(f'Copying Templates....')
-    shutil.copytree(template_path, cwd)
-    print(f'Templates copied to {cwd}"/"{template_path.name}')
-    print(':wave:')
--- a/marl_factory_grid/utils/plotting/action_assets/target_coin.png
+++ b/marl_factory_grid/utils/plotting/action_assets/target_coin.png
--- a/marl_factory_grid/utils/plotting/plot_single_runs.py
+++ b/marl_factory_grid/utils/plotting/plot_single_runs.py
@@ -9,16 +9,16 @@ import numpy as np
 import pandas as pd
 import torch
 from matplotlib import pyplot as plt
+import scipy.stats as stats

-from marl_factory_grid.algorithms.rl.utils import _as_torch
+from marl_factory_grid.algorithms.marl.utils import _as_torch
 from marl_factory_grid.utils.helpers import IGNORED_DF_COLUMNS
 from marl_factory_grid.utils.plotting.plotting_utils import prepare_plot

 from marl_factory_grid.utils.renderer import Renderer
 from marl_factory_grid.utils.utility_classes import RenderEntity

-from marl_factory_grid.modules.clean_up import constants as d
-
+from marl_factory_grid.modules.coins import constants as c

 def plot_single_run(run_path: Union[str, PathLike], use_tex: bool = False, column_keys=None,
                    file_key: str = 'monitor', file_ext: str = 'pkl'):
@@ -72,7 +72,6 @@ def plot_single_run(run_path: Union[str, PathLike], use_tex: bool = False, colum
    prepare_plot(run_path.parent / f'{run_path.parent.name}_monitor_lineplot.png', df_melted, use_tex=use_tex)
    print('Plotting done.')

-
 def plot_routes(factory, agents):
    """
    Creates a plot of the agents' actions on the level map by creating a Renderer and Render Entities that hold the
@@ -134,7 +133,7 @@ def plot_action_maps(factory, agents, result_path):
        'red_arrow': os.path.join(base_dir, 'utils', 'plotting', 'action_assets', 'red_arrow.png'),
        'grey_arrow': os.path.join(base_dir, 'utils', 'plotting', 'action_assets', 'grey_arrow.png'),
        'wall': os.path.join(base_dir, 'environment', 'assets', 'wall.png'),
-        'target_dirt': os.path.join(base_dir, 'utils', 'plotting', 'action_assets', 'target_dirt.png'),
+        'target_coin': os.path.join(base_dir, 'utils', 'plotting', 'action_assets', 'target_coin.png'),
        'spawn_pos': os.path.join(base_dir, 'utils', 'plotting', 'action_assets', 'spawn_pos.png')
    }
    renderer = Renderer(factory.map.level_shape, cell_size=80, custom_assets_path=assets_path)
@@ -149,20 +148,25 @@ def plot_action_maps(factory, agents, result_path):

                wall_entities = [RenderEntity(name='wall', probability=0, pos=np.array(pos)) for pos in wall_positions]
                action_entities = list(wall_entities)
-                target_dirt_pos = factory.state.entities[d.DIRT][action_map_index].pos
+                target_coin_pos = factory.state.entities[c.COIN][action_map_index].pos
                action_entities.append(
-                    RenderEntity(name='target_dirt', probability=0, pos=swap_coordinates(target_dirt_pos)))
+                    RenderEntity(name='target_coin', probability=0, pos=swap_coordinates(target_coin_pos)))

-                # Render all spawnpoints assigned to current target dirt pile
+                # Render all spawnpoints assigned to current target coin pile
                spawnpoints = list(factory.state.agents_conf.values())[agent_index]['positions']
-                all_target_dirts = []
-                if 'DirtPiles' in factory.conf['Entities']:
-                    tuples = ast.literal_eval(factory.conf['Entities']['DirtPiles']['coords_or_quantity'])
+                all_target_coins = []
+                if 'CoinPiles' in factory.conf['Entities']:
+                    tuples = ast.literal_eval(factory.conf['Entities']['CoinPiles']['coords_or_quantity'])
                    for t in tuples:
-                        all_target_dirts.append(t)
+                        all_target_coins.append(t)
+
+                if isinstance(all_target_coins[0], int):
+                    temp = all_target_coins
+                    all_target_coins = [tuple(temp)]
+
                assigned_spawn_positions = []
-                for j in range(len(spawnpoints) // len(all_target_dirts)):
-                    assigned_spawn_positions.append(spawnpoints[j * len(all_target_dirts) + all_target_dirts.index(target_dirt_pos)])
+                for j in range(len(spawnpoints) // len(all_target_coins)):
+                    assigned_spawn_positions.append(spawnpoints[j * len(all_target_coins) + all_target_coins.index(target_coin_pos)])
                for spawn_pos in assigned_spawn_positions:
                    action_entities.append(RenderEntity(name='spawn_pos', probability=0, pos=swap_coordinates(spawn_pos)))

@@ -258,73 +262,241 @@ direction_mapping = {
 }


-def plot_reward_development(reward_development, results_path):
-    smoothed_data = np.convolve(reward_development, np.ones(10) / 10, mode='valid')
+def plot_return_development(return_development, results_path, discounted=False):
+    smoothed_data = np.convolve(return_development, np.ones(10) / 10, mode='valid')
    plt.plot(smoothed_data)
    plt.ylim([-10, max(smoothed_data) + 20])
-    plt.title('Smoothed Reward Development')
+    plt.title('Smoothed Return Development' if not discounted else 'Smoothed Discounted Return Development')
    plt.xlabel('Episode')
-    plt.ylabel('Reward')
-    plt.savefig(f"{results_path}/smoothed_reward_development.png")
+    plt.ylabel('Return' if not discounted else "Discounted Return")
+    plt.savefig(f"{results_path}/smoothed_return_development.png"
+                if not discounted else f"{results_path}/smoothed_discounted_return_development.png")
+    plt.show()
+
+def plot_return_development_change(return_change_development, results_path):
+    plt.plot(return_change_development)
+    plt.title('Return Change Development')
+    plt.xlabel('Episode')
+    plt.ylabel('Delta Return')
+    plt.savefig(f"{results_path}/return_change_development.png")
    plt.show()


-def plot_collected_coins_per_step():
-    # Observed behaviour for multi-agent setting consisting of run0 and run0
-    cleaned_dirt_per_step_emergent = [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5]
-    cleaned_dirt_per_step = [0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 5] # RL and TSP
+def mean_confidence_interval(data, confidence=0.95):
+    a = np.array(data)
+    n = np.sum(~np.isnan(a), axis=0)
+    mean = np.nanmean(a, axis=0)
+    se = np.nanstd(a, axis=0) / np.sqrt(n)
+    h = se * 1.96  # For 95% confidence interval
+    return mean, mean - h, mean + h
+
+def load_metrics(file_path, key):
+    with open(file_path, "rb") as pickle_file:
+        metrics = pickle.load(pickle_file)
+    return metrics[key][0]
+
+def pad_runs(runs):
+    max_length = max(len(run) for run in runs)
+    padded_runs = [np.pad(np.array(run, dtype=float), (0, max_length - len(run)), constant_values=np.nan) for run in runs]
+    return padded_runs
+
+def get_reached_flags_metrics(runs):
+    # Find the step where flag 1 and flag 2 are reached
+    flag1_steps = []
+    flag2_steps = []
+
+    for run in runs:
+        if 1 in run:
+            flag1_steps.append(run.index(1))
+        if 2 in run:
+            flag2_steps.append(run.index(2))
+
+    print(flag1_steps)
+    print(flag2_steps)
+
+    # Calculate the mean steps and confidence intervals
+    mean_flag1_steps = np.mean(flag1_steps)
+    mean_flag2_steps = np.mean(flag2_steps)
+
+    std_flag1_steps = np.std(flag1_steps, ddof=1)
+    std_flag2_steps = np.std(flag2_steps, ddof=1)
+
+    n_flag1 = len(flag1_steps)
+    n_flag2 = len(flag2_steps)
+
+    confidence_level = 0.95
+    t_critical_flag1 = stats.t.ppf((1 + confidence_level) / 2, n_flag1 - 1)
+    t_critical_flag2 = stats.t.ppf((1 + confidence_level) / 2, n_flag2 - 1)
+
+    margin_of_error_flag1 = t_critical_flag1 * (std_flag1_steps / np.sqrt(n_flag1))
+    margin_of_error_flag2 = t_critical_flag2 * (std_flag2_steps / np.sqrt(n_flag2))
+
+    # Mean steps including baseline
+    mean_steps = [0, mean_flag1_steps, mean_flag2_steps]
+    flags_reached = [0, 1, 2]
+    error_bars = [0, margin_of_error_flag1, margin_of_error_flag2]
+    return mean_steps, flags_reached, error_bars
+
+
+def plot_collected_coins_per_step(rl_runs_names, tsp_runs_names, results_path):
+    # Observed behaviour for multi-agent setting consisting of run0 and run0
+    collected_coins_per_step_emergent = [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5]
+
+    # Load RL and TSP data from multiple runs
+    rl_runs = [load_metrics(results_path + f"/{rl_run}/metrics", "cleaned_dirt_piles_per_step") for rl_run in rl_runs_names]
+
+    tsp_runs = [load_metrics(results_path + f"/{tsp_run}/metrics", "cleaned_dirt_piles_per_step") for tsp_run in tsp_runs_names]
+
+    # Pad runs to handle heterogeneous lengths
+    rl_runs = pad_runs(rl_runs)
+    tsp_runs = pad_runs(tsp_runs)
+
+    # Calculate mean and confidence intervals
+    mean_rl, lower_rl, upper_rl = mean_confidence_interval(rl_runs)
+    mean_tsp, lower_tsp, upper_tsp = mean_confidence_interval(tsp_runs)
+
+    # Plot the mean and confidence intervals
+    plt.fill_between(range(1, len(mean_rl) + 1), lower_rl, upper_rl, color='green', alpha=0.2)
+    plt.step(range(1, len(mean_rl) + 1), mean_rl, color='green', linewidth=3, label='Prevented (RL)')
+
+    plt.fill_between(range(1, len(mean_tsp) + 1), lower_tsp, upper_tsp, color='darkorange', alpha=0.2)
+    plt.step(range(1, len(mean_tsp) + 1), mean_tsp, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
+
+    plt.step(range(1, len(collected_coins_per_step_emergent) + 1), collected_coins_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')

-    plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, color='green', linewidth=3, label='Prevented (RL)')
-    plt.step(range(1, len(cleaned_dirt_per_step_emergent) + 1), cleaned_dirt_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')
-    plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
    plt.xlabel("Environment step", fontsize=20)
    plt.ylabel("Collected Coins", fontsize=20)
-    yint = range(min(cleaned_dirt_per_step), max(cleaned_dirt_per_step) + 1)
-    plt.yticks(yint, fontsize=17)
-    plt.xticks(range(1, len(cleaned_dirt_per_step_emergent) + 1), fontsize=17)
+    plt.xticks(range(1, len(collected_coins_per_step_emergent) + 1), fontsize=17)
+    plt.yticks(fontsize=17)
+
    frame1 = plt.gca()
-    # Only display every 5th tick label
    for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
        if (idx + 1) % 5 != 0:
            xlabel_i.set_visible(False)
            xlabel_i.set_fontsize(0.0)
-    # Change order of labels in legend
+
    handles, labels = frame1.get_legend_handles_labels()
    order = [0, 2, 1]
    plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
+
    fig = plt.gcf()
    fig.set_size_inches(8, 7)
-    plt.savefig("../study_out/number_of_collected_coins.pdf")
+    plt.savefig(f"{results_path}/number_of_collected_coins.pdf")
    plt.show()


-def plot_reached_flags_per_step():
-    # Observed behaviour for multi-agent setting consisting of runs 1 + 2
-    reached_flags_per_step_emergent = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    reached_flags_per_step_RL = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]
-    reached_flags_per_step_TSP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
+def plot_reached_flags_per_step(rl_runs_names, tsp_runs_names, results_path):
+    reached_flags_per_step_emergent = [0] * 32  # Adjust based on your data length
+
+    # Load RL and TSP data from multiple runs
+    rl_runs = [load_metrics(results_path + f"/{rl_run}/metrics", "cleaned_dirt_piles_per_step") for rl_run in rl_runs_names]
+    rl_runs = [[pile - 1 for pile in run] for run in rl_runs]  # Subtract the auxiliary pile
+
+    tsp_runs = [load_metrics(results_path + f"/{tsp_run}/metrics", "reached_flags") for tsp_run in tsp_runs_names]
+
+    # Pad runs to handle heterogeneous lengths
+    rl_runs = pad_runs(rl_runs)
+    tsp_runs = pad_runs(tsp_runs)
+
+    # Calculate mean and confidence intervals
+    mean_rl, lower_rl, upper_rl = mean_confidence_interval(rl_runs)
+    mean_tsp, lower_tsp, upper_tsp = mean_confidence_interval(tsp_runs)
+
+    # Plot the mean and confidence intervals
+    plt.fill_between(range(1, len(mean_rl) + 1), lower_rl, upper_rl, color='green', alpha=0.2)
+    plt.step(range(1, len(mean_rl) + 1), mean_rl, color='green', linewidth=3, label='Prevented (RL)')
+
+    plt.fill_between(range(1, len(mean_tsp) + 1), lower_tsp, upper_tsp, color='darkorange', alpha=0.2)
+    plt.step(range(1, len(mean_tsp) + 1), mean_tsp, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
+
+    plt.step(range(1, len(reached_flags_per_step_emergent) + 1), reached_flags_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')

-    plt.step(range(1, len(reached_flags_per_step_RL) + 1), reached_flags_per_step_RL, color='green', linewidth=3, label='Prevented (RL)')
-    plt.step(range(1, len(reached_flags_per_step_emergent) + 1), reached_flags_per_step_emergent,  linestyle='--', color='darkred', linewidth=3, label='Emergent')
-    plt.step(range(1, len(reached_flags_per_step_TSP) + 1), reached_flags_per_step_TSP, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
    plt.xlabel("Environment step", fontsize=20)
    plt.ylabel("Reached Flags", fontsize=20)
-    yint = range(min(reached_flags_per_step_RL), max(reached_flags_per_step_RL) + 1)
-    plt.yticks(yint, fontsize=17)
    plt.xticks(range(1, len(reached_flags_per_step_emergent) + 1), fontsize=17)
+    plt.yticks(fontsize=17)
+
    frame1 = plt.gca()
-    # Only display every 5th tick label
    for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
        if (idx + 1) % 5 != 0:
            xlabel_i.set_visible(False)
            xlabel_i.set_fontsize(0.0)
-    # Change order of labels in legend
+
    handles, labels = frame1.get_legend_handles_labels()
    order = [0, 2, 1]
    plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
+
    fig = plt.gcf()
    fig.set_size_inches(8, 7)
-    plt.savefig("../study_out/number_of_reached_flags.pdf")
+    plt.savefig(f"{results_path}/number_of_reached_flags.pdf")
+    plt.show()
+
+
+def plot_performance_distribution_on_coin_quadrant(dirt_quadrant, results_path, grid=False):
+    plt.rcParams["figure.autolayout"] = True
+    plt.rcParams["axes.edgecolor"] = "black"
+    plt.rcParams["axes.linewidth"] = 5.0
+    fig = plt.figure(figsize=(18, 13))
+
+    rl_color = '#5D3A9B'
+    tsp_color = '#E66100'
+
+    # Boxplot
+    boxprops = dict(linestyle='-', linewidth=4)
+    whiskerprops = dict(linestyle='-', linewidth=4)
+    capprops = dict(linestyle='-', linewidth=4)
+    flierprops = dict(marker='o', markersize=14, markeredgewidth=4,
+                      linestyle='none')
+    medianprops = dict(linestyle='-', linewidth=4, color='#40B0A6')
+    meanpointprops = dict(marker='D', markeredgecolor='black',
+                          markerfacecolor='firebrick')
+    meanlineprops = dict(linestyle='-.', linewidth=4, color='purple')
+
+    bp = plt.boxplot([dirt_quadrant["RL_emergence"], dirt_quadrant["RL_prevented"], dirt_quadrant["TSP_emergence"],
+            dirt_quadrant["TSP_prevented"]], patch_artist=True, widths=0.6, flierprops=flierprops,
+                    boxprops=boxprops, medianprops=medianprops, meanprops=meanlineprops,
+                    whiskerprops=whiskerprops, capprops=capprops,
+                    meanline=True, showmeans=False, positions=[1, 2.5, 4, 5.5])
+
+    colors = [rl_color, rl_color, tsp_color, tsp_color]
+
+    for bplot, color in zip([bp], [colors, colors]):
+        for patch, color in zip(bplot['boxes'], color):
+            patch.set_facecolor(color)
+
+    plt.tick_params(width=5, length=10)
+    plt.xticks([1, 2.5, 4, 5.5], labels=['Emergent \n (RL)', 'Prevented \n (RL)', 'Emergent \n (TSP)', 'Prevented \n (TSP)'], fontsize=50)
+    plt.yticks(fontsize=50)
+    plt.ylabel('No. environment steps', fontsize=50)
+    plt.xlabel("Agent Types", fontsize=50)
+    plt.grid(grid)
+    plt.tight_layout()
+    plt.savefig(f"{results_path}/number_of_collected_coins_distribution{'_grid' if grid else ''}.pdf")
+    plt.show()
+
+def plot_reached_flags_per_step_with_error(mean_steps_RL_prevented, error_bars_RL_prevented,
+                                           mean_steps_TSP_prevented, error_bars_TSP_prevented, flags_reached,
+                                           results_path, grid=False):
+    plt.rcParams["figure.autolayout"] = True
+    plt.rcParams["axes.edgecolor"] = "black"
+    plt.rcParams["axes.linewidth"] = 5.0
+    fig = plt.figure(figsize=(18, 13))
+
+    # Line plot with error bars
+    plt.plot(range(30), [0 for _ in range(30)], color='gray', linestyle='--', linewidth=7,
+             label='Emergent')
+    plt.errorbar(mean_steps_RL_prevented, flags_reached, xerr=error_bars_RL_prevented, fmt='-o', ecolor='r', capsize=10, capthick=5,
+                 markersize=20, label='Prevented (RL) + CI', color='#5D3A9B', linewidth=7)
+    plt.errorbar(mean_steps_TSP_prevented, flags_reached, xerr=error_bars_TSP_prevented, fmt='-o', ecolor='r', capsize=10, capthick=5,
+                 markersize=20, label='Prevented (TSP) + CI', color='#E66100', linewidth=7)
+    plt.tick_params(width=5, length=10)
+    plt.xticks(fontsize=50)
+    plt.yticks(flags_reached, fontsize=50)
+    plt.xlabel("Avg. environment step", fontsize=50)
+    plt.ylabel('Reached flags', fontsize=50)
+    plt.legend(fontsize=45, loc='best', bbox_to_anchor=(0.38, 0.38))
+    plt.grid(grid)
+    plt.savefig(f"{results_path}/number_of_reached_flags{'_grid' if grid else ''}.pdf")
    plt.show()
				`@@ -1 +0,0 @@`
				`from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory`