added changes from code submission branch and coin entity

2025-12-06 15:40:37 +01:00 · 2024-09-06 11:01:42 +02:00
parent 33e40deecf
commit 5476f617c6
42 changed files with 1429 additions and 68 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -10,7 +10,7 @@ build-job:       # This job runs in the build stage, which runs first.
  variables:
    TWINE_USERNAME: $USER_NAME
    TWINE_PASSWORD: $API_KEY
-    TWINE_REPOSITORY: marl-factory-grid
+    TWINE_REPOSITORY: rl-factory-grid

  image: python:slim
  script:
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ Existing modules include a variety of functionalities within the environment:

 - [Agents](marl_factory_grid/algorithms) implement either static strategies or learning algorithms based on the specific
  configuration.
- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), cleaning
+- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), collecting [coins](marl_factory_grid/modules/coins/coin  cleaning
  [dirt](marl_factory_grid/modules/clean_up/entitites.py), picking
  up [items](marl_factory_grid/modules/items/entitites.py) and
  delivering them to designated drop-off locations.
--- a/README_submission.md
+++ b/README_submission.md
@@ -0,0 +1,77 @@
+# About EDYS
+
+## Tackling emergent dysfunctions (EDYs) in cooperation with Fraunhofer-IKS.
+
+Collaborating with Fraunhofer-IKS, this project is dedicated to investigating Emergent Dysfunctions (EDYs) within
+multi-agent environments. In multi-agent reinforcement learning (MARL), a population of agents learns by interacting
+with each other in a shared environment and adapt their behavior based on the feedback they receive from the environment
+and the actions of other agents.
+
+In this context, emergent behavior describes spontaneous behaviors resulting from interactions among agents and
+environmental stimuli, rather than explicit programming. This promotes natural, adaptable behavior, increases system
+unpredictability for dynamic learning , enables diverse strategies, and encourages collective intelligence for complex
+problem-solving. However, the complex dynamics of the environment also give rise to emerging dysfunctions—unexpected
+issues from agent interactions. This research aims to enhance our understanding of EDYs and their impact on multi-agent
+systems.
+
+### Project Objectives:
+
+- Create an environment that provokes emerging dysfunctions.
+
+    - This is achieved by creating a high level of background noise in the domain, where various entities perform
+      diverse tasks, resulting in a deliberately chaotic dynamic.
+    - The goal is to observe and analyze naturally occurring emergent dysfunctions within the complexity generated in
+      this dynamic environment.
+
+
+- Observational Framework:
+
+    - The project introduces an environment that is designed to capture dysfunctions as they naturally occur.
+    - The environment allows for continuous monitoring of agent behaviors, actions, and interactions.
+    - Tracking emergent dysfunctions in real-time provides valuable data for analysis and understanding.
+
+
+- Compatibility
+    - The Framework allows learning entities from different manufacturers and projects with varying representations
+      of actions and observations to interact seamlessly within the environment.
+
+
+## Setup
+
+Install this environment using `pip install marl-factory-grid`. For more information refer
+to ['installation'](docs/source/installation.rst).
+
+## Usage
+
+The environment is configured to automatically load necessary objects, including entities, rules, and assets, based on your requirements.
+You can utilize existing configurations to replicate the experiments from [this paper](PAPER).
+
+- Preconfigured Studies:
+    The studies folder contains predefined studies that can be used to replicate the experiments.
+    These studies provide a structured way to validate and analyze the outcomes observed in different scenarios.
+  - Creating your own scenarios:
+      If you want to use the environment with custom entities, rules or levels refer to the [complete repository]().
+
+
+
+Existing modules include a variety of functionalities within the environment:
+
+- [Agents](marl_factory_grid/algorithms) implement either static strategies or learning algorithms based on the specific
+  configuration.
+- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), collecting [coins](marl_factory_grid/modules/coins/coin  cleaning
+  [dirt](marl_factory_grid/modules/clean_up/entitites.py), picking
+  up [items](marl_factory_grid/modules/items/entitites.py) and
+  delivering them to designated drop-off locations.
+- Agents are equipped with a [battery](marl_factory_grid/modules/batteries/entitites.py) that gradually depletes over
+  time if not charged at a chargepod.
+- The [maintainer](marl_factory_grid/modules/maintenance/entities.py) aims to
+  repair [machines](marl_factory_grid/modules/machines/entitites.py) that lose health over time.
+
+
+## Limitations
+
+The provided code and documentation are tailored for replicating and validating experiments as described in the paper. 
+Modifications to the environment, such as adding new entities, creating additional rules, or customizing behavior beyond the provided scope are not supported in this release.
+If you are interested in accessing the complete project, including features not covered in this release, refer to the [full repository](LINK FULL REPO).
+
+For further details on running the experiments, please consult the relevant documentation provided in the studies' folder.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -6,7 +6,7 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information

-project = 'marl-factory-grid'
+project = 'rl-factory-grid'
 copyright = '2023, Steffen Illium, Robert Mueller, Joel Friedrich'
 author = 'Steffen Illium, Robert Mueller, Joel Friedrich'
 release = '2.5.0'
--- a/marl_factory_grid/init.py
+++ b/marl_factory_grid/init.py
@@ -1,7 +1,7 @@
 from .quickstart import init
 from marl_factory_grid.environment.factory import Factory
 """
-Main module of the 'marl-factory-grid'-environment.
+Main module of the 'rl-factory-grid'-environment.
 Configure the :class:.Factory with any 'conf.yaml' file.
 Examples can be found in :module:.levels .
 """
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
@@ -1 +0,0 @@
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
--- a/marl_factory_grid/algorithms/rl/init.py
+++ b/marl_factory_grid/algorithms/rl/init.py
@@ -0,0 +1 @@
+from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
--- a/marl_factory_grid/algorithms/rl/a2c_coin.py
+++ b/marl_factory_grid/algorithms/rl/a2c_coin.py
@@ -0,0 +1,297 @@
+import os
+import torch
+from typing import Union, List
+import numpy as np
+from tqdm import tqdm
+
+from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient
+from marl_factory_grid.algorithms.rl.constants import Names
+from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
+    get_coin_piles_positions, update_target_pile, update_ordered_coin_piles, get_all_collected_coin_piles, \
+    distribute_indices, set_agents_spawnpoints, get_ordered_coin_piles, handle_finished_episode, save_configs, \
+    save_agent_models, get_all_observations, get_agents_positions
+from marl_factory_grid.algorithms.utils import add_env_props
+from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
+    create_info_maps
+
+nms = Names
+ListOrTensor = Union[List, torch.Tensor]
+
+
+class A2C:
+    def __init__(self, train_cfg, eval_cfg):
+        self.results_path = None
+        self.agents = None
+        self.act_dim = None
+        self.obs_dim = None
+        self.factory = add_env_props(train_cfg)
+        self.eval_factory = add_env_props(eval_cfg)
+        self.__training = True
+        self.train_cfg = train_cfg
+        self.eval_cfg = eval_cfg
+        self.cfg = train_cfg
+        self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
+        self.setup()
+        self.reward_development = []
+        self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}
+
+    def setup(self):
+        """ Initialize agents and create entry for run results according to configuration """
+        self.obs_dim = 2 + 2 * len(get_coin_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
+                                                                                  nms.PILE_OBSERVABILITY] == nms.ALL else 4
+        self.act_dim = 4  # The 4 movement directions
+        self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
+                       range(self.n_agents)]
+
+        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
+            # Define study_out_path and check if it exists
+            base_dir = os.path.dirname(os.path.abspath(__file__))  # Directory of the script
+            study_out_path = os.path.join(base_dir, '../../../study_out')
+            study_out_path = os.path.abspath(study_out_path)
+
+            if not os.path.exists(study_out_path):
+                raise FileNotFoundError(f"The directory {study_out_path} does not exist.")
+
+            # Create results folder
+            runs = os.listdir(study_out_path)
+            run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
+            next_run_number = max(run_numbers) + 1 if run_numbers else 0
+            self.results_path = os.path.join(study_out_path, f"run{next_run_number}")
+            os.mkdir(self.results_path)
+
+            # Save settings in results folder
+            save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)
+
+    def set_cfg(self, eval=False):
+        if eval:
+            self.cfg = self.eval_cfg
+        else:
+            self.cfg = self.train_cfg
+
+    def load_agents(self, runs_list):
+        """ Initialize networks with parameters of already trained agents """
+        for idx, run in enumerate(runs_list):
+            run_path = f"./study_out/{run}"
+            self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
+            self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
+
+    @torch.no_grad()
+    def train_loop(self):
+        """ Function for training agents """
+        env = self.factory
+        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
+        global_steps, episode = 0, 0
+        indices = distribute_indices(env, self.cfg, self.n_agents)
+        coin_piles_positions = get_coin_piles_positions(env)
+        target_pile = [partition[0] for partition in
+                       indices]  # list of pointers that point to the current target pile for each agent
+        collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+
+        pbar = tqdm(total=max_steps)
+        while global_steps < max_steps:
+            _ = env.reset()
+            if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
+                env.render()
+            set_agents_spawnpoints(env, self.n_agents)
+            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
+            # Reset current target pile at episode begin if all piles have to be collected in one episode
+            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
+                target_pile = [partition[0] for partition in indices]
+                collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+
+            # Supply each agent with its local observation
+            obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+            done, rew_log = [False] * self.n_agents, 0
+
+            while not all(done):
+                action = self.use_door_or_move(env, obs, collected_coin_piles) \
+                    if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
+                _, next_obs, reward, done, info = env.step(action)
+                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+
+                # Handle case where agent is on field with coin
+                reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
+                                                reward, done)
+
+                if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True
+
+                done = [done] * self.n_agents if isinstance(done, bool) else done
+                for ag_i, agent in enumerate(self.agents):
+                    if action[ag_i] in range(self.act_dim):
+                        # Add agent results into respective rollout buffers
+                        agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
+
+                # Visualize state update
+                if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()
+
+                obs = next_obs
+
+                if all(done): handle_finished_episode(obs, self.agents, self.cfg)
+
+                global_steps += 1
+                rew_log += sum(reward)
+
+                if global_steps >= max_steps: break
+
+            self.reward_development.append(rew_log)
+            episode += 1
+            pbar.update(global_steps - pbar.n)
+
+        pbar.close()
+        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
+            plot_reward_development(self.reward_development, self.results_path)
+            create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
+                             get_coin_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
+            save_agent_models(self.results_path, self.agents)
+            plot_action_maps(env, [self], self.results_path)
+
+    @torch.inference_mode(True)
+    def eval_loop(self, n_episodes):
+        """ Function for performing inference """
+        env = self.eval_factory
+        self.set_cfg(eval=True)
+        episode, results = 0, []
+        coin_piles_positions = get_coin_piles_positions(env)
+        indices = distribute_indices(env, self.cfg, self.n_agents)
+        target_pile = [partition[0] for partition in
+                       indices]  # list of pointers that point to the current target pile for each agent
+        if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
+            collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
+                                  range(self.n_agents)]
+        else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+
+        while episode < n_episodes:
+            _ = env.reset()
+            set_agents_spawnpoints(env, self.n_agents)
+            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
+                # Don't render auxiliary piles
+                if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
+                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.COIN_PILES]) if
+                                       idx % 2 == 0]
+                    for pile in auxiliary_piles:
+                        pile.set_new_amount(0)
+                env.render()
+                env._renderer.fps = 5  # Slow down agent movement
+
+            # Reset current target pile at episode begin if all piles have to be collected in one episode
+            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
+                target_pile = [partition[0] for partition in indices]
+                if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
+                    collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
+                                          range(self.n_agents)]
+                else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
+
+            ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
+
+            # Supply each agent with its local observation
+            obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
+
+            while not all(done):
+                action = self.use_door_or_move(env, obs, collected_coin_piles, det=True) \
+                    if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
+                                                                                       collected_coin_piles)  # zero exploration
+                _, next_obs, reward, done, info = env.step(action)
+
+                # Handle case where agent is on field with coin
+                reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
+                                                reward, done)
+
+                # Get transformed next_obs that might have been updated because of handle_coin
+                next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
+
+                done = [done] * self.n_agents if isinstance(done, bool) else done
+
+                if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()
+
+                obs = next_obs
+
+            episode += 1
+
+    # -------------------------------------- HELPER FUNCTIONS ------------------------------------------------- #
+
+    def get_actions(self, observations) -> ListOrTensor:
+        """ Given local observations, get actions for both agents """
+        actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
+                   enumerate(self.agents)]
+        return actions
+
+    def execute_policy(self, observations, env, collected_coin_piles) -> ListOrTensor:
+        """ Execute agent policies deterministically for inference """
+        actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
+                   enumerate(self.agents)]
+        for agent_idx in range(self.n_agents):
+            if all(collected_coin_piles[agent_idx].values()):
+                actions[agent_idx] = np.array(next(
+                    action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
+                    a.name == nms.NOOP))
+        return actions
+
+    def use_door_or_move(self, env, obs, collected_coin_piles, det=False):
+        """ Function that handles automatic actions like door opening and forced Noop"""
+        action = []
+        for agent_idx, agent in enumerate(self.agents):
+            agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
+            # Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting)
+            if all(collected_coin_piles[agent_idx].values()):
+                action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
+                                   a.name == nms.NOOP))
+                if not det:
+                    # Include agent experience entry manually
+                    agent._episode.append((None, None, None, agent.vf(agent_obs)))
+            else:
+                if door := is_door_close(env, agent_idx):
+                    if door.is_closed:
+                        action.append(next(
+                            action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
+                            a.name == nms.USE_DOOR))
+                        # Don't include action in agent experience
+                    else:
+                        if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
+                        else: action.append(int(agent.step(agent_obs)))
+                else:
+                    if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
+                    else: action.append(int(agent.step(agent_obs)))
+        return action
+
+    def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done):
+        """ Check if agent moved on field with coin. If that is the case collect coin automatically """
+        agents_positions = get_agents_positions(env, self.n_agents)
+        coin_piles_positions = get_coin_piles_positions(env)
+        if any([True for pos in agents_positions if pos in coin_piles_positions]):
+            # Only simulate collecting the coin
+            for idx, pos in enumerate(agents_positions):
+                if pos in collected_coin_piles[idx].keys() and not collected_coin_piles[idx][pos]:
+
+                    # If coin piles should be collected in a specific order
+                    if ordered_coin_piles[idx]:
+                        if pos == ordered_coin_piles[idx][target_pile[idx]]:
+                            reward[idx] += 50
+                            collected_coin_piles[idx][pos] = True
+                            # Set pointer to next coin pile
+                            update_target_pile(env, idx, target_pile, indices, self.cfg)
+                            update_ordered_coin_piles(idx, collected_coin_piles, ordered_coin_piles, env,
+                                                      self.cfg, self.n_agents)
+                            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE:
+                                done = True
+                                if all(collected_coin_piles[idx].values()):
+                                    # Reset collected_coin_piles indicator
+                                    for pos in coin_piles_positions:
+                                        collected_coin_piles[idx][pos] = False
+                    else:
+                        reward[idx] += 50
+                        collected_coin_piles[idx][pos] = True
+
+                    # Indicate that renderer can hide coin pile
+                    coin_at_position = env.state[nms.COIN_PILES].by_pos(pos)
+                    coin_at_position[0].set_new_amount(0)
+
+            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]:
+                if all([all(collected_coin_piles[i].values()) for i in range(self.n_agents)]):
+                    done = True
+            elif self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED:
+                # End episode if both agents together have collected all coin piles
+                if all(get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, self.n_agents).values()):
+                    done = True
+
+        return reward, done
--- a/marl_factory_grid/algorithms/rl/base_a2c.py
+++ b/marl_factory_grid/algorithms/rl/base_a2c.py
@@ -0,0 +1,112 @@
+import numpy as np
+import torch as th
+import scipy as sp
+from collections import deque
+from torch import nn
+
+cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]
+
+
+class Net(th.nn.Module):
+    def __init__(self, shape, activation, lr):
+        super().__init__()
+        self.net = th.nn.Sequential(*[layer
+                                      for io, a in zip(zip(shape[:-1], shape[1:]),
+                                                       [activation] * (len(shape) - 2) + [th.nn.Identity])
+                                      for layer in [th.nn.Linear(*io), a()]])
+        self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
+
+        # Initialize weights uniformly, so that for the policy net all actions have approximately the same
+        # probability in the beginning
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.uniform_(module.weight, a=-0.1, b=0.1)
+                if module.bias is not None:
+                    nn.init.uniform_(module.bias, a=-0.1, b=0.1)
+
+    def save_model(self, path):
+        th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
+
+    def save_model_parameters(self, path):
+        th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
+
+    def load_model_parameters(self, path):
+        self.net.load_state_dict(th.load(path))
+        self.net.eval()
+
+
+class ValueNet(Net):
+    def __init__(self, obs_dim, hidden_sizes=[64, 64], activation=th.nn.ReLU, lr=1e-3):
+        super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
+
+    def forward(self, obs): return self.net(obs)
+
+    def loss(self, states, returns): return ((returns - self(states)) ** 2).mean()
+
+
+class PolicyNet(Net):
+    def __init__(self, obs_dim, act_dim, hidden_sizes=[64, 64], activation=th.nn.Tanh, lr=3e-4):
+        super().__init__([obs_dim] + hidden_sizes + [act_dim], activation, lr)
+        self.distribution = lambda obs: th.distributions.Categorical(logits=self.net(obs))
+
+    def forward(self, obs, act=None, det=False):
+        """Given an observation: Returns policy distribution and probablilty for a given action
+          or Returns a sampled action and its corresponding probablilty"""
+        pi = self.distribution(obs)
+        if act is not None: return pi, pi.log_prob(act)
+        act = self.net(obs).argmax() if det else pi.sample()  # sample from the learned distribution
+        return act, pi.log_prob(act)
+
+    def loss(self, states, actions, advantages):
+        _, logp = self.forward(states, actions)
+        loss = -(logp * advantages).mean()
+        return loss
+
+
+class PolicyGradient:
+    """ Autonomous agent using vanilla policy gradient. """
+
+    def __init__(self, env, seed=42, gamma=0.99, agent_id=0, act_dim=None, obs_dim=None):
+        self.env = env
+        self.gamma = gamma                                  # Setup env and discount
+        th.manual_seed(seed)
+        np.random.seed(seed)                                # Seed Torch, numpy and gym
+        # Keep track of previous rewards and performed steps to calcule the mean Return metric
+        self._episode, self.ep_returns, self.num_steps = [], deque(maxlen=100), 0
+        # Get observation and action shapes
+        if not obs_dim:
+            obs_size = env.observation_space.shape if len(env.state.entities.by_name("Agents")) == 1 \
+                else env.observation_space[agent_id].shape  # Single agent case vs. multi-agent case
+            obs_dim = np.prod(obs_size)
+        if not act_dim:
+            act_dim = env.action_space[agent_id].n
+        self.vf = ValueNet(obs_dim)                         # Setup Value Network (Critic)
+        self.pi = PolicyNet(obs_dim, act_dim)               # Setup Policy Network (Actor)
+
+    def step(self, obs):
+        """ Given an observation, get action and probs from policy and values from critic"""
+        with th.no_grad():
+            (a, _), v = self.pi(obs), self.vf(obs)
+        self._episode.append((None, None, None, v))
+        return a.numpy()
+
+    def policy(self, obs, det=True):
+        return self.pi(obs, det=det)[0].numpy()
+
+    def finish_episode(self):
+        """Process self._episode & reset self.env, Returns (s,a,G,V)-Tuple and new inital state"""
+        s, a, r, v = (np.array(e) for e in zip(*self._episode))  # Get trajectories from rollout
+        self.ep_returns.append(sum(r))
+        self._episode = []                  # Add episode return to buffer & reset
+        return s, a, r, v                   # state, action, Return, Value Tensors
+
+    def train(self, states, actions, returns, advantages):  # Update policy weights
+        self.pi.optimizer.zero_grad()
+        self.vf.optimizer.zero_grad()       # Reset optimizer
+        states = states.flatten(1, -1)      # Reduce dimensionality to rollout_dim x input_dim
+        policy_loss = self.pi.loss(states, actions, advantages)  # Calculate Policy loss
+        policy_loss.backward()
+        self.pi.optimizer.step()            # Apply Policy loss
+        value_loss = self.vf.loss(states, returns)  # Calculate Value loss
+        value_loss.backward()
+        self.vf.optimizer.step()            # Apply Value loss
--- a/marl_factory_grid/algorithms/marl/base_ac.py
+++ b/marl_factory_grid/algorithms/marl/base_ac.py
@@ -2,7 +2,7 @@ import torch
 from typing import Union, List, Dict
 import numpy as np
 from torch.distributions import Categorical
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
 from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
 from pathlib import Path
 import pandas as pd
--- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@@ -1,5 +1,5 @@
 agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
  n_agents:            2
  obs_emb_size:        96
  action_emb_size:     16
@@ -18,7 +18,7 @@ env:
  eval_render:        True
  save_and_log:       True
  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
--- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
@@ -1,5 +1,5 @@
 agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
  n_agents:            2
  obs_emb_size:        96
  action_emb_size:     16
@@ -18,7 +18,7 @@ env:
  eval_render:        True
  save_and_log:       True
  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
--- a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
@@ -1,5 +1,5 @@
 agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
  n_agents:            1
  obs_emb_size:        96
  action_emb_size:     16
@@ -18,7 +18,7 @@ env:
  eval_render:        True
  save_and_log:       True
  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
--- a/marl_factory_grid/algorithms/marl/configs/environment_changes
+++ b/marl_factory_grid/algorithms/marl/configs/environment_changes
--- a/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml
@@ -1,5 +1,5 @@
 agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
  n_agents:            1
  obs_emb_size:        96
  action_emb_size:     16
@@ -18,7 +18,7 @@ env:
  eval_render:        True
  save_and_log:       False
  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
--- a/marl_factory_grid/algorithms/rl/constants.py
+++ b/marl_factory_grid/algorithms/rl/constants.py
@@ -0,0 +1,37 @@
+class Names:
+    ENV             = 'env'
+    ENV_NAME = 'env_name'
+    N_AGENTS = 'n_agents'
+    ALGORITHM       = 'algorithm'
+    MAX_STEPS       = 'max_steps'
+    N_STEPS         = 'n_steps'
+    TRAIN_RENDER    = 'train_render'
+    EVAL_RENDER     = 'eval_render'
+    AGENT = 'Agent'
+    PILE_OBSERVABILITY = 'pile-observability'
+    PILE_ORDER = 'pile-order'
+    ALL = 'all'
+    FIXED = 'fixed'
+    AGENTS = 'agents'
+    DYNAMIC = 'dynamic'
+    SMART = 'smart'
+    DIRT_PILES = 'DirtPiles'
+    COIN_PILES = 'CoinPiles'
+    AUXILIARY_PILES = "auxiliary_piles"
+    DOORS = 'Doors'
+    DOOR = 'Door'
+    GAMMA = 'gamma'
+    ADVANTAGE = 'advantage'
+    REINFORCE = 'reinforce'
+    ADVANTAGE_AC = "Advantage-AC"
+    TD_ADVANTAGE_AC = "TD-Advantage-AC"
+    CHUNK_EPISODE = 'chunk-episode'
+    POS_POINTER = 'pos_pointer'
+    POSITIONS = 'positions'
+    SAVE_AND_LOG = 'save_and_log'
+    NOOP = 'Noop'
+    USE_DOOR = 'use_door'
+    PILE_ALL_DONE = 'pile_all_done'
+    SINGLE = 'single'
+    DISTRIBUTED = 'distributed'
+    SHARED = 'shared'
--- a/marl_factory_grid/algorithms/marl/iac.py
+++ b/marl_factory_grid/algorithms/marl/iac.py
@@ -1,9 +1,9 @@
 import torch
-from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
+from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic, nms
 from marl_factory_grid.algorithms.utils import instantiate_class
 from pathlib import Path
 from natsort import natsorted
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory


 class LoopIAC(BaseActorCritic):
--- a/marl_factory_grid/algorithms/marl/mappo.py
+++ b/marl_factory_grid/algorithms/marl/mappo.py
@@ -1,6 +1,6 @@
-from marl_factory_grid.algorithms.marl.base_ac import Names as nms
-from marl_factory_grid.algorithms.marl.snac import LoopSNAC
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+from marl_factory_grid.algorithms.rl.base_ac import Names as nms
+from marl_factory_grid.algorithms.rl.snac import LoopSNAC
+from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
 import torch
 from torch.distributions import Categorical
 from marl_factory_grid.algorithms.utils import instantiate_class
--- a/marl_factory_grid/algorithms/marl/memory.py
+++ b/marl_factory_grid/algorithms/marl/memory.py
--- a/marl_factory_grid/algorithms/marl/networks.py
+++ b/marl_factory_grid/algorithms/marl/networks.py
--- a/marl_factory_grid/algorithms/marl/seac.py
+++ b/marl_factory_grid/algorithms/marl/seac.py
@@ -1,8 +1,8 @@
 import torch
 from torch.distributions import Categorical
-from marl_factory_grid.algorithms.marl.iac import LoopIAC
-from marl_factory_grid.algorithms.marl.base_ac import nms
-from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
+from marl_factory_grid.algorithms.rl.iac import LoopIAC
+from marl_factory_grid.algorithms.rl.base_ac import nms
+from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory


 class LoopSEAC(LoopIAC):
--- a/marl_factory_grid/algorithms/marl/snac.py
+++ b/marl_factory_grid/algorithms/marl/snac.py
@@ -1,5 +1,5 @@
-from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
-from marl_factory_grid.algorithms.marl.base_ac import nms
+from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic
+from marl_factory_grid.algorithms.rl.base_ac import nms
 import torch
 from torch.distributions import Categorical
 from pathlib import Path
--- a/marl_factory_grid/algorithms/rl/utils.py
+++ b/marl_factory_grid/algorithms/rl/utils.py
@@ -0,0 +1,337 @@
+import copy
+from typing import List
+import numpy as np
+import torch
+
+from marl_factory_grid.algorithms.rl.constants import Names as nms
+
+from marl_factory_grid.algorithms.rl.base_a2c import cumulate_discount
+
+
+def _as_torch(x):
+    """ Helper function to convert different list types to a torch tensor """
+    if isinstance(x, np.ndarray):
+        return torch.from_numpy(x)
+    elif isinstance(x, List):
+        return torch.tensor(x)
+    elif isinstance(x, (int, float)):
+        return torch.tensor([x])
+    return x
+
+
+def transform_observations(env, ordered_coins, target_coin, cfg, n_agents):
+    """ Function that extracts local observations from global state
+    Requires that agents have observations -CoinPiles and -Self (cf. environment configs) """
+    agents_positions = get_agents_positions(env, n_agents)
+    coin_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL
+    if coin_observability_is_all:
+        trans_obs = [torch.zeros(2 + 2 * len(ordered_coins[0])) for _ in range(len(agents_positions))]
+    else:
+        # Only show current target pile
+        trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))]
+    for i, pos in enumerate(agents_positions):
+        agent_x, agent_y = pos[0], pos[1]
+        trans_obs[i][0] = agent_x
+        trans_obs[i][1] = agent_y
+        idx = 2
+        if coin_observability_is_all:
+            for coin_pos in ordered_coins[i]:
+                trans_obs[i][idx] = coin_pos[0]
+                trans_obs[i][idx + 1] = coin_pos[1]
+                idx += 2
+        else:
+            trans_obs[i][2] = ordered_coins[i][target_coin[i]][0]
+            trans_obs[i][3] = ordered_coins[i][target_coin[i]][1]
+    return trans_obs
+
+
+def get_all_observations(env, cfg, n_agents):
+    """ Helper function that returns all possible agent observations """
+    coins_positions = [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
+                       range(len(env.state.entities[nms.COIN_PILES]))]
+    if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL:
+        obs = [torch.zeros(2 + 2 * len(coins_positions))]
+        observations = [[]]
+        # Fill in pile positions
+        idx = 2
+        for pile_pos in coins_positions:
+            obs[0][idx] = pile_pos[0]
+            obs[0][idx + 1] = pile_pos[1]
+            idx += 2
+    else:
+        # Have multiple observation layers of the map for each coin pile one
+        obs = [torch.zeros(4) for _ in range(n_agents) for _ in coins_positions]
+        observations = [[] for _ in coins_positions]
+        for idx, pile_pos in enumerate(coins_positions):
+            obs[idx][2] = pile_pos[0]
+            obs[idx][3] = pile_pos[1]
+    valid_agent_positions = env.state.entities.floorlist
+
+    for idx, pos in enumerate(valid_agent_positions):
+        for obs_layer in range(len(obs)):
+            observation = copy.deepcopy(obs[obs_layer])
+            observation[0] = pos[0]
+            observation[1] = pos[1]
+            observations[obs_layer].append(observation)
+
+    return observations
+
+
+def get_coin_piles_positions(env):
+    """ Get positions of coin piles on the map """
+    return [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
+            range(len(env.state.entities[nms.COIN_PILES]))]
+
+
+def get_agents_positions(env, n_agents):
+    """ Get positions of agents on the map """
+    return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
+
+
+def get_ordered_coin_piles(env, collected_coins, cfg, n_agents):
+    """ This function determines in which order the agents should collect the coin piles
+    Each agent can have its individual pile order """
+    ordered_coin_piles = [[] for _ in range(n_agents)]
+    coin_piles_positions = get_coin_piles_positions(env)
+    agents_positions = get_agents_positions(env, n_agents)
+    for agent_idx in range(n_agents):
+        if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]:
+            ordered_coin_piles[agent_idx] = coin_piles_positions
+        elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]:
+            # Calculate distances for remaining unvisited coin piles
+            remaining_target_piles = [pos for pos, value in collected_coins[agent_idx].items() if not value]
+            pile_distances = {pos: 0 for pos in remaining_target_piles}
+            agent_pos = agents_positions[agent_idx]
+            for pos in remaining_target_piles:
+                pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
+
+            if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART:
+                # Check if there is an agent on the direct path to any of the remaining coin piles
+                for pile_pos in remaining_target_piles:
+                    for other_pos in agents_positions:
+                        if other_pos != agent_pos:
+                            if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[
+                                1]:
+                                # Get the line between the agent and the target
+                                path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
+
+                                # Check if the entity lies on the path between the agent and the target
+                                if other_pos in path:
+                                    pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(
+                                        agent_pos[1] - other_pos[1])
+
+            sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
+            # Insert already visited coin piles
+            ordered_coin_piles[agent_idx] = [pos for pos in coin_piles_positions if pos not in remaining_target_piles]
+            # Fill up with sorted positions
+            for pos in sorted_pile_distances.keys():
+                ordered_coin_piles[agent_idx].append(pos)
+
+        else:
+            print("Not a valid pile order option.")
+            exit()
+
+    return ordered_coin_piles
+
+
+def bresenham(x0, y0, x1, y1):
+    """Bresenham's line algorithm to get the coordinates of a line between two points."""
+    dx = np.abs(x1 - x0)
+    dy = np.abs(y1 - y0)
+    sx = 1 if x0 < x1 else -1
+    sy = 1 if y0 < y1 else -1
+    err = dx - dy
+
+    coordinates = []
+    while True:
+        coordinates.append((x0, y0))
+        if x0 == x1 and y0 == y1:
+            break
+        e2 = 2 * err
+        if e2 > -dy:
+            err -= dy
+            x0 += sx
+        if e2 < dx:
+            err += dx
+            y0 += sy
+    return coordinates
+
+
+def update_ordered_coin_piles(agent_idx, collected_coin_piles, ordered_coin_piles, env, cfg, n_agents):
+    """ Update the order of the remaining coin piles """
+    # Only update ordered_coin_pile for agent that reached its target pile
+    updated_ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, cfg, n_agents)
+    for i in range(len(ordered_coin_piles[agent_idx])):
+        ordered_coin_piles[agent_idx][i] = updated_ordered_coin_piles[agent_idx][i]
+
+
+def distribute_indices(env, cfg, n_agents):
+    """ Distribute coin piles evenly among the agents """
+    indices = []
+    n_coin_piles = len(get_coin_piles_positions(env))
+    agents_positions = get_agents_positions(env, n_agents)
+    if n_coin_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
+        indices = [[0] for _ in range(n_agents)]
+    else:
+        base_count = n_coin_piles // n_agents
+        remainder = n_coin_piles % n_agents
+
+        start_index = 0
+        for i in range(n_agents):
+            # Add an extra index to the first 'remainder' objects
+            end_index = start_index + base_count + (1 if i < remainder else 0)
+            indices.append(list(range(start_index, end_index)))
+            start_index = end_index
+
+        # Static form: auxiliary pile, primary pile, auxiliary pile, ...
+        # -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
+        if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
+            door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
+            distances = {door_pos: [] for door_pos in door_positions}
+
+            # Calculate distance of every agent to every door
+            for door_pos in door_positions:
+                for agent_pos in agents_positions:
+                    distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
+
+            def duplicate_indices(lst, item):
+                return [i for i, x in enumerate(lst) if x == item]
+
+            # Get agent indices of agents with same distance to door
+            affected_agents = {door_pos: {} for door_pos in door_positions}
+            for door_pos in distances.keys():
+                dist = distances[door_pos]
+                dist_set = set(dist)
+                for d in dist_set:
+                    affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)
+
+            updated_indices = []
+
+            for door_pos, agent_distances in affected_agents.items():
+                if len(agent_distances) == 0:
+                    # Remove auxiliary piles for all agents
+                    # (In config, we defined every pile with an even numbered index to be an auxiliary pile)
+                    updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
+                else:
+                    for distance, agent_indices in agent_distances.items():
+                        # For each distance group, pick one random agent to keep the auxiliary pile
+                        # selected_agent = np.random.choice(agent_indices)
+                        selected_agent = 0
+                        for agent_idx in agent_indices:
+                            if agent_idx == selected_agent:
+                                updated_indices.append(indices[agent_idx])
+                            else:
+                                updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
+
+            indices = updated_indices
+
+    return indices
+
+
+def update_target_pile(env, agent_idx, target_pile, indices, cfg):
+    """ Get the next target pile for a given agent """
+    if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
+        if target_pile[agent_idx] + 1 < len(get_coin_piles_positions(env)):
+            target_pile[agent_idx] += 1
+        else:
+            target_pile[agent_idx] = 0
+    else:
+        if target_pile[agent_idx] + 1 in indices[agent_idx]:
+            target_pile[agent_idx] += 1
+
+
+def is_door_close(env, agent_idx):
+    """ Checks whether the agent is close to a door """
+    neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos)
+                     for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name]
+    if neighbourhood:
+        return neighbourhood[0]
+
+
+def get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, n_agents):
+    """ Returns all coin piles collected by any agent """
+    meta_collected_coin_piles = {pos: False for pos in coin_piles_positions}
+    for agent_idx in range(n_agents):
+        for (pos, collected) in collected_coin_piles[agent_idx].items():
+            if collected:
+                meta_collected_coin_piles[pos] = True
+    return meta_collected_coin_piles
+
+
+def handle_finished_episode(obs, agents, cfg):
+    """ Finish up episode, calculate advantages and perform policy net and value net updates"""
+    with torch.inference_mode(False):
+        for ag_i, agent in enumerate(agents):
+            # Get states, actions, rewards and values from rollout buffer
+            data = agent.finish_episode()
+            # Chunk episode data, such that there will be no memory failure for very long episodes
+            chunks = split_into_chunks(data, cfg)
+            for (s, a, R, V) in chunks:
+                # Calculate discounted return and advantage
+                G = cumulate_discount(R, cfg[nms.ALGORITHM][nms.GAMMA])
+                if cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.REINFORCE:
+                    A = G
+                elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.ADVANTAGE_AC:
+                    A = G - V  # Actor-Critic Advantages
+                elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.TD_ADVANTAGE_AC:
+                    with torch.no_grad():
+                        A = R + cfg[nms.ALGORITHM][nms.GAMMA] * np.append(V[1:], agent.vf(
+                            _as_torch(obs[ag_i]).view(-1).to(
+                                torch.float32)).numpy()) - V  # TD Actor-Critic Advantages
+                else:
+                    print("Not a valid advantage option.")
+                    exit()
+
+                rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
+                # Update policy and value net of agent with experience from rollout buffer
+                agent.train(*rollout)
+
+
+def split_into_chunks(data_tuple, cfg):
+    """ Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """
+    result = [data_tuple]
+    chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE]
+    if chunk_size > 0:
+        # Get the maximum length of the lists in the tuple to handle different lengths
+        max_length = max(len(lst) for lst in data_tuple)
+
+        # Prepare a list to store the result
+        result = []
+
+        # Split each list into chunks and add them to the result
+        for i in range(0, max_length, chunk_size):
+            # Create a sublist containing the ith chunk from each list
+            sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
+            result.append(sublist)
+
+    return result
+
+
+def set_agents_spawnpoints(env, n_agents):
+    """ Tell environment where the agents should spawn in the next episode """
+    for agent_idx in range(n_agents):
+        agent_name = list(env.state.agents_conf.keys())[agent_idx]
+        current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER]
+        # Making the reset dependent on the number of spawnpoints and not the number of coinpiles allows
+        # for having multiple subsequent spawnpoints with the same target pile
+        if current_pos_pointer == len(env.state.agents_conf[agent_name][nms.POSITIONS]) - 1:
+            env.state.agents_conf[agent_name][nms.POS_POINTER] = 0
+        else:
+            env.state.agents_conf[agent_name][nms.POS_POINTER] += 1
+
+
+def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
+    """ Save configurations for logging purposes """
+    with open(f"{results_path}/MARL_config.txt", "w") as txt_file:
+        txt_file.write(str(cfg))
+    with open(f"{results_path}/train_env_config.txt", "w") as txt_file:
+        txt_file.write(str(factory_conf))
+    with open(f"{results_path}/eval_env_config.txt", "w") as txt_file:
+        txt_file.write(str(eval_factory_conf))
+
+
+def save_agent_models(results_path, agents):
+    """ Save model parameters after training """
+    for idx, agent in enumerate(agents):
+        agent.pi.save_model_parameters(results_path)
+        agent.vf.save_model_parameters(results_path)
--- a/marl_factory_grid/algorithms/static/TSP_coin_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_coin_agent.py
@@ -0,0 +1,40 @@
+from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+
+from marl_factory_grid.modules.coins import constants as c
+from marl_factory_grid.environment import constants as e
+
+future_planning = 7
+
+
+class TSPCoinAgent(TSPBaseAgent):
+
+    def __init__(self, *args, **kwargs):
+        """
+        Initializes a TSPCoinAgent that aims to collect coins in the environment.
+        """
+        super(TSPCoinAgent, self).__init__(*args, **kwargs)
+        self.fallback_action = e.NOOP
+
+    def predict(self, *_, **__):
+        """
+        Predicts the next action based on the presence of coins in the environment.
+
+        :return: Predicted action.
+        :rtype: int
+        """
+        coin_at_position = self._env.state[c.COIN].by_pos(self.state.pos)
+        if coin_at_position:
+            # Translate the action_object to an integer to have the same output as any other model
+            action = c.COLLECT
+        elif door := self._door_is_close(self._env.state):
+            action = self._use_door_or_move(door, c.COIN)
+        else:
+            action = self._predict_move(c.COIN)
+        self.action_list.append(action)
+        # Translate the action_object to an integer to have the same output as any other model
+        try:
+            action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
+        except (StopIteration, UnboundLocalError):
+            print('Will not happen')
+            raise EnvironmentError
+        return action_obj
--- a/marl_factory_grid/configs/test_config.yaml
+++ b/marl_factory_grid/configs/test_config.yaml
@@ -40,10 +40,27 @@ Agents:
 #      - DropOffLocations
 #      - Maintainers
 #    Clones: 0
-  Target test agent:
+#  Target test agent:
+#    Actions:
+#      - Noop
+#      - Charge
+#      - DoorUse
+#      - Move8
+#    Observations:
+#      - Combined:
+#          - Other
+#          - Walls
+#      - GlobalPosition
+#      - Battery
+#      - Destinations
+#      - Doors
+#      - Maintainers
+#    Clones: 1
+  Coin test agent:
    Actions:
      - Noop
      - Charge
+      - Collect
      - DoorUse
      - Move8
    Observations:
@@ -52,6 +69,8 @@ Agents:
          - Walls
      - GlobalPosition
      - Battery
+      - ChargePods
+      - CoinPiles
      - Destinations
      - Doors
      - Maintainers
@@ -67,11 +86,18 @@ Entities:
  Destinations:
    coords_or_quantity: 1
    spawn_mode: GROUPED
-  DirtPiles:
+#  DirtPiles:
+#    coords_or_quantity: 10
+#    initial_amount: 2
+#    clean_amount: 1
+#    dirt_spawn_r_var: 0.1
+#    max_global_amount: 20
+#    max_local_amount: 5
+  CoinPiles:
    coords_or_quantity: 10
    initial_amount: 2
-    clean_amount: 1
-    dirt_spawn_r_var: 0.1
+    collect_amount: 1
+    coin_spawn_r_var: 0.1
    max_global_amount: 20
    max_local_amount: 5
  Doors:
@@ -90,24 +116,26 @@ Entities:
 General:
  env_seed: 69
  individual_rewards: true
-  level_name: quadrant
+  level_name: two_rooms
  pomdp_r: 3
  verbose: false
  tests: false

 Rules:
  # Environment Dynamics
-  EntitiesSmearDirtOnMove:
-    smear_ratio: 0.2
+  #  EntitiesSmearDirtOnMove:
+  #    smear_ratio: 0.2
  DoorAutoClose:
    close_frequency: 10
  MoveMaintainers:

  # Respawn Stuff
-  RespawnDirt:
-    respawn_freq: 15
+  #  RespawnDirt:
+  #    respawn_freq: 15
  RespawnItems:
    respawn_freq: 15
+  RespawnCoins:
+    respawn_freq: 15

  # Utilities
  WatchCollisions:
--- a/marl_factory_grid/environment/factory.py
+++ b/marl_factory_grid/environment/factory.py
@@ -81,7 +81,7 @@ class Factory(gym.Env):
    def __init__(self, config_file: Union[str, PathLike], custom_modules_path: Union[None, PathLike] = None,
                 custom_level_path: Union[None, PathLike] = None):
        """
-        Initializes the marl-factory-grid as Gym environment.
+        Initializes the rl-factory-grid as Gym environment.

        :param config_file: Path to the configuration file.
        :type config_file: Union[str, PathLike]
@@ -271,15 +271,37 @@ class Factory(gym.Env):
        if not self._renderer:  # lazy init
            from marl_factory_grid.utils.renderer import Renderer
            global Renderer
-            self._renderer = Renderer(self.map.level_shape,  view_radius=self.conf.pomdp_r, fps=10)
+            self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10)

        render_entities = self.state.entities.render()
+
+        # Hide entities where certain conditions are met (e.g., amount <= 0 for DirtPiles)
+        render_entities = self.filter_entities(render_entities)
+
+        # Mask entities based on dynamic conditions instead of hardcoding level-specific logic
+        if self.conf['General']['level_name'] == 'two_rooms':
+            render_entities = self.mask_entities(render_entities)
+
        if self.conf.pomdp_r:
            for render_entity in render_entities:
                if render_entity.name == c.AGENT:
                    render_entity.aux = self.obs_builder.curr_lightmaps[render_entity.real_name]
        return self._renderer.render(render_entities, self._recorder)

+    def filter_entities(self, entities):
+        """ Generalized method to filter out entities that shouldn't be rendered. """
+        if 'DirtPiles' in self.state.entities.keys():
+            entities = [entity for entity in entities if not (entity.name == 'DirtPiles' and entity.amount <= 0)]
+        return entities
+
+    def mask_entities(self, entities):
+        """ Generalized method to mask entities based on dynamic conditions. """
+        for entity in entities:
+            if entity.name == 'CoinPiles':
+                entity.mask = 'Destinations'
+                entity.mask_value = 1
+        return entities
+
    def set_recorder(self, recorder):
        self._recorder = recorder

@@ -298,7 +320,7 @@ class Factory(gym.Env):
            summary.update({entity_group.name.lower(): entity_group.summarize_states()})
        # TODO Section End                                                          ########
        for key in list(summary.keys()):
-            if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries']:
+            if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries', 'coinPiles']:
                del summary[key]
        return summary

--- a/marl_factory_grid/environment/rules.py
+++ b/marl_factory_grid/environment/rules.py
@@ -168,14 +168,25 @@ class SpawnEntity(Rule):
        return results


+def _get_position(spawn_rule, positions, empty_positions, positions_pointer):
+    """
+    Internal usage, selects positions based on rule.
+    """
+    if spawn_rule and spawn_rule == "random":
+        position = random.choice(([x for x in positions if x in empty_positions]))
+    elif spawn_rule and spawn_rule == "order":
+        position = ([x for x in positions if x in empty_positions])[positions_pointer]
+    else:
+        position = h.get_first([x for x in positions if x in empty_positions])
+    return position
+
+
 class SpawnAgents(Rule):

    def __init__(self):
        """
-        TODO
-
-
-        :return:
+        Finds suitable spawn positions according to the given spawn rule, creates agents with these positions and adds
+        them to state.agents.
        """
        super().__init__()
        pass
@@ -183,8 +194,9 @@ class SpawnAgents(Rule):
    def on_reset(self, state):
        spawn_rule = None
        for rule in state.rules.rules:
-            if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule):
+            if isinstance(rule, AgentSpawnRule):
                spawn_rule = rule.spawn_rule
+                break

        if not hasattr(state, 'agent_spawn_positions'):
            state.agent_spawn_positions = []
@@ -200,7 +212,7 @@ class SpawnAgents(Rule):
            other = agent_conf['other'].copy()
            positions_pointer = agent_conf['pos_pointer']

-            if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer):
+            if position := _get_position(spawn_rule, positions, empty_positions, positions_pointer):
                assert state.check_pos_validity(position), 'smth went wrong....'
                agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
                state.agent_spawn_positions.append(position)
@@ -213,21 +225,13 @@ class SpawnAgents(Rule):
                state.agent_spawn_positions.append(chosen_position)
        return []

-    def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer):
-        if spawn_rule and spawn_rule == "random":
-            position = random.choice(([x for x in positions if x in empty_positions]))
-        elif spawn_rule and spawn_rule == "order":
-            position = ([x for x in positions if x in empty_positions])[positions_pointer]
-        else:
-            position = h.get_first([x for x in positions if x in empty_positions])
-
-        return position

 class AgentSpawnRule(Rule):
    def __init__(self, spawn_rule):
        self.spawn_rule = spawn_rule
        super().__init__()

+
 class DoneAtMaxStepsReached(Rule):

    def __init__(self, max_steps: int = 500):
--- a/marl_factory_grid/modules/clean_up/groups.py
+++ b/marl_factory_grid/modules/clean_up/groups.py
@@ -1,4 +1,5 @@
 import ast
+import random
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.environment.groups.collection import Collection
 from marl_factory_grid.modules.clean_up.entitites import DirtPile
@@ -33,7 +34,7 @@ class DirtPiles(Collection):
        return sum([dirt.amount for dirt in self])

    def __init__(self, *args, max_local_amount=5, clean_amount=1, max_global_amount: int = 20, coords_or_quantity=10,
-                 initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs):
+                 initial_amount=2, amount_var=0.2, n_var=0.2, randomize=False, randomization_seed=0, **kwargs):
        """
        A Collection of dirt piles that triggers their spawn.

@@ -67,6 +68,8 @@ class DirtPiles(Collection):
        self.max_local_amount = max_local_amount
        self.coords_or_quantity = coords_or_quantity
        self.initial_amount = initial_amount
+        self.randomize = randomize
+        self.randomized_selection = None

    def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]:
        if ignore_blocking:
@@ -85,7 +88,17 @@ class DirtPiles(Collection):
            else:
                n_new = [pos for pos in coords_or_quantity]

-        amounts = [amount if amount else (self.initial_amount ) # removed rng amount
+        if self.randomize:
+            if not self.randomized_selection:
+                n_new_prime = []
+                for n in n_new:
+                    if random.random() < 0.5:
+                        n_new_prime.append(n)
+                n_new = n_new_prime
+                self.randomized_selection = n_new
+            else:
+                n_new = self.randomized_selection
+        amounts = [amount if amount else self.initial_amount  # removed rng amount
                   for _ in range(len(n_new))]

        spawn_counter = 0
--- a/marl_factory_grid/modules/coins/init.py
+++ b/marl_factory_grid/modules/coins/init.py
@@ -0,0 +1,4 @@
+from .actions import Collect
+from .entitites import CoinPile
+from .groups import CoinPiles
+from .rules import DoneOnAllCoinsCollected
--- a/marl_factory_grid/modules/coins/actions.py
+++ b/marl_factory_grid/modules/coins/actions.py
@@ -0,0 +1,36 @@
+from typing import Union
+
+from marl_factory_grid.environment.actions import Action
+from marl_factory_grid.utils.results import ActionResult
+
+from marl_factory_grid.modules.coins import constants as d
+
+from marl_factory_grid.environment import constants as c
+
+
+class Collect(Action):
+
+    def __init__(self):
+        """
+        Attempts to reduce coin amount on entity's position. Fails if no coin is found at the at agents' position.
+        """
+        super().__init__(d.COLLECT, d.REWARD_COLLECT_VALID, d.REWARD_COLLECT_FAIL)
+
+    def do(self, entity, state) -> Union[None, ActionResult]:
+        if coin_pile := next((x for x in state.entities.pos_dict[entity.pos] if "coin" in x.name.lower()), None):
+            new_coin_pile_amount = coin_pile.amount - state[d.COIN].collect_amount
+
+            if new_coin_pile_amount <= 0:
+                state[d.COIN].delete_env_object(coin_pile)
+            else:
+                coin_pile.set_new_amount(max(new_coin_pile_amount, c.VALUE_FREE_CELL))
+            valid = c.VALID
+            print_str = f'{entity.name} did just collect some coins at {entity.pos}.'
+            state.print(print_str)
+
+        else:
+            valid = c.NOT_VALID
+            print_str = f'{entity.name} just tried to collect some coins at {entity.pos}, but failed.'
+            state.print(print_str)
+
+        return self.get_result(valid, entity)
--- a/marl_factory_grid/modules/coins/coinpiles.png
+++ b/marl_factory_grid/modules/coins/coinpiles.png
--- a/marl_factory_grid/modules/coins/constants.py
+++ b/marl_factory_grid/modules/coins/constants.py
@@ -0,0 +1,11 @@
+COIN = 'CoinPiles'
+
+COLLECT = 'do_collect_action'
+
+COLLECT_VALID = 'collect_valid'
+COLLECT_FAIL = 'collect_fail'
+COLLECT_ALL = 'all_collected'
+
+REWARD_COLLECT_VALID: float = 0.5
+REWARD_COLLECT_FAIL: float = -0.1
+REWARD_COLLECT_ALL: float = 4.5
--- a/marl_factory_grid/modules/coins/entitites.py
+++ b/marl_factory_grid/modules/coins/entitites.py
@@ -0,0 +1,46 @@
+from marl_factory_grid.environment.entity.entity import Entity
+from marl_factory_grid.utils.utility_classes import RenderEntity
+from marl_factory_grid.modules.coins import constants as d
+
+
+class CoinPile(Entity):
+
+    @property
+    def amount(self):
+        """
+        Internal Usage
+        """
+        return self._amount
+
+    @property
+    def encoding(self):
+        return self._amount
+
+    def __init__(self, *args, amount=2, max_local_amount=5, **kwargs):
+        """
+        Represents a pile of coins at a specific position in the environment that agents can interact with. Agents can
+        clean the dirt pile or, depending on activated rules, interact with it in different ways.
+
+        :param amount: The amount of coins in the pile.
+        :type amount: float
+
+        :param max_local_amount: The maximum amount of dirt allowed in a single pile at one position.
+        :type max_local_amount: float
+        """
+        super(CoinPile, self).__init__(*args, **kwargs)
+        self._amount = amount
+        self.max_local_amount = max_local_amount
+
+    def set_new_amount(self, amount):
+        """
+        Internal Usage
+        """
+        self._amount = min(amount, self.max_local_amount)
+
+    def summarize_state(self):
+        state_dict = super().summarize_state()
+        state_dict.update(amount=float(self.amount))
+        return state_dict
+
+    def render(self):
+        return RenderEntity(d.COIN, self.pos, min(0.15 + self.amount, 1.5), 'scale')
--- a/marl_factory_grid/modules/coins/groups.py
+++ b/marl_factory_grid/modules/coins/groups.py
@@ -0,0 +1,108 @@
+import ast
+from marl_factory_grid.environment import constants as c
+from marl_factory_grid.environment.groups.collection import Collection
+from marl_factory_grid.modules.coins.entitites import CoinPile
+from marl_factory_grid.utils.results import Result
+from marl_factory_grid.utils import helpers as h
+
+
+class CoinPiles(Collection):
+    _entity = CoinPile
+
+    @property
+    def var_is_blocking_light(self):
+        return False
+
+    @property
+    def var_can_collide(self):
+        return False
+
+    @property
+    def var_can_move(self):
+        return False
+
+    @property
+    def var_has_position(self):
+        return True
+
+    @property
+    def global_amount(self) -> float:
+        """
+        Internal Usage
+        """
+        return sum([dirt.amount for dirt in self])
+
+    def __init__(self, *args, max_local_amount=5, collect_amount=1, max_global_amount: int = 20, coords_or_quantity=10,
+                 initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs):
+        """
+        A Collection of dirt piles that triggers their spawn.
+
+        :param max_local_amount: The maximum amount of coins allowed in a single pile at one position.
+        :type max_local_amount: int
+
+        :param clean_amount: The amount of coins removed by a single collecting action.
+        :type clean_amount: int
+
+        :param max_global_amount: The maximum total amount of coins allowed in the environment.
+        :type max_global_amount: int
+
+        :param coords_or_quantity: Determines whether to use coordinates or quantity when triggering coin pile spawn.
+        :type coords_or_quantity:  Union[Tuple[int, int], int]
+
+        :param initial_amount: The initial amount of coin in each newly spawned pile.
+        :type initial_amount: int
+
+        :param amount_var: The variability in the initial amount of coin in each pile.
+        :type amount_var: float
+
+        :param n_var: The variability in the number of new coin piles spawned.
+        :type n_var: float
+
+        """
+        super(CoinPiles, self).__init__(*args, **kwargs)
+        self.amount_var = amount_var
+        self.n_var = n_var
+        self.collect_amount = collect_amount
+        self.max_global_amount = max_global_amount
+        self.max_local_amount = max_local_amount
+        self.coords_or_quantity = coords_or_quantity
+        self.initial_amount = initial_amount
+
+    def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]:
+        if ignore_blocking:
+            print("##########################################")
+            print("Blocking should not be ignored for this Entity")
+            print("Exiting....")
+            exit()
+        coords_or_quantity = coords_or_quantity if coords_or_quantity else self.coords_or_quantity
+        if isinstance(coords_or_quantity, int):
+            n_new = int(abs(coords_or_quantity + (state.rng.uniform(-self.n_var, self.n_var))))
+            n_new = state.get_n_random_free_positions(n_new)
+        else:
+            coords_or_quantity = ast.literal_eval(coords_or_quantity)
+            if isinstance(coords_or_quantity[0], int):
+                n_new = [coords_or_quantity]
+            else:
+                n_new = [pos for pos in coords_or_quantity]
+
+        amounts = [amount if amount else (self.initial_amount ) # removed rng amount
+                   for _ in range(len(n_new))]
+
+        spawn_counter = 0
+        for idx, (pos, a) in enumerate(zip(n_new, amounts)):
+            if not self.global_amount > self.max_global_amount:
+                if coin := self.by_pos(pos):
+                    coin = h.get_first(coin)
+                    new_value = coin.amount + a
+                    coin.set_new_amount(new_value)
+                else:
+                    super().spawn([pos], amount=a)
+                    spawn_counter += 1
+            else:
+                return Result(identifier=f'{self.name}_spawn', validity=c.NOT_VALID, value=spawn_counter)
+
+        return Result(identifier=f'{self.name}_spawn', validity=c.VALID, value=spawn_counter)
+
+    def __repr__(self):
+        s = super(CoinPiles, self).__repr__()
+        return f'{s[:-1]}, {self.global_amount}]'
--- a/marl_factory_grid/modules/coins/rules.py
+++ b/marl_factory_grid/modules/coins/rules.py
@@ -0,0 +1,59 @@
+from marl_factory_grid.modules.coins import constants as d
+from marl_factory_grid.environment import constants as c
+
+from marl_factory_grid.environment.rules import Rule
+from marl_factory_grid.utils.helpers import is_move
+from marl_factory_grid.utils.results import TickResult
+from marl_factory_grid.utils.results import DoneResult
+
+
+class DoneOnAllCoinsCollected(Rule):
+
+    def __init__(self, reward: float = d.REWARD_COLLECT_ALL):
+        """
+        Defines a 'Done'-condition which triggers, when there is no more 'Dirt' in the environment.
+
+        :type reward: float
+        :parameter reward: Given reward when condition triggers.
+        """
+        super().__init__()
+        self.reward = reward
+
+    def on_check_done(self, state) -> [DoneResult]:
+        if len(state[d.COIN]) == 0 and state.curr_step:
+            return [DoneResult(validity=c.VALID, identifier=self.name, reward=self.reward)]
+        return []
+
+
+class RespawnCoins(Rule):
+
+    def __init__(self, respawn_freq: int = 15, respawn_n: int = 5, respawn_amount: float = 1.0):
+        """
+        Defines the spawn pattern of initial and additional 'Dirt'-entities.
+        First chooses positions, then tries to spawn dirt until 'respawn_n' or the maximal global amount is reached.
+        If there is already some, it is topped up to min(max_local_amount, amount).
+
+        :type respawn_freq: int
+        :parameter respawn_freq: In which frequency should this Rule try to spawn new 'Dirt'?
+        :type respawn_n: int
+        :parameter respawn_n: How many respawn positions are considered.
+        :type respawn_amount: float
+        :parameter respawn_amount: Defines how much dirt 'amount' is placed every 'spawn_freq' ticks.
+        """
+        super().__init__()
+        self.respawn_n = respawn_n
+        self.respawn_amount = respawn_amount
+        self.respawn_freq = respawn_freq
+        self._next_coin_spawn = respawn_freq
+
+    def tick_step(self, state):
+        collection = state[d.COIN]
+        if self._next_coin_spawn < 0:
+            result = []  # No CoinPile Spawn
+        elif not self._next_coin_spawn:
+            result = [collection.trigger_spawn(state, coords_or_quantity=self.respawn_n, amount=self.respawn_amount)]
+            self._next_coin_spawn = self.respawn_freq
+        else:
+            self._next_coin_spawn -= 1
+            result = []
+        return result
--- a/marl_factory_grid/utils/plotting/plot_single_runs.py
+++ b/marl_factory_grid/utils/plotting/plot_single_runs.py
@@ -7,7 +7,10 @@ from typing import Union

 import numpy as np
 import pandas as pd
+import torch
+from matplotlib import pyplot as plt

+from marl_factory_grid.algorithms.rl.utils import _as_torch
 from marl_factory_grid.utils.helpers import IGNORED_DF_COLUMNS
 from marl_factory_grid.utils.plotting.plotting_utils import prepare_plot

@@ -253,3 +256,125 @@ direction_mapping = {
    'south_east': (1, 1),
    'south_west': (-1, 1)
 }
+
+
+def plot_reward_development(reward_development, results_path):
+    smoothed_data = np.convolve(reward_development, np.ones(10) / 10, mode='valid')
+    plt.plot(smoothed_data)
+    plt.ylim([-10, max(smoothed_data) + 20])
+    plt.title('Smoothed Reward Development')
+    plt.xlabel('Episode')
+    plt.ylabel('Reward')
+    plt.savefig(f"{results_path}/smoothed_reward_development.png")
+    plt.show()
+
+
+def plot_collected_coins_per_step():
+    # Observed behaviour for multi-agent setting consisting of run0 and run0
+    cleaned_dirt_per_step_emergent = [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5]
+    cleaned_dirt_per_step = [0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 5] # RL and TSP
+
+    plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, color='green', linewidth=3, label='Prevented (RL)')
+    plt.step(range(1, len(cleaned_dirt_per_step_emergent) + 1), cleaned_dirt_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')
+    plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
+    plt.xlabel("Environment step", fontsize=20)
+    plt.ylabel("Collected Coins", fontsize=20)
+    yint = range(min(cleaned_dirt_per_step), max(cleaned_dirt_per_step) + 1)
+    plt.yticks(yint, fontsize=17)
+    plt.xticks(range(1, len(cleaned_dirt_per_step_emergent) + 1), fontsize=17)
+    frame1 = plt.gca()
+    # Only display every 5th tick label
+    for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
+        if (idx + 1) % 5 != 0:
+            xlabel_i.set_visible(False)
+            xlabel_i.set_fontsize(0.0)
+    # Change order of labels in legend
+    handles, labels = frame1.get_legend_handles_labels()
+    order = [0, 2, 1]
+    plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
+    fig = plt.gcf()
+    fig.set_size_inches(8, 7)
+    plt.savefig("../study_out/number_of_collected_coins.pdf")
+    plt.show()
+
+
+def plot_reached_flags_per_step():
+    # Observed behaviour for multi-agent setting consisting of runs 1 + 2
+    reached_flags_per_step_emergent = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    reached_flags_per_step_RL = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]
+    reached_flags_per_step_TSP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
+
+    plt.step(range(1, len(reached_flags_per_step_RL) + 1), reached_flags_per_step_RL, color='green', linewidth=3, label='Prevented (RL)')
+    plt.step(range(1, len(reached_flags_per_step_emergent) + 1), reached_flags_per_step_emergent,  linestyle='--', color='darkred', linewidth=3, label='Emergent')
+    plt.step(range(1, len(reached_flags_per_step_TSP) + 1), reached_flags_per_step_TSP, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
+    plt.xlabel("Environment step", fontsize=20)
+    plt.ylabel("Reached Flags", fontsize=20)
+    yint = range(min(reached_flags_per_step_RL), max(reached_flags_per_step_RL) + 1)
+    plt.yticks(yint, fontsize=17)
+    plt.xticks(range(1, len(reached_flags_per_step_emergent) + 1), fontsize=17)
+    frame1 = plt.gca()
+    # Only display every 5th tick label
+    for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
+        if (idx + 1) % 5 != 0:
+            xlabel_i.set_visible(False)
+            xlabel_i.set_fontsize(0.0)
+    # Change order of labels in legend
+    handles, labels = frame1.get_legend_handles_labels()
+    order = [0, 2, 1]
+    plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
+    fig = plt.gcf()
+    fig.set_size_inches(8, 7)
+    plt.savefig("../study_out/number_of_reached_flags.pdf")
+    plt.show()
+
+
+def create_info_maps(env, all_valid_observations, dirt_piles_positions, results_path, agents, act_dim,
+                     a2c_instance):
+    # Create value map
+    with open(f"{results_path}/info_maps.txt", "w") as txt_file:
+        for obs_layer, pos in enumerate(dirt_piles_positions):
+            observations_shape = (
+                max(t[0] for t in env.state.entities.floorlist) + 2,
+                max(t[1] for t in env.state.entities.floorlist) + 2)
+            value_maps = [np.zeros(observations_shape) for _ in agents]
+            likeliest_action = [np.full(observations_shape, np.NaN) for _ in agents]
+            action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], act_dim)) for
+                                    _ in agents]
+            for obs in all_valid_observations[obs_layer]:
+                for idx, agent in enumerate(agents):
+                    x, y = int(obs[0]), int(obs[1])
+                    try:
+                        value_maps[idx][x][y] = agent.vf(obs)
+                        probs = agent.pi.distribution(obs).probs
+                        likeliest_action[idx][x][y] = torch.argmax(
+                            probs)  # get the likeliest action at the current agent position
+                        action_probabilities[idx][x][y] = probs
+                    except:
+                        pass
+
+            txt_file.write("=======Value Maps=======\n")
+            for agent_idx, vmap in enumerate(value_maps):
+                txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
+                vmap = _as_torch(vmap).round(decimals=4)
+                max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
+                for idx, row in enumerate(vmap):
+                    txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
+                    txt_file.write("\n")
+            txt_file.write("\n")
+            txt_file.write("=======Likeliest Action=======\n")
+            for agent_idx, amap in enumerate(likeliest_action):
+                txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
+                txt_file.write(np.array2string(amap))
+            txt_file.write("\n")
+            txt_file.write("=======Action Probabilities=======\n")
+            for agent_idx, pmap in enumerate(action_probabilities):
+                a2c_instance.action_probabilities[agent_idx].append(pmap)
+                txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
+                for d in range(pmap.shape[0]):
+                    row = '['
+                    for r in range(pmap.shape[1]):
+                        row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
+                    txt_file.write(row + "]")
+                    txt_file.write("\n")
+
+    return action_probabilities
--- a/marl_factory_grid/utils/renderer.py
+++ b/marl_factory_grid/utils/renderer.py
@@ -348,7 +348,6 @@ class Renderer:
        self.save_counter += 1
        full_path = os.path.join(out_dir, unique_filename)
        pygame.image.save(self.screen, full_path)
-        print(f"Image saved as {unique_filename}")


 if __name__ == '__main__':
--- a/marl_factory_grid/utils/states.py
+++ b/marl_factory_grid/utils/states.py
@@ -118,9 +118,8 @@ class Gamestate(object):
        self._floortile_graph = None
        self.tests = StepTests(*tests)

-        # Pointer that defines current spawn points of agents
-        for agent in self.agents_conf:
-            self.agents_conf[agent]["pos_pointer"] = 0
+        # Initialize position pointers for agents
+        self._initialize_position_pointers()

    def reset(self):
        self.curr_step = 0
@@ -138,6 +137,11 @@ class Gamestate(object):
    def __repr__(self):
        return f'{self.__class__.__name__}({len(self.entities)} Entitites @ Step {self.curr_step})'

+    def _initialize_position_pointers(self):
+        """ Initialize the position pointers for each agent in the configuration."""
+        for agent in self.agents_conf:
+            self.agents_conf[agent]["pos_pointer"] = 0
+
    @property
    def random_free_position(self) -> (int, int):
        """
--- a/studies/marl_adapted.py
+++ b/studies/marl_adapted.py
@@ -1,10 +1,11 @@
 import copy
 from pathlib import Path
-from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
+from marl_factory_grid.algorithms.rl.a2c_coin import A2C
 from marl_factory_grid.algorithms.utils import load_yaml_file

+
 def single_agent_training(config_name):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml')
+    cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/{config_name}_config.yaml')

    train_cfg = load_yaml_file(cfg_path)
    # Use environment config with fixed spawnpoints for eval
@@ -21,7 +22,7 @@ def single_agent_training(config_name):


 def single_agent_eval(config_name, run):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml')
+    cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/{config_name}_config.yaml')

    train_cfg = load_yaml_file(cfg_path)
    # Use environment config with fixed spawnpoints for eval
@@ -34,7 +35,7 @@ def single_agent_eval(config_name, run):


 def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/{config_name}_config.yaml')
+    cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/{config_name}_config.yaml')

    train_cfg = load_yaml_file(cfg_path)
    # Use environment config with fixed spawnpoints for eval
@@ -85,12 +86,14 @@ def two_rooms_one_door_modified_single_agent_eval(agent_name):
 def dirt_quadrant_5_multi_agent_eval(emergent_phenomenon):
    multi_agent_eval("dirt_quadrant", ["run4", "run5"], emergent_phenomenon)

-def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon): # run7 == run4
+
+def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon):  # run7 == run4
    multi_agent_eval("dirt_quadrant", ["run4", "run7"], emergent_phenomenon)

+
 def two_rooms_one_door_modified_multi_agent_eval(emergent_phenomenon):
    multi_agent_eval("two_rooms_one_door_modified", ["run2", "run3"], emergent_phenomenon)


 if __name__ == '__main__':
-    dirt_quadrant_5_multi_agent_ctde_eval(True)
+    dirt_quadrant_5_multi_agent_ctde_eval(True)
--- a/studies/normalization_study.py
+++ b/studies/normalization_study.py
@@ -2,7 +2,7 @@ from marl_factory_grid.algorithms.utils import Checkpointer
 from pathlib import Path
 from marl_factory_grid.algorithms.utils import load_yaml_file, add_env_props, instantiate_class, load_class

-# from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC
+# from algorithms.rl import LoopSNAC, LoopIAC, LoopSEAC


 for i in range(0, 5):
--- a/studies/viz_policy.py
+++ b/studies/viz_policy.py
@@ -5,7 +5,7 @@ from algorithms.utils import load_yaml_file
 from tqdm import trange
 study = 'example_config#0'
 #study_root = Path(__file__).parent / study
-study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/marl/')
+study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/rl/')

 #['L2NoAh_gru', 'L2NoCh_gru', 'nomix_gru']:
 render = True
--- a/test_run.py
+++ b/test_run.py
@@ -3,6 +3,7 @@ from pprint import pprint

 from tqdm import trange

+from marl_factory_grid.algorithms.static.TSP_coin_agent import TSPCoinAgent
 from marl_factory_grid.algorithms.static.TSP_dirt_agent import TSPDirtAgent
 from marl_factory_grid.algorithms.static.TSP_item_agent import TSPItemAgent
 from marl_factory_grid.algorithms.static.TSP_target_agent import TSPTargetAgent
@@ -30,7 +31,7 @@ if __name__ == '__main__':
            factory.render()
        action_spaces = factory.action_space
        # agents = [TSPDirtAgent(factory, 0), TSPItemAgent(factory, 1), TSPTargetAgent(factory, 2)]
-        agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
+        agents = [TSPCoinAgent(factory, 0)]
        while not done:
            a = [x.predict() for x in agents]
            obs_type, _, _, done, info = factory.step(a)
@@ -39,5 +40,3 @@ if __name__ == '__main__':
            if done:
                print(f'Episode {episode} done...')
                break
-
-        plot_routes(factory, agents)
				`@@ -1 +0,0 @@`
				`from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory`
				`@@ -0,0 +1 @@`
				`from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory`