From 3c54d04f9f9569c14a5b2b99fd8ff45f3284e565 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20Sch=C3=B6nberger?= <schoenbergerj@cip.ifi.lmu.de>
Date: Mon, 6 May 2024 12:33:37 +0200
Subject: [PATCH] All relevant functional code for A2C Dirt Quadrant setting
 with small changes to the environment + Different configs for single agent
 and multiagent settings

---
 .gitignore                                    |   1 +
 marl_factory_grid/algorithms/marl/a2c_dirt.py | 456 +++++++++++++-----
 marl_factory_grid/algorithms/marl/base_a2c.py |  25 +-
 .../dirt_quadrant_config.yaml                 |  32 ++
 .../marl/configs/dirt_quadrant_config.yaml    |  10 +-
 .../dirt_quadrant_eval_config.yaml            |  71 +++
 .../dirt_quadrant_train_config.yaml}          |  50 +-
 ...nt.yaml => dirt_quadrant_eval_config.yaml} |  32 +-
 .../custom/dirt_quadrant_train_config.yaml    |  85 ++++
 marl_factory_grid/environment/rewards.py      |   8 +-
 marl_factory_grid/environment/rules.py        |  25 +-
 marl_factory_grid/utils/states.py             |   4 +
 studies/marl_adapted.py                       |  27 +-
 13 files changed, 652 insertions(+), 174 deletions(-)
 create mode 100644 marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
 create mode 100644 marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml
 rename marl_factory_grid/configs/custom/{dirt_quadrant_random_pos.yaml => MultiAgentConfigs/dirt_quadrant_train_config.yaml} (77%)
 rename marl_factory_grid/configs/custom/{dirt_quadrant.yaml => dirt_quadrant_eval_config.yaml} (85%)
 create mode 100644 marl_factory_grid/configs/custom/dirt_quadrant_train_config.yaml

diff --git a/.gitignore b/.gitignore
index d699fd6..2501e90 100644
--- a/.gitignore
+++ b/.gitignore
@@ -701,3 +701,4 @@ $RECYCLE.BIN/
 # End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex
 /studies/e_1/
 /studies/curious_study/
+/study_out/
diff --git a/marl_factory_grid/algorithms/marl/a2c_dirt.py b/marl_factory_grid/algorithms/marl/a2c_dirt.py
index 168547d..5cdd054 100644
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@@ -1,4 +1,5 @@
 import copy
+import os
 import random
 
 from scipy import signal
@@ -61,9 +62,22 @@ class A2C:
         # act_dim=6 for dirt_quadrant
         dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
                                 range(len(self.factory.state.entities['DirtPiles']))]
-        obs_dim = 2 + 2*len(dirt_piles_positions)
+        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
+            obs_dim = 2 + 2*len(dirt_piles_positions)
+        else:
+            obs_dim = 4
         self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)]
+        # self.agents[0].pi.load_model_parameters("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/run5/Wolfgang_PolicyNet_model_parameters.pth")
         self.doors_exist = "Doors" in self.factory.state.entities.keys()
+        if self.cfg[nms.ENV]["save_and_log"]:
+            # Create results folder
+            runs = os.listdir("../study_out/")
+            run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
+            next_run_number = max(run_numbers)+1 if run_numbers else 0
+            self.results_path = f"../study_out/run{next_run_number}"
+            os.mkdir(self.results_path)
+            # Save settings in results folder
+            self.save_configs()
 
     @classmethod
     def _as_torch(cls, x):
@@ -80,62 +94,149 @@ class A2C:
         actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
         return actions
 
-    def execute_policy(self, observations) -> ListOrTensor:
+    def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
         # Use deterministic policy for inference
         actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
+        for agent_idx in range(self.n_agents):
+            if all(cleaned_dirt_piles[agent_idx].values()):
+                actions[agent_idx] = np.array(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
         return actions
 
-    def transform_observations(self, env):
+    def transform_observations(self, env, ordered_dirt_piles, target_pile):
         """ Assumes that agent has observations -DirtPiles and -Self """
         agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
-        dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
-        trans_obs = [torch.zeros(2+2*len(dirt_piles_positions)) for _ in range(len(agent_positions))]
+        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
+            trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
+        else:
+            # Only show current target pile
+            trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
         for i, pos in enumerate(agent_positions):
             agent_x, agent_y = pos[0], pos[1]
             trans_obs[i][0] = agent_x
             trans_obs[i][1] = agent_y
             idx = 2
-            for pos in dirt_piles_positions:
-                trans_obs[i][idx] = pos[0]
-                trans_obs[i][idx + 1] = pos[1]
-                idx += 2
+            if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
+                for pile_pos in ordered_dirt_piles[i]:
+                    trans_obs[i][idx] = pile_pos[0]
+                    trans_obs[i][idx + 1] = pile_pos[1]
+                    idx += 2
+            else:
+                trans_obs[i][2] = ordered_dirt_piles[i][target_pile[i]][0]
+                trans_obs[i][3] = ordered_dirt_piles[i][target_pile[i]][1]
         return trans_obs
 
     def get_all_observations(self, env):
-        first_trans_obs = self.transform_observations(env)[0]
+        dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
+                                range(len(env.state.entities['DirtPiles']))]
+        if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
+            obs = [torch.zeros(2 + 2 * len(dirt_piles_positions))]
+            observations = [[]]
+            # Fill in pile positions
+            idx = 2
+            for pile_pos in dirt_piles_positions:
+                obs[0][idx] = pile_pos[0]
+                obs[0][idx + 1] = pile_pos[1]
+                idx += 2
+        else:
+            # Have multiple observation layers of the map for each dirt pile one
+            obs = [torch.zeros(4) for _ in range(self.n_agents) for _ in dirt_piles_positions]
+            observations = [[] for _ in dirt_piles_positions]
+            for idx, pile_pos in enumerate(dirt_piles_positions):
+                obs[idx][2] = pile_pos[0]
+                obs[idx][3] = pile_pos[1]
         valid_agent_positions = env.state.entities.floorlist
         #observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2)
-        observations = []
         for idx, pos in enumerate(valid_agent_positions):
-            obs = copy.deepcopy(first_trans_obs)
-            obs[0] = pos[0]
-            obs[1] = pos[1]
-            observations.append(obs)
+            for obs_layer in range(len(obs)):
+                observation = copy.deepcopy(obs[obs_layer])
+                observation[0] = pos[0]
+                observation[1] = pos[1]
+                observations[obs_layer].append(observation)
 
         return observations
 
     def get_dirt_piles_positions(self, env):
         return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
 
-    def get_ordered_dirt_piles(self, env):
-        ordered_dirt_piles = []
-        if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
-            ordered_dirt_piles = self.get_dirt_piles_positions(env)
-        elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
-            ordered_dirt_piles = self.get_dirt_piles_positions(env)
-            random.shuffle(ordered_dirt_piles)
-        elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
-            ordered_dirt_piles = None
-        else:
-            print("Not a valid pile order option.")
-            exit()
+    def get_ordered_dirt_piles(self, env, cleaned_dirt_piles, target_pile):
+        """ Each agent can have it's individual pile order """
+        ordered_dirt_piles = [[] for _ in range(self.n_agents)]
+        dirt_pile_positions = self.get_dirt_piles_positions(env)
+        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
+        for agent_idx in range(self.n_agents):
+            if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
+                ordered_dirt_piles[agent_idx] = dirt_pile_positions
+            elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
+                ordered_dirt_piles[agent_idx] = dirt_pile_positions
+                random.shuffle(ordered_dirt_piles)
+            elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
+                ordered_dirt_piles[agent_idx] = None
+            elif self.cfg[nms.ALGORITHM]["pile-order"] in ["smart", "dynamic"]:
+                # Calculate distances for remaining unvisited dirt piles
+                remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
+                pile_distances = {pos:0 for pos in remaining_target_piles}
+                agent_pos = agent_positions[agent_idx]
+                for pos in remaining_target_piles:
+                    pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
+
+                if self.cfg[nms.ALGORITHM]["pile-order"] == "smart":
+                    # Check if there is an agent in line with any of the remaining dirt piles
+                    for pile_pos in remaining_target_piles:
+                        for other_pos in agent_positions:
+                            if other_pos != agent_pos:
+                                if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
+                                    # Get the line between the agent and the goal
+                                    path = self.bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
+
+                                    # Check if the entity lies on the path between the agent and the goal
+                                    if other_pos in path:
+                                        pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])
+
+                sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
+                # Insert already visited dirt piles
+                ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
+                # Fill up with sorted positions
+                for pos in sorted_pile_distances.keys():
+                    ordered_dirt_piles[agent_idx].append(pos)
+
+            else:
+                print("Not a valid pile order option.")
+                exit()
 
         return ordered_dirt_piles
 
+    def bresenham(self, x0, y0, x1, y1):
+        """Bresenham's line algorithm to get the coordinates of a line between two points."""
+        dx = np.abs(x1 - x0)
+        dy = np.abs(y1 - y0)
+        sx = 1 if x0 < x1 else -1
+        sy = 1 if y0 < y1 else -1
+        err = dx - dy
+
+        coordinates = []
+        while True:
+            coordinates.append((x0, y0))
+            if x0 == x1 and y0 == y1:
+                break
+            e2 = 2 * err
+            if e2 > -dy:
+                err -= dy
+                x0 += sx
+            if e2 < dx:
+                err += dx
+                y0 += sy
+        return coordinates
+
+    def update_ordered_dirt_piles(self, agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile):
+        # Only update ordered_dirt_pile for agent that reached its target pile
+        updated_ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
+        for i in range(len(ordered_dirt_piles[agent_idx])):
+            ordered_dirt_piles[agent_idx][i] = updated_ordered_dirt_piles[agent_idx][i]
+
     def distribute_indices(self, env):
         indices = []
         n_dirt_piles = len(self.get_dirt_piles_positions(env))
-        if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none"]:
+        if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
             indices = [[0] for _ in range(self.n_agents)]
         else:
             base_count = n_dirt_piles // self.n_agents
@@ -152,8 +253,14 @@ class A2C:
 
     def update_target_pile(self, env, agent_idx, target_pile):
         indices = self.distribute_indices(env)
-        if target_pile[agent_idx] + 1 in indices[agent_idx]:
-            target_pile[agent_idx] += 1
+        if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
+            if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
+                target_pile[agent_idx] += 1
+            else:
+                target_pile[agent_idx] = 0
+        else:
+            if target_pile[agent_idx] + 1 in indices[agent_idx]:
+                target_pile[agent_idx] += 1
 
     def door_is_close(self, env, agent_idx):
         neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state["Agent"][agent_idx].pos)
@@ -166,7 +273,7 @@ class A2C:
         for agent_idx, agent in enumerate(self.agents):
             agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
             # If agent already reached its target
-            if list(cleaned_dirt_piles.values())[target_pile[agent_idx]]:
+            if all(cleaned_dirt_piles[agent_idx].values()):
                 action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
                 if not det:
                     # Include agent experience entry manually
@@ -238,32 +345,40 @@ class A2C:
 
             # Execute real step in environment
             for idx, pos in enumerate(agent_positions):
-                if pos in cleaned_dirt_piles.keys() and not cleaned_dirt_piles[pos]:
+                if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
                     action[idx] = np.array(4)
                     # Collect dirt
                     _, next_obs, reward, done, info = env.step(action)
-                    cleaned_dirt_piles[pos] = True
+                    cleaned_dirt_piles[idx][pos] = True
                     break"""
 
             # Only simulate collecting the dirt
             for idx, pos in enumerate(agent_positions):
-                if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[pos]:
+                if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[idx][pos]:
                     # print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
                     # If dirt piles should be cleaned in a specific order
-                    if ordered_dirt_piles:
-                        if pos == ordered_dirt_piles[target_pile[idx]]:
-                            reward[idx] += 1  # 1
-                            cleaned_dirt_piles[pos] = True
+                    if ordered_dirt_piles[idx]:
+                        if pos == ordered_dirt_piles[idx][target_pile[idx]]:
+                            reward[idx] += 50  # 1
+                            cleaned_dirt_piles[idx][pos] = True
                             # Set pointer to next dirt pile
                             self.update_target_pile(env, idx, target_pile)
+                            self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
+                            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
+                                done = True
+                                if all(cleaned_dirt_piles[idx].values()):
+                                    # Reset cleaned_dirt_piles indicator
+                                    for pos in dirt_piles_positions:
+                                        cleaned_dirt_piles[idx][pos] = False
                             break
                     else:
-                        reward[idx] += 1  # 1
-                        cleaned_dirt_piles[pos] = True
+                        reward[idx] += 50  # 1
+                        cleaned_dirt_piles[idx][pos] = True
                         break
 
-            if all(cleaned_dirt_piles.values()):
-                done = True
+            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
+                if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
+                    done = True
 
         return reward, done
 
@@ -271,26 +386,57 @@ class A2C:
         with torch.inference_mode(False):
             for ag_i, agent in enumerate(self.agents):
                 # Get states, actions, rewards and values from rollout buffer
-                (s, a, R, V) = agent.finish_episode()
-                # Calculate discounted return and advantage
-                G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
-                if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
-                    A = G
-                elif self.cfg[nms.ALGORITHM]["advantage"] == "Advantage-AC":
-                    A = G - V  # Actor-Critic Advantages
-                elif self.cfg[nms.ALGORITHM]["advantage"] == "TD-Advantage-AC":
-                    with torch.no_grad():
-                        A = R + self.cfg[nms.ALGORITHM]["gamma"] * np.append(V[1:], agent.vf(
-                            self._as_torch(obs[ag_i]).view(-1).to(
-                                torch.float32)).numpy()) - V  # TD Actor-Critic Advantages
-                else:
-                    print("Not a valid advantage option.")
-                    exit()
+                data = agent.finish_episode()
+                # Chunk episode data, such that there will be no memory failure for very long episodes
+                chunks = self.split_into_chunks(data)
+                for (s, a, R, V) in chunks:
+                    # Calculate discounted return and advantage
+                    G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
+                    if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
+                        A = G
+                    elif self.cfg[nms.ALGORITHM]["advantage"] == "Advantage-AC":
+                        A = G - V  # Actor-Critic Advantages
+                    elif self.cfg[nms.ALGORITHM]["advantage"] == "TD-Advantage-AC":
+                        with torch.no_grad():
+                            A = R + self.cfg[nms.ALGORITHM]["gamma"] * np.append(V[1:], agent.vf(
+                                self._as_torch(obs[ag_i]).view(-1).to(
+                                    torch.float32)).numpy()) - V  # TD Actor-Critic Advantages
+                    else:
+                        print("Not a valid advantage option.")
+                        exit()
 
-                rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
-                # Update policy and value net of agent with experience from rollout buffer
-                agent.train(*rollout)
+                    rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
+                    # Update policy and value net of agent with experience from rollout buffer
+                    agent.train(*rollout)
 
+    def split_into_chunks(self, data_tuple):
+        result = [data_tuple]
+        chunk_size = self.cfg[nms.ALGORITHM]["chunk-episode"]
+        if chunk_size > 0:
+            # Get the maximum length of the lists in the tuple to handle different lengths
+            max_length = max(len(lst) for lst in data_tuple)
+
+            # Prepare a list to store the result
+            result = []
+
+            # Split each list into chunks and add them to the result
+            for i in range(0, max_length, chunk_size):
+                # Create a sublist containing the ith chunk from each list
+                sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
+                result.append(sublist)
+
+        return result
+
+    def set_agent_spawnpoint(self, env):
+        for agent_idx in range(self.n_agents):
+            agent_name = list(env.state.agents_conf.keys())[agent_idx]
+            current_pos_pointer = env.state.agents_conf[agent_name]["pos_pointer"]
+            # Making the reset dependent on the number of spawnpoints and not the number of dirtpiles allows
+            # for having multiple subsequent spawnpoints with the same target pile
+            if current_pos_pointer == len(env.state.agents_conf[agent_name]['positions']) - 1:
+                env.state.agents_conf[agent_name]["pos_pointer"] = 0
+            else:
+                env.state.agents_conf[agent_name]["pos_pointer"] += 1
 
     @torch.no_grad()
     def train_loop(self):
@@ -301,19 +447,28 @@ class A2C:
         global_steps, episode = 0, 0
         dirt_piles_positions = self.get_dirt_piles_positions(env)
         used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
+        target_pile = [partition[0] for partition in self.distribute_indices(env)]  # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
+        cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
 
         while global_steps < max_steps:
             print(global_steps)
             obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given
-            print([env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
+            self.set_agent_spawnpoint(env)
+            ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
+            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
+            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
+                target_pile = [partition[0] for partition in self.distribute_indices(env)]
+                cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
+            """passed_fields = [[] for _ in range(self.n_agents)]"""
+
             """obs = list(obs.values())"""
-            obs = self.transform_observations(env)
+            obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
             done, rew_log       = [False] * self.n_agents, 0
 
-            cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
-            ordered_dirt_piles = self.get_ordered_dirt_piles(env)
-            target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
-            """passed_fields = [[] for _ in range(self.n_agents)]"""
+            print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
+            print("Agents target piles:", target_pile)
+            print("Agents initial observation:", obs)
+            print("Agents cleaned dirt piles:", cleaned_dirt_piles)
 
             # Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
             """for i in range(self.n_agents):
@@ -326,12 +481,16 @@ class A2C:
                 _, next_obs, reward, done, info = env.step(action)
                 if done:
                     print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
-                next_obs = self.transform_observations(env)
+                next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
 
                 # Add small negative reward if agent has moved away from the target_pile
-                reward = self.reward_distance(env, obs, target_pile, reward)
+                # reward = self.reward_distance(env, obs, target_pile, reward)
 
-                # Check and handle if agent is on field with dirt
+                # Check and handle if agent is on field with dirt. This method can change the observation for the next step.
+                # If pile_all_done is "single", the episode ends if agents reached its target pile and the new episode begins
+                # with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
+                # the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
+                # before this method is called.
                 reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
 
                 if n_steps != 0 and (global_steps + 1) % n_steps == 0:
@@ -361,45 +520,11 @@ class A2C:
             self.reward_development.append(rew_log)
             episode += 1
 
-        # Create value map
-        observations_shape = (max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
-        value_maps = [np.zeros(observations_shape) for _ in self.agents]
-        likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
-        action_probabilities = [np.zeros((observations_shape[0],observations_shape[1], env.action_space[0].n)) for _ in self.agents]
-        for obs in self.get_all_observations(env):
-            """obs = self._as_torch(obs).view(-1).to(torch.float32)"""
-            for idx, agent in enumerate(self.agents):
-                """indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
-                x, y = indices[0][0], indices[1][0]"""
-                x, y = int(obs[0]), int(obs[1])
-                try:
-                    value_maps[idx][x][y] = agent.vf(obs)
-                    probs = agent.pi.distribution(obs).probs
-                    likeliest_action[idx][x][y] = torch.argmax(probs) # get the likeliest action at the current agent position
-                    action_probabilities[idx][x][y] = probs
-                except:
-                    pass
+        self.plot_reward_development()
+        if self.cfg[nms.ENV]["save_and_log"]:
+            self.create_info_maps(env, used_actions, target_pile)
+            self.save_agent_models()
 
-        print("=======Value Maps=======")
-        for agent_idx, vmap in enumerate(value_maps):
-            print(f"Value map of agent {agent_idx}:")
-            vmap = self._as_torch(vmap).round(decimals=4)
-            max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
-            for idx, row in enumerate(vmap):
-                print(' '.join(f" {elem:>{max_digits+1}}" for elem in row.tolist()))
-        print("=======Likeliest Action=======")
-        for agent_idx, amap in enumerate(likeliest_action):
-            print(f"Likeliest action map of agent {agent_idx}:")
-            print(amap)
-        print("=======Action Probabilities=======")
-        for agent_idx, pmap in enumerate(action_probabilities):
-            print(f"Action probability map of agent {agent_idx}:")
-            for d in range(pmap.shape[0]):
-                row = '['
-                for r in range(pmap.shape[1]):
-                    row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
-                print(row + "]")
-        print("Used actions:", used_actions)
 
 
     @torch.inference_mode(True)
@@ -409,35 +534,45 @@ class A2C:
             env.render()
         episode, results = 0, []
         dirt_piles_positions = self.get_dirt_piles_positions(env)
+        target_pile = [partition[0] for partition in self.distribute_indices(env)]  # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
+        cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
 
         while episode < n_episodes:
             obs = env.reset()
+            self.set_agent_spawnpoint(env)
             """obs = list(obs.values())"""
-            obs = self.transform_observations(env)
-            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
+            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
+            if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
+                target_pile = [partition[0] for partition in self.distribute_indices(env)]
+                cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
 
-            cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
-            ordered_dirt_piles = self.get_ordered_dirt_piles(env)
-            target_pile = [partition[0] for partition in self.distribute_indices(env)]
+            ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
+
+            obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
+            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
 
             # Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
             """for i in range(self.n_agents):
                 self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
 
             while not all(done):
-                action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs) # zero exploration
+                action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
                 print(action)
                 _, next_obs, reward, done, info = env.step(action)
                 if done:
                     print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
-                next_obs = self.transform_observations(env)
 
                 # Add small negative reward if agent has moved away from the target_pile
-                reward = self.reward_distance(env, obs, target_pile, reward)
+                # reward = self.reward_distance(env, obs, target_pile, reward)
 
                 # Check and handle if agent is on field with dirt
                 reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
 
+                # Get transformed next_obs that might have been updated because of self.handle_dirt.
+                # For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
+                # in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
+                next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
+
                 done = [done] * self.n_agents if isinstance(done, bool) else done
 
                 if self.cfg[nms.ENV][nms.EVAL_RENDER]:
@@ -448,10 +583,95 @@ class A2C:
             episode += 1
 
     def plot_reward_development(self):
-        plt.plot(self.reward_development)
-        plt.title('Reward development')
+        smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
+        plt.plot(smoothed_data)
+        plt.ylim([-10, max(smoothed_data) + 20])
+        plt.title('Smoothed Reward Development')
         plt.xlabel('Episode')
         plt.ylabel('Reward')
-        plt.savefig("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/two_rooms_one_door_modified_runs/reward_development.png")
+        if self.cfg[nms.ENV]["save_and_log"]:
+            plt.savefig(f"{self.results_path}/smoothed_reward_development.png")
         plt.show()
 
+    def save_configs(self):
+        with open(f"{self.results_path}/MARL_config.txt", "w") as txt_file:
+            txt_file.write(str(self.cfg))
+        with open(f"{self.results_path}/train_env_config.txt", "w") as txt_file:
+            txt_file.write(str(self.factory.conf))
+        with open(f"{self.results_path}/eval_env_config.txt", "w") as txt_file:
+            txt_file.write(str(self.eval_factory.conf))
+
+    def save_agent_models(self):
+        for idx, agent in enumerate(self.agents):
+            agent_name = list(self.factory.state.agents_conf.keys())[idx]
+            agent.pi.save_model_parameters(self.results_path, agent_name)
+            agent.vf.save_model_parameters(self.results_path, agent_name)
+
+    def load_agents(self, runs_list):
+        for idx, run in enumerate(runs_list):
+            run_path = f"../study_out/{run}"
+            agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
+            self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
+            self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
+
+    def create_info_maps(self, env, used_actions, target_pile):
+        # Create value map
+        all_valid_observations = self.get_all_observations(env)
+        dirt_piles_positions = self.get_dirt_piles_positions(env)
+        with open(f"{self.results_path}/info_maps.txt", "w") as txt_file:
+            for obs_layer, pos in enumerate(dirt_piles_positions):
+                observations_shape = (
+                max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
+                value_maps = [np.zeros(observations_shape) for _ in self.agents]
+                likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
+                action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], env.action_space[0].n)) for
+                                        _ in self.agents]
+                for obs in all_valid_observations[obs_layer]:
+                    """obs = self._as_torch(obs).view(-1).to(torch.float32)"""
+                    for idx, agent in enumerate(self.agents):
+                        """indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
+                        x, y = indices[0][0], indices[1][0]"""
+                        x, y = int(obs[0]), int(obs[1])
+                        try:
+                            value_maps[idx][x][y] = agent.vf(obs)
+                            probs = agent.pi.distribution(obs).probs
+                            likeliest_action[idx][x][y] = torch.argmax(probs)  # get the likeliest action at the current agent position
+                            action_probabilities[idx][x][y] = probs
+                        except:
+                            pass
+
+                txt_file.write("=======Value Maps=======\n")
+                print("=======Value Maps=======")
+                for agent_idx, vmap in enumerate(value_maps):
+                    txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
+                    print(f"Value map of agent {agent_idx} for target pile {pos}:")
+                    vmap = self._as_torch(vmap).round(decimals=4)
+                    max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
+                    for idx, row in enumerate(vmap):
+                        txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
+                        txt_file.write("\n")
+                        print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
+                txt_file.write("\n")
+                txt_file.write("=======Likeliest Action=======\n")
+                print("=======Likeliest Action=======")
+                for agent_idx, amap in enumerate(likeliest_action):
+                    txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
+                    print(f"Likeliest action map of agent {agent_idx} for target pile {pos}:")
+                    txt_file.write(np.array2string(amap))
+                    print(amap)
+                txt_file.write("\n")
+                txt_file.write("=======Action Probabilities=======\n")
+                print("=======Action Probabilities=======")
+                for agent_idx, pmap in enumerate(action_probabilities):
+                    txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
+                    print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
+                    for d in range(pmap.shape[0]):
+                        row = '['
+                        for r in range(pmap.shape[1]):
+                            row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
+                        txt_file.write(row + "]")
+                        txt_file.write("\n")
+                        print(row + "]")
+                txt_file.write(f"Used actions: {used_actions}\n")
+                print("Used actions:", used_actions)
+
diff --git a/marl_factory_grid/algorithms/marl/base_a2c.py b/marl_factory_grid/algorithms/marl/base_a2c.py
index 263c275..085493d 100644
--- a/marl_factory_grid/algorithms/marl/base_a2c.py
+++ b/marl_factory_grid/algorithms/marl/base_a2c.py
@@ -1,6 +1,6 @@
-import numpy as np; import torch as th; import scipy as sp; import gym
-import os; from collections import deque; import matplotlib.pyplot as plt
-from tqdm import tqdm
+import numpy as np; import torch as th; import scipy as sp;
+from collections import deque
+from torch import nn
 
 # RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
 # cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
@@ -14,8 +14,25 @@ class Net(th.nn.Module):
         for layer in [th.nn.Linear(*io), a()]])
     self.optimizer =  th.optim.Adam(self.net.parameters(), lr=lr)
 
+    # Initialize weights uniformly, so that for the policy net all actions have approximately the same probability in the beginning
+    for module in self.modules():
+      if isinstance(module, nn.Linear):
+        nn.init.uniform_(module.weight, a=-0.1, b=0.1)
+        if module.bias is not None:
+          nn.init.uniform_(module.bias, a=-0.1, b=0.1)
+
+  def save_model(self, path, agent_name):
+    th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
+
+  def save_model_parameters(self, path, agent_name):
+    th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
+
+  def load_model_parameters(self, path):
+    self.net.load_state_dict(th.load(path))
+    self.net.eval()
+
 class ValueNet(Net):
-  def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.Tanh, lr=1e-3):
+  def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.ReLU, lr=1e-3):
     super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
   def forward(self, obs): return self.net(obs)
   def loss(self, states, returns): return ((returns - self(states))**2).mean()
diff --git a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
new file mode 100644
index 0000000..cfb85c6
--- /dev/null
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@@ -0,0 +1,32 @@
+agent:
+  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/MultiAgentConfigs/dirt_quadrant_train_config"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       True
+method:               marl_factory_grid.algorithms.marl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          200000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "all" # Options: "single", "all" ("single" for training, "all" for eval)
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
diff --git a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml b/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
index 6668f55..e814d20 100644
--- a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
@@ -8,7 +8,7 @@ agent:
   use_agent_embedding: False
 env:
   classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/dirt_quadrant_random_pos"
+  env_name:           "custom/dirt_quadrant_train_config"
   n_agents:           1
   max_steps:          250
   pomdp_r:            2
@@ -16,13 +16,17 @@ env:
   individual_rewards: True
   train_render:       False
   eval_render:        True
+  save_and_log:       False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
   gamma:              0.99
   entropy_coef:       0.01
   vf_coef:            0.05
   n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          80000
+  max_steps:          270000
   advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents"
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
 
diff --git a/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml
new file mode 100644
index 0000000..7692cbe
--- /dev/null
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_eval_config.yaml
@@ -0,0 +1,71 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: quadrant
+  # Radius of Partially observable Markov decision process
+  pomdp_r: 0 # default 3
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
+# other agents aim to clean dirt piles.
+Agents:
+  # The clean agents
+  Sigmund:
+    Actions:
+      - Move4
+      #- Clean
+      - Noop
+    Observations:
+      # - Walls
+      # - Other
+      - DirtPiles
+      - Self
+    Positions:
+      - (9,1)
+      #- (9,9)
+      #- (4,5)
+  Wolfgang:
+    Actions:
+      - Move4
+      #- Clean
+      - Noop
+    Observations:
+      # - Walls
+      # - Other
+      - DirtPiles
+      - Self
+    Positions:
+      - (9,5)
+      #- (9,9)
+      #- (4,5)
+
+Entities:
+  DirtPiles:
+    coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    clean_amount: 1
+    dirt_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  # Can be omitted/ignored if you do not want to take care of collisions at all.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # The environment stops when all dirt is cleaned
+  DoneOnAllDirtCleaned:
+  #DoneAtMaxStepsReached:
+    #max_steps: 200
diff --git a/marl_factory_grid/configs/custom/dirt_quadrant_random_pos.yaml b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_train_config.yaml
similarity index 77%
rename from marl_factory_grid/configs/custom/dirt_quadrant_random_pos.yaml
rename to marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_train_config.yaml
index b619a7b..25436af 100644
--- a/marl_factory_grid/configs/custom/dirt_quadrant_random_pos.yaml
+++ b/marl_factory_grid/configs/custom/MultiAgentConfigs/dirt_quadrant_train_config.yaml
@@ -16,6 +16,23 @@ General:
 # other agents aim to clean dirt piles.
 Agents:
   # The clean agents
+  Sigmund:
+    Actions:
+      - Move4
+      #- Clean
+      #- Noop
+    Observations:
+      # - Walls
+      # - Other
+      - DirtPiles
+      - Self
+    Positions:
+      - (9,1)
+      - (4,5)
+      - (1,1)
+      - (4,5)
+      - (9,1)
+      - (9,9)
   Wolfgang:
     Actions:
       - Move4
@@ -26,32 +43,17 @@ Agents:
       # - Other
       - DirtPiles
       - Self
-    #Positions:
-      #- (9,1)
-      #- (9,2)
-      #- (9,3)
-      #- (9,4)
-      #- (9,5)
-      #- (9,6)
-      #- (9,7)
-      #- (9,8)
-      #- (9,9)
-  #Reiner:
-    #Actions:
-      #- Move4
-      #- Clean
-      #- Noop
-    #Observations:
-      # - Walls
-      # - Other
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (9,8) # (9, 4)
+    Positions:
+      - (9,5)
+      - (4,5)
+      - (1,1)
+      - (4,5)
+      - (9,5)
+      - (9,9)
 
 Entities:
   DirtPiles:
-    coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    coords_or_quantity: (9,9), (1,1), (4,5)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
     initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
     clean_amount: 1
     dirt_spawn_r_var: 0
@@ -72,4 +74,4 @@ Rules:
   # The environment stops when all dirt is cleaned
   DoneOnAllDirtCleaned:
   #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
-    #max_steps: 1000
+    #max_steps: 100
diff --git a/marl_factory_grid/configs/custom/dirt_quadrant.yaml b/marl_factory_grid/configs/custom/dirt_quadrant_eval_config.yaml
similarity index 85%
rename from marl_factory_grid/configs/custom/dirt_quadrant.yaml
rename to marl_factory_grid/configs/custom/dirt_quadrant_eval_config.yaml
index 49b27ed..4052195 100644
--- a/marl_factory_grid/configs/custom/dirt_quadrant.yaml
+++ b/marl_factory_grid/configs/custom/dirt_quadrant_eval_config.yaml
@@ -16,6 +16,20 @@ General:
 # other agents aim to clean dirt piles.
 Agents:
   # The clean agents
+  #Sigmund:
+    #Actions:
+      #- Move4
+      #- Clean
+      #- Noop
+    #Observations:
+      # - Walls
+      # - Other
+      #- DirtPiles
+      #- Self
+    #Positions:
+      #- (9,1)
+      #- (9,9)
+      #- (4,5)
   Wolfgang:
     Actions:
       - Move4
@@ -27,23 +41,13 @@ Agents:
       - DirtPiles
       - Self
     Positions:
-      - (9,1)
-  #Reiner:
-    #Actions:
-      #- Move4
-      #- Clean
-      #- Noop
-    #Observations:
-      # - Walls
-      # - Other
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (9,8) # (9, 4)
+      - (9,5)
+      #- (9,9)
+      #- (4,5)
 
 Entities:
   DirtPiles:
-    coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
     initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
     clean_amount: 1
     dirt_spawn_r_var: 0
diff --git a/marl_factory_grid/configs/custom/dirt_quadrant_train_config.yaml b/marl_factory_grid/configs/custom/dirt_quadrant_train_config.yaml
new file mode 100644
index 0000000..7340484
--- /dev/null
+++ b/marl_factory_grid/configs/custom/dirt_quadrant_train_config.yaml
@@ -0,0 +1,85 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: quadrant
+  # Radius of Partially observable Markov decision process
+  pomdp_r: 0 # default 3
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
+# other agents aim to clean dirt piles.
+Agents:
+  # The clean agents
+  #Sigmund:
+    #Actions:
+      #- Move4
+      #- Clean
+      #- Noop
+    #Observations:
+      # - Walls
+      # - Other
+      #- DirtPiles
+      #- Self
+    #Positions:
+      #- (9,1)
+      #- (4,5)
+      #- (1,1)
+      #- (4,5)
+      #- (9,1)
+      #- (9,9)
+  Wolfgang:
+    Actions:
+      - Move4
+      #- Clean
+      #- Noop
+    Observations:
+      # - Walls
+      # - Other
+      - DirtPiles
+      - Self
+    Positions:
+      - (9,5)
+      - (4,5)
+      - (1,1)
+      - (4,5)
+      - (9,5)
+      - (9,9)
+
+
+Entities:
+  DirtPiles:
+    coords_or_quantity: (9,9), (1,1), (4,5)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    clean_amount: 1
+    dirt_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  # Can be omitted/ignored if you do not want to take care of collisions at all.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # The environment stops when all dirt is cleaned
+  DoneOnAllDirtCleaned:
+  #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
+    #max_steps: 1000
+
+  # Define how agents spawn.
+  # Options: "random" (Spawn agent at a random position from the list of defined positions)
+  # "first" (Always spawn agent at first position regardless of the other provided positions)
+  # "order" (Loop through agent positions)
+  AgentSpawnRule:
+    spawn_rule: "order"
diff --git a/marl_factory_grid/environment/rewards.py b/marl_factory_grid/environment/rewards.py
index 8605e25..e17269c 100644
--- a/marl_factory_grid/environment/rewards.py
+++ b/marl_factory_grid/environment/rewards.py
@@ -1,5 +1,5 @@
-MOVEMENTS_VALID: float = -0.01 # default: -0.001
-MOVEMENTS_FAIL: float  = -0.1 # default: -0.05
-NOOP: float            = -0.01
-COLLISION: float       = -0.5
+MOVEMENTS_VALID: float = -1 # default: -0.001
+MOVEMENTS_FAIL: float  = -1 # default: -0.05
+NOOP: float            = -1
+COLLISION: float       = -1
 COLLISION_DONE: float  = -1
diff --git a/marl_factory_grid/environment/rules.py b/marl_factory_grid/environment/rules.py
index c3669f1..306dd3e 100644
--- a/marl_factory_grid/environment/rules.py
+++ b/marl_factory_grid/environment/rules.py
@@ -5,6 +5,7 @@ from typing import List, Collection
 
 import numpy as np
 
+import marl_factory_grid
 from marl_factory_grid.environment import rewards as r, constants as c
 from marl_factory_grid.environment.entity.agent import Agent
 from marl_factory_grid.utils import helpers as h
@@ -180,6 +181,11 @@ class SpawnAgents(Rule):
         pass
 
     def on_reset(self, state):
+        spawn_rule = None
+        for rule in state.rules.rules:
+            if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule):
+                spawn_rule = rule.spawn_rule
+
         agents = state[c.AGENT]
         for agent_name, agent_conf in state.agents_conf.items():
             empty_positions = state.entities.empty_positions
@@ -187,10 +193,9 @@ class SpawnAgents(Rule):
             observations = agent_conf['observations'].copy()
             positions = agent_conf['positions'].copy()
             other = agent_conf['other'].copy()
+            positions_pointer = agent_conf['pos_pointer']
 
-            # Spawn agent on random position if multiple spawn points are provided
-            func = random.choice if len(positions) else h.get_first
-            if position := func([x for x in positions if x in empty_positions]):
+            if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer):
                 assert state.check_pos_validity(position), 'smth went wrong....'
                 agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
             elif positions:
@@ -200,6 +205,20 @@ class SpawnAgents(Rule):
                 agents.add_item(Agent(actions, observations, empty_positions.pop(), str_ident=agent_name, **other))
         return []
 
+    def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer):
+        if spawn_rule and spawn_rule == "random":
+            position = random.choice(([x for x in positions if x in empty_positions]))
+        elif spawn_rule and spawn_rule == "order":
+            position = ([x for x in positions if x in empty_positions])[positions_pointer]
+        else:
+            position = h.get_first([x for x in positions if x in empty_positions])
+
+        return position
+
+class AgentSpawnRule(Rule):
+    def __init__(self, spawn_rule):
+        self.spawn_rule = spawn_rule
+        super().__init__()
 
 class DoneAtMaxStepsReached(Rule):
 
diff --git a/marl_factory_grid/utils/states.py b/marl_factory_grid/utils/states.py
index 1452ab7..0c9e965 100644
--- a/marl_factory_grid/utils/states.py
+++ b/marl_factory_grid/utils/states.py
@@ -118,6 +118,10 @@ class Gamestate(object):
         self._floortile_graph = None
         self.tests = StepTests(*tests)
 
+        # Pointer that defines current spawn points of agents
+        for agent in self.agents_conf:
+            self.agents_conf[agent]["pos_pointer"] = 0
+
     def reset(self):
         self.curr_step = 0
         self.curr_actions = None
diff --git a/studies/marl_adapted.py b/studies/marl_adapted.py
index 74f66a5..ce3b549 100644
--- a/studies/marl_adapted.py
+++ b/studies/marl_adapted.py
@@ -3,17 +3,36 @@ from pathlib import Path
 from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
 from marl_factory_grid.algorithms.utils import load_yaml_file
 
-if __name__ == '__main__':
+def dirt_quadrant_single_agent_training():
     cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml')
 
     train_cfg = load_yaml_file(cfg_path)
     # Use environment config with fixed spawnpoints for eval
     eval_cfg = copy.deepcopy(train_cfg)
-    eval_cfg["env"]["env_name"] = "custom/dirt_quadrant" # Options: two_rooms_one_door_modified, dirt_quadrant
+    eval_cfg["env"]["env_name"] = "custom/dirt_quadrant_eval_config"
 
     print("Training phase")
     agent = A2C(train_cfg, eval_cfg)
     agent.train_loop()
-    agent.plot_reward_development()
     print("Evaluation phase")
-    agent.eval_loop(10)
\ No newline at end of file
+    # Have consecutive episode for eval in single agent case
+    train_cfg["algorithm"]["pile_all_done"] = "all"
+    # agent.load_agents(["run0", "run1"])
+    agent.eval_loop(10)
+
+
+def dirt_quadrant_multi_agent_eval():
+    cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml')
+
+    train_cfg = load_yaml_file(cfg_path)
+    # Use environment config with fixed spawnpoints for eval
+    eval_cfg = copy.deepcopy(train_cfg)
+    eval_cfg["env"]["env_name"] = "custom/MultiAgentConfigs/dirt_quadrant_eval_config"
+    agent = A2C(train_cfg, eval_cfg)
+    print("Evaluation phase")
+    agent.load_agents(["run0", "run1"])
+    agent.eval_loop(10)
+
+
+if __name__ == '__main__':
+    dirt_quadrant_single_agent_training()
\ No newline at end of file