Updated pomdp_r comment + Added some additional comments + Restructured experiment calling + Added Readme and requirements.txt

2025-07-11 23:42:40 +02:00 · 2024-05-27 18:23:11 +02:00
parent 41a1ec0a5b
commit a0852e805a
23 changed files with 327 additions and 369 deletions
--- a/marl_factory_grid/algorithms/rl/RL_runner.py
+++ b/marl_factory_grid/algorithms/rl/RL_runner.py
@ -0,0 +1,73 @@
+from pathlib import Path
+from marl_factory_grid.algorithms.rl.a2c_dirt import A2C
+from marl_factory_grid.algorithms.utils import load_yaml_file
+
+
+####### Training routines ######
+def rerun_dirt_quadrant_agent1_training():
+    train_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml')
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    print("Training phase")
+    agent = A2C(train_cfg, eval_cfg)
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop(n_episodes=1)
+
+
+def two_rooms_training(max_steps, agent_name):
+    train_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml')
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    train_cfg["algorithm"]["max_steps"] = max_steps
+    train_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_train_config"
+    eval_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_eval_config"
+    print("Training phase")
+    agent = A2C(train_cfg, eval_cfg)
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop(n_episodes=1)
+
+
+def rerun_two_rooms_agent1_training():
+    two_rooms_training(max_steps=190000, agent_name="agent1")
+
+
+def rerun_two_rooms_agent2_training():
+    two_rooms_training(max_steps=260000, agent_name="agent2")
+
+
+####### Eval routines ########
+def single_agent_eval(config_name, run_folder_name):
+    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/{config_name}_eval_config.yaml')
+    train_cfg = eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg)
+    print("Evaluation phase")
+    agent.load_agents([run_folder_name])
+    agent.eval_loop(1)
+
+
+def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
+    eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/multi_agent_configs/{config_name}' +
+                         f'_eval_config{"_emergent" if emergent_phenomenon else ""}.yaml')
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
+    print("Evaluation phase")
+    agent.load_agents(runs)
+    agent.eval_loop(1)
+
+
+def dirt_quadrant_multi_agent_rl_eval(emergent_phenomenon):
+    multi_agent_eval("dirt_quadrant", ["run0", "run0"], emergent_phenomenon)
+
+
+def two_rooms_multi_agent_rl_eval(emergent_phenomenon):
+    multi_agent_eval("two_rooms", ["run1", "run2"], emergent_phenomenon)
--- a/marl_factory_grid/algorithms/rl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/rl/a2c_dirt.py
@ -2,13 +2,14 @@ import os
 import torch
 from typing import Union, List
 import numpy as np
+from tqdm import tqdm

 from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
 from marl_factory_grid.algorithms.rl.constants import Names
-from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, door_is_close, \
+from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
    get_dirt_piles_positions, update_target_pile, update_ordered_dirt_piles, get_all_cleaned_dirt_piles, \
-    distribute_indices, set_agent_spawnpoint, get_ordered_dirt_piles, handle_finished_episode, save_configs, \
-    save_agent_models, get_all_observations
+    distribute_indices, set_agents_spawnpoints, get_ordered_dirt_piles, handle_finished_episode, save_configs, \
+    save_agent_models, get_all_observations, get_agents_positions
 from marl_factory_grid.algorithms.utils import add_env_props
 from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
    create_info_maps
@ -28,93 +29,88 @@ class A2C:
        self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
        self.setup()
        self.reward_development = []
-        self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
+        self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}

    def setup(self):
-        dirt_piles_positions = [self.factory.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in
-                                range(len(self.factory.state.entities[nms.DIRT_PILES]))]
-        self.obs_dim = 2 + 2*len(dirt_piles_positions) if self.cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL else 4
+        """ Initialize agents and create entry for run results according to configuration """
+        self.obs_dim = 2 + 2 * len(get_dirt_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
+                                                                                  nms.PILE_OBSERVABILITY] == nms.ALL else 4
        self.act_dim = 4  # The 4 movement directions
-        self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in range(self.n_agents)]
+        self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
+                       range(self.n_agents)]
+
        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
            # Create results folder
-            runs = os.listdir("../study_out/")
+            runs = os.listdir("./study_out/")
            run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
-            next_run_number = max(run_numbers)+1 if run_numbers else 0
-            self.results_path = f"../study_out/run{next_run_number}"
+            next_run_number = max(run_numbers) + 1 if run_numbers else 0
+            self.results_path = f"./study_out/run{next_run_number}"
            os.mkdir(self.results_path)
            # Save settings in results folder
            save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)

    def set_cfg(self, eval=False):
+        """ Set the mode of the current configuration """
        if eval:
            self.cfg = self.eval_cfg
        else:
            self.cfg = self.train_cfg

    def load_agents(self, runs_list):
+        """ Initialize networks with parameters of already trained agents """
        for idx, run in enumerate(runs_list):
-            run_path = f"../study_out/{run}"
+            run_path = f"./study_out/{run}"
            self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
            self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")

    @torch.no_grad()
    def train_loop(self):
+        """ Function for training agents """
        env = self.factory
        n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
        global_steps, episode = 0, 0
        indices = distribute_indices(env, self.cfg, self.n_agents)
        dirt_piles_positions = get_dirt_piles_positions(env)
-        used_actions = {i:0 for i in range(len(env.state.entities[nms.AGENT][0]._actions))} # Assume both agents have the same actions
-        target_pile = [partition[0] for partition in indices]  # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
-        cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
+        target_pile = [partition[0] for partition in
+                       indices]  # list of pointers that point to the current target pile for each agent
+        cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]

+        pbar = tqdm(total=max_steps)
        while global_steps < max_steps:
-            print(global_steps)
-            obs = env.reset()
+            _ = env.reset()
            if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
                env.render()
-            set_agent_spawnpoint(env, self.n_agents)
+            set_agents_spawnpoints(env, self.n_agents)
            ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, self.cfg, self.n_agents)
            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
                target_pile = [partition[0] for partition in indices]
                cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]

+            # Supply each agent with its local observation
            obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
-            done, rew_log       = [False] * self.n_agents, 0
-
-            print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
-            print("Agents target piles:", target_pile)
-            print("Agents initial observation:", obs)
-            print("Agents cleaned dirt piles:", cleaned_dirt_piles)
+            done, rew_log = [False] * self.n_agents, 0

            while not all(done):
-                # 0="North", 1="East", 2="South", 3="West", 4="Clean", 5="Noop"
                action = self.use_door_or_move(env, obs, cleaned_dirt_piles) \
                    if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
-                used_actions[int(action[0])] += 1
                _, next_obs, reward, done, info = env.step(action)
-                if done:
-                    print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
                next_obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)

-                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
+                # Handle case where agent is on field with dirt
+                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices,
+                                                reward, done)

-                if n_steps != 0 and (global_steps + 1) % n_steps == 0:
-                    print("max_steps reached")
-                    done = True
+                if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True

                done = [done] * self.n_agents if isinstance(done, bool) else done
                for ag_i, agent in enumerate(self.agents):
-                    # For forced actions like door opening, we have to call the step function with this action, but
-                    # since we are not allowed to exceed the dimensions range, we can't log the corresponding step info.
                    if action[ag_i] in range(self.act_dim):
                        # Add agent results into respective rollout buffers
                        agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])

-                if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
-                    env.render()
+                # Visualize state update
+                if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()

                obs = next_obs

@ -123,97 +119,93 @@ class A2C:
                global_steps += 1
                rew_log += sum(reward)

-                if global_steps >= max_steps:
-                    break
+                if global_steps >= max_steps: break

-            print(f'reward at episode: {episode} = {rew_log}')
            self.reward_development.append(rew_log)
            episode += 1
+            pbar.update(global_steps - pbar.n)

-        plot_reward_development(self.reward_development, self.cfg, self.results_path)
+        pbar.close()
        if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
-            create_info_maps(env, used_actions, get_all_observations(env, self.cfg, self.n_agents),
+            plot_reward_development(self.reward_development, self.results_path)
+            create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
                             get_dirt_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
            save_agent_models(self.results_path, self.agents)
            plot_action_maps(env, [self], self.results_path)

    @torch.inference_mode(True)
-    def eval_loop(self, n_episodes, render=False):
+    def eval_loop(self, n_episodes):
+        """ Function for performing inference """
        env = self.eval_factory
        self.set_cfg(eval=True)
        episode, results = 0, []
        dirt_piles_positions = get_dirt_piles_positions(env)
        indices = distribute_indices(env, self.cfg, self.n_agents)
-        target_pile = [partition[0] for partition in indices]  # pointer that points to the target pile for each agent. (point to same pile/ point to different piles)
+        target_pile = [partition[0] for partition in
+                       indices]  # list of pointers that point to the current target pile for each agent
        if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
-            cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
-        else:
-            cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
+            cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in
+                                  range(self.n_agents)]
+        else: cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]

        while episode < n_episodes:
-            obs = env.reset()
-            set_agent_spawnpoint(env, self.n_agents)
+            _ = env.reset()
+            set_agents_spawnpoints(env, self.n_agents)
            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
+                # Don't render auxiliary piles
                if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
-                    # Don't render auxiliary piles
-                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.DIRT_PILES]) if idx % 2 == 0]
+                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.DIRT_PILES]) if
+                                       idx % 2 == 0]
                    for pile in auxiliary_piles:
                        pile.set_new_amount(0)
                env.render()
-                env._renderer.fps = 5 # Slow down agent movement
+                env._renderer.fps = 5  # Slow down agent movement

            # Reset current target pile at episode begin if all piles have to be cleaned in one episode
            if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
                target_pile = [partition[0] for partition in indices]
                if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
-                    cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
-                else:
-                    cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
+                    cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in
+                                          range(self.n_agents)]
+                else: cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]

            ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, self.cfg, self.n_agents)

+            # Supply each agent with its local observation
            obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
            done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)

            while not all(done):
                action = self.use_door_or_move(env, obs, cleaned_dirt_piles, det=True) \
-                    if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
-                _, next_obs, reward, done, info = env.step(action) # Note that this call seems to flip the lists in indices
-                if done:
-                    print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
+                    if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
+                                                                                       cleaned_dirt_piles)  # zero exploration
+                _, next_obs, reward, done, info = env.step(action)

-                # Add small negative reward if agent has moved away from the target_pile
-                # reward = self.reward_distance(env, obs, target_pile, reward)
+                # Handle case where agent is on field with dirt
+                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices,
+                                                reward, done)

-                # Check and handle if agent is on field with dirt
-                reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
-
-                # Get transformed next_obs that might have been updated because of self.handle_dirt.
-                # For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
-                # in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
+                # Get transformed next_obs that might have been updated because of handle_dirt
                next_obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)

                done = [done] * self.n_agents if isinstance(done, bool) else done

-                if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                    env.render()
+                if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()

                obs = next_obs

            episode += 1

-
-
    ########## Helper functions ########

    def get_actions(self, observations) -> ListOrTensor:
-        # Given an observation, get actions for both agents
+        """ Given local observations, get actions for both agents """
        actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
                   enumerate(self.agents)]
        return actions

    def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
-        # Use deterministic policy for inference
+        """ Execute agent policies deterministically for inference """
        actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
                   enumerate(self.agents)]
        for agent_idx in range(self.n_agents):
@ -224,10 +216,11 @@ class A2C:
        return actions

    def use_door_or_move(self, env, obs, cleaned_dirt_piles, det=False):
+        """ Function that handles automatic actions like door opening and forced Noop"""
        action = []
        for agent_idx, agent in enumerate(self.agents):
            agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
-            # If agent already reached its target
+            # Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting)
            if all(cleaned_dirt_piles[agent_idx].values()):
                action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
                                   a.name == nms.NOOP))
@ -235,37 +228,33 @@ class A2C:
                    # Include agent experience entry manually
                    agent._episode.append((None, None, None, agent.vf(agent_obs)))
            else:
-                if door := door_is_close(env, agent_idx):
+                if door := is_door_close(env, agent_idx):
                    if door.is_closed:
                        action.append(next(
                            action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
                            a.name == nms.USE_DOOR))
                        # Don't include action in agent experience
                    else:
-                        if det:
-                            action.append(int(agent.pi(agent_obs, det=True)[0]))
-                        else:
-                            action.append(int(agent.step(agent_obs)))
+                        if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
+                        else: action.append(int(agent.step(agent_obs)))
                else:
-                    if det:
-                        action.append(int(agent.pi(agent_obs, det=True)[0]))
-                    else:
-                        action.append(int(agent.step(agent_obs)))
+                    if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
+                    else: action.append(int(agent.step(agent_obs)))
        return action

    def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done):
-        # Check if agent moved on field with dirt. If that is the case collect dirt automatically
-        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
+        """ Check if agent moved on field with dirt. If that is the case collect dirt automatically """
+        agents_positions = get_agents_positions(env, self.n_agents)
        dirt_piles_positions = get_dirt_piles_positions(env)
-        if any([True for pos in agent_positions if pos in dirt_piles_positions]):
+        if any([True for pos in agents_positions if pos in dirt_piles_positions]):
            # Only simulate collecting the dirt
-            for idx, pos in enumerate(agent_positions):
+            for idx, pos in enumerate(agents_positions):
                if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:

                    # If dirt piles should be cleaned in a specific order
                    if ordered_dirt_piles[idx]:
                        if pos == ordered_dirt_piles[idx][target_pile[idx]]:
-                            reward[idx] += 50  # 1
+                            reward[idx] += 50
                            cleaned_dirt_piles[idx][pos] = True
                            # Set pointer to next dirt pile
                            update_target_pile(env, idx, target_pile, indices, self.cfg)
@ -278,7 +267,7 @@ class A2C:
                                    for pos in dirt_piles_positions:
                                        cleaned_dirt_piles[idx][pos] = False
                    else:
-                        reward[idx] += 50  # 1
+                        reward[idx] += 50
                        cleaned_dirt_piles[idx][pos] = True

                    # Indicate that renderer can hide dirt pile
@ -294,4 +283,3 @@ class A2C:
                    done = True

        return reward, done
-
--- a/marl_factory_grid/algorithms/rl/utils.py
+++ b/marl_factory_grid/algorithms/rl/utils.py
@ -10,6 +10,7 @@ from marl_factory_grid.algorithms.rl.constants import Names
 nms = Names

 def _as_torch(x):
+    """ Helper function to convert different list types to a torch tensor """
    if isinstance(x, np.ndarray):
        return torch.from_numpy(x)
    elif isinstance(x, List):
@ -20,15 +21,16 @@ def _as_torch(x):


 def transform_observations(env, ordered_dirt_piles, target_pile, cfg, n_agents):
-    """ Requires that agent has observations -DirtPiles and -Self """
-    agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
+    """ Function that extracts local observations from global state
+    Requires that agents have observations -DirtPiles and -Self (cf. environment configs) """
+    agents_positions = get_agents_positions(env, n_agents)
    pile_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL
    if pile_observability_is_all:
-        trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
+        trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agents_positions))]
    else:
        # Only show current target pile
-        trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
-    for i, pos in enumerate(agent_positions):
+        trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))]
+    for i, pos in enumerate(agents_positions):
        agent_x, agent_y = pos[0], pos[1]
        trans_obs[i][0] = agent_x
        trans_obs[i][1] = agent_y
@ -45,6 +47,7 @@ def transform_observations(env, ordered_dirt_piles, target_pile, cfg, n_agents):


 def get_all_observations(env, cfg, n_agents):
+    """ Helper function that returns all possible agent observations """
    dirt_piles_positions = [env.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in
                            range(len(env.state.entities[nms.DIRT_PILES]))]
    if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL:
@ -76,41 +79,48 @@ def get_all_observations(env, cfg, n_agents):


 def get_dirt_piles_positions(env):
+    """ Get positions of dirt piles on the map """
    return [env.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in range(len(env.state.entities[nms.DIRT_PILES]))]


+def get_agents_positions(env, n_agents):
+    """ Get positions of agents on the map """
+    return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
+
+
 def get_ordered_dirt_piles(env, cleaned_dirt_piles, cfg, n_agents):
-    """ Each agent can have its individual pile order """
+    """ This function determines in which order the agents should clean the dirt piles
+    Each agent can have its individual pile order """
    ordered_dirt_piles = [[] for _ in range(n_agents)]
-    dirt_pile_positions = get_dirt_piles_positions(env)
-    agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
+    dirt_piles_positions = get_dirt_piles_positions(env)
+    agents_positions = get_agents_positions(env, n_agents)
    for agent_idx in range(n_agents):
        if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]:
-            ordered_dirt_piles[agent_idx] = dirt_pile_positions
+            ordered_dirt_piles[agent_idx] = dirt_piles_positions
        elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]:
            # Calculate distances for remaining unvisited dirt piles
            remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
            pile_distances = {pos:0 for pos in remaining_target_piles}
-            agent_pos = agent_positions[agent_idx]
+            agent_pos = agents_positions[agent_idx]
            for pos in remaining_target_piles:
                pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])

            if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART:
-                # Check if there is an agent in line with any of the remaining dirt piles
+                # Check if there is an agent on the direct path to any of the remaining dirt piles
                for pile_pos in remaining_target_piles:
-                    for other_pos in agent_positions:
+                    for other_pos in agents_positions:
                        if other_pos != agent_pos:
                            if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
-                                # Get the line between the agent and the goal
+                                # Get the line between the agent and the target
                                path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])

-                                # Check if the entity lies on the path between the agent and the goal
+                                # Check if the entity lies on the path between the agent and the target
                                if other_pos in path:
                                    pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])

            sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
            # Insert already visited dirt piles
-            ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
+            ordered_dirt_piles[agent_idx] = [pos for pos in dirt_piles_positions if pos not in remaining_target_piles]
            # Fill up with sorted positions
            for pos in sorted_pile_distances.keys():
                ordered_dirt_piles[agent_idx].append(pos)
@ -145,6 +155,7 @@ def bresenham(x0, y0, x1, y1):


 def update_ordered_dirt_piles(agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, cfg, n_agents):
+    """ Update the order of the remaining dirt piles """
    # Only update ordered_dirt_pile for agent that reached its target pile
    updated_ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, cfg, n_agents)
    for i in range(len(ordered_dirt_piles[agent_idx])):
@ -152,8 +163,10 @@ def update_ordered_dirt_piles(agent_idx, cleaned_dirt_piles, ordered_dirt_piles,


 def distribute_indices(env, cfg, n_agents):
+    """ Distribute dirt piles evenly among the agents """
    indices = []
    n_dirt_piles = len(get_dirt_piles_positions(env))
+    agents_positions = get_agents_positions(env, n_agents)
    if n_dirt_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
        indices = [[0] for _ in range(n_agents)]
    else:
@ -171,12 +184,11 @@ def distribute_indices(env, cfg, n_agents):
        # -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
        if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
            door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
-            agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
            distances = {door_pos:[] for door_pos in door_positions}

            # Calculate distance of every agent to every door
            for door_pos in door_positions:
-                for agent_pos in agent_positions:
+                for agent_pos in agents_positions:
                    distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))

            def duplicate_indices(lst, item):
@ -213,6 +225,7 @@ def distribute_indices(env, cfg, n_agents):


 def update_target_pile(env, agent_idx, target_pile, indices, cfg):
+    """ Get the next target pile for a given agent """
    if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
        if target_pile[agent_idx] + 1 < len(get_dirt_piles_positions(env)):
            target_pile[agent_idx] += 1
@ -223,7 +236,8 @@ def update_target_pile(env, agent_idx, target_pile, indices, cfg):
            target_pile[agent_idx] += 1


-def door_is_close(env, agent_idx):
+def is_door_close(env, agent_idx):
+    """ Checks whether the agent is close to a door """
    neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos)
                    for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name]
    if neighbourhood:
@ -231,6 +245,7 @@ def door_is_close(env, agent_idx):


 def get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles, n_agents):
+    """ Returns all dirt piles cleaned by any agent """
    meta_cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
    for agent_idx in range(n_agents):
        for (pos, cleaned) in cleaned_dirt_piles[agent_idx].items():
@ -240,6 +255,7 @@ def get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles, n_agent


 def handle_finished_episode(obs, agents, cfg):
+    """ Finish up episode, calculate advantages and perform policy net and value net updates"""
    with torch.inference_mode(False):
        for ag_i, agent in enumerate(agents):
            # Get states, actions, rewards and values from rollout buffer
@ -268,6 +284,7 @@ def handle_finished_episode(obs, agents, cfg):


 def split_into_chunks(data_tuple, cfg):
+    """ Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """
    result = [data_tuple]
    chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE]
    if chunk_size > 0:
@ -286,7 +303,8 @@ def split_into_chunks(data_tuple, cfg):
    return result


-def set_agent_spawnpoint(env, n_agents):
+def set_agents_spawnpoints(env, n_agents):
+    """ Tell environment where the agents should spawn in the next episode """
    for agent_idx in range(n_agents):
        agent_name = list(env.state.agents_conf.keys())[agent_idx]
        current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER]
@ -299,6 +317,7 @@ def set_agent_spawnpoint(env, n_agents):


 def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
+    """ Save configurations for logging purposes """
    with open(f"{results_path}/MARL_config.txt", "w") as txt_file:
        txt_file.write(str(cfg))
    with open(f"{results_path}/train_env_config.txt", "w") as txt_file:
@ -308,6 +327,7 @@ def save_configs(results_path, cfg, factory_conf, eval_factory_conf):


 def save_agent_models(results_path, agents):
+    """ Save model parameters after training """
    for idx, agent in enumerate(agents):
        agent.pi.save_model_parameters(results_path)
        agent.vf.save_model_parameters(results_path)
--- a/marl_factory_grid/algorithms/tsp/TSP_runner.py
+++ b/marl_factory_grid/algorithms/tsp/TSP_runner.py
@ -0,0 +1,61 @@
+import os
+from pathlib import Path
+
+from tqdm import trange
+
+from marl_factory_grid import Factory
+from marl_factory_grid.algorithms.tsp.contortions import get_dirt_quadrant_tsp_agents, get_two_rooms_tsp_agents
+
+
+def dirt_quadrant_multi_agent_tsp_eval(emergent_phenomenon):
+    run_tsp_setting("dirt_quadrant", emergent_phenomenon)
+
+
+def two_rooms_multi_agent_tsp_eval(emergent_phenomenon):
+    run_tsp_setting("two_rooms", emergent_phenomenon)
+
+
+def run_tsp_setting(config_name, emergent_phenomenon, n_episodes=1):
+    # Render at each step?
+    render = True
+
+    # Path to config File
+    path = Path(f'./marl_factory_grid/environment/configs/tsp/{config_name}.yaml')
+
+    # Create results folder
+    runs = os.listdir("./study_out/")
+    run_numbers = [int(run[7:]) for run in runs if run[:7] == "tsp_run"]
+    next_run_number = max(run_numbers) + 1 if run_numbers else 0
+    results_path = f"./study_out/tsp_run{next_run_number}"
+    os.mkdir(results_path)
+
+    # Env Init
+    factory = Factory(path)
+
+    with open(f"{results_path}/env_config.txt", "w") as txt_file:
+        txt_file.write(str(factory.conf))
+
+    for episode in trange(n_episodes):
+        _ = factory.reset()
+        done = False
+        if render:
+            factory.render()
+            factory._renderer.fps = 5
+        if config_name == "dirt_quadrant":
+            agents = get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory)
+        elif config_name == "two_rooms":
+            agents = get_two_rooms_tsp_agents(emergent_phenomenon, factory)
+        else:
+            print("Config name does not exist. Abort...")
+            break
+        while not done:
+            a = [x.predict() for x in agents]
+            # Have this condition, to terminate as soon as all dirt piles are collected. This ensures that the implementation
+            # of the TSP agent is equivalent to that of the RL agent
+            if 'DirtPiles' in list(factory.state.entities.keys()) and factory.state.entities['DirtPiles'].global_amount == 0.0:
+                break
+            obs_type, _, _, done, info = factory.step(a)
+            if render:
+                factory.render()
+            if done:
+                break
--- a/marl_factory_grid/algorithms/tsp/contortions.py
+++ b/marl_factory_grid/algorithms/tsp/contortions.py
@ -0,0 +1,55 @@
+import numpy as np
+from marl_factory_grid.algorithms.tsp.TSP_dirt_agent import TSPDirtAgent
+from marl_factory_grid.algorithms.tsp.TSP_target_agent import TSPTargetAgent
+
+
+def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
+    agents = [TSPDirtAgent(factory, 0), TSPDirtAgent(factory, 1)]
+    if not emergent_phenomenon:
+        edge_costs = {}
+        # Add costs for horizontal edges
+        for i in range(1, 10):
+            for j in range(1, 9):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i, j + 1}"] = 0.55 + (i - 1) * 0.05
+                edge_costs[f"{i, j + 1}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
+
+        # Add costs for vertical edges
+        for i in range(1, 9):
+            for j in range(1, 10):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i + 1, j}"] = 0.55 + (i) * 0.05
+                edge_costs[f"{i + 1, j}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
+
+
+        for agent in agents:
+            for u, v, weight in agent._position_graph.edges(data='weight'):
+                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
+
+
+    return agents
+
+
+def get_two_rooms_tsp_agents(emergent_phenomenon, factory):
+    agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
+    if not emergent_phenomenon:
+        edge_costs = {}
+        # Add costs for horizontal edges
+        for i in range(1, 6):
+            for j in range(1, 13):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i, j + 1}"] = np.abs(5/i*np.cbrt(((j+1)/4 - 1)) - 1)
+                edge_costs[f"{i, j + 1}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
+
+        # Add costs for vertical edges
+        for i in range(1, 5):
+            for j in range(1, 14):
+                # Add costs for both traversal directions
+                edge_costs[f"{(i, j)}-{i + 1, j}"] = np.abs(5/(i+1)*np.cbrt((j/4 - 1)) - 1)
+                edge_costs[f"{i + 1, j}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
+
+
+        for agent in agents:
+            for u, v, weight in agent._position_graph.edges(data='weight'):
+                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
+    return agents
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@ -58,7 +58,7 @@ def load_yaml_file(path: Path):

 def add_env_props(cfg):
    # Path to config File
-    env_path = Path(f'../marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')
+    env_path = Path(f'./marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')

    # Env Init
    factory = Factory(env_path)