Code cleaning part 2

2025-07-08 02:21:36 +02:00 · 2024-05-24 23:56:00 +02:00
parent 6e6ce9dc5d
commit 81f0f6e209
36 changed files with 421 additions and 495 deletions
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/dirt_quadrant_config.yaml
@ -1,34 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
-  n_agents:            2
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.environment.configs.marl_eval
-  env_name:           "marl_eval/dirt_quadrant_eval_config"
-  n_agents:           2
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          200000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_config.yaml
@ -1,35 +0,0 @@
-agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
-  n_agents:            2
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
-env:
-  classname:          marl_factory_grid.environment.configs.marl_eval
-  env_name:           "marl_eval/two_rooms_eval_config"
-  n_agents:           2
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
-  train_render:       False
-  eval_render:        True
-  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
-algorithm:
-  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
-  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          260000
-  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
-  pile-observability: "single" # Options: "single", "all"
-  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
-  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
-
-
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@ -1,44 +1,23 @@
 import copy
 import os
 import random
-
-import imageio # requires ffmpeg install on operating system and imageio-ffmpeg package for python
-from scipy import signal
 import matplotlib.pyplot as plt
 import torch
-from typing import Union, List, Dict
+from typing import Union, List
 import numpy as np
-from torch.distributions import Categorical

-from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
-from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
-from pathlib import Path
-from collections import deque
-
-from marl_factory_grid.environment.actions import Noop
-from marl_factory_grid.modules import Clean, DoorUse
+from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
+from marl_factory_grid.algorithms.utils import add_env_props
 from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps


 class Names:
-    REWARD          = 'reward'
-    DONE            = 'done'
-    ACTION          = 'action'
-    OBSERVATION     = 'observation'
-    LOGITS          = 'logits'
-    HIDDEN_ACTOR    = 'hidden_actor'
-    HIDDEN_CRITIC   = 'hidden_critic'
-    AGENT           = 'agent'
    ENV             = 'env'
    ENV_NAME        = 'env_name'
    N_AGENTS        = 'n_agents'
    ALGORITHM       = 'algorithm'
    MAX_STEPS       = 'max_steps'
    N_STEPS         = 'n_steps'
-    BUFFER_SIZE     = 'buffer_size'
-    CRITIC          = 'critic'
-    BATCH_SIZE      = 'bnatch_size'
-    N_ACTIONS       = 'n_actions'
    TRAIN_RENDER    = 'train_render'
    EVAL_RENDER     = 'eval_render'

@ -55,7 +34,7 @@ class A2C:
        self.train_cfg = train_cfg
        self.eval_cfg = eval_cfg
        self.cfg = train_cfg
-        self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
+        self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
        self.setup()
        self.reward_development = []
        self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
@ -80,8 +59,6 @@ class A2C:
            os.mkdir(self.results_path)
            # Save settings in results folder
            self.save_configs()
-            if self.cfg[nms.ENV]["record"]:
-                self.recorder = imageio.get_writer(f'{self.results_path}/pygame_recording.mp4', fps=5)

    def set_cfg(self, eval=False):
        if eval:
@ -610,8 +587,6 @@ class A2C:
            obs = env.reset()
            self.set_agent_spawnpoint(env)
            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
-                if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
-                    env.set_recorder(self.recorder)
                if self.cfg[nms.ALGORITHM]["auxiliary_piles"]:
                    # Don't render auxiliary piles
                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities['DirtPiles']) if idx % 2 == 0]
@ -664,10 +639,6 @@ class A2C:

            episode += 1

-        # Properly finalize the video file
-        if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
-            self.recorder.close()
-
    def plot_reward_development(self):
        smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
        plt.plot(smoothed_data)
@ -689,16 +660,14 @@ class A2C:

    def save_agent_models(self):
        for idx, agent in enumerate(self.agents):
-            agent_name = list(self.factory.state.agents_conf.keys())[idx]
-            agent.pi.save_model_parameters(self.results_path, agent_name)
-            agent.vf.save_model_parameters(self.results_path, agent_name)
+            agent.pi.save_model_parameters(self.results_path)
+            agent.vf.save_model_parameters(self.results_path)

    def load_agents(self, runs_list):
        for idx, run in enumerate(runs_list):
            run_path = f"../study_out/{run}"
-            agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
-            self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
-            self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
+            self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
+            self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")

    def create_info_maps(self, env, used_actions):
        # Create value map
--- a/marl_factory_grid/algorithms/marl/base_a2c.py
+++ b/marl_factory_grid/algorithms/marl/base_a2c.py
@ -19,11 +19,11 @@ class Net(th.nn.Module):
        if module.bias is not None:
          nn.init.uniform_(module.bias, a=-0.1, b=0.1)

-  def save_model(self, path, agent_name):
-    th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
+  def save_model(self, path):
+    th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")

-  def save_model_parameters(self, path, agent_name):
-    th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
+  def save_model_parameters(self, path):
+    th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")

  def load_model_parameters(self, path):
    self.net.load_state_dict(th.load(path))
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
@ -0,0 +1,11 @@
+env:
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/dirt_quadrant_eval_config"
+  n_agents:           2
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "smart" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
+  auxiliary_piles:    False # Dirt quadrant does not use this option
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
@ -0,0 +1,11 @@
+env:
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/dirt_quadrant_eval_config"
+  n_agents:           2
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
+  auxiliary_piles:    False # Dirt quadrant does not use this option
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
@ -0,0 +1,13 @@
+env:
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/two_rooms_eval_config"
+  n_agents:           2
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
+  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
+
+
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
@ -0,0 +1,13 @@
+env:
+  classname:          marl_factory_grid.environment.configs.marl_eval
+  env_name:           "marl_eval/two_rooms_eval_config_emergent"
+  n_agents:           2
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
+  auxiliary_piles:    False # Use True to see emergent phenomenon and False to prevent it
+
+
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
@ -0,0 +1,12 @@
+env:
+  classname:          marl_factory_grid.environment.configs.rl
+  env_name:           "rl/dirt_quadrant_agent1_eval_config"
+  n_agents:           1
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "all" #
+  auxiliary_piles:    False # Dirt quadrant does not use this option
+
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
@ -1,34 +1,17 @@
-agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
-  n_agents:            1
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.rl
-  env_name:           "rl/dirt_quadrant_train_config"
+  env_name:           "rl/dirt_quadrant_agent1_train_config"
  n_agents:           1
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
  train_render:       False
-  eval_render:        True
  save_and_log:       True
-  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          240000
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+  max_steps:          140000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+  auxiliary_piles:    False # Dirt quadrant does not use this option

--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
@ -0,0 +1,13 @@
+env:
+  classname:          marl_factory_grid.environment.configs.rl
+  env_name:           "rl/two_rooms_eval_config"
+  n_agents:           1
+  eval_render:        True
+  save_and_log:       False
+algorithm:
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "all" # Options: "single", "all" ("single" for training, "all" for eval)
+  auxiliary_piles:    False # Auxiliary piles are only used during marl eval
+
+
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
@ -1,35 +1,17 @@
-agent:
-  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
-  n_agents:            1
-  obs_emb_size:        96
-  action_emb_size:     16
-  hidden_size_actor:   64
-  hidden_size_critic:  64
-  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.rl
-  env_name:           "rl/two_rooms_train_config"
  n_agents:           1
-  max_steps:          250
-  pomdp_r:            2
-  stack_n_frames:     0
-  individual_rewards: True
  train_render:       False
-  eval_render:        True
-  save_and_log:       False
-  record:             False
-method:               marl_factory_grid.algorithms.marl.LoopSEAC
+  save_and_log:       True
 algorithm:
  gamma:              0.99
-  entropy_coef:       0.01
-  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
  max_steps:          260000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
-  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+  auxiliary_piles:    False # Auxiliary piles are only used during marl eval


--- a/marl_factory_grid/algorithms/static/TSP_base_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_base_agent.py
@ -6,7 +6,7 @@ from networkx.algorithms.approximation import traveling_salesman as tsp
 import time
 import copy

-from marl_factory_grid.algorithms.static.utils import points_to_graph
+from marl_factory_grid.algorithms.tsp.utils import points_to_graph
 from marl_factory_grid.modules.doors import constants as do
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.utils.helpers import MOVEMAP
--- a/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
@ -1,4 +1,4 @@
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent

 from marl_factory_grid.modules.clean_up import constants as di
 from marl_factory_grid.environment import constants as c
--- a/marl_factory_grid/algorithms/static/TSP_target_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_target_agent.py
@ -1,4 +1,4 @@
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent

 from marl_factory_grid.modules.destinations import constants as d
 from marl_factory_grid.modules.doors import constants as do
--- a/marl_factory_grid/algorithms/static/init.py
+++ b/marl_factory_grid/algorithms/static/init.py
--- a/marl_factory_grid/algorithms/static/utils.py
+++ b/marl_factory_grid/algorithms/static/utils.py
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@ -64,13 +64,6 @@ def add_env_props(cfg):
    factory = Factory(env_path)
    _ = factory.reset()

-    # Agent Init
-    if len(factory.state.moving_entites) == 1: # Single agent setting
-        observation_size = list(factory.observation_space.shape)
-    else: # Multi-agent setting
-        observation_size = list(factory.observation_space[0].shape)
-    cfg['agent'].update(dict(observation_size=observation_size, n_actions=factory.action_space[0].n))
-
    return factory


--- a/marl_factory_grid/environment/configs/marl_eval/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/dirt_quadrant_eval_config.yaml
@ -5,18 +5,17 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to clean dirt piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
  # The clean agents
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - Noop
@ -25,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (9,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - Noop
@ -37,8 +36,8 @@ Agents:

 Entities:
  DirtPiles:
-    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1)
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -46,7 +45,6 @@ Entities:

 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@ -57,5 +55,3 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
-  #DoneAtMaxStepsReached:
-    #max_steps: 200
--- a/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config.yaml
@ -1,20 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@ -36,9 +36,10 @@ Agents:
      - (3,13)

 Entities:
+  # For RL-agent we model the flags as dirt piles to be more flexible
  DirtPiles:
    coords_or_quantity: (2,1), (3,12), (2,13), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -47,16 +48,13 @@ Entities:
  Doors: { }

 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config_emergent.yaml
@ -1,20 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@ -36,9 +36,10 @@ Agents:
      - (3,13)

 Entities:
+  # For RL-agent we model the flags as dirt piles to be more flexible
  DirtPiles:
-    coords_or_quantity: (3,12), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    coords_or_quantity: (3,12), (3,2) # Locations of flags
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -47,16 +48,13 @@ Entities:
  Doors: { }

 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_eval_config.yaml
@ -0,0 +1,48 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: quadrant
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  # The clean agents
+  Agent1:
+    Actions:
+      - Move4
+      - Noop
+    Observations:
+      - DirtPiles
+      - Self
+    Positions:
+      - (9,1)
+
+Entities:
+  DirtPiles:
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of dirt piles
+    initial_amount: 0.5
+    clean_amount: 1
+    dirt_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  # Can be omitted/ignored if you do not want to take care of collisions at all.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
+  # The environment stops when all dirt is cleaned
+  DoneOnAllDirtCleaned:
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_train_config.yaml
@ -5,61 +5,38 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to clean dirt piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
  # The clean agents
-  #Sigmund:
-    #Actions:
-      #- Move4
-    #Observations:
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (9,1)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (6,8)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (6,8)
-      #- (7,9)
-      #- (9,9)
-      #- (9,1)
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
    Observations:
      - DirtPiles
      - Self
    Positions:
-      - (9,5)
+      - (9,1)
      - (1,1)
      - (2,4)
      - (4,7)
-      - (6,8)
      - (7,9)
      - (2,4)
      - (4,7)
-      - (6,8)
      - (7,9)
      - (9,9)
-      - (9,5)
-
+      - (9,1)

 Entities:
  DirtPiles:
-    coords_or_quantity: (1, 1), (2,4), (4,7), (6,8), (7,9), (9,9)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of dirt piles
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -67,7 +44,6 @@ Entities:

 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@ -78,8 +54,6 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
-  #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
-    #max_steps: 1000

  # Define how agents spawn.
  # Options: "random" (Spawn agent at a random position from the list of defined positions)
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_eval_config.yaml
@ -1,78 +0,0 @@
-General:
-  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
-  env_seed: 69
-  # Individual vs global rewards
-  individual_rewards: true
-  # The level.txt file to load from marl_factory_grid/levels
-  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
-  # Print all messages and events
-  verbose: false
-  # Run tests
-  tests: false
-
-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to clean dirt piles.
-Agents:
-  # The clean agents
-  #Sigmund:
-    #Actions:
-      #- Move4
-      #- Noop
-    #Observations:
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (9,1)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (9,9)
-      #- (9,1)
-  Wolfgang:
-    Actions:
-      - Move4
-    Observations:
-      - DirtPiles
-      - Self
-    Positions:
-      - (9,5)
-      #- (1,1)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (2,4)
-      #- (4,7)
-      #- (7,9)
-      #- (9,9)
-      #- (9,5)
-
-Entities:
-  DirtPiles:
-    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) #(9,9), (7,9), (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
-    clean_amount: 1
-    dirt_spawn_r_var: 0
-    max_global_amount: 12
-    max_local_amount: 1
-
-# Rules section specifies the rules governing the dynamics of the environment.
-Rules:
-
-  # Utilities
-  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
-  # Can be omitted/ignored if you do not want to take care of collisions at all.
-  WatchCollisions:
-    done_at_collisions: false
-
-  # Done Conditions
-  # Define the conditions for the environment to stop. Either success or a fail conditions.
-  # The environment stops when all dirt is cleaned
-  DoneOnAllDirtCleaned:
-  #DoneAtMaxStepsReached:
-    #max_steps: 200
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent1_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent1_eval_config.yaml
@ -0,0 +1,50 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent1:
+    Actions:
+      - Move4
+      - DoorUse
+    Observations:
+      - DirtPiles
+      - Self
+    Positions:
+      - (3,1)
+      - (2,1)
+
+Entities:
+  DirtPiles:
+    coords_or_quantity: (2,1), (3,12) # Locations of dirt piles
+    initial_amount: 0.5
+    clean_amount: 1
+    dirt_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  Doors: { }
+
+# Rules section specifies the rules governing the dynamics of the environment.
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
+  DoneAtMaxStepsReached:
+    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent1_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent1_train_config.yaml
@ -0,0 +1,60 @@
+General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
+  env_seed: 69
+  # Individual vs global rewards
+  individual_rewards: true
+  # The level.txt file to load from marl_factory_grid/levels
+  level_name: two_rooms
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
+  # Print all messages and events
+  verbose: false
+  # Run tests
+  tests: false
+
+# Define Agents, their actions, observations and spawnpoints
+Agents:
+  Agent1:
+    Actions:
+      - Move4
+    Observations:
+      - DirtPiles
+      - Self
+    Positions:
+      - (3,1)
+      - (1,1)
+      - (3,1)
+      - (5,1)
+      - (3,1)
+      - (1,8)
+      - (3,1)
+      - (5,8)
+
+Entities:
+  DirtPiles:
+    coords_or_quantity: (2,1), (3,12) # Locations of dirt piles
+    initial_amount: 0.5
+    clean_amount: 1
+    dirt_spawn_r_var: 0
+    max_global_amount: 12
+    max_local_amount: 1
+
+  #Doors: { }  # We leave out the door during training
+
+Rules:
+  # Utilities
+  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
+  WatchCollisions:
+    done_at_collisions: false
+
+  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # The environment stops when all dirt is cleaned
+  DoneOnAllDirtCleaned:
+
+  # Define how agents spawn.
+  # Options: "random" (Spawn agent at a random position from the list of defined positions)
+  # "first" (Always spawn agent at first position regardless of the other provided positions)
+  # "order" (Loop through agent positions)
+  AgentSpawnRule:
+    spawn_rule: "order"
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent2_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent2_eval_config.yaml
@ -1,30 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  #Sigmund:
-    #Actions:
-      #- Move4
-      #- DoorUse
-    #Observations:
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (3,1)
-      #- (2,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@ -37,8 +27,8 @@ Agents:

 Entities:
  DirtPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    coords_or_quantity: (2,13), (3,2)  # Locations of dirt piles
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -46,17 +36,15 @@ Entities:

  Doors: { }

+# Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent2_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent2_train_config.yaml
@ -1,35 +1,20 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  #Sigmund:
-    #Actions:
-      #- Move4
-    #Observations:
-      #- DirtPiles
-      #- Self
-    #Positions:
-      #- (3,1)
-      #- (1,1)
-      #- (3,1)
-      #- (5,1)
-      #- (3,1)
-      #- (1,8)
-      #- (3,1)
-      #- (5,8)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
    Observations:
@ -47,29 +32,30 @@ Agents:

 Entities:
  DirtPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    coords_or_quantity: (2,13), (3,2)  # Locations of dirt piles
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1

-  #Doors: { }
+  #Doors: { } # We leave out the door during training

+# Rules section specifies the rules governing the dynamics of the environment.
 Rules:
-  # Environment Dynamics
-  #DoorAutoClose:
-    #close_frequency: 10
-
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false

  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
-  #DoneAtMaxStepsReached:
-    #max_steps: 100

+  # Defines how agents spawn.
+  # Options: "random" (Spawn agent at a random position from the list of defined positions)
+  # "first" (Always spawn agent at first position regardless of the other provided positions)
+  # "order" (Loop through agent positions)
  AgentSpawnRule:
    spawn_rule: "order"
--- a/marl_factory_grid/environment/configs/tsp/dirt_quadrant.yaml
+++ b/marl_factory_grid/environment/configs/tsp/dirt_quadrant.yaml
@ -5,37 +5,34 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
-  pomdp_r: 0 # default 3
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
-# other agents aim to clean dirt piles.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
  # The clean agents
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
      - Clean
      - Noop
    Observations:
      - Walls
-      - Other
      - DirtPiles
      - Self
    Positions:
      - (9,1)
-  Reiner:
+  Agent2:
    Actions:
      - Move4
      - Clean
      - Noop
    Observations:
      - Walls
-      - Other
      - DirtPiles
      - Self
    Positions:
@ -44,7 +41,7 @@ Agents:
 Entities:
  DirtPiles:
    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@ -63,5 +60,3 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
-  DoneAtMaxStepsReached:
-    max_steps: 200
--- a/marl_factory_grid/environment/configs/tsp/two_rooms.yaml
+++ b/marl_factory_grid/environment/configs/tsp/two_rooms.yaml
@ -1,40 +1,38 @@
 General:
+  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
-  pomdp_r: 0
+  # View Radius
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false

-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
-# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
+# Define Agents, their actions, observations and spawnpoints
 Agents:
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
      - Noop
-      - DestAction
+      - DestAction # Action that is performed when the destination is reached
      - DoorUse
    Observations:
      - Walls
-      - Other
      - Doors
      - Destination
    Positions:
-      - (3,1) # Agent spawnpoint
-  Sigmund:
+      - (3,1)
+  Agent2:
    Actions:
      - Move4
      - Noop
      - DestAction
      - DoorUse
    Observations:
-      - Other
      - Walls
      - Destination
      - Doors
@ -45,10 +43,11 @@ Entities:
  Destinations:
    spawnrule:
      SpawnDestinationsPerAgent:
+        # Target coordinates
        coords_or_quantity:
-          Wolfgang:
-            - (3,12) # Target coordinates
-          Sigmund:
+          Agent1:
+            - (3,12)
+          Agent2:
            - (3,2)

  Doors: { }
@ -68,10 +67,12 @@ Rules:
  AssignGlobalPositions: { }

  DoneAtDestinationReach:
-    reward_at_done: 1
+    reward_at_done: 50
    # We want to give rewards only, when all targets have been reached.
    condition: "all"

  # Done Conditions
+  # Define the conditions for the environment to stop. Either success or a fail conditions
+  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/factory.py
+++ b/marl_factory_grid/environment/factory.py
@ -293,9 +293,6 @@ class Factory(gym.Env):
                    render_entity.aux = self.obs_builder.curr_lightmaps[render_entity.real_name]
        return self._renderer.render(render_entities)

-    def set_recorder(self, recorder):
-        self._recorder = recorder
-
    def summarize_header(self):
        header = {'rec_step': self.state.curr_step}
        for entity_group in (x for x in self.state if x.name in ['Walls', 'DropOffLocations', 'ChargePods']):
--- a/marl_factory_grid/utils/states.py
+++ b/marl_factory_grid/utils/states.py
@ -3,7 +3,7 @@ from typing import List, Tuple

 import numpy as np

-from marl_factory_grid.algorithms.static.utils import points_to_graph
+from marl_factory_grid.algorithms.tsp.utils import points_to_graph
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.environment.entity.entity import Entity
 from marl_factory_grid.environment.rules import Rule, SpawnAgents
--- a/studies/marl_adapted.py
+++ b/studies/marl_adapted.py
@ -1,93 +0,0 @@
-import copy
-from pathlib import Path
-from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
-from marl_factory_grid.algorithms.utils import load_yaml_file
-
-def single_agent_training(config_name):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/single_agent_configs/{config_name}_config.yaml')
-
-    train_cfg = load_yaml_file(cfg_path)
-    # Use environment config with fixed spawnpoints for eval
-    eval_cfg = copy.deepcopy(train_cfg)
-    eval_cfg["env"]["env_name"] = f"rl/{config_name}_eval_config"
-
-    print("Training phase")
-    agent = A2C(train_cfg, eval_cfg)
-    agent.train_loop()
-    print("Evaluation phase")
-    # Have consecutive episode for eval in single agent case
-    train_cfg["algorithm"]["pile_all_done"] = "all"
-    agent.eval_loop(10)
-
-
-def single_agent_eval(config_name, run):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/single_agent_configs/{config_name}_config.yaml')
-
-    train_cfg = load_yaml_file(cfg_path)
-    # Use environment config with fixed spawnpoints for eval
-    eval_cfg = copy.deepcopy(train_cfg)
-    eval_cfg["env"]["env_name"] = f"rl/{config_name}_eval_config"
-    agent = A2C(train_cfg, eval_cfg)
-    print("Evaluation phase")
-    agent.load_agents(run)
-    agent.eval_loop(1)
-
-
-def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
-    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/multi_agent_configs/{config_name}_config.yaml')
-
-    eval_cfg = load_yaml_file(cfg_path)
-    #  Sanity setting of required attributes and configs
-    if config_name == "two_rooms":
-        if emergent_phenomenon:
-            eval_cfg["env"]["env_name"] = f"marl_eval/{config_name}_eval_config_emergent"
-            eval_cfg["algorithm"]["auxiliary_piles"] = False
-        else:
-            eval_cfg["algorithm"]["auxiliary_piles"] = True
-    elif config_name == "dirt_quadrant":
-        if emergent_phenomenon:
-            eval_cfg["algorithm"]["pile-order"] = "dynamic"
-        else:
-            eval_cfg["algorithm"]["pile-order"] = "smart"
-    agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
-    print("Evaluation phase")
-    agent.load_agents(runs)
-    agent.eval_loop(1)
-
-
-def dirt_quadrant_single_agent_training():
-    single_agent_training("dirt_quadrant")
-
-
-def two_rooms_one_door_modified_single_agent_training():
-    single_agent_training("two_rooms")
-
-
-def dirt_quadrant_single_agent_eval(agent_name):
-    if agent_name == "Sigmund":
-        run = "run0"
-    elif agent_name == "Wolfgang":
-        run = "run1"
-    single_agent_eval("dirt_quadrant", [run])
-
-
-def two_rooms_one_door_modified_single_agent_eval(agent_name):
-    if agent_name == "Sigmund":
-        run = "run2"
-    elif agent_name == "Wolfgang":
-        run = "run3"
-    single_agent_eval("two_rooms", [run])
-
-
-def dirt_quadrant_5_multi_agent_eval(emergent_phenomenon):
-    multi_agent_eval("dirt_quadrant", ["run4", "run5"], emergent_phenomenon)
-
-def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon): # run7 == run4
-    multi_agent_eval("dirt_quadrant", ["run4", "run7"], emergent_phenomenon)
-
-def two_rooms_one_door_modified_multi_agent_eval(emergent_phenomenon):
-    multi_agent_eval("two_rooms", ["run2", "run3"], emergent_phenomenon)
-
-
-if __name__ == '__main__':
-    two_rooms_one_door_modified_multi_agent_eval(False)
--- a/studies/rl_runs.py
+++ b/studies/rl_runs.py
@ -0,0 +1,75 @@
+from pathlib import Path
+from marl_factory_grid.algorithms.rl.a2c_dirt import A2C
+from marl_factory_grid.algorithms.utils import load_yaml_file
+
+
+def dirt_quadrant_agent1_training():
+    train_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml')
+    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    print("Training phase")
+    agent = A2C(train_cfg, eval_cfg)
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop(n_episodes=1)
+
+
+def two_rooms_training(max_steps, agent_name):
+    train_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml')
+    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml')
+    train_cfg = load_yaml_file(train_cfg_path)
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    train_cfg["algorithm"]["max_steps"] = max_steps
+    train_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_train_config"
+    eval_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_eval_config"
+    print("Training phase")
+    agent = A2C(train_cfg, eval_cfg)
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop(n_episodes=1)
+
+
+def two_rooms_agent1_training():
+    two_rooms_training(max_steps=190000, agent_name="agent1")
+
+
+def two_rooms_agent2_training():
+    two_rooms_training(max_steps=260000, agent_name="agent2")
+
+
+def single_agent_eval(config_name, run_folder_name):
+    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/{config_name}_eval_config.yaml')
+    train_cfg = eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg)
+    print("Evaluation phase")
+    agent.load_agents([run_folder_name])
+    agent.eval_loop(1)
+
+
+def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
+    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/multi_agent_configs/{config_name}' +
+                         f'_eval_config{"_emergent" if emergent_phenomenon else ""}.yaml')
+    eval_cfg = load_yaml_file(eval_cfg_path)
+
+    # A value for train_cfg is required, but the train environment won't be used
+    agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
+    print("Evaluation phase")
+    agent.load_agents(runs)
+    agent.eval_loop(1)
+
+
+def dirt_quadrant_multi_agent_ctde_eval(emergent_phenomenon):
+    multi_agent_eval("dirt_quadrant", ["run0", "run0"], emergent_phenomenon)
+
+
+def two_rooms_multi_agent_eval(emergent_phenomenon):
+    multi_agent_eval("two_rooms", ["run1", "run2"], emergent_phenomenon)
+
+
+if __name__ == '__main__':
+    dirt_quadrant_agent1_training()
--- a/studies/tsp_runs.py
+++ b/studies/tsp_runs.py
@ -4,10 +4,11 @@ from pathlib import Path

 from tqdm import trange

-from marl_factory_grid.algorithms.static.TSP_dirt_agent import TSPDirtAgent
-from marl_factory_grid.algorithms.static.TSP_target_agent import TSPTargetAgent
+from marl_factory_grid.algorithms.tsp.TSP_dirt_agent import TSPDirtAgent
+from marl_factory_grid.algorithms.tsp.TSP_target_agent import TSPTargetAgent
 from marl_factory_grid.environment.factory import Factory

+
 def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
    agents = [TSPDirtAgent(factory, 0), TSPDirtAgent(factory, 1)]
    if not emergent_phenomenon:
@ -31,13 +32,11 @@ def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
            for u, v, weight in agent._position_graph.edges(data='weight'):
                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]

-            """for u, v, weight in agent._position_graph.edges(data='weight'):
-                            print(f"Edge ({u}-{v}) has weight: {weight}")"""

    return agents


-def get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory):
+def get_two_rooms_tsp_agents(emergent_phenomenon, factory):
    agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
    if not emergent_phenomenon:
        print(emergent_phenomenon)
@ -45,6 +44,7 @@ def get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory):
            agent._position_graph[(3, 1)][(3, 2)]['weight'] = 4
    return agents

+
 def run_tsp_setting(config_name, emergent_phenomenon):
    # Render at each step?
    render = True
@ -74,7 +74,7 @@ def run_tsp_setting(config_name, emergent_phenomenon):
        if config_name == "dirt_quadrant":
            agents = get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory)
        elif config_name == "two_rooms":
-            agents = get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory)
+            agents = get_two_rooms_tsp_agents(emergent_phenomenon, factory)
        else:
            print("Config name does not exist. Abort...")
            break
@ -95,7 +95,7 @@ def dirt_quadrant_multi_agent_tsp(emergent_phenomenon):
    run_tsp_setting("dirt_quadrant", emergent_phenomenon)


-def two_rooms_one_door_modified_multi_agent_tsp(emergent_phenomenon):
+def two_rooms_multi_agent_tsp(emergent_phenomenon):
    run_tsp_setting("two_rooms", emergent_phenomenon)