Code cleaning part 2

2025-12-14 03:00:37 +01:00 · 2024-05-24 23:56:00 +02:00
parent 6e6ce9dc5d
commit 81f0f6e209
36 changed files with 421 additions and 495 deletions
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/dirt_quadrant_config.yaml
@@ -1,34 +0,0 @@
 agent:
  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
  n_agents:            2
  obs_emb_size:        96
  action_emb_size:     16
  hidden_size_actor:   64
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/dirt_quadrant_eval_config"
  n_agents:           2
  max_steps:          250
  pomdp_r:            2
  stack_n_frames:     0
  individual_rewards: True
  train_render:       False
  eval_render:        True
  save_and_log:       True
  record:             False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
  max_steps:          200000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
--- a/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_config.yaml
+++ b/marl_factory_grid/algorithms/marl/multi_agent_configs/two_rooms_config.yaml
@@ -1,35 +0,0 @@
 agent:
  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
  n_agents:            2
  obs_emb_size:        96
  action_emb_size:     16
  hidden_size_actor:   64
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/two_rooms_eval_config"
  n_agents:           2
  max_steps:          250
  pomdp_r:            2
  stack_n_frames:     0
  individual_rewards: True
  train_render:       False
  eval_render:        True
  save_and_log:       True
  record:             False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
  max_steps:          260000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
--- a/marl_factory_grid/algorithms/marl/init.py
+++ b/marl_factory_grid/algorithms/marl/init.py
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@@ -1,44 +1,23 @@
 import copy
 import os
 import random
 import imageio # requires ffmpeg install on operating system and imageio-ffmpeg package for python
 from scipy import signal
 import matplotlib.pyplot as plt
 import torch
-from typing import Union, List, Dict
+from typing import Union, List
 import numpy as np
 from torch.distributions import Categorical
-from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
+from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
-from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
+from marl_factory_grid.algorithms.utils import add_env_props
 from pathlib import Path
 from collections import deque
 from marl_factory_grid.environment.actions import Noop
 from marl_factory_grid.modules import Clean, DoorUse
 from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps
 class Names:
    REWARD          = 'reward'
    DONE            = 'done'
    ACTION          = 'action'
    OBSERVATION     = 'observation'
    LOGITS          = 'logits'
    HIDDEN_ACTOR    = 'hidden_actor'
    HIDDEN_CRITIC   = 'hidden_critic'
    AGENT           = 'agent'
    ENV             = 'env'
    ENV_NAME        = 'env_name'
    N_AGENTS        = 'n_agents'
    ALGORITHM       = 'algorithm'
    MAX_STEPS       = 'max_steps'
    N_STEPS         = 'n_steps'
    BUFFER_SIZE     = 'buffer_size'
    CRITIC          = 'critic'
    BATCH_SIZE      = 'bnatch_size'
    N_ACTIONS       = 'n_actions'
    TRAIN_RENDER    = 'train_render'
    EVAL_RENDER     = 'eval_render'
@@ -55,7 +34,7 @@ class A2C:
        self.train_cfg = train_cfg
        self.eval_cfg = eval_cfg
        self.cfg = train_cfg
-        self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
+        self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
        self.setup()
        self.reward_development = []
        self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
@@ -80,8 +59,6 @@ class A2C:
            os.mkdir(self.results_path)
            # Save settings in results folder
            self.save_configs()
            if self.cfg[nms.ENV]["record"]:
                self.recorder = imageio.get_writer(f'{self.results_path}/pygame_recording.mp4', fps=5)
    def set_cfg(self, eval=False):
        if eval:
@@ -610,8 +587,6 @@ class A2C:
            obs = env.reset()
            self.set_agent_spawnpoint(env)
            if self.cfg[nms.ENV][nms.EVAL_RENDER]:
                if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
                    env.set_recorder(self.recorder)
                if self.cfg[nms.ALGORITHM]["auxiliary_piles"]:
                    # Don't render auxiliary piles
                    auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities['DirtPiles']) if idx % 2 == 0]
@@ -664,10 +639,6 @@ class A2C:
            episode += 1
        # Properly finalize the video file
        if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
            self.recorder.close()
    def plot_reward_development(self):
        smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
        plt.plot(smoothed_data)
@@ -689,16 +660,14 @@ class A2C:
    def save_agent_models(self):
        for idx, agent in enumerate(self.agents):
-            agent_name = list(self.factory.state.agents_conf.keys())[idx]
+            agent.pi.save_model_parameters(self.results_path)
-            agent.pi.save_model_parameters(self.results_path, agent_name)
+            agent.vf.save_model_parameters(self.results_path)
            agent.vf.save_model_parameters(self.results_path, agent_name)
    def load_agents(self, runs_list):
        for idx, run in enumerate(runs_list):
            run_path = f"../study_out/{run}"
-            agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
+            self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
-            self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
+            self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
            self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
    def create_info_maps(self, env, used_actions):
        # Create value map
--- a/marl_factory_grid/algorithms/marl/base_a2c.py
+++ b/marl_factory_grid/algorithms/marl/base_a2c.py
@@ -19,11 +19,11 @@ class Net(th.nn.Module):
        if module.bias is not None:
          nn.init.uniform_(module.bias, a=-0.1, b=0.1)
-  def save_model(self, path, agent_name):
+  def save_model(self, path):
-    th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
+    th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
-  def save_model_parameters(self, path, agent_name):
+  def save_model_parameters(self, path):
-    th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
+    th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
  def load_model_parameters(self, path):
    self.net.load_state_dict(th.load(path))
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
@@ -0,0 +1,11 @@
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/dirt_quadrant_eval_config"
  n_agents:           2
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "smart" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
  auxiliary_piles:    False # Dirt quadrant does not use this option
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
@@ -0,0 +1,11 @@
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/dirt_quadrant_eval_config"
  n_agents:           2
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
  auxiliary_piles:    False # Dirt quadrant does not use this option
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
@@ -0,0 +1,13 @@
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/two_rooms_eval_config"
  n_agents:           2
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
@@ -0,0 +1,13 @@
 env:
  classname:          marl_factory_grid.environment.configs.marl_eval
  env_name:           "marl_eval/two_rooms_eval_config_emergent"
  n_agents:           2
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
  auxiliary_piles:    False # Use True to see emergent phenomenon and False to prevent it
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
@@ -0,0 +1,12 @@
 env:
  classname:          marl_factory_grid.environment.configs.rl
  env_name:           "rl/dirt_quadrant_agent1_eval_config"
  n_agents:           1
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "all" #
  auxiliary_piles:    False # Dirt quadrant does not use this option
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
@@ -1,34 +1,17 @@
 agent:
  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
  n_agents:            1
  obs_emb_size:        96
  action_emb_size:     16
  hidden_size_actor:   64
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.rl
-  env_name:           "rl/dirt_quadrant_train_config"
+  env_name:           "rl/dirt_quadrant_agent1_train_config"
  n_agents:           1
  max_steps:          250
  pomdp_r:            2
  stack_n_frames:     0
  individual_rewards: True
  train_render:       False
  eval_render:        True
  save_and_log:       True
  record:             False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          240000
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
  max_steps:          140000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
+  auxiliary_piles:    False # Dirt quadrant does not use this option
  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
@@ -0,0 +1,13 @@
 env:
  classname:          marl_factory_grid.environment.configs.rl
  env_name:           "rl/two_rooms_eval_config"
  n_agents:           1
  eval_render:        True
  save_and_log:       False
 algorithm:
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "all" # Options: "single", "all" ("single" for training, "all" for eval)
  auxiliary_piles:    False # Auxiliary piles are only used during marl eval
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
@@ -1,35 +1,17 @@
 agent:
  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
  n_agents:            1
  obs_emb_size:        96
  action_emb_size:     16
  hidden_size_actor:   64
  hidden_size_critic:  64
  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.environment.configs.rl
  env_name:           "rl/two_rooms_train_config"
  n_agents:           1
  max_steps:          250
  pomdp_r:            2
  stack_n_frames:     0
  individual_rewards: True
  train_render:       False
-  eval_render:        True
+  save_and_log:       True
  save_and_log:       False
  record:             False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
  max_steps:          260000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"
  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
-  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
+  auxiliary_piles:    False # Auxiliary piles are only used during marl eval
  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
--- a/marl_factory_grid/algorithms/static/TSP_base_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_base_agent.py
@@ -6,7 +6,7 @@ from networkx.algorithms.approximation import traveling_salesman as tsp
 import time
 import copy
-from marl_factory_grid.algorithms.static.utils import points_to_graph
+from marl_factory_grid.algorithms.tsp.utils import points_to_graph
 from marl_factory_grid.modules.doors import constants as do
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.utils.helpers import MOVEMAP
--- a/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_dirt_agent.py
@@ -1,4 +1,4 @@
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
 from marl_factory_grid.modules.clean_up import constants as di
 from marl_factory_grid.environment import constants as c
--- a/marl_factory_grid/algorithms/static/TSP_target_agent.py
+++ b/marl_factory_grid/algorithms/static/TSP_target_agent.py
@@ -1,4 +1,4 @@
-from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
+from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
 from marl_factory_grid.modules.destinations import constants as d
 from marl_factory_grid.modules.doors import constants as do
--- a/marl_factory_grid/algorithms/static/init.py
+++ b/marl_factory_grid/algorithms/static/init.py
--- a/marl_factory_grid/algorithms/static/utils.py
+++ b/marl_factory_grid/algorithms/static/utils.py
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@@ -64,13 +64,6 @@ def add_env_props(cfg):
    factory = Factory(env_path)
    _ = factory.reset()
    # Agent Init
    if len(factory.state.moving_entites) == 1: # Single agent setting
        observation_size = list(factory.observation_space.shape)
    else: # Multi-agent setting
        observation_size = list(factory.observation_space[0].shape)
    cfg['agent'].update(dict(observation_size=observation_size, n_actions=factory.action_space[0].n))
    return factory
--- a/marl_factory_grid/environment/configs/marl_eval/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/dirt_quadrant_eval_config.yaml
@@ -5,18 +5,17 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
+  # View Radius
-  pomdp_r: 0 # default 3
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
+# Define Agents, their actions, observations and spawnpoints
 # other agents aim to clean dirt piles.
 Agents:
  # The clean agents
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - Noop
@@ -25,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (9,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - Noop
@@ -37,8 +36,8 @@ Agents:
 Entities:
  DirtPiles:
-    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    coords_or_quantity: (9,9), (7,9), (4,7), (2,4), (1, 1)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -46,7 +45,6 @@ Entities:
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@@ -57,5 +55,3 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  #DoneAtMaxStepsReached:
    #max_steps: 200
--- a/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config.yaml
@@ -1,20 +1,20 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
+  # View Radius
-  pomdp_r: 0
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
+# Define Agents, their actions, observations and spawnpoints
 # is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@@ -36,9 +36,10 @@ Agents:
      - (3,13)
 Entities:
  # For RL-agent we model the flags as dirt piles to be more flexible
  DirtPiles:
    coords_or_quantity: (2,1), (3,12), (2,13), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -47,16 +48,13 @@ Entities:
  Doors: { }
 Rules:
  # Environment Dynamics
  #DoorAutoClose:
    #close_frequency: 10
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/environment/configs/marl_eval/two_rooms_eval_config_emergent.yaml
@@ -1,20 +1,20 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
+  # View Radius
-  pomdp_r: 0
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
+# Define Agents, their actions, observations and spawnpoints
 # is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
 Agents:
-  Sigmund:
+  Agent1:
    Actions:
      - Move4
      - DoorUse
@@ -24,7 +24,7 @@ Agents:
      - Self
    Positions:
      - (3,1)
-  Wolfgang:
+  Agent2:
    Actions:
      - Move4
      - DoorUse
@@ -36,9 +36,10 @@ Agents:
      - (3,13)
 Entities:
  # For RL-agent we model the flags as dirt piles to be more flexible
  DirtPiles:
-    coords_or_quantity: (3,12), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
+    coords_or_quantity: (3,12), (3,2) # Locations of flags
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -47,16 +48,13 @@ Entities:
  Doors: { }
 Rules:
  # Environment Dynamics
  #DoorAutoClose:
    #close_frequency: 10
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions
  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_eval_config.yaml
@@ -0,0 +1,48 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
  # View Radius
  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
 # Define Agents, their actions, observations and spawnpoints
 Agents:
  # The clean agents
  Agent1:
    Actions:
      - Move4
      - Noop
    Observations:
      - DirtPiles
      - Self
    Positions:
      - (9,1)
 Entities:
  DirtPiles:
    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of dirt piles
    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_agent1_train_config.yaml
@@ -5,61 +5,38 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
+  # View Radius
-  pomdp_r: 0 # default 3
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
+# Define Agents, their actions, observations and spawnpoints
 # other agents aim to clean dirt piles.
 Agents:
  # The clean agents
-  #Sigmund:
+  Agent1:
    #Actions:
      #- Move4
    #Observations:
      #- DirtPiles
      #- Self
    #Positions:
      #- (9,1)
      #- (1,1)
      #- (2,4)
      #- (4,7)
      #- (6,8)
      #- (7,9)
      #- (2,4)
      #- (4,7)
      #- (6,8)
      #- (7,9)
      #- (9,9)
      #- (9,1)
  Wolfgang:
    Actions:
      - Move4
    Observations:
      - DirtPiles
      - Self
    Positions:
-      - (9,5)
+      - (9,1)
      - (1,1)
      - (2,4)
      - (4,7)
      - (6,8)
      - (7,9)
      - (2,4)
      - (4,7)
      - (6,8)
      - (7,9)
      - (9,9)
-      - (9,5)
+      - (9,1)
 Entities:
  DirtPiles:
-    coords_or_quantity: (1, 1), (2,4), (4,7), (6,8), (7,9), (9,9)  # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
+    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) # Locations of dirt piles
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -67,7 +44,6 @@ Entities:
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
@@ -78,8 +54,6 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
    #max_steps: 1000
  # Define how agents spawn.
  # Options: "random" (Spawn agent at a random position from the list of defined positions)
--- a/marl_factory_grid/environment/configs/rl/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/dirt_quadrant_eval_config.yaml
@@ -1,78 +0,0 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
  # Radius of Partially observable Markov decision process
  pomdp_r: 0 # default 3
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
 # In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
 # other agents aim to clean dirt piles.
 Agents:
  # The clean agents
  #Sigmund:
    #Actions:
      #- Move4
      #- Noop
    #Observations:
      #- DirtPiles
      #- Self
    #Positions:
      #- (9,1)
      #- (1,1)
      #- (2,4)
      #- (4,7)
      #- (7,9)
      #- (2,4)
      #- (4,7)
      #- (7,9)
      #- (9,9)
      #- (9,1)
  Wolfgang:
    Actions:
      - Move4
    Observations:
      - DirtPiles
      - Self
    Positions:
      - (9,5)
      #- (1,1)
      #- (2,4)
      #- (4,7)
      #- (7,9)
      #- (2,4)
      #- (4,7)
      #- (7,9)
      #- (9,9)
      #- (9,5)
 Entities:
  DirtPiles:
    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9) #(9,9), (7,9), (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  # Can be omitted/ignored if you do not want to take care of collisions at all.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  #DoneAtMaxStepsReached:
    #max_steps: 200
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent1_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent1_eval_config.yaml
@@ -0,0 +1,50 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
  # View Radius
  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
 # Define Agents, their actions, observations and spawnpoints
 Agents:
  Agent1:
    Actions:
      - Move4
      - DoorUse
    Observations:
      - DirtPiles
      - Self
    Positions:
      - (3,1)
      - (2,1)
 Entities:
  DirtPiles:
    coords_or_quantity: (2,1), (3,12) # Locations of dirt piles
    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
  Doors: { }
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions
  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent1_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent1_train_config.yaml
@@ -0,0 +1,60 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
  # View Radius
  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
 # Define Agents, their actions, observations and spawnpoints
 Agents:
  Agent1:
    Actions:
      - Move4
    Observations:
      - DirtPiles
      - Self
    Positions:
      - (3,1)
      - (1,1)
      - (3,1)
      - (5,1)
      - (3,1)
      - (1,8)
      - (3,1)
      - (5,8)
 Entities:
  DirtPiles:
    coords_or_quantity: (2,1), (3,12) # Locations of dirt piles
    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
  #Doors: { }  # We leave out the door during training
 Rules:
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  # Define how agents spawn.
  # Options: "random" (Spawn agent at a random position from the list of defined positions)
  # "first" (Always spawn agent at first position regardless of the other provided positions)
  # "order" (Loop through agent positions)
  AgentSpawnRule:
    spawn_rule: "order"
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent2_eval_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent2_eval_config.yaml
@@ -1,30 +1,20 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
+  # View Radius
-  pomdp_r: 0
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
+# Define Agents, their actions, observations and spawnpoints
 # is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
 Agents:
-  #Sigmund:
+  Agent2:
    #Actions:
      #- Move4
      #- DoorUse
    #Observations:
      #- DirtPiles
      #- Self
    #Positions:
      #- (3,1)
      #- (2,1)
  Wolfgang:
    Actions:
      - Move4
      - DoorUse
@@ -37,8 +27,8 @@ Agents:
 Entities:
  DirtPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
+    coords_or_quantity: (2,13), (3,2)  # Locations of dirt piles
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -46,17 +36,15 @@ Entities:
  Doors: { }
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Environment Dynamics
  #DoorAutoClose:
    #close_frequency: 10
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
-  #DoneOnAllDirtCleaned:
+  # Define the conditions for the environment to stop. Either success or a fail conditions
  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/configs/rl/two_rooms_agent2_train_config.yaml
+++ b/marl_factory_grid/environment/configs/rl/two_rooms_agent2_train_config.yaml
@@ -1,35 +1,20 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
+  # View Radius
-  pomdp_r: 0
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
+# Define Agents, their actions, observations and spawnpoints
 # is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
 Agents:
-  #Sigmund:
+  Agent2:
    #Actions:
      #- Move4
    #Observations:
      #- DirtPiles
      #- Self
    #Positions:
      #- (3,1)
      #- (1,1)
      #- (3,1)
      #- (5,1)
      #- (3,1)
      #- (1,8)
      #- (3,1)
      #- (5,8)
  Wolfgang:
    Actions:
      - Move4
    Observations:
@@ -47,29 +32,30 @@ Agents:
 Entities:
  DirtPiles:
-    coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
+    coords_or_quantity: (2,13), (3,2)  # Locations of dirt piles
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
    max_local_amount: 1
-  #Doors: { }
+  #Doors: { } # We leave out the door during training
 # Rules section specifies the rules governing the dynamics of the environment.
 Rules:
  # Environment Dynamics
  #DoorAutoClose:
    #close_frequency: 10
  # Utilities
  # This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
  WatchCollisions:
    done_at_collisions: false
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  #DoneAtMaxStepsReached:
    #max_steps: 100
  # Defines how agents spawn.
  # Options: "random" (Spawn agent at a random position from the list of defined positions)
  # "first" (Always spawn agent at first position regardless of the other provided positions)
  # "order" (Loop through agent positions)
  AgentSpawnRule:
    spawn_rule: "order"
--- a/marl_factory_grid/environment/configs/tsp/dirt_quadrant.yaml
+++ b/marl_factory_grid/environment/configs/tsp/dirt_quadrant.yaml
@@ -5,37 +5,34 @@ General:
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: quadrant
-  # Radius of Partially observable Markov decision process
+  # View Radius
-  pomdp_r: 0 # default 3
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
+# Define Agents, their actions, observations and spawnpoints
 # other agents aim to clean dirt piles.
 Agents:
  # The clean agents
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
      - Clean
      - Noop
    Observations:
      - Walls
      - Other
      - DirtPiles
      - Self
    Positions:
      - (9,1)
-  Reiner:
+  Agent2:
    Actions:
      - Move4
      - Clean
      - Noop
    Observations:
      - Walls
      - Other
      - DirtPiles
      - Self
    Positions:
@@ -44,7 +41,7 @@ Agents:
 Entities:
  DirtPiles:
    coords_or_quantity: (1, 1), (2,4), (4,7), (7,9), (9,9)
-    initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
+    initial_amount: 0.5
    clean_amount: 1
    dirt_spawn_r_var: 0
    max_global_amount: 12
@@ -63,5 +60,3 @@ Rules:
  # Define the conditions for the environment to stop. Either success or a fail conditions.
  # The environment stops when all dirt is cleaned
  DoneOnAllDirtCleaned:
  DoneAtMaxStepsReached:
    max_steps: 200
--- a/marl_factory_grid/environment/configs/tsp/two_rooms.yaml
+++ b/marl_factory_grid/environment/configs/tsp/two_rooms.yaml
@@ -1,40 +1,38 @@
 General:
  # RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
  env_seed: 69
  # Individual vs global rewards
  individual_rewards: true
  # The level.txt file to load from marl_factory_grid/levels
  level_name: two_rooms
-  # View Radius; 0 = full observatbility
+  # View Radius
-  pomdp_r: 0
+  pomdp_r: 0 # 0 = full observability
  # Print all messages and events
  verbose: false
  # Run tests
  tests: false
-# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
+# Define Agents, their actions, observations and spawnpoints
 # is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
 Agents:
-  Wolfgang:
+  Agent1:
    Actions:
      - Move4
      - Noop
-      - DestAction
+      - DestAction # Action that is performed when the destination is reached
      - DoorUse
    Observations:
      - Walls
      - Other
      - Doors
      - Destination
    Positions:
-      - (3,1) # Agent spawnpoint
+      - (3,1)
-  Sigmund:
+  Agent2:
    Actions:
      - Move4
      - Noop
      - DestAction
      - DoorUse
    Observations:
      - Other
      - Walls
      - Destination
      - Doors
@@ -45,10 +43,11 @@ Entities:
  Destinations:
    spawnrule:
      SpawnDestinationsPerAgent:
        # Target coordinates
        coords_or_quantity:
-          Wolfgang:
+          Agent1:
-            - (3,12) # Target coordinates
+            - (3,12)
-          Sigmund:
+          Agent2:
            - (3,2)
  Doors: { }
@@ -68,10 +67,12 @@ Rules:
  AssignGlobalPositions: { }
  DoneAtDestinationReach:
-    reward_at_done: 1
+    reward_at_done: 50
    # We want to give rewards only, when all targets have been reached.
    condition: "all"
  # Done Conditions
  # Define the conditions for the environment to stop. Either success or a fail conditions
  # Environment execution stops after 30 steps
  DoneAtMaxStepsReached:
-    max_steps: 50
+    max_steps: 30
--- a/marl_factory_grid/environment/factory.py
+++ b/marl_factory_grid/environment/factory.py
@@ -293,9 +293,6 @@ class Factory(gym.Env):
                    render_entity.aux = self.obs_builder.curr_lightmaps[render_entity.real_name]
        return self._renderer.render(render_entities)
    def set_recorder(self, recorder):
        self._recorder = recorder
    def summarize_header(self):
        header = {'rec_step': self.state.curr_step}
        for entity_group in (x for x in self.state if x.name in ['Walls', 'DropOffLocations', 'ChargePods']):
--- a/marl_factory_grid/utils/states.py
+++ b/marl_factory_grid/utils/states.py
@@ -3,7 +3,7 @@ from typing import List, Tuple
 import numpy as np
-from marl_factory_grid.algorithms.static.utils import points_to_graph
+from marl_factory_grid.algorithms.tsp.utils import points_to_graph
 from marl_factory_grid.environment import constants as c
 from marl_factory_grid.environment.entity.entity import Entity
 from marl_factory_grid.environment.rules import Rule, SpawnAgents
--- a/studies/marl_adapted.py
+++ b/studies/marl_adapted.py
@@ -1,93 +0,0 @@
 import copy
 from pathlib import Path
 from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
 from marl_factory_grid.algorithms.utils import load_yaml_file
 def single_agent_training(config_name):
    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/single_agent_configs/{config_name}_config.yaml')
    train_cfg = load_yaml_file(cfg_path)
    # Use environment config with fixed spawnpoints for eval
    eval_cfg = copy.deepcopy(train_cfg)
    eval_cfg["env"]["env_name"] = f"rl/{config_name}_eval_config"
    print("Training phase")
    agent = A2C(train_cfg, eval_cfg)
    agent.train_loop()
    print("Evaluation phase")
    # Have consecutive episode for eval in single agent case
    train_cfg["algorithm"]["pile_all_done"] = "all"
    agent.eval_loop(10)
 def single_agent_eval(config_name, run):
    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/single_agent_configs/{config_name}_config.yaml')
    train_cfg = load_yaml_file(cfg_path)
    # Use environment config with fixed spawnpoints for eval
    eval_cfg = copy.deepcopy(train_cfg)
    eval_cfg["env"]["env_name"] = f"rl/{config_name}_eval_config"
    agent = A2C(train_cfg, eval_cfg)
    print("Evaluation phase")
    agent.load_agents(run)
    agent.eval_loop(1)
 def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
    cfg_path = Path(f'../marl_factory_grid/algorithms/marl/multi_agent_configs/{config_name}_config.yaml')
    eval_cfg = load_yaml_file(cfg_path)
    #  Sanity setting of required attributes and configs
    if config_name == "two_rooms":
        if emergent_phenomenon:
            eval_cfg["env"]["env_name"] = f"marl_eval/{config_name}_eval_config_emergent"
            eval_cfg["algorithm"]["auxiliary_piles"] = False
        else:
            eval_cfg["algorithm"]["auxiliary_piles"] = True
    elif config_name == "dirt_quadrant":
        if emergent_phenomenon:
            eval_cfg["algorithm"]["pile-order"] = "dynamic"
        else:
            eval_cfg["algorithm"]["pile-order"] = "smart"
    agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
    print("Evaluation phase")
    agent.load_agents(runs)
    agent.eval_loop(1)
 def dirt_quadrant_single_agent_training():
    single_agent_training("dirt_quadrant")
 def two_rooms_one_door_modified_single_agent_training():
    single_agent_training("two_rooms")
 def dirt_quadrant_single_agent_eval(agent_name):
    if agent_name == "Sigmund":
        run = "run0"
    elif agent_name == "Wolfgang":
        run = "run1"
    single_agent_eval("dirt_quadrant", [run])
 def two_rooms_one_door_modified_single_agent_eval(agent_name):
    if agent_name == "Sigmund":
        run = "run2"
    elif agent_name == "Wolfgang":
        run = "run3"
    single_agent_eval("two_rooms", [run])
 def dirt_quadrant_5_multi_agent_eval(emergent_phenomenon):
    multi_agent_eval("dirt_quadrant", ["run4", "run5"], emergent_phenomenon)
 def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon): # run7 == run4
    multi_agent_eval("dirt_quadrant", ["run4", "run7"], emergent_phenomenon)
 def two_rooms_one_door_modified_multi_agent_eval(emergent_phenomenon):
    multi_agent_eval("two_rooms", ["run2", "run3"], emergent_phenomenon)
 if __name__ == '__main__':
    two_rooms_one_door_modified_multi_agent_eval(False)
--- a/studies/rl_runs.py
+++ b/studies/rl_runs.py
@@ -0,0 +1,75 @@
 from pathlib import Path
 from marl_factory_grid.algorithms.rl.a2c_dirt import A2C
 from marl_factory_grid.algorithms.utils import load_yaml_file
 def dirt_quadrant_agent1_training():
    train_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml')
    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml')
    train_cfg = load_yaml_file(train_cfg_path)
    eval_cfg = load_yaml_file(eval_cfg_path)
    print("Training phase")
    agent = A2C(train_cfg, eval_cfg)
    agent.train_loop()
    print("Evaluation phase")
    agent.eval_loop(n_episodes=1)
 def two_rooms_training(max_steps, agent_name):
    train_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml')
    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml')
    train_cfg = load_yaml_file(train_cfg_path)
    eval_cfg = load_yaml_file(eval_cfg_path)
    train_cfg["algorithm"]["max_steps"] = max_steps
    train_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_train_config"
    eval_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_eval_config"
    print("Training phase")
    agent = A2C(train_cfg, eval_cfg)
    agent.train_loop()
    print("Evaluation phase")
    agent.eval_loop(n_episodes=1)
 def two_rooms_agent1_training():
    two_rooms_training(max_steps=190000, agent_name="agent1")
 def two_rooms_agent2_training():
    two_rooms_training(max_steps=260000, agent_name="agent2")
 def single_agent_eval(config_name, run_folder_name):
    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/{config_name}_eval_config.yaml')
    train_cfg = eval_cfg = load_yaml_file(eval_cfg_path)
    # A value for train_cfg is required, but the train environment won't be used
    agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg)
    print("Evaluation phase")
    agent.load_agents([run_folder_name])
    agent.eval_loop(1)
 def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
    eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/multi_agent_configs/{config_name}' +
                         f'_eval_config{"_emergent" if emergent_phenomenon else ""}.yaml')
    eval_cfg = load_yaml_file(eval_cfg_path)
    # A value for train_cfg is required, but the train environment won't be used
    agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
    print("Evaluation phase")
    agent.load_agents(runs)
    agent.eval_loop(1)
 def dirt_quadrant_multi_agent_ctde_eval(emergent_phenomenon):
    multi_agent_eval("dirt_quadrant", ["run0", "run0"], emergent_phenomenon)
 def two_rooms_multi_agent_eval(emergent_phenomenon):
    multi_agent_eval("two_rooms", ["run1", "run2"], emergent_phenomenon)
 if __name__ == '__main__':
    dirt_quadrant_agent1_training()
--- a/studies/tsp_runs.py
+++ b/studies/tsp_runs.py
@@ -4,10 +4,11 @@ from pathlib import Path
 from tqdm import trange
-from marl_factory_grid.algorithms.static.TSP_dirt_agent import TSPDirtAgent
+from marl_factory_grid.algorithms.tsp.TSP_dirt_agent import TSPDirtAgent
-from marl_factory_grid.algorithms.static.TSP_target_agent import TSPTargetAgent
+from marl_factory_grid.algorithms.tsp.TSP_target_agent import TSPTargetAgent
 from marl_factory_grid.environment.factory import Factory
 def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
    agents = [TSPDirtAgent(factory, 0), TSPDirtAgent(factory, 1)]
    if not emergent_phenomenon:
@@ -31,13 +32,11 @@ def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
            for u, v, weight in agent._position_graph.edges(data='weight'):
                agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
            """for u, v, weight in agent._position_graph.edges(data='weight'):
                            print(f"Edge ({u}-{v}) has weight: {weight}")"""
    return agents
-def get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory):
+def get_two_rooms_tsp_agents(emergent_phenomenon, factory):
    agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
    if not emergent_phenomenon:
        print(emergent_phenomenon)
@@ -45,6 +44,7 @@ def get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory):
            agent._position_graph[(3, 1)][(3, 2)]['weight'] = 4
    return agents
 def run_tsp_setting(config_name, emergent_phenomenon):
    # Render at each step?
    render = True
@@ -74,7 +74,7 @@ def run_tsp_setting(config_name, emergent_phenomenon):
        if config_name == "dirt_quadrant":
            agents = get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory)
        elif config_name == "two_rooms":
-            agents = get_two_rooms_one_door_modified_tsp_agents(emergent_phenomenon, factory)
+            agents = get_two_rooms_tsp_agents(emergent_phenomenon, factory)
        else:
            print("Config name does not exist. Abort...")
            break
@@ -95,7 +95,7 @@ def dirt_quadrant_multi_agent_tsp(emergent_phenomenon):
    run_tsp_setting("dirt_quadrant", emergent_phenomenon)
-def two_rooms_one_door_modified_multi_agent_tsp(emergent_phenomenon):
+def two_rooms_multi_agent_tsp(emergent_phenomenon):
    run_tsp_setting("two_rooms", emergent_phenomenon)