Reworked configurations

2025-07-11 23:42:40 +02:00 · 2024-08-09 16:31:08 +02:00
parent 8e8e925278
commit 4c81e4b865
12 changed files with 33 additions and 23 deletions
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config.yaml
@ -4,7 +4,9 @@ env:
  n_agents:           2 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
  save_and_log:       False # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  pile-order:         "smart" # Triggers implementation of our emergence prevention mechanism. Agents consider distance to other agent
  pile-observability: "single" # Agents can only perceive one dirt pile at any given time step
  pile_all_done:      "shared" # Indicates that agents don't have to collect the same dirt piles
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/dirt_quadrant_eval_config_emergent.yaml
@ -5,7 +5,9 @@ env:
  n_agents:           2 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
  save_and_log:       False # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  pile-order:         "dynamic" # Agents only decide on next target pile based on the distance to the respective piles
  pile-observability: "single" # Agents can only perceive one dirt pile at any given time step
  pile_all_done:      "shared" # Indicates that agents don't have to collect the same dirt piles
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config.yaml
@ -3,8 +3,10 @@ env:
  env_name:           "marl_eval/two_rooms_eval_config"
  n_agents:           2 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
-  save_and_log:       False # If configurations and potential logging files should be saved
+  save_and_log:       True # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  # Piles (=encoded flags) are evenly distributed among the two agents and have to be collected in the order defined
  # by the environment config (cf. coords_or_quantity)
  pile-order:         "agents"
--- a/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
+++ b/marl_factory_grid/algorithms/rl/multi_agent_configs/two_rooms_eval_config_emergent.yaml
@ -5,7 +5,9 @@ env:
  n_agents:           2 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
  save_and_log:       False # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  # Piles (=encoded flags) are evenly distributed among the two agents and have to be collected in the order defined
  # by the environment config (cf. coords_or_quantity)
  pile-order:         "agents"
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml
@ -4,7 +4,9 @@ env:
  n_agents:           1 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
  save_and_log:       False # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  pile-order:         "fixed" # Clean dirt piles in a fixed order specified by the environment config (cf. coords_or_quantity)
  pile-observability: "single" # Agent can only perceive one dirt pile at any given time step
  pile_all_done:      "all" # During inference the episode ends only when all dirt piles are cleaned
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml
@ -4,11 +4,16 @@ env:
  n_agents:           1 # Number of agents in the environment
  train_render:       False # If training should be graphically visualized
  save_and_log:       True # If configurations and potential logging files should be saved
+  wandb_log:          True # If metrics for training steps should be logged with weights&biases
 algorithm:
+  seed:               9 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  gamma:              0.99 # The gamma value that is used as discounting factor
  n_steps:            0 # How much experience should be sampled at most until the next value- and policy-net updates are performed. (0 = Monte Carlo)
  chunk-episode:      20000 # For update, splits very large episodes in batches of approximately equal size. (0 = update networks with full episode at once)
-  max_steps:          140000 # Number of training steps used for agent1 (=agent2)
+  max_steps:          400000 # Number of training steps used for agent1 (=agent2)
+  early_stopping:     True # If the early stopping functionality should be used
+  last_n_episodes:    100 # To determine if low change phase has begun, the last n episodes are checked if the mean target change is reached
+  mean_target_change: 2.0 # What should be the accepted fluctuation for determining if a low change phase has begun
  advantage:          "Advantage-AC" # Defines the used actor critic model
  pile-order:         "fixed" # Clean dirt piles in a fixed order specified by the environment config (cf. coords_or_quantity)
  pile-observability: "single" # Agent can only perceive one dirt pile at any given time step
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml
@ -1,10 +1,12 @@
 env:
  classname:          marl_factory_grid.environment.configs.rl
-  env_name:           "rl/two_rooms_eval_config"
+  env_name:           "rl/two_rooms_agent2_eval_config"
  n_agents:           1 # Number of agents in the environment
  eval_render:        True # If inference should be graphically visualized
  save_and_log:       False # If configurations and potential logging files should be saved
+  wandb_log:          False # If metrics for evaluation steps should be logged with weights&biases
 algorithm:
+  seed:               42 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  pile-order:         "fixed" # Clean dirt piles (=encoded flags) in a fixed order specified by the environment config (cf. coords_or_quantity)
  pile-observability: "single" # Agent can only perceive one dirt pile at any given time step
  pile_all_done:      "all" # During inference the episode ends only when all dirt piles are cleaned
--- a/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
+++ b/marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml
@ -3,11 +3,16 @@ env:
  n_agents:           1 # Number of agents in the environment
  train_render:       False # If training should be graphically visualized
  save_and_log:       True # If configurations and potential logging files should be saved
+  wandb_log:          True # If metrics for training steps should be logged with weights&biases
 algorithm:
+  seed:               9 # Picks seed to make random parts of algorithm reproducible. -1 for random seed
  gamma:              0.99 # The gamma value that is used as discounting factor
  n_steps:            0 # How much experience should be sampled at most until the next value- and policy-net updates are performed. (0 = Monte Carlo)
  chunk-episode:      20000 # For update, splits very large episodes in batches of approximately equal size. (0 = update networks with full episode at once)
-  max_steps:          260000 # Number of training steps used to train the agent. Here, only a placeholder value
+  max_steps:          300000 # Number of training steps used to train the agent. Here, only a placeholder value
+  early_stopping:     True # If the early stopping functionality should be used
+  last_n_episodes:    100 # To determine if low change phase has begun, the last n episodes are checked if the mean target change is reached
+  mean_target_change: 2.0 # What should be the accepted fluctuation for determining if a low change phase has begun
  advantage:          "Advantage-AC" # Defines the used actor critic model
  pile-order:         "fixed" # Clean dirt piles (=encoded flags) in a fixed order specified by the environment config (cf. coords_or_quantity)
  pile-observability: "single" # Agent can only perceive one dirt pile at any given time step