All relevant functional code for A2C Dirt Quadrant setting with small changes to the environment + Different configs for single agent and multiagent settings

2025-07-05 17:11:35 +02:00 · 2024-05-06 12:33:37 +02:00
parent 55026eda12
commit 3c54d04f9f
13 changed files with 652 additions and 174 deletions
--- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@ -0,0 +1,32 @@
+agent:
+  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/MultiAgentConfigs/dirt_quadrant_train_config"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       True
+method:               marl_factory_grid.algorithms.marl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          200000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "all" # Options: "single", "all" ("single" for training, "all" for eval)
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
--- a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
@ -8,7 +8,7 @@ agent:
  use_agent_embedding: False
 env:
  classname:          marl_factory_grid.configs.custom
-  env_name:           "custom/dirt_quadrant_random_pos"
+  env_name:           "custom/dirt_quadrant_train_config"
  n_agents:           1
  max_steps:          250
  pomdp_r:            2
@ -16,13 +16,17 @@ env:
  individual_rewards: True
  train_render:       False
  eval_render:        True
+  save_and_log:       False
 method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
  gamma:              0.99
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          80000
+  max_steps:          270000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
-  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents"
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)