added changes from code submission branch and coin entity

2025-12-14 03:00:37 +01:00 · 2024-09-06 11:01:42 +02:00
parent 33e40deecf
commit 5476f617c6
42 changed files with 1429 additions and 68 deletions
--- a/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml
@@ -0,0 +1,34 @@
+agent:
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/MultiAgentConfigs/dirt_quadrant_train_config"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       True
+  record:             False
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          200000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
+  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
--- a/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml
@@ -0,0 +1,35 @@
+agent:
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
+  n_agents:            2
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/two_rooms_one_door_modified_train_config"
+  n_agents:           2
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       True
+  record:             False
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          260000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
+  auxiliary_piles:    True # Use True to see emergent phenomenon and False to prevent it
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
+
--- a/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml
@@ -0,0 +1,34 @@
+agent:
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
+  n_agents:            1
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/dirt_quadrant_train_config"
+  n_agents:           1
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       True
+  record:             False
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          240000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
+  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
--- a/marl_factory_grid/algorithms/rl/configs/environment_changes
+++ b/marl_factory_grid/algorithms/rl/configs/environment_changes
@@ -0,0 +1,8 @@
+marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
+marl_factory_grid>environment>rewards.py
+marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
+marl_factory_grid>environment>rules.py#AgentSpawnRule
+marl_factory_grid>utils>states.py#GameState.__init__()
+marl_factory_grid>environment>factory.py>Factory#render
+marl_factory_grid>environment>factory.py>Factory#set_recorder
+marl_factory_grid>utils>renderer.py>Renderer#render
--- a/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml
+++ b/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml
@@ -0,0 +1,35 @@
+agent:
+  classname:           marl_factory_grid.algorithms.rl.networks.RecurrentAC
+  n_agents:            1
+  obs_emb_size:        96
+  action_emb_size:     16
+  hidden_size_actor:   64
+  hidden_size_critic:  64
+  use_agent_embedding: False
+env:
+  classname:          marl_factory_grid.configs.custom
+  env_name:           "custom/two_rooms_one_door_modified_train_config"
+  n_agents:           1
+  max_steps:          250
+  pomdp_r:            2
+  stack_n_frames:     0
+  individual_rewards: True
+  train_render:       False
+  eval_render:        True
+  save_and_log:       False
+  record:             False
+method:               marl_factory_grid.algorithms.rl.LoopSEAC
+algorithm:
+  gamma:              0.99
+  entropy_coef:       0.01
+  vf_coef:            0.05
+  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
+  max_steps:          260000
+  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
+  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
+  pile-observability: "single" # Options: "single", "all"
+  pile_all_done:      "single" # Options: "single", "all" ("single" for training, "all" for eval)
+  auxiliary_piles:    False # Option that is only considered when pile-order = "agents"
+  chunk-episode:      20000 # Chunk size. (0 = update networks with full episode at once)
+
+