All relevant functional code for A2C Dirt Quadrant setting with small changes to the environment + Different configs for single agent and multiagent settings

This commit is contained in:
Julian Schönberger
2024-05-06 12:33:37 +02:00
parent 55026eda12
commit 3c54d04f9f
13 changed files with 652 additions and 174 deletions

View File

@ -0,0 +1,32 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.configs.custom
env_name: "custom/MultiAgentConfigs/dirt_quadrant_train_config"
n_agents: 2
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: True
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 200000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)

View File

@ -8,7 +8,7 @@ agent:
use_agent_embedding: False
env:
classname: marl_factory_grid.configs.custom
env_name: "custom/dirt_quadrant_random_pos"
env_name: "custom/dirt_quadrant_train_config"
n_agents: 1
max_steps: 250
pomdp_r: 2
@ -16,13 +16,17 @@ env:
individual_rewards: True
train_render: False
eval_render: True
save_and_log: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 80000
max_steps: 270000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "fixed" # Options: "fixed", "random", "none", "agents"
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)