mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-12-14 03:00:37 +01:00
added changes from code submission branch and coin entity
This commit is contained in:
@@ -0,0 +1,34 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/MultiAgentConfigs/dirt_quadrant_train_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 200000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 260000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
|
||||
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/dirt_quadrant_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 240000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
|
||||
marl_factory_grid>environment>rewards.py
|
||||
marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
|
||||
marl_factory_grid>environment>rules.py#AgentSpawnRule
|
||||
marl_factory_grid>utils>states.py#GameState.__init__()
|
||||
marl_factory_grid>environment>factory.py>Factory#render
|
||||
marl_factory_grid>environment>factory.py>Factory#set_recorder
|
||||
marl_factory_grid>utils>renderer.py>Renderer#render
|
||||
@@ -0,0 +1,35 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 260000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user