mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-07-11 23:42:40 +02:00
Code cleaning part 2
This commit is contained in:
@ -1,34 +0,0 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/dirt_quadrant_eval_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 200000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
@ -1,35 +0,0 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/two_rooms_eval_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 260000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
|
||||
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
|
||||
|
@ -1,44 +1,23 @@
|
||||
import copy
|
||||
import os
|
||||
import random
|
||||
|
||||
import imageio # requires ffmpeg install on operating system and imageio-ffmpeg package for python
|
||||
from scipy import signal
|
||||
import matplotlib.pyplot as plt
|
||||
import torch
|
||||
from typing import Union, List, Dict
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
from torch.distributions import Categorical
|
||||
|
||||
from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
|
||||
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
||||
from pathlib import Path
|
||||
from collections import deque
|
||||
|
||||
from marl_factory_grid.environment.actions import Noop
|
||||
from marl_factory_grid.modules import Clean, DoorUse
|
||||
from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
|
||||
from marl_factory_grid.algorithms.utils import add_env_props
|
||||
from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps
|
||||
|
||||
|
||||
class Names:
|
||||
REWARD = 'reward'
|
||||
DONE = 'done'
|
||||
ACTION = 'action'
|
||||
OBSERVATION = 'observation'
|
||||
LOGITS = 'logits'
|
||||
HIDDEN_ACTOR = 'hidden_actor'
|
||||
HIDDEN_CRITIC = 'hidden_critic'
|
||||
AGENT = 'agent'
|
||||
ENV = 'env'
|
||||
ENV_NAME = 'env_name'
|
||||
N_AGENTS = 'n_agents'
|
||||
ALGORITHM = 'algorithm'
|
||||
MAX_STEPS = 'max_steps'
|
||||
N_STEPS = 'n_steps'
|
||||
BUFFER_SIZE = 'buffer_size'
|
||||
CRITIC = 'critic'
|
||||
BATCH_SIZE = 'bnatch_size'
|
||||
N_ACTIONS = 'n_actions'
|
||||
TRAIN_RENDER = 'train_render'
|
||||
EVAL_RENDER = 'eval_render'
|
||||
|
||||
@ -55,7 +34,7 @@ class A2C:
|
||||
self.train_cfg = train_cfg
|
||||
self.eval_cfg = eval_cfg
|
||||
self.cfg = train_cfg
|
||||
self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
|
||||
self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
|
||||
self.setup()
|
||||
self.reward_development = []
|
||||
self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
|
||||
@ -80,8 +59,6 @@ class A2C:
|
||||
os.mkdir(self.results_path)
|
||||
# Save settings in results folder
|
||||
self.save_configs()
|
||||
if self.cfg[nms.ENV]["record"]:
|
||||
self.recorder = imageio.get_writer(f'{self.results_path}/pygame_recording.mp4', fps=5)
|
||||
|
||||
def set_cfg(self, eval=False):
|
||||
if eval:
|
||||
@ -610,8 +587,6 @@ class A2C:
|
||||
obs = env.reset()
|
||||
self.set_agent_spawnpoint(env)
|
||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||
if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
|
||||
env.set_recorder(self.recorder)
|
||||
if self.cfg[nms.ALGORITHM]["auxiliary_piles"]:
|
||||
# Don't render auxiliary piles
|
||||
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities['DirtPiles']) if idx % 2 == 0]
|
||||
@ -664,10 +639,6 @@ class A2C:
|
||||
|
||||
episode += 1
|
||||
|
||||
# Properly finalize the video file
|
||||
if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
|
||||
self.recorder.close()
|
||||
|
||||
def plot_reward_development(self):
|
||||
smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
|
||||
plt.plot(smoothed_data)
|
||||
@ -689,16 +660,14 @@ class A2C:
|
||||
|
||||
def save_agent_models(self):
|
||||
for idx, agent in enumerate(self.agents):
|
||||
agent_name = list(self.factory.state.agents_conf.keys())[idx]
|
||||
agent.pi.save_model_parameters(self.results_path, agent_name)
|
||||
agent.vf.save_model_parameters(self.results_path, agent_name)
|
||||
agent.pi.save_model_parameters(self.results_path)
|
||||
agent.vf.save_model_parameters(self.results_path)
|
||||
|
||||
def load_agents(self, runs_list):
|
||||
for idx, run in enumerate(runs_list):
|
||||
run_path = f"../study_out/{run}"
|
||||
agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
|
||||
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
|
||||
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
|
||||
self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
|
||||
self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
|
||||
|
||||
def create_info_maps(self, env, used_actions):
|
||||
# Create value map
|
@ -19,11 +19,11 @@ class Net(th.nn.Module):
|
||||
if module.bias is not None:
|
||||
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
|
||||
|
||||
def save_model(self, path, agent_name):
|
||||
th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
|
||||
def save_model(self, path):
|
||||
th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
|
||||
|
||||
def save_model_parameters(self, path, agent_name):
|
||||
th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
|
||||
def save_model_parameters(self, path):
|
||||
th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
|
||||
|
||||
def load_model_parameters(self, path):
|
||||
self.net.load_state_dict(th.load(path))
|
@ -0,0 +1,11 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/dirt_quadrant_eval_config"
|
||||
n_agents: 2
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "smart" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
|
||||
auxiliary_piles: False # Dirt quadrant does not use this option
|
@ -0,0 +1,11 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/dirt_quadrant_eval_config"
|
||||
n_agents: 2
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
|
||||
auxiliary_piles: False # Dirt quadrant does not use this option
|
@ -0,0 +1,13 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/two_rooms_eval_config"
|
||||
n_agents: 2
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
|
||||
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it
|
||||
|
||||
|
@ -0,0 +1,13 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/two_rooms_eval_config_emergent"
|
||||
n_agents: 2
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
|
||||
auxiliary_piles: False # Use True to see emergent phenomenon and False to prevent it
|
||||
|
||||
|
@ -0,0 +1,12 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/dirt_quadrant_agent1_eval_config"
|
||||
n_agents: 1
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "all" #
|
||||
auxiliary_piles: False # Dirt quadrant does not use this option
|
||||
|
@ -1,34 +1,17 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/dirt_quadrant_train_config"
|
||||
env_name: "rl/dirt_quadrant_agent1_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
max_steps: 240000
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
max_steps: 140000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
auxiliary_piles: False # Dirt quadrant does not use this option
|
||||
|
@ -0,0 +1,13 @@
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/two_rooms_eval_config"
|
||||
n_agents: 1
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
algorithm:
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||
auxiliary_piles: False # Auxiliary piles are only used during marl eval
|
||||
|
||||
|
@ -1,35 +1,17 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
hidden_size_actor: 64
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/two_rooms_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
||||
stack_n_frames: 0
|
||||
individual_rewards: True
|
||||
train_render: False
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
save_and_log: True
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
vf_coef: 0.05
|
||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
max_steps: 260000
|
||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||
pile-observability: "single" # Options: "single", "all"
|
||||
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||
auxiliary_piles: False # Auxiliary piles are only used during marl eval
|
||||
|
||||
|
@ -6,7 +6,7 @@ from networkx.algorithms.approximation import traveling_salesman as tsp
|
||||
import time
|
||||
import copy
|
||||
|
||||
from marl_factory_grid.algorithms.static.utils import points_to_graph
|
||||
from marl_factory_grid.algorithms.tsp.utils import points_to_graph
|
||||
from marl_factory_grid.modules.doors import constants as do
|
||||
from marl_factory_grid.environment import constants as c
|
||||
from marl_factory_grid.utils.helpers import MOVEMAP
|
@ -1,4 +1,4 @@
|
||||
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
|
||||
from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
|
||||
|
||||
from marl_factory_grid.modules.clean_up import constants as di
|
||||
from marl_factory_grid.environment import constants as c
|
@ -1,4 +1,4 @@
|
||||
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
|
||||
from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
|
||||
|
||||
from marl_factory_grid.modules.destinations import constants as d
|
||||
from marl_factory_grid.modules.doors import constants as do
|
@ -64,13 +64,6 @@ def add_env_props(cfg):
|
||||
factory = Factory(env_path)
|
||||
_ = factory.reset()
|
||||
|
||||
# Agent Init
|
||||
if len(factory.state.moving_entites) == 1: # Single agent setting
|
||||
observation_size = list(factory.observation_space.shape)
|
||||
else: # Multi-agent setting
|
||||
observation_size = list(factory.observation_space[0].shape)
|
||||
cfg['agent'].update(dict(observation_size=observation_size, n_actions=factory.action_space[0].n))
|
||||
|
||||
return factory
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user