Code cleaning part 2

This commit is contained in:
Julian Schönberger
2024-05-24 23:56:00 +02:00
parent 6e6ce9dc5d
commit 81f0f6e209
36 changed files with 421 additions and 495 deletions

View File

@ -1,34 +0,0 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/dirt_quadrant_eval_config"
n_agents: 2
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 200000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
pile-observability: "single" # Options: "single", "all"
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)

View File

@ -1,35 +0,0 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/two_rooms_eval_config"
n_agents: 2
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 260000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)

View File

@ -1,44 +1,23 @@
import copy
import os
import random
import imageio # requires ffmpeg install on operating system and imageio-ffmpeg package for python
from scipy import signal
import matplotlib.pyplot as plt
import torch
from typing import Union, List, Dict
from typing import Union, List
import numpy as np
from torch.distributions import Categorical
from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
from pathlib import Path
from collections import deque
from marl_factory_grid.environment.actions import Noop
from marl_factory_grid.modules import Clean, DoorUse
from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
from marl_factory_grid.algorithms.utils import add_env_props
from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps
class Names:
REWARD = 'reward'
DONE = 'done'
ACTION = 'action'
OBSERVATION = 'observation'
LOGITS = 'logits'
HIDDEN_ACTOR = 'hidden_actor'
HIDDEN_CRITIC = 'hidden_critic'
AGENT = 'agent'
ENV = 'env'
ENV_NAME = 'env_name'
N_AGENTS = 'n_agents'
ALGORITHM = 'algorithm'
MAX_STEPS = 'max_steps'
N_STEPS = 'n_steps'
BUFFER_SIZE = 'buffer_size'
CRITIC = 'critic'
BATCH_SIZE = 'bnatch_size'
N_ACTIONS = 'n_actions'
TRAIN_RENDER = 'train_render'
EVAL_RENDER = 'eval_render'
@ -55,7 +34,7 @@ class A2C:
self.train_cfg = train_cfg
self.eval_cfg = eval_cfg
self.cfg = train_cfg
self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
self.setup()
self.reward_development = []
self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
@ -80,8 +59,6 @@ class A2C:
os.mkdir(self.results_path)
# Save settings in results folder
self.save_configs()
if self.cfg[nms.ENV]["record"]:
self.recorder = imageio.get_writer(f'{self.results_path}/pygame_recording.mp4', fps=5)
def set_cfg(self, eval=False):
if eval:
@ -610,8 +587,6 @@ class A2C:
obs = env.reset()
self.set_agent_spawnpoint(env)
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
env.set_recorder(self.recorder)
if self.cfg[nms.ALGORITHM]["auxiliary_piles"]:
# Don't render auxiliary piles
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities['DirtPiles']) if idx % 2 == 0]
@ -664,10 +639,6 @@ class A2C:
episode += 1
# Properly finalize the video file
if self.cfg[nms.ENV]["save_and_log"] and self.cfg[nms.ENV]["record"]:
self.recorder.close()
def plot_reward_development(self):
smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
plt.plot(smoothed_data)
@ -689,16 +660,14 @@ class A2C:
def save_agent_models(self):
for idx, agent in enumerate(self.agents):
agent_name = list(self.factory.state.agents_conf.keys())[idx]
agent.pi.save_model_parameters(self.results_path, agent_name)
agent.vf.save_model_parameters(self.results_path, agent_name)
agent.pi.save_model_parameters(self.results_path)
agent.vf.save_model_parameters(self.results_path)
def load_agents(self, runs_list):
for idx, run in enumerate(runs_list):
run_path = f"../study_out/{run}"
agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
def create_info_maps(self, env, used_actions):
# Create value map

View File

@ -19,11 +19,11 @@ class Net(th.nn.Module):
if module.bias is not None:
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
def save_model(self, path, agent_name):
th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
def save_model(self, path):
th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
def save_model_parameters(self, path, agent_name):
th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
def save_model_parameters(self, path):
th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
def load_model_parameters(self, path):
self.net.load_state_dict(th.load(path))

View File

@ -0,0 +1,11 @@
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/dirt_quadrant_eval_config"
n_agents: 2
eval_render: True
save_and_log: False
algorithm:
pile-order: "smart" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
pile-observability: "single" # Options: "single", "all"
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
auxiliary_piles: False # Dirt quadrant does not use this option

View File

@ -0,0 +1,11 @@
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/dirt_quadrant_eval_config"
n_agents: 2
eval_render: True
save_and_log: False
algorithm:
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
pile-observability: "single" # Options: "single", "all"
pile_all_done: "shared" # Options: "single", "all" ("single" for training, "all" for eval), "shared"
auxiliary_piles: False # Dirt quadrant does not use this option

View File

@ -0,0 +1,13 @@
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/two_rooms_eval_config"
n_agents: 2
eval_render: True
save_and_log: False
algorithm:
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it

View File

@ -0,0 +1,13 @@
env:
classname: marl_factory_grid.environment.configs.marl_eval
env_name: "marl_eval/two_rooms_eval_config_emergent"
n_agents: 2
eval_render: True
save_and_log: False
algorithm:
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
auxiliary_piles: False # Use True to see emergent phenomenon and False to prevent it

View File

@ -0,0 +1,12 @@
env:
classname: marl_factory_grid.environment.configs.rl
env_name: "rl/dirt_quadrant_agent1_eval_config"
n_agents: 1
eval_render: True
save_and_log: False
algorithm:
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "all" #
auxiliary_piles: False # Dirt quadrant does not use this option

View File

@ -1,34 +1,17 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 1
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.environment.configs.rl
env_name: "rl/dirt_quadrant_train_config"
env_name: "rl/dirt_quadrant_agent1_train_config"
n_agents: 1
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 240000
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
max_steps: 140000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
auxiliary_piles: False # Dirt quadrant does not use this option

View File

@ -0,0 +1,13 @@
env:
classname: marl_factory_grid.environment.configs.rl
env_name: "rl/two_rooms_eval_config"
n_agents: 1
eval_render: True
save_and_log: False
algorithm:
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
auxiliary_piles: False # Auxiliary piles are only used during marl eval

View File

@ -1,35 +1,17 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 1
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.environment.configs.rl
env_name: "rl/two_rooms_train_config"
n_agents: 1
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: False
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
save_and_log: True
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
max_steps: 260000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
auxiliary_piles: False # Auxiliary piles are only used during marl eval

View File

@ -6,7 +6,7 @@ from networkx.algorithms.approximation import traveling_salesman as tsp
import time
import copy
from marl_factory_grid.algorithms.static.utils import points_to_graph
from marl_factory_grid.algorithms.tsp.utils import points_to_graph
from marl_factory_grid.modules.doors import constants as do
from marl_factory_grid.environment import constants as c
from marl_factory_grid.utils.helpers import MOVEMAP

View File

@ -1,4 +1,4 @@
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
from marl_factory_grid.modules.clean_up import constants as di
from marl_factory_grid.environment import constants as c

View File

@ -1,4 +1,4 @@
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
from marl_factory_grid.algorithms.tsp.TSP_base_agent import TSPBaseAgent
from marl_factory_grid.modules.destinations import constants as d
from marl_factory_grid.modules.doors import constants as do

View File

@ -64,13 +64,6 @@ def add_env_props(cfg):
factory = Factory(env_path)
_ = factory.reset()
# Agent Init
if len(factory.state.moving_entites) == 1: # Single agent setting
observation_size = list(factory.observation_space.shape)
else: # Multi-agent setting
observation_size = list(factory.observation_space[0].shape)
cfg['agent'].update(dict(observation_size=observation_size, n_actions=factory.action_space[0].n))
return factory