From 086a92192962d7750447d9d60f0e4604b30aa310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Sch=C3=B6nberger?= Date: Wed, 27 Mar 2024 17:04:14 +0100 Subject: [PATCH] Adapt base_ac.py and utils.py to be compatible with refactored environment --- marl_factory_grid/algorithms/marl/base_ac.py | 46 +++++++++++++------ .../algorithms/marl/example_config.yaml | 12 +++-- marl_factory_grid/algorithms/utils.py | 16 +++++-- studies/marl_adapted.py | 15 ++++++ 4 files changed, 66 insertions(+), 23 deletions(-) create mode 100644 studies/marl_adapted.py diff --git a/marl_factory_grid/algorithms/marl/base_ac.py b/marl_factory_grid/algorithms/marl/base_ac.py index 8b64262..0c15250 100644 --- a/marl_factory_grid/algorithms/marl/base_ac.py +++ b/marl_factory_grid/algorithms/marl/base_ac.py @@ -18,7 +18,8 @@ class Names: HIDDEN_ACTOR = 'hidden_actor' HIDDEN_CRITIC = 'hidden_critic' AGENT = 'agent' - ENV = 'environment' + ENV = 'env' + ENV_NAME = 'env_name' N_AGENTS = 'n_agents' ALGORITHM = 'algorithm' MAX_STEPS = 'max_steps' @@ -27,6 +28,8 @@ class Names: CRITIC = 'critic' BATCH_SIZE = 'bnatch_size' N_ACTIONS = 'n_actions' + TRAIN_RENDER = 'train_render' + EVAL_RENDER = 'eval_render' nms = Names @@ -35,10 +38,10 @@ ListOrTensor = Union[List, torch.Tensor] class BaseActorCritic: def __init__(self, cfg): - add_env_props(cfg) + self.factory = add_env_props(cfg) self.__training = True self.cfg = cfg - self.n_agents = cfg[nms.ENV][nms.N_AGENTS] + self.n_agents = cfg[nms.AGENT][nms.N_AGENTS] self.reset_memory_after_epoch = True self.setup() @@ -88,7 +91,9 @@ class BaseActorCritic: @torch.no_grad() def train_loop(self, checkpointer=None): - env = instantiate_class(self.cfg[nms.ENV]) + env = self.factory + if self.cfg[nms.ENV][nms.TRAIN_RENDER]: + env.render() n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]] tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps)) global_steps, episode, df_results = 0, 0, [] @@ -96,6 +101,7 @@ class BaseActorCritic: while global_steps < max_steps: obs = env.reset() + obs = list(obs.values()) last_hiddens = self.init_hidden() last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents done, rew_log = [False] * self.n_agents, 0 @@ -110,14 +116,20 @@ class BaseActorCritic: while not all(done): out = self.forward(obs, last_action, **last_hiddens) action = self.get_actions(out) - next_obs, reward, done, info = env.step(action) + _, next_obs, reward, done, info = env.step(action) done = [done] * self.n_agents if isinstance(done, bool) else done + if self.cfg[nms.ENV][nms.TRAIN_RENDER]: + env.render() + last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR], hidden_critic=out[nms.HIDDEN_CRITIC]) + logits = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.LOGITS, None)], dim=0) + values = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.CRITIC, None)], dim=0) + tm.add(observation=obs, action=action, reward=reward, done=done, - logits=out.get(nms.LOGITS, None), values=out.get(nms.CRITIC, None), + logits=logits, values=values, **last_hiddens) obs = next_obs @@ -139,7 +151,8 @@ class BaseActorCritic: if global_steps >= max_steps: break - print(f'reward at episode: {episode} = {rew_log}') + if global_steps%100 == 0: + print(f'reward at episode: {episode} = {rew_log}') episode += 1 df_results.append([episode, rew_log, *reward]) df_results = pd.DataFrame(df_results, @@ -151,23 +164,26 @@ class BaseActorCritic: @torch.inference_mode(True) def eval_loop(self, n_episodes, render=False): - env = instantiate_class(self.cfg[nms.ENV]) + env = self.factory + if self.cfg[nms.ENV][nms.EVAL_RENDER]: + env.render() episode, results = 0, [] while episode < n_episodes: obs = env.reset() + obs = list(obs.values()) last_hiddens = self.init_hidden() last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents) while not all(done): - if render: - env.render() - out = self.forward(obs, last_action, **last_hiddens) action = self.get_actions(out) - next_obs, reward, done, info = env.step(action) + _, next_obs, reward, done, info = env.step(action) + + if self.cfg[nms.ENV][nms.EVAL_RENDER]: + env.render() if isinstance(done, bool): - done = [done] * obs.shape[0] + done = [done] * obs[0].shape[0] obs = next_obs last_action = action last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR, None), @@ -176,7 +192,7 @@ class BaseActorCritic: eps_rew += torch.tensor(reward) results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode]) episode += 1 - agent_columns = [f'agent#{i}' for i in range(self.cfg['environment']['n_agents'])] + agent_columns = [f'agent#{i}' for i in range(self.cfg[nms.ENV][nms.N_AGENTS])] results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode']) results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'], value_name='reward', var_name='agent') @@ -200,7 +216,7 @@ class BaseActorCritic: def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs): obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:] - out = network(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0]) + out = network(obs, actions, tm.hidden_actor[:, 0].squeeze(0), tm.hidden_critic[:, 0].squeeze(0)) logits = out[nms.LOGITS][:, :-1] # last one only needed for v_{t+1} critic = out[nms.CRITIC] diff --git a/marl_factory_grid/algorithms/marl/example_config.yaml b/marl_factory_grid/algorithms/marl/example_config.yaml index b8a7bd3..62782b3 100644 --- a/marl_factory_grid/algorithms/marl/example_config.yaml +++ b/marl_factory_grid/algorithms/marl/example_config.yaml @@ -1,5 +1,5 @@ agent: - classname: algorithms.marl.networks.RecurrentAC + classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC n_agents: 2 obs_emb_size: 96 action_emb_size: 16 @@ -7,18 +7,20 @@ agent: hidden_size_critic: 64 use_agent_embedding: False env: - classname: environments.factory.make - env_name: "DirtyFactory-v0" + classname: marl_factory_grid.configs + env_name: "simple_crossing" n_agents: 2 max_steps: 250 pomdp_r: 2 stack_n_frames: 0 individual_rewards: True -method: algorithms.marl.LoopSEAC + train_render: True + eval_render: True +method: marl_factory_grid.algorithms.marl.LoopSEAC algorithm: gamma: 0.99 entropy_coef: 0.01 vf_coef: 0.5 n_steps: 5 - max_steps: 1000000 + max_steps: 10000 diff --git a/marl_factory_grid/algorithms/utils.py b/marl_factory_grid/algorithms/utils.py index fa9415c..b472cba 100644 --- a/marl_factory_grid/algorithms/utils.py +++ b/marl_factory_grid/algorithms/utils.py @@ -3,6 +3,8 @@ from pathlib import Path import numpy as np import yaml +from marl_factory_grid import Factory + def load_class(classname): from importlib import import_module @@ -55,9 +57,17 @@ def load_yaml_file(path: Path): def add_env_props(cfg): - env = instantiate_class(cfg['environment'].copy()) - cfg['agent'].update(dict(observation_size=list(env.observation_space.shape), - n_actions=env.action_space.n)) + # Path to config File + env_path = Path(f'../marl_factory_grid/configs/{cfg["env"]["env_name"]}.yaml') + + # Env Init + factory = Factory(env_path) + _ = factory.reset() + + # Agent Init + cfg['agent'].update(dict(observation_size=list(factory.observation_space[0].shape), + n_actions=factory.action_space[0].n)) + return factory class Checkpointer(object): diff --git a/studies/marl_adapted.py b/studies/marl_adapted.py new file mode 100644 index 0000000..b88746a --- /dev/null +++ b/studies/marl_adapted.py @@ -0,0 +1,15 @@ +from pathlib import Path + +from marl_factory_grid.algorithms.marl.iac import LoopIAC +from marl_factory_grid.algorithms.utils import load_yaml_file + +if __name__ == '__main__': + cfg_path = Path('../marl_factory_grid/algorithms/marl/example_config.yaml') + + cfg = load_yaml_file(cfg_path) + + print("Training phase") + agent = LoopIAC(cfg) + agent.train_loop() + print("Evaluation phase") + agent.eval_loop(10) \ No newline at end of file