From 086a92192962d7750447d9d60f0e4604b30aa310 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Julian=20Sch=C3=B6nberger?= <schoenbergerj@cip.ifi.lmu.de>
Date: Wed, 27 Mar 2024 17:04:14 +0100
Subject: [PATCH] Adapt base_ac.py and utils.py to be compatible with
 refactored environment

---
 marl_factory_grid/algorithms/marl/base_ac.py  | 46 +++++++++++++------
 .../algorithms/marl/example_config.yaml       | 12 +++--
 marl_factory_grid/algorithms/utils.py         | 16 +++++--
 studies/marl_adapted.py                       | 15 ++++++
 4 files changed, 66 insertions(+), 23 deletions(-)
 create mode 100644 studies/marl_adapted.py

diff --git a/marl_factory_grid/algorithms/marl/base_ac.py b/marl_factory_grid/algorithms/marl/base_ac.py
index 8b64262..0c15250 100644
--- a/marl_factory_grid/algorithms/marl/base_ac.py
+++ b/marl_factory_grid/algorithms/marl/base_ac.py
@@ -18,7 +18,8 @@ class Names:
     HIDDEN_ACTOR    = 'hidden_actor'
     HIDDEN_CRITIC   = 'hidden_critic'
     AGENT           = 'agent'
-    ENV             = 'environment'
+    ENV             = 'env'
+    ENV_NAME        = 'env_name'
     N_AGENTS        = 'n_agents'
     ALGORITHM       = 'algorithm'
     MAX_STEPS       = 'max_steps'
@@ -27,6 +28,8 @@ class Names:
     CRITIC          = 'critic'
     BATCH_SIZE      = 'bnatch_size'
     N_ACTIONS       = 'n_actions'
+    TRAIN_RENDER    = 'train_render'
+    EVAL_RENDER     = 'eval_render'
 
 
 nms = Names
@@ -35,10 +38,10 @@ ListOrTensor = Union[List, torch.Tensor]
 
 class BaseActorCritic:
     def __init__(self, cfg):
-        add_env_props(cfg)
+        self.factory = add_env_props(cfg)
         self.__training = True
         self.cfg = cfg
-        self.n_agents = cfg[nms.ENV][nms.N_AGENTS]
+        self.n_agents = cfg[nms.AGENT][nms.N_AGENTS]
         self.reset_memory_after_epoch = True
         self.setup()
 
@@ -88,7 +91,9 @@ class BaseActorCritic:
 
     @torch.no_grad()
     def train_loop(self, checkpointer=None):
-        env = instantiate_class(self.cfg[nms.ENV])
+        env = self.factory
+        if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
+            env.render()
         n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
         tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps))
         global_steps, episode, df_results = 0, 0, []
@@ -96,6 +101,7 @@ class BaseActorCritic:
 
         while global_steps < max_steps:
             obs = env.reset()
+            obs = list(obs.values())
             last_hiddens        = self.init_hidden()
             last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
             done, rew_log       = [False] * self.n_agents, 0
@@ -110,14 +116,20 @@ class BaseActorCritic:
             while not all(done):
                 out = self.forward(obs, last_action, **last_hiddens)
                 action = self.get_actions(out)
-                next_obs, reward, done, info = env.step(action)
+                _, next_obs, reward, done, info = env.step(action)
                 done = [done] * self.n_agents if isinstance(done, bool) else done
 
+                if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
+                    env.render()
+
                 last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR],
                                     hidden_critic=out[nms.HIDDEN_CRITIC])
 
+                logits = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.LOGITS, None)], dim=0)
+                values = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.CRITIC, None)], dim=0)
+
                 tm.add(observation=obs, action=action, reward=reward, done=done,
-                       logits=out.get(nms.LOGITS, None), values=out.get(nms.CRITIC, None),
+                       logits=logits, values=values,
                        **last_hiddens)
 
                 obs = next_obs
@@ -139,7 +151,8 @@ class BaseActorCritic:
 
                 if global_steps >= max_steps:
                     break
-            print(f'reward at episode: {episode} = {rew_log}')
+            if global_steps%100 == 0:
+                print(f'reward at episode: {episode} = {rew_log}')
             episode += 1
             df_results.append([episode, rew_log, *reward])
         df_results = pd.DataFrame(df_results,
@@ -151,23 +164,26 @@ class BaseActorCritic:
 
     @torch.inference_mode(True)
     def eval_loop(self, n_episodes, render=False):
-        env = instantiate_class(self.cfg[nms.ENV])
+        env = self.factory
+        if self.cfg[nms.ENV][nms.EVAL_RENDER]:
+            env.render()
         episode, results = 0, []
         while episode < n_episodes:
             obs = env.reset()
+            obs = list(obs.values())
             last_hiddens           = self.init_hidden()
             last_action, reward    = [-1] * self.n_agents, [0.] * self.n_agents
             done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
             while not all(done):
-                if render:
-                    env.render()
-
                 out    = self.forward(obs, last_action, **last_hiddens)
                 action = self.get_actions(out)
-                next_obs, reward, done, info = env.step(action)
+                _, next_obs, reward, done, info = env.step(action)
+
+                if self.cfg[nms.ENV][nms.EVAL_RENDER]:
+                    env.render()
 
                 if isinstance(done, bool):
-                    done = [done] * obs.shape[0]
+                    done = [done] * obs[0].shape[0]
                 obs = next_obs
                 last_action = action
                 last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR,   None),
@@ -176,7 +192,7 @@ class BaseActorCritic:
                 eps_rew += torch.tensor(reward)
             results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
             episode += 1
-        agent_columns = [f'agent#{i}' for i in range(self.cfg['environment']['n_agents'])]
+        agent_columns = [f'agent#{i}' for i in range(self.cfg[nms.ENV][nms.N_AGENTS])]
         results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
         results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'],
                           value_name='reward', var_name='agent')
@@ -200,7 +216,7 @@ class BaseActorCritic:
     def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
         obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
 
-        out = network(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0])
+        out = network(obs, actions, tm.hidden_actor[:, 0].squeeze(0), tm.hidden_critic[:, 0].squeeze(0))
         logits = out[nms.LOGITS][:, :-1]  # last one only needed for v_{t+1}
         critic = out[nms.CRITIC]
 
diff --git a/marl_factory_grid/algorithms/marl/example_config.yaml b/marl_factory_grid/algorithms/marl/example_config.yaml
index b8a7bd3..62782b3 100644
--- a/marl_factory_grid/algorithms/marl/example_config.yaml
+++ b/marl_factory_grid/algorithms/marl/example_config.yaml
@@ -1,5 +1,5 @@
 agent:
-  classname:           algorithms.marl.networks.RecurrentAC
+  classname:           marl_factory_grid.algorithms.marl.networks.RecurrentAC
   n_agents:            2
   obs_emb_size:        96
   action_emb_size:     16
@@ -7,18 +7,20 @@ agent:
   hidden_size_critic:  64
   use_agent_embedding: False
 env:
-  classname:          environments.factory.make
-  env_name:           "DirtyFactory-v0"
+  classname:          marl_factory_grid.configs
+  env_name:           "simple_crossing"
   n_agents:           2
   max_steps:          250
   pomdp_r:            2
   stack_n_frames:     0
   individual_rewards: True
-method:               algorithms.marl.LoopSEAC
+  train_render:       True
+  eval_render:        True
+method:               marl_factory_grid.algorithms.marl.LoopSEAC
 algorithm:
   gamma:              0.99
   entropy_coef:       0.01
   vf_coef:            0.5
   n_steps:            5
-  max_steps:          1000000
+  max_steps:          10000
 
diff --git a/marl_factory_grid/algorithms/utils.py b/marl_factory_grid/algorithms/utils.py
index fa9415c..b472cba 100644
--- a/marl_factory_grid/algorithms/utils.py
+++ b/marl_factory_grid/algorithms/utils.py
@@ -3,6 +3,8 @@ from pathlib import Path
 import numpy as np
 import yaml
 
+from marl_factory_grid import Factory
+
 
 def load_class(classname):
     from importlib import import_module
@@ -55,9 +57,17 @@ def load_yaml_file(path: Path):
 
 
 def add_env_props(cfg):
-    env = instantiate_class(cfg['environment'].copy())
-    cfg['agent'].update(dict(observation_size=list(env.observation_space.shape),
-                             n_actions=env.action_space.n))
+    # Path to config File
+    env_path = Path(f'../marl_factory_grid/configs/{cfg["env"]["env_name"]}.yaml')
+
+    # Env Init
+    factory = Factory(env_path)
+    _ = factory.reset()
+
+    # Agent Init
+    cfg['agent'].update(dict(observation_size=list(factory.observation_space[0].shape),
+                             n_actions=factory.action_space[0].n))
+    return factory
 
 
 class Checkpointer(object):
diff --git a/studies/marl_adapted.py b/studies/marl_adapted.py
new file mode 100644
index 0000000..b88746a
--- /dev/null
+++ b/studies/marl_adapted.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+from marl_factory_grid.algorithms.marl.iac import LoopIAC
+from marl_factory_grid.algorithms.utils import load_yaml_file
+
+if __name__ == '__main__':
+    cfg_path = Path('../marl_factory_grid/algorithms/marl/example_config.yaml')
+
+    cfg = load_yaml_file(cfg_path)
+
+    print("Training phase")
+    agent = LoopIAC(cfg)
+    agent.train_loop()
+    print("Evaluation phase")
+    agent.eval_loop(10)
\ No newline at end of file