From 5c15bb2ddf024c7d21dc7d4bde856f168325bfd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20M=C3=BCller?= <robert.mueller1990@googlemail.com>
Date: Tue, 23 Nov 2021 14:03:52 +0100
Subject: [PATCH 1/2] added MarlFrameStack and salina stuff

---
 algorithms/utils.py                       | 67 ++++++++++++++++++++++-
 environments/factory/__init__.py          |  5 +-
 environments/factory/base/base_factory.py |  5 +-
 environments/utility_classes.py           | 14 ++++-
 studies/sat_mad.py                        | 24 +++++---
 studies/sat_mad.yaml                      | 17 +++---
 6 files changed, 109 insertions(+), 23 deletions(-)

diff --git a/algorithms/utils.py b/algorithms/utils.py
index bf3e2cb..cd3fbd3 100644
--- a/algorithms/utils.py
+++ b/algorithms/utils.py
@@ -1,9 +1,11 @@
 import re
 import torch
+import numpy as np
 import yaml
 from pathlib import Path
 from salina import instantiate_class
 from salina import TAgent
+from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame
 
 
 def load_yaml_file(path: Path):
@@ -27,4 +29,67 @@ class CombineActionsAgent(TAgent):
         keys = list(self.workspace.keys())
         action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
         actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
-        self.set((f'action', t), actions.unsqueeze(0))
+        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
+        self.set((f'action', t), actions)
+
+
+class AutoResetGymMultiAgent(AutoResetGymAgent):
+    AGENT_PREFIX = 'agent#'
+    REWARD       =  'reward'
+    CUMU_REWARD  = 'cumulated_reward'
+    SEP          = '_'
+
+    def __init__(self, *args, n_agents, **kwargs):
+        super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
+        self.n_agents = n_agents
+
+    def prefix(self, agent_id, name):
+        return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}'
+
+    def _reset(self, k, save_render):
+        ret = super()._reset(k, save_render)
+        self.cumulated_reward[k] = [0.0]*self.n_agents
+        del ret['cumulated_reward']
+        cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float()
+                    for agent_i in range(self.n_agents)}
+        rewards  = {self.prefix(agent_i, self.REWARD)     : torch.zeros(1).float()
+                    for agent_i in range(self.n_agents)}
+        ret.update(cumu_rew)
+        ret.update(rewards)
+        return ret
+
+    def _step(self, k, action, save_render):
+        self.timestep[k] += 1
+        env = self.envs[k]
+        if len(action.size()) == 0:
+            action = action.item()
+            assert isinstance(action, int)
+        else:
+            action = np.array(action.tolist())
+        o, r, d, _ = env.step(action)
+        self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
+        print(o.shape)
+        observation = _format_frame(o)
+        if isinstance(observation, torch.Tensor):
+            print(observation.shape)
+            observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i]
+                           for agent_i in range(self.n_agents)}
+            print(observation)
+        else:
+            assert isinstance(observation, dict)
+        if d:
+            self.is_running[k] = False
+
+        if save_render:
+            image = env.render(mode="image").unsqueeze(0)
+            observation["rendering"] = image
+        ret = {
+            **observation,
+            "done": torch.tensor([d]),
+            "initial_state": torch.tensor([False]),
+            "reward": torch.tensor(r).float(),
+            "timestep": torch.tensor([self.timestep[k]]),
+            "cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(),
+        }
+        return _torch_type(ret)
+
diff --git a/environments/factory/__init__.py b/environments/factory/__init__.py
index dca0135..23346f9 100644
--- a/environments/factory/__init__.py
+++ b/environments/factory/__init__.py
@@ -1,4 +1,4 @@
-def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3):
+def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1,  individual_rewards=False):
     import yaml
     from pathlib import Path
     from environments.factory.combined_factories import DirtItemFactory
@@ -12,7 +12,8 @@ def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3):
     obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED,
                                       frames_to_stack=stack_n_frames, pomdp_r=pomdp_r)
 
-    factory_kwargs = dict(n_agents=n_agents, max_steps=max_steps, obs_prop=obs_props,
+    factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards,
+                          max_steps=max_steps, obs_prop=obs_props,
                           mv_prop=MovementProperties(**dictionary['movement_props']),
                           dirt_prop=DirtProperties(**dictionary['dirt_props']),
                           record_episodes=False, verbose=False, **dictionary['factory_props']
diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py
index bb0c121..753e09a 100644
--- a/environments/factory/base/base_factory.py
+++ b/environments/factory/base/base_factory.py
@@ -15,12 +15,11 @@ from environments.helpers import Constants as c, Constants
 from environments import helpers as h
 from environments.factory.base.objects import Agent, Tile, Action
 from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders
-from environments.utility_classes import MovementProperties, ObservationProperties
+from environments.utility_classes import MovementProperties, ObservationProperties, MarlFrameStack
 from environments.utility_classes import AgentRenderOptions as a_obs
 
 import simplejson
 
-
 REC_TAC = 'rec_'
 
 
@@ -57,7 +56,7 @@ class BaseFactory(gym.Env):
 
     def __enter__(self):
         return self if self.obs_prop.frames_to_stack == 0 else \
-            FrameStack(self, self.obs_prop.frames_to_stack)
+            MarlFrameStack(FrameStack(self, self.obs_prop.frames_to_stack))
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
diff --git a/environments/utility_classes.py b/environments/utility_classes.py
index 3069d6d..cdcfd56 100644
--- a/environments/utility_classes.py
+++ b/environments/utility_classes.py
@@ -1,5 +1,6 @@
-from enum import Enum
 from typing import NamedTuple, Union
+import gym
+from gym.wrappers.frame_stack import FrameStack
 
 
 class AgentRenderOptions(object):
@@ -22,3 +23,14 @@ class ObservationProperties(NamedTuple):
     cast_shadows = True
     frames_to_stack: int = 0
     pomdp_r: int = 0
+
+
+class MarlFrameStack(gym.ObservationWrapper):
+    def __init__(self, env):
+        super().__init__(env)
+
+    def observation(self, observation):
+        if isinstance(self.env, FrameStack) and self.env.unwrapped.n_agents > 1:
+            return observation[0:].swapaxes(0, 1)
+        return observation
+
diff --git a/studies/sat_mad.py b/studies/sat_mad.py
index 35ddda5..4380c7d 100644
--- a/studies/sat_mad.py
+++ b/studies/sat_mad.py
@@ -9,7 +9,12 @@ from pathlib import Path
 import numpy as np
 from tqdm import tqdm
 import time
-from algorithms.utils import add_env_props, load_yaml_file, CombineActionsAgent
+from algorithms.utils import (
+    add_env_props,
+    load_yaml_file,
+    CombineActionsAgent,
+    AutoResetGymMultiAgent
+)
 
 
 class A2CAgent(TAgent):
@@ -32,8 +37,8 @@ class A2CAgent(TAgent):
 
     def get_obs(self, t):
         observation = self.get(("env/env_obs", t))
+        print(observation.shape)
         if self.marl:
-            observation = observation.permute(2, 0, 1, 3, 4, 5)
             observation = observation[self.agent_id]
         return observation
 
@@ -57,7 +62,7 @@ if __name__ == '__main__':
     # Setup workspace
     uid = time.time()
     workspace = Workspace()
-    n_agents = 1
+    n_agents = 2
 
     # load config
     cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml')
@@ -65,10 +70,11 @@ if __name__ == '__main__':
     cfg['env'].update({'n_agents': n_agents})
 
     # instantiate agent and env
-    env_agent = AutoResetGymAgent(
+    env_agent = AutoResetGymMultiAgent(
         get_class(cfg['env']),
         get_arguments(cfg['env']),
-        n_envs=1
+        n_envs=1,
+        n_agents=n_agents
     )
 
     a2c_agents = [instantiate_class({**cfg['agent'],
@@ -103,7 +109,8 @@ if __name__ == '__main__':
                     f'agent{agent_id}_action_probs', "env/reward",
                     f"agent{agent_id}_action"
                 ]
-                td = gae(critic, reward, done, 0.99, 0.3)
+                reward = reward[agent_id]
+                td = gae(critic, reward, done, 0.98, 0.25)
                 td_error = td ** 2
                 critic_loss = td_error.mean()
                 entropy_loss = Categorical(action_probs).entropy().mean()
@@ -118,11 +125,12 @@ if __name__ == '__main__':
                 optimizer = optimizers[agent_id]
                 optimizer.zero_grad()
                 loss.backward()
-                #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), 2)
+                #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5)
                 optimizer.step()
 
                 # Compute the cumulated reward on final_state
-                creward = workspace["env/cumulated_reward"]
+                creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1)
+                print(creward.shape, done.shape)
                 creward = creward[done]
                 if creward.size()[0] > 0:
                     cum_r = creward.mean().item()
diff --git a/studies/sat_mad.yaml b/studies/sat_mad.yaml
index 5f7ea46..7e7ca71 100644
--- a/studies/sat_mad.yaml
+++ b/studies/sat_mad.yaml
@@ -5,21 +5,22 @@ agent:
   n_actions:        10
 
 env:
-  classname:      environments.factory.make
-  env_name:       "DirtyFactory-v0"
-  n_agents:       1
-  pomdp_r:        2
-  max_steps:      400
-  stack_n_frames: 3
+  classname:          environments.factory.make
+  env_name:           "DirtyFactory-v0"
+  n_agents:           1
+  pomdp_r:            2
+  max_steps:          400
+  stack_n_frames:     3
+  individual_rewards: True
 
 algorithm:
   max_epochs:             1000000
   n_envs:                 1
-  n_timesteps:            16
+  n_timesteps:            10
   discount_factor:        0.99
   entropy_coef:           0.01
   critic_coef:            1.0
-  gae:                    0.3
+  gae:                    0.25
   optimizer:
     classname:            torch.optim.Adam
     lr:                   0.0003

From c1c7909925e1aa7850039a97a4c800c7a3ea87d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20M=C3=BCller?= <robert.mueller1990@googlemail.com>
Date: Tue, 23 Nov 2021 17:02:35 +0100
Subject: [PATCH 2/2] added running marl a2c

---
 algorithms/utils.py | 91 ++++++++++++++++++++++++++-------------------
 studies/sat_mad.py  | 54 +++++++++++++--------------
 2 files changed, 79 insertions(+), 66 deletions(-)

diff --git a/algorithms/utils.py b/algorithms/utils.py
index cd3fbd3..d72046a 100644
--- a/algorithms/utils.py
+++ b/algorithms/utils.py
@@ -5,7 +5,12 @@ import yaml
 from pathlib import Path
 from salina import instantiate_class
 from salina import TAgent
-from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame
+from salina.agents.gyma import (
+    AutoResetGymAgent,
+    _torch_type,
+    _format_frame,
+    _torch_cat_dict
+)
 
 
 def load_yaml_file(path: Path):
@@ -20,42 +25,47 @@ def add_env_props(cfg):
                              n_actions=env.action_space.n))
 
 
-class CombineActionsAgent(TAgent):
-    def __init__(self, pattern=r'^agent\d_action$'):
-        super().__init__()
-        self.pattern = pattern
 
-    def forward(self, t, **kwargs):
-        keys = list(self.workspace.keys())
-        action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
-        actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
-        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
-        self.set((f'action', t), actions)
+
+AGENT_PREFIX = 'agent#'
+REWARD       =  'reward'
+CUMU_REWARD  = 'cumulated_reward'
+OBS          = 'env_obs'
+SEP          = '_'
+ACTION       = 'action'
+
+
+def access_str(agent_i, name, prefix=''):
+    return f'{prefix}{AGENT_PREFIX}{agent_i}{SEP}{name}'
 
 
 class AutoResetGymMultiAgent(AutoResetGymAgent):
-    AGENT_PREFIX = 'agent#'
-    REWARD       =  'reward'
-    CUMU_REWARD  = 'cumulated_reward'
-    SEP          = '_'
-
-    def __init__(self, *args, n_agents, **kwargs):
+    def __init__(self, *args, **kwargs):
         super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
-        self.n_agents = n_agents
 
-    def prefix(self, agent_id, name):
-        return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}'
+    def per_agent_values(self, name, values):
+        return {access_str(agent_i, name): value
+                for agent_i, value in zip(range(self.n_agents), values)}
+
+    def _initialize_envs(self, n):
+        super()._initialize_envs(n)
+        n_agents_list = [self.envs[i].unwrapped.n_agents for i in range(n)]
+        assert all(n_agents == n_agents_list[0] for n_agents in n_agents_list), \
+            'All envs must have the same number of agents.'
+        self.n_agents = n_agents_list[0]
 
     def _reset(self, k, save_render):
         ret = super()._reset(k, save_render)
+        obs = ret['env_obs'].squeeze()
         self.cumulated_reward[k] = [0.0]*self.n_agents
-        del ret['cumulated_reward']
-        cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float()
-                    for agent_i in range(self.n_agents)}
-        rewards  = {self.prefix(agent_i, self.REWARD)     : torch.zeros(1).float()
-                    for agent_i in range(self.n_agents)}
+        obs      = self.per_agent_values(OBS,  [_format_frame(obs[i]) for i in range(self.n_agents)])
+        cumu_rew = self.per_agent_values(CUMU_REWARD, torch.zeros(self.n_agents, 1).float().unbind())
+        rewards  = self.per_agent_values(REWARD,      torch.zeros(self.n_agents, 1).float().unbind())
         ret.update(cumu_rew)
         ret.update(rewards)
+        ret.update(obs)
+        for remove in ['env_obs', 'cumulated_reward', 'reward']:
+            del ret[remove]
         return ret
 
     def _step(self, k, action, save_render):
@@ -68,28 +78,33 @@ class AutoResetGymMultiAgent(AutoResetGymAgent):
             action = np.array(action.tolist())
         o, r, d, _ = env.step(action)
         self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
-        print(o.shape)
-        observation = _format_frame(o)
-        if isinstance(observation, torch.Tensor):
-            print(observation.shape)
-            observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i]
-                           for agent_i in range(self.n_agents)}
-            print(observation)
-        else:
-            assert isinstance(observation, dict)
+        observation = self.per_agent_values(OBS, [_format_frame(o[i]) for i in range(self.n_agents)])
         if d:
             self.is_running[k] = False
-
         if save_render:
             image = env.render(mode="image").unsqueeze(0)
             observation["rendering"] = image
+        rewards           = self.per_agent_values(REWARD, torch.tensor(r).float().view(-1, 1).unbind())
+        cumulated_rewards = self.per_agent_values(CUMU_REWARD, torch.tensor(self.cumulated_reward[k]).float().view(-1, 1).unbind())
         ret = {
             **observation,
+            **rewards,
+            **cumulated_rewards,
             "done": torch.tensor([d]),
             "initial_state": torch.tensor([False]),
-            "reward": torch.tensor(r).float(),
-            "timestep": torch.tensor([self.timestep[k]]),
-            "cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(),
+            "timestep": torch.tensor([self.timestep[k]])
         }
         return _torch_type(ret)
 
+
+class CombineActionsAgent(TAgent):
+    def __init__(self):
+        super().__init__()
+        self.pattern = fr'^{AGENT_PREFIX}\d{SEP}{ACTION}$'
+
+    def forward(self, t, **kwargs):
+        keys = list(self.workspace.keys())
+        action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
+        actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
+        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
+        self.set((f'action', t), actions)
diff --git a/studies/sat_mad.py b/studies/sat_mad.py
index 4380c7d..6bf3e22 100644
--- a/studies/sat_mad.py
+++ b/studies/sat_mad.py
@@ -13,16 +13,18 @@ from algorithms.utils import (
     add_env_props,
     load_yaml_file,
     CombineActionsAgent,
-    AutoResetGymMultiAgent
+    AutoResetGymMultiAgent,
+    access_str,
+    AGENT_PREFIX, REWARD, CUMU_REWARD, OBS, SEP
 )
 
 
 class A2CAgent(TAgent):
-    def __init__(self, observation_size, hidden_size, n_actions, agent_id=-1, marl=False):
+    def __init__(self, observation_size, hidden_size, n_actions, agent_id):
         super().__init__()
         observation_size = np.prod(observation_size)
+        print(observation_size)
         self.agent_id = agent_id
-        self.marl = marl
         self.model = nn.Sequential(
             nn.Flatten(),
             nn.Linear(observation_size, hidden_size),
@@ -36,10 +38,7 @@ class A2CAgent(TAgent):
         self.critic_head = nn.Linear(hidden_size, 1)
 
     def get_obs(self, t):
-        observation = self.get(("env/env_obs", t))
-        print(observation.shape)
-        if self.marl:
-            observation = observation[self.agent_id]
+        observation = self.get((f'env/{access_str(self.agent_id, OBS)}', t))
         return observation
 
     def forward(self, t, stochastic, **kwargs):
@@ -52,10 +51,9 @@ class A2CAgent(TAgent):
             action = torch.distributions.Categorical(probs).sample()
         else:
             action = probs.argmax(1)
-        agent_str = f'agent{self.agent_id}_'
-        self.set((f'{agent_str}action', t), action)
-        self.set((f'{agent_str}action_probs', t), probs)
-        self.set((f'{agent_str}critic', t), critic)
+        self.set((f'{access_str(self.agent_id, "action")}', t), action)
+        self.set((f'{access_str(self.agent_id, "action_probs")}', t), probs)
+        self.set((f'{access_str(self.agent_id, "critic")}', t), critic)
 
 
 if __name__ == '__main__':
@@ -73,13 +71,11 @@ if __name__ == '__main__':
     env_agent = AutoResetGymMultiAgent(
         get_class(cfg['env']),
         get_arguments(cfg['env']),
-        n_envs=1,
-        n_agents=n_agents
+        n_envs=1
     )
 
     a2c_agents = [instantiate_class({**cfg['agent'],
-                                     'agent_id': agent_id,
-                                     'marl':     n_agents > 1})
+                                     'agent_id': agent_id})
                   for agent_id in range(n_agents)]
 
     # combine agents
@@ -105,11 +101,12 @@ if __name__ == '__main__':
 
             for agent_id in range(n_agents):
                 critic, done, action_probs, reward, action = workspace[
-                    f"agent{agent_id}_critic", "env/done",
-                    f'agent{agent_id}_action_probs', "env/reward",
-                    f"agent{agent_id}_action"
+                    access_str(agent_id, 'critic'),
+                    "env/done",
+                    access_str(agent_id, 'action_probs'),
+                    access_str(agent_id, 'reward', 'env/'),
+                    access_str(agent_id, 'action')
                 ]
-                reward = reward[agent_id]
                 td = gae(critic, reward, done, 0.98, 0.25)
                 td_error = td ** 2
                 critic_loss = td_error.mean()
@@ -129,13 +126,14 @@ if __name__ == '__main__':
                 optimizer.step()
 
                 # Compute the cumulated reward on final_state
-                creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1)
-                print(creward.shape, done.shape)
-                creward = creward[done]
-                if creward.size()[0] > 0:
-                    cum_r = creward.mean().item()
-                    if cum_r > best:
-                    #    torch.save(a2c_agent.state_dict(), Path(__file__).parent / f'agent_{uid}.pt')
-                        best = cum_r
-                    pbar.set_description(f"Cum. r: {cum_r:.2f}, Best r. so far: {best:.2f}", refresh=True)
+                rews = ''
+                for agent_i in range(n_agents):
+                    creward = workspace['env/'+access_str(agent_i, CUMU_REWARD)]
+                    creward = creward[done]
+                    if creward.size()[0] > 0:
+                        rews += f'{AGENT_PREFIX}{agent_i}: {creward.mean().item():.2f}  |  '
+                        """if cum_r > best:
+                            torch.save(a2c_agent.state_dict(), Path(__file__).parent / f'agent_{uid}.pt')
+                            best = cum_r"""
+                        pbar.set_description(rews, refresh=True)