added running marl a2c

2021-11-23 17:02:35 +01:00
parent 5c15bb2ddf
commit c1c7909925
2 changed files with 79 additions and 66 deletions
--- a/algorithms/utils.py
+++ b/algorithms/utils.py
@ -5,7 +5,12 @@ import yaml
 from pathlib import Path
 from salina import instantiate_class
 from salina import TAgent
-from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame
+from salina.agents.gyma import (
+    AutoResetGymAgent,
+    _torch_type,
+    _format_frame,
+    _torch_cat_dict
+)


 def load_yaml_file(path: Path):
@ -20,42 +25,47 @@ def add_env_props(cfg):
                             n_actions=env.action_space.n))


-class CombineActionsAgent(TAgent):
-    def __init__(self, pattern=r'^agent\d_action$'):
-        super().__init__()
-        self.pattern = pattern

-    def forward(self, t, **kwargs):
-        keys = list(self.workspace.keys())
-        action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
-        actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
-        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
-        self.set((f'action', t), actions)
+
+AGENT_PREFIX = 'agent#'
+REWARD       =  'reward'
+CUMU_REWARD  = 'cumulated_reward'
+OBS          = 'env_obs'
+SEP          = '_'
+ACTION       = 'action'
+
+
+def access_str(agent_i, name, prefix=''):
+    return f'{prefix}{AGENT_PREFIX}{agent_i}{SEP}{name}'


 class AutoResetGymMultiAgent(AutoResetGymAgent):
-    AGENT_PREFIX = 'agent#'
-    REWARD       =  'reward'
-    CUMU_REWARD  = 'cumulated_reward'
-    SEP          = '_'
-
-    def __init__(self, *args, n_agents, **kwargs):
+    def __init__(self, *args, **kwargs):
        super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
-        self.n_agents = n_agents

-    def prefix(self, agent_id, name):
-        return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}'
+    def per_agent_values(self, name, values):
+        return {access_str(agent_i, name): value
+                for agent_i, value in zip(range(self.n_agents), values)}
+
+    def _initialize_envs(self, n):
+        super()._initialize_envs(n)
+        n_agents_list = [self.envs[i].unwrapped.n_agents for i in range(n)]
+        assert all(n_agents == n_agents_list[0] for n_agents in n_agents_list), \
+            'All envs must have the same number of agents.'
+        self.n_agents = n_agents_list[0]

    def _reset(self, k, save_render):
        ret = super()._reset(k, save_render)
+        obs = ret['env_obs'].squeeze()
        self.cumulated_reward[k] = [0.0]*self.n_agents
-        del ret['cumulated_reward']
-        cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float()
-                    for agent_i in range(self.n_agents)}
-        rewards  = {self.prefix(agent_i, self.REWARD)     : torch.zeros(1).float()
-                    for agent_i in range(self.n_agents)}
+        obs      = self.per_agent_values(OBS,  [_format_frame(obs[i]) for i in range(self.n_agents)])
+        cumu_rew = self.per_agent_values(CUMU_REWARD, torch.zeros(self.n_agents, 1).float().unbind())
+        rewards  = self.per_agent_values(REWARD,      torch.zeros(self.n_agents, 1).float().unbind())
        ret.update(cumu_rew)
        ret.update(rewards)
+        ret.update(obs)
+        for remove in ['env_obs', 'cumulated_reward', 'reward']:
+            del ret[remove]
        return ret

    def _step(self, k, action, save_render):
@ -68,28 +78,33 @@ class AutoResetGymMultiAgent(AutoResetGymAgent):
            action = np.array(action.tolist())
        o, r, d, _ = env.step(action)
        self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
-        print(o.shape)
-        observation = _format_frame(o)
-        if isinstance(observation, torch.Tensor):
-            print(observation.shape)
-            observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i]
-                           for agent_i in range(self.n_agents)}
-            print(observation)
-        else:
-            assert isinstance(observation, dict)
+        observation = self.per_agent_values(OBS, [_format_frame(o[i]) for i in range(self.n_agents)])
        if d:
            self.is_running[k] = False
-
        if save_render:
            image = env.render(mode="image").unsqueeze(0)
            observation["rendering"] = image
+        rewards           = self.per_agent_values(REWARD, torch.tensor(r).float().view(-1, 1).unbind())
+        cumulated_rewards = self.per_agent_values(CUMU_REWARD, torch.tensor(self.cumulated_reward[k]).float().view(-1, 1).unbind())
        ret = {
            **observation,
+            **rewards,
+            **cumulated_rewards,
            "done": torch.tensor([d]),
            "initial_state": torch.tensor([False]),
-            "reward": torch.tensor(r).float(),
-            "timestep": torch.tensor([self.timestep[k]]),
-            "cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(),
+            "timestep": torch.tensor([self.timestep[k]])
        }
        return _torch_type(ret)

+
+class CombineActionsAgent(TAgent):
+    def __init__(self):
+        super().__init__()
+        self.pattern = fr'^{AGENT_PREFIX}\d{SEP}{ACTION}$'
+
+    def forward(self, t, **kwargs):
+        keys = list(self.workspace.keys())
+        action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
+        actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
+        actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
+        self.set((f'action', t), actions)