diff --git a/algorithms/utils.py b/algorithms/utils.py index bf3e2cb..cd3fbd3 100644 --- a/algorithms/utils.py +++ b/algorithms/utils.py @@ -1,9 +1,11 @@ import re import torch +import numpy as np import yaml from pathlib import Path from salina import instantiate_class from salina import TAgent +from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame def load_yaml_file(path: Path): @@ -27,4 +29,67 @@ class CombineActionsAgent(TAgent): keys = list(self.workspace.keys()) action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))]) actions = torch.cat([self.get((k, t)) for k in action_keys], 0) - self.set((f'action', t), actions.unsqueeze(0)) + actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0) + self.set((f'action', t), actions) + + +class AutoResetGymMultiAgent(AutoResetGymAgent): + AGENT_PREFIX = 'agent#' + REWARD = 'reward' + CUMU_REWARD = 'cumulated_reward' + SEP = '_' + + def __init__(self, *args, n_agents, **kwargs): + super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs) + self.n_agents = n_agents + + def prefix(self, agent_id, name): + return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}' + + def _reset(self, k, save_render): + ret = super()._reset(k, save_render) + self.cumulated_reward[k] = [0.0]*self.n_agents + del ret['cumulated_reward'] + cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float() + for agent_i in range(self.n_agents)} + rewards = {self.prefix(agent_i, self.REWARD) : torch.zeros(1).float() + for agent_i in range(self.n_agents)} + ret.update(cumu_rew) + ret.update(rewards) + return ret + + def _step(self, k, action, save_render): + self.timestep[k] += 1 + env = self.envs[k] + if len(action.size()) == 0: + action = action.item() + assert isinstance(action, int) + else: + action = np.array(action.tolist()) + o, r, d, _ = env.step(action) + self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])] + print(o.shape) + observation = _format_frame(o) + if isinstance(observation, torch.Tensor): + print(observation.shape) + observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i] + for agent_i in range(self.n_agents)} + print(observation) + else: + assert isinstance(observation, dict) + if d: + self.is_running[k] = False + + if save_render: + image = env.render(mode="image").unsqueeze(0) + observation["rendering"] = image + ret = { + **observation, + "done": torch.tensor([d]), + "initial_state": torch.tensor([False]), + "reward": torch.tensor(r).float(), + "timestep": torch.tensor([self.timestep[k]]), + "cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(), + } + return _torch_type(ret) + diff --git a/environments/factory/__init__.py b/environments/factory/__init__.py index dca0135..23346f9 100644 --- a/environments/factory/__init__.py +++ b/environments/factory/__init__.py @@ -1,4 +1,4 @@ -def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3): +def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False): import yaml from pathlib import Path from environments.factory.combined_factories import DirtItemFactory @@ -12,7 +12,8 @@ def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3): obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED, frames_to_stack=stack_n_frames, pomdp_r=pomdp_r) - factory_kwargs = dict(n_agents=n_agents, max_steps=max_steps, obs_prop=obs_props, + factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards, + max_steps=max_steps, obs_prop=obs_props, mv_prop=MovementProperties(**dictionary['movement_props']), dirt_prop=DirtProperties(**dictionary['dirt_props']), record_episodes=False, verbose=False, **dictionary['factory_props'] diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py index bb0c121..753e09a 100644 --- a/environments/factory/base/base_factory.py +++ b/environments/factory/base/base_factory.py @@ -15,12 +15,11 @@ from environments.helpers import Constants as c, Constants from environments import helpers as h from environments.factory.base.objects import Agent, Tile, Action from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders -from environments.utility_classes import MovementProperties, ObservationProperties +from environments.utility_classes import MovementProperties, ObservationProperties, MarlFrameStack from environments.utility_classes import AgentRenderOptions as a_obs import simplejson - REC_TAC = 'rec_' @@ -57,7 +56,7 @@ class BaseFactory(gym.Env): def __enter__(self): return self if self.obs_prop.frames_to_stack == 0 else \ - FrameStack(self, self.obs_prop.frames_to_stack) + MarlFrameStack(FrameStack(self, self.obs_prop.frames_to_stack)) def __exit__(self, exc_type, exc_val, exc_tb): self.close() diff --git a/environments/utility_classes.py b/environments/utility_classes.py index 3069d6d..cdcfd56 100644 --- a/environments/utility_classes.py +++ b/environments/utility_classes.py @@ -1,5 +1,6 @@ -from enum import Enum from typing import NamedTuple, Union +import gym +from gym.wrappers.frame_stack import FrameStack class AgentRenderOptions(object): @@ -22,3 +23,14 @@ class ObservationProperties(NamedTuple): cast_shadows = True frames_to_stack: int = 0 pomdp_r: int = 0 + + +class MarlFrameStack(gym.ObservationWrapper): + def __init__(self, env): + super().__init__(env) + + def observation(self, observation): + if isinstance(self.env, FrameStack) and self.env.unwrapped.n_agents > 1: + return observation[0:].swapaxes(0, 1) + return observation + diff --git a/studies/sat_mad.py b/studies/sat_mad.py index 35ddda5..4380c7d 100644 --- a/studies/sat_mad.py +++ b/studies/sat_mad.py @@ -9,7 +9,12 @@ from pathlib import Path import numpy as np from tqdm import tqdm import time -from algorithms.utils import add_env_props, load_yaml_file, CombineActionsAgent +from algorithms.utils import ( + add_env_props, + load_yaml_file, + CombineActionsAgent, + AutoResetGymMultiAgent +) class A2CAgent(TAgent): @@ -32,8 +37,8 @@ class A2CAgent(TAgent): def get_obs(self, t): observation = self.get(("env/env_obs", t)) + print(observation.shape) if self.marl: - observation = observation.permute(2, 0, 1, 3, 4, 5) observation = observation[self.agent_id] return observation @@ -57,7 +62,7 @@ if __name__ == '__main__': # Setup workspace uid = time.time() workspace = Workspace() - n_agents = 1 + n_agents = 2 # load config cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml') @@ -65,10 +70,11 @@ if __name__ == '__main__': cfg['env'].update({'n_agents': n_agents}) # instantiate agent and env - env_agent = AutoResetGymAgent( + env_agent = AutoResetGymMultiAgent( get_class(cfg['env']), get_arguments(cfg['env']), - n_envs=1 + n_envs=1, + n_agents=n_agents ) a2c_agents = [instantiate_class({**cfg['agent'], @@ -103,7 +109,8 @@ if __name__ == '__main__': f'agent{agent_id}_action_probs', "env/reward", f"agent{agent_id}_action" ] - td = gae(critic, reward, done, 0.99, 0.3) + reward = reward[agent_id] + td = gae(critic, reward, done, 0.98, 0.25) td_error = td ** 2 critic_loss = td_error.mean() entropy_loss = Categorical(action_probs).entropy().mean() @@ -118,11 +125,12 @@ if __name__ == '__main__': optimizer = optimizers[agent_id] optimizer.zero_grad() loss.backward() - #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), 2) + #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5) optimizer.step() # Compute the cumulated reward on final_state - creward = workspace["env/cumulated_reward"] + creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1) + print(creward.shape, done.shape) creward = creward[done] if creward.size()[0] > 0: cum_r = creward.mean().item() diff --git a/studies/sat_mad.yaml b/studies/sat_mad.yaml index 5f7ea46..7e7ca71 100644 --- a/studies/sat_mad.yaml +++ b/studies/sat_mad.yaml @@ -5,21 +5,22 @@ agent: n_actions: 10 env: - classname: environments.factory.make - env_name: "DirtyFactory-v0" - n_agents: 1 - pomdp_r: 2 - max_steps: 400 - stack_n_frames: 3 + classname: environments.factory.make + env_name: "DirtyFactory-v0" + n_agents: 1 + pomdp_r: 2 + max_steps: 400 + stack_n_frames: 3 + individual_rewards: True algorithm: max_epochs: 1000000 n_envs: 1 - n_timesteps: 16 + n_timesteps: 10 discount_factor: 0.99 entropy_coef: 0.01 critic_coef: 1.0 - gae: 0.3 + gae: 0.25 optimizer: classname: torch.optim.Adam lr: 0.0003