mirror of
				https://github.com/illiumst/marl-factory-grid.git
				synced 2025-10-31 04:37:25 +01:00 
			
		
		
		
	added MarlFrameStack and salina stuff
This commit is contained in:
		| @@ -1,9 +1,11 @@ | ||||
| import re | ||||
| import torch | ||||
| import numpy as np | ||||
| import yaml | ||||
| from pathlib import Path | ||||
| from salina import instantiate_class | ||||
| from salina import TAgent | ||||
| from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame | ||||
|  | ||||
|  | ||||
| def load_yaml_file(path: Path): | ||||
| @@ -27,4 +29,67 @@ class CombineActionsAgent(TAgent): | ||||
|         keys = list(self.workspace.keys()) | ||||
|         action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))]) | ||||
|         actions = torch.cat([self.get((k, t)) for k in action_keys], 0) | ||||
|         self.set((f'action', t), actions.unsqueeze(0)) | ||||
|         actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0) | ||||
|         self.set((f'action', t), actions) | ||||
|  | ||||
|  | ||||
| class AutoResetGymMultiAgent(AutoResetGymAgent): | ||||
|     AGENT_PREFIX = 'agent#' | ||||
|     REWARD       =  'reward' | ||||
|     CUMU_REWARD  = 'cumulated_reward' | ||||
|     SEP          = '_' | ||||
|  | ||||
|     def __init__(self, *args, n_agents, **kwargs): | ||||
|         super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs) | ||||
|         self.n_agents = n_agents | ||||
|  | ||||
|     def prefix(self, agent_id, name): | ||||
|         return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}' | ||||
|  | ||||
|     def _reset(self, k, save_render): | ||||
|         ret = super()._reset(k, save_render) | ||||
|         self.cumulated_reward[k] = [0.0]*self.n_agents | ||||
|         del ret['cumulated_reward'] | ||||
|         cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float() | ||||
|                     for agent_i in range(self.n_agents)} | ||||
|         rewards  = {self.prefix(agent_i, self.REWARD)     : torch.zeros(1).float() | ||||
|                     for agent_i in range(self.n_agents)} | ||||
|         ret.update(cumu_rew) | ||||
|         ret.update(rewards) | ||||
|         return ret | ||||
|  | ||||
|     def _step(self, k, action, save_render): | ||||
|         self.timestep[k] += 1 | ||||
|         env = self.envs[k] | ||||
|         if len(action.size()) == 0: | ||||
|             action = action.item() | ||||
|             assert isinstance(action, int) | ||||
|         else: | ||||
|             action = np.array(action.tolist()) | ||||
|         o, r, d, _ = env.step(action) | ||||
|         self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])] | ||||
|         print(o.shape) | ||||
|         observation = _format_frame(o) | ||||
|         if isinstance(observation, torch.Tensor): | ||||
|             print(observation.shape) | ||||
|             observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i] | ||||
|                            for agent_i in range(self.n_agents)} | ||||
|             print(observation) | ||||
|         else: | ||||
|             assert isinstance(observation, dict) | ||||
|         if d: | ||||
|             self.is_running[k] = False | ||||
|  | ||||
|         if save_render: | ||||
|             image = env.render(mode="image").unsqueeze(0) | ||||
|             observation["rendering"] = image | ||||
|         ret = { | ||||
|             **observation, | ||||
|             "done": torch.tensor([d]), | ||||
|             "initial_state": torch.tensor([False]), | ||||
|             "reward": torch.tensor(r).float(), | ||||
|             "timestep": torch.tensor([self.timestep[k]]), | ||||
|             "cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(), | ||||
|         } | ||||
|         return _torch_type(ret) | ||||
|  | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3): | ||||
| def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1,  individual_rewards=False): | ||||
|     import yaml | ||||
|     from pathlib import Path | ||||
|     from environments.factory.combined_factories import DirtItemFactory | ||||
| @@ -12,7 +12,8 @@ def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3): | ||||
|     obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED, | ||||
|                                       frames_to_stack=stack_n_frames, pomdp_r=pomdp_r) | ||||
|  | ||||
|     factory_kwargs = dict(n_agents=n_agents, max_steps=max_steps, obs_prop=obs_props, | ||||
|     factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards, | ||||
|                           max_steps=max_steps, obs_prop=obs_props, | ||||
|                           mv_prop=MovementProperties(**dictionary['movement_props']), | ||||
|                           dirt_prop=DirtProperties(**dictionary['dirt_props']), | ||||
|                           record_episodes=False, verbose=False, **dictionary['factory_props'] | ||||
|   | ||||
| @@ -15,12 +15,11 @@ from environments.helpers import Constants as c, Constants | ||||
| from environments import helpers as h | ||||
| from environments.factory.base.objects import Agent, Tile, Action | ||||
| from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders | ||||
| from environments.utility_classes import MovementProperties, ObservationProperties | ||||
| from environments.utility_classes import MovementProperties, ObservationProperties, MarlFrameStack | ||||
| from environments.utility_classes import AgentRenderOptions as a_obs | ||||
|  | ||||
| import simplejson | ||||
|  | ||||
|  | ||||
| REC_TAC = 'rec_' | ||||
|  | ||||
|  | ||||
| @@ -57,7 +56,7 @@ class BaseFactory(gym.Env): | ||||
|  | ||||
|     def __enter__(self): | ||||
|         return self if self.obs_prop.frames_to_stack == 0 else \ | ||||
|             FrameStack(self, self.obs_prop.frames_to_stack) | ||||
|             MarlFrameStack(FrameStack(self, self.obs_prop.frames_to_stack)) | ||||
|  | ||||
|     def __exit__(self, exc_type, exc_val, exc_tb): | ||||
|         self.close() | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| from enum import Enum | ||||
| from typing import NamedTuple, Union | ||||
| import gym | ||||
| from gym.wrappers.frame_stack import FrameStack | ||||
|  | ||||
|  | ||||
| class AgentRenderOptions(object): | ||||
| @@ -22,3 +23,14 @@ class ObservationProperties(NamedTuple): | ||||
|     cast_shadows = True | ||||
|     frames_to_stack: int = 0 | ||||
|     pomdp_r: int = 0 | ||||
|  | ||||
|  | ||||
| class MarlFrameStack(gym.ObservationWrapper): | ||||
|     def __init__(self, env): | ||||
|         super().__init__(env) | ||||
|  | ||||
|     def observation(self, observation): | ||||
|         if isinstance(self.env, FrameStack) and self.env.unwrapped.n_agents > 1: | ||||
|             return observation[0:].swapaxes(0, 1) | ||||
|         return observation | ||||
|  | ||||
|   | ||||
| @@ -9,7 +9,12 @@ from pathlib import Path | ||||
| import numpy as np | ||||
| from tqdm import tqdm | ||||
| import time | ||||
| from algorithms.utils import add_env_props, load_yaml_file, CombineActionsAgent | ||||
| from algorithms.utils import ( | ||||
|     add_env_props, | ||||
|     load_yaml_file, | ||||
|     CombineActionsAgent, | ||||
|     AutoResetGymMultiAgent | ||||
| ) | ||||
|  | ||||
|  | ||||
| class A2CAgent(TAgent): | ||||
| @@ -32,8 +37,8 @@ class A2CAgent(TAgent): | ||||
|  | ||||
|     def get_obs(self, t): | ||||
|         observation = self.get(("env/env_obs", t)) | ||||
|         print(observation.shape) | ||||
|         if self.marl: | ||||
|             observation = observation.permute(2, 0, 1, 3, 4, 5) | ||||
|             observation = observation[self.agent_id] | ||||
|         return observation | ||||
|  | ||||
| @@ -57,7 +62,7 @@ if __name__ == '__main__': | ||||
|     # Setup workspace | ||||
|     uid = time.time() | ||||
|     workspace = Workspace() | ||||
|     n_agents = 1 | ||||
|     n_agents = 2 | ||||
|  | ||||
|     # load config | ||||
|     cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml') | ||||
| @@ -65,10 +70,11 @@ if __name__ == '__main__': | ||||
|     cfg['env'].update({'n_agents': n_agents}) | ||||
|  | ||||
|     # instantiate agent and env | ||||
|     env_agent = AutoResetGymAgent( | ||||
|     env_agent = AutoResetGymMultiAgent( | ||||
|         get_class(cfg['env']), | ||||
|         get_arguments(cfg['env']), | ||||
|         n_envs=1 | ||||
|         n_envs=1, | ||||
|         n_agents=n_agents | ||||
|     ) | ||||
|  | ||||
|     a2c_agents = [instantiate_class({**cfg['agent'], | ||||
| @@ -103,7 +109,8 @@ if __name__ == '__main__': | ||||
|                     f'agent{agent_id}_action_probs', "env/reward", | ||||
|                     f"agent{agent_id}_action" | ||||
|                 ] | ||||
|                 td = gae(critic, reward, done, 0.99, 0.3) | ||||
|                 reward = reward[agent_id] | ||||
|                 td = gae(critic, reward, done, 0.98, 0.25) | ||||
|                 td_error = td ** 2 | ||||
|                 critic_loss = td_error.mean() | ||||
|                 entropy_loss = Categorical(action_probs).entropy().mean() | ||||
| @@ -118,11 +125,12 @@ if __name__ == '__main__': | ||||
|                 optimizer = optimizers[agent_id] | ||||
|                 optimizer.zero_grad() | ||||
|                 loss.backward() | ||||
|                 #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), 2) | ||||
|                 #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5) | ||||
|                 optimizer.step() | ||||
|  | ||||
|                 # Compute the cumulated reward on final_state | ||||
|                 creward = workspace["env/cumulated_reward"] | ||||
|                 creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1) | ||||
|                 print(creward.shape, done.shape) | ||||
|                 creward = creward[done] | ||||
|                 if creward.size()[0] > 0: | ||||
|                     cum_r = creward.mean().item() | ||||
|   | ||||
| @@ -5,21 +5,22 @@ agent: | ||||
|   n_actions:        10 | ||||
|  | ||||
| env: | ||||
|   classname:      environments.factory.make | ||||
|   env_name:       "DirtyFactory-v0" | ||||
|   n_agents:       1 | ||||
|   pomdp_r:        2 | ||||
|   max_steps:      400 | ||||
|   stack_n_frames: 3 | ||||
|   classname:          environments.factory.make | ||||
|   env_name:           "DirtyFactory-v0" | ||||
|   n_agents:           1 | ||||
|   pomdp_r:            2 | ||||
|   max_steps:          400 | ||||
|   stack_n_frames:     3 | ||||
|   individual_rewards: True | ||||
|  | ||||
| algorithm: | ||||
|   max_epochs:             1000000 | ||||
|   n_envs:                 1 | ||||
|   n_timesteps:            16 | ||||
|   n_timesteps:            10 | ||||
|   discount_factor:        0.99 | ||||
|   entropy_coef:           0.01 | ||||
|   critic_coef:            1.0 | ||||
|   gae:                    0.3 | ||||
|   gae:                    0.25 | ||||
|   optimizer: | ||||
|     classname:            torch.optim.Adam | ||||
|     lr:                   0.0003 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Robert Müller
					Robert Müller