added MarlFrameStack and salina stuff

This commit is contained in:
Robert Müller 2021-11-23 14:03:52 +01:00
parent 59484f49c9
commit 5c15bb2ddf
6 changed files with 109 additions and 23 deletions

View File

@ -1,9 +1,11 @@
import re
import torch
import numpy as np
import yaml
from pathlib import Path
from salina import instantiate_class
from salina import TAgent
from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame
def load_yaml_file(path: Path):
@ -27,4 +29,67 @@ class CombineActionsAgent(TAgent):
keys = list(self.workspace.keys())
action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
self.set((f'action', t), actions.unsqueeze(0))
actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
self.set((f'action', t), actions)
class AutoResetGymMultiAgent(AutoResetGymAgent):
AGENT_PREFIX = 'agent#'
REWARD = 'reward'
CUMU_REWARD = 'cumulated_reward'
SEP = '_'
def __init__(self, *args, n_agents, **kwargs):
super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
self.n_agents = n_agents
def prefix(self, agent_id, name):
return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}'
def _reset(self, k, save_render):
ret = super()._reset(k, save_render)
self.cumulated_reward[k] = [0.0]*self.n_agents
del ret['cumulated_reward']
cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float()
for agent_i in range(self.n_agents)}
rewards = {self.prefix(agent_i, self.REWARD) : torch.zeros(1).float()
for agent_i in range(self.n_agents)}
ret.update(cumu_rew)
ret.update(rewards)
return ret
def _step(self, k, action, save_render):
self.timestep[k] += 1
env = self.envs[k]
if len(action.size()) == 0:
action = action.item()
assert isinstance(action, int)
else:
action = np.array(action.tolist())
o, r, d, _ = env.step(action)
self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
print(o.shape)
observation = _format_frame(o)
if isinstance(observation, torch.Tensor):
print(observation.shape)
observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i]
for agent_i in range(self.n_agents)}
print(observation)
else:
assert isinstance(observation, dict)
if d:
self.is_running[k] = False
if save_render:
image = env.render(mode="image").unsqueeze(0)
observation["rendering"] = image
ret = {
**observation,
"done": torch.tensor([d]),
"initial_state": torch.tensor([False]),
"reward": torch.tensor(r).float(),
"timestep": torch.tensor([self.timestep[k]]),
"cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(),
}
return _torch_type(ret)

View File

@ -1,4 +1,4 @@
def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3):
def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False):
import yaml
from pathlib import Path
from environments.factory.combined_factories import DirtItemFactory
@ -12,7 +12,8 @@ def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3):
obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED,
frames_to_stack=stack_n_frames, pomdp_r=pomdp_r)
factory_kwargs = dict(n_agents=n_agents, max_steps=max_steps, obs_prop=obs_props,
factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards,
max_steps=max_steps, obs_prop=obs_props,
mv_prop=MovementProperties(**dictionary['movement_props']),
dirt_prop=DirtProperties(**dictionary['dirt_props']),
record_episodes=False, verbose=False, **dictionary['factory_props']

View File

@ -15,12 +15,11 @@ from environments.helpers import Constants as c, Constants
from environments import helpers as h
from environments.factory.base.objects import Agent, Tile, Action
from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders
from environments.utility_classes import MovementProperties, ObservationProperties
from environments.utility_classes import MovementProperties, ObservationProperties, MarlFrameStack
from environments.utility_classes import AgentRenderOptions as a_obs
import simplejson
REC_TAC = 'rec_'
@ -57,7 +56,7 @@ class BaseFactory(gym.Env):
def __enter__(self):
return self if self.obs_prop.frames_to_stack == 0 else \
FrameStack(self, self.obs_prop.frames_to_stack)
MarlFrameStack(FrameStack(self, self.obs_prop.frames_to_stack))
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

View File

@ -1,5 +1,6 @@
from enum import Enum
from typing import NamedTuple, Union
import gym
from gym.wrappers.frame_stack import FrameStack
class AgentRenderOptions(object):
@ -22,3 +23,14 @@ class ObservationProperties(NamedTuple):
cast_shadows = True
frames_to_stack: int = 0
pomdp_r: int = 0
class MarlFrameStack(gym.ObservationWrapper):
def __init__(self, env):
super().__init__(env)
def observation(self, observation):
if isinstance(self.env, FrameStack) and self.env.unwrapped.n_agents > 1:
return observation[0:].swapaxes(0, 1)
return observation

View File

@ -9,7 +9,12 @@ from pathlib import Path
import numpy as np
from tqdm import tqdm
import time
from algorithms.utils import add_env_props, load_yaml_file, CombineActionsAgent
from algorithms.utils import (
add_env_props,
load_yaml_file,
CombineActionsAgent,
AutoResetGymMultiAgent
)
class A2CAgent(TAgent):
@ -32,8 +37,8 @@ class A2CAgent(TAgent):
def get_obs(self, t):
observation = self.get(("env/env_obs", t))
print(observation.shape)
if self.marl:
observation = observation.permute(2, 0, 1, 3, 4, 5)
observation = observation[self.agent_id]
return observation
@ -57,7 +62,7 @@ if __name__ == '__main__':
# Setup workspace
uid = time.time()
workspace = Workspace()
n_agents = 1
n_agents = 2
# load config
cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml')
@ -65,10 +70,11 @@ if __name__ == '__main__':
cfg['env'].update({'n_agents': n_agents})
# instantiate agent and env
env_agent = AutoResetGymAgent(
env_agent = AutoResetGymMultiAgent(
get_class(cfg['env']),
get_arguments(cfg['env']),
n_envs=1
n_envs=1,
n_agents=n_agents
)
a2c_agents = [instantiate_class({**cfg['agent'],
@ -103,7 +109,8 @@ if __name__ == '__main__':
f'agent{agent_id}_action_probs', "env/reward",
f"agent{agent_id}_action"
]
td = gae(critic, reward, done, 0.99, 0.3)
reward = reward[agent_id]
td = gae(critic, reward, done, 0.98, 0.25)
td_error = td ** 2
critic_loss = td_error.mean()
entropy_loss = Categorical(action_probs).entropy().mean()
@ -118,11 +125,12 @@ if __name__ == '__main__':
optimizer = optimizers[agent_id]
optimizer.zero_grad()
loss.backward()
#torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), 2)
#torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5)
optimizer.step()
# Compute the cumulated reward on final_state
creward = workspace["env/cumulated_reward"]
creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1)
print(creward.shape, done.shape)
creward = creward[done]
if creward.size()[0] > 0:
cum_r = creward.mean().item()

View File

@ -5,21 +5,22 @@ agent:
n_actions: 10
env:
classname: environments.factory.make
env_name: "DirtyFactory-v0"
n_agents: 1
pomdp_r: 2
max_steps: 400
stack_n_frames: 3
classname: environments.factory.make
env_name: "DirtyFactory-v0"
n_agents: 1
pomdp_r: 2
max_steps: 400
stack_n_frames: 3
individual_rewards: True
algorithm:
max_epochs: 1000000
n_envs: 1
n_timesteps: 16
n_timesteps: 10
discount_factor: 0.99
entropy_coef: 0.01
critic_coef: 1.0
gae: 0.3
gae: 0.25
optimizer:
classname: torch.optim.Adam
lr: 0.0003