added MarlFrameStack and salina stuff

This commit is contained in:
Robert Müller 2021-11-23 14:03:52 +01:00
parent 59484f49c9
commit 5c15bb2ddf
6 changed files with 109 additions and 23 deletions

View File

@ -1,9 +1,11 @@
import re import re
import torch import torch
import numpy as np
import yaml import yaml
from pathlib import Path from pathlib import Path
from salina import instantiate_class from salina import instantiate_class
from salina import TAgent from salina import TAgent
from salina.agents.gyma import AutoResetGymAgent, _torch_type, _format_frame
def load_yaml_file(path: Path): def load_yaml_file(path: Path):
@ -27,4 +29,67 @@ class CombineActionsAgent(TAgent):
keys = list(self.workspace.keys()) keys = list(self.workspace.keys())
action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))]) action_keys = sorted([k for k in keys if bool(re.match(self.pattern, k))])
actions = torch.cat([self.get((k, t)) for k in action_keys], 0) actions = torch.cat([self.get((k, t)) for k in action_keys], 0)
self.set((f'action', t), actions.unsqueeze(0)) actions = actions if len(action_keys) <= 1 else actions.unsqueeze(0)
self.set((f'action', t), actions)
class AutoResetGymMultiAgent(AutoResetGymAgent):
AGENT_PREFIX = 'agent#'
REWARD = 'reward'
CUMU_REWARD = 'cumulated_reward'
SEP = '_'
def __init__(self, *args, n_agents, **kwargs):
super(AutoResetGymMultiAgent, self).__init__(*args, **kwargs)
self.n_agents = n_agents
def prefix(self, agent_id, name):
return f'{self.AGENT_PREFIX}{agent_id}{self.SEP}{name}'
def _reset(self, k, save_render):
ret = super()._reset(k, save_render)
self.cumulated_reward[k] = [0.0]*self.n_agents
del ret['cumulated_reward']
cumu_rew = {self.prefix(agent_i, self.CUMU_REWARD): torch.zeros(1).float()
for agent_i in range(self.n_agents)}
rewards = {self.prefix(agent_i, self.REWARD) : torch.zeros(1).float()
for agent_i in range(self.n_agents)}
ret.update(cumu_rew)
ret.update(rewards)
return ret
def _step(self, k, action, save_render):
self.timestep[k] += 1
env = self.envs[k]
if len(action.size()) == 0:
action = action.item()
assert isinstance(action, int)
else:
action = np.array(action.tolist())
o, r, d, _ = env.step(action)
self.cumulated_reward[k] = [x+y for x, y in zip(r, self.cumulated_reward[k])]
print(o.shape)
observation = _format_frame(o)
if isinstance(observation, torch.Tensor):
print(observation.shape)
observation = {self.prefix(agent_i, 'env_obs'): observation[agent_i]
for agent_i in range(self.n_agents)}
print(observation)
else:
assert isinstance(observation, dict)
if d:
self.is_running[k] = False
if save_render:
image = env.render(mode="image").unsqueeze(0)
observation["rendering"] = image
ret = {
**observation,
"done": torch.tensor([d]),
"initial_state": torch.tensor([False]),
"reward": torch.tensor(r).float(),
"timestep": torch.tensor([self.timestep[k]]),
"cumulated_reward": torch.tensor(self.cumulated_reward[k]).float(),
}
return _torch_type(ret)

View File

@ -1,4 +1,4 @@
def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3): def make(env_name, pomdp_r=2, max_steps=400, stack_n_frames=3, n_agents=1, individual_rewards=False):
import yaml import yaml
from pathlib import Path from pathlib import Path
from environments.factory.combined_factories import DirtItemFactory from environments.factory.combined_factories import DirtItemFactory
@ -12,7 +12,8 @@ def make(env_name, n_agents=1, pomdp_r=2, max_steps=400, stack_n_frames=3):
obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED, obs_props = ObservationProperties(render_agents=AgentRenderOptions.COMBINED,
frames_to_stack=stack_n_frames, pomdp_r=pomdp_r) frames_to_stack=stack_n_frames, pomdp_r=pomdp_r)
factory_kwargs = dict(n_agents=n_agents, max_steps=max_steps, obs_prop=obs_props, factory_kwargs = dict(n_agents=n_agents, individual_rewards=individual_rewards,
max_steps=max_steps, obs_prop=obs_props,
mv_prop=MovementProperties(**dictionary['movement_props']), mv_prop=MovementProperties(**dictionary['movement_props']),
dirt_prop=DirtProperties(**dictionary['dirt_props']), dirt_prop=DirtProperties(**dictionary['dirt_props']),
record_episodes=False, verbose=False, **dictionary['factory_props'] record_episodes=False, verbose=False, **dictionary['factory_props']

View File

@ -15,12 +15,11 @@ from environments.helpers import Constants as c, Constants
from environments import helpers as h from environments import helpers as h
from environments.factory.base.objects import Agent, Tile, Action from environments.factory.base.objects import Agent, Tile, Action
from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders
from environments.utility_classes import MovementProperties, ObservationProperties from environments.utility_classes import MovementProperties, ObservationProperties, MarlFrameStack
from environments.utility_classes import AgentRenderOptions as a_obs from environments.utility_classes import AgentRenderOptions as a_obs
import simplejson import simplejson
REC_TAC = 'rec_' REC_TAC = 'rec_'
@ -57,7 +56,7 @@ class BaseFactory(gym.Env):
def __enter__(self): def __enter__(self):
return self if self.obs_prop.frames_to_stack == 0 else \ return self if self.obs_prop.frames_to_stack == 0 else \
FrameStack(self, self.obs_prop.frames_to_stack) MarlFrameStack(FrameStack(self, self.obs_prop.frames_to_stack))
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
self.close() self.close()

View File

@ -1,5 +1,6 @@
from enum import Enum
from typing import NamedTuple, Union from typing import NamedTuple, Union
import gym
from gym.wrappers.frame_stack import FrameStack
class AgentRenderOptions(object): class AgentRenderOptions(object):
@ -22,3 +23,14 @@ class ObservationProperties(NamedTuple):
cast_shadows = True cast_shadows = True
frames_to_stack: int = 0 frames_to_stack: int = 0
pomdp_r: int = 0 pomdp_r: int = 0
class MarlFrameStack(gym.ObservationWrapper):
def __init__(self, env):
super().__init__(env)
def observation(self, observation):
if isinstance(self.env, FrameStack) and self.env.unwrapped.n_agents > 1:
return observation[0:].swapaxes(0, 1)
return observation

View File

@ -9,7 +9,12 @@ from pathlib import Path
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
import time import time
from algorithms.utils import add_env_props, load_yaml_file, CombineActionsAgent from algorithms.utils import (
add_env_props,
load_yaml_file,
CombineActionsAgent,
AutoResetGymMultiAgent
)
class A2CAgent(TAgent): class A2CAgent(TAgent):
@ -32,8 +37,8 @@ class A2CAgent(TAgent):
def get_obs(self, t): def get_obs(self, t):
observation = self.get(("env/env_obs", t)) observation = self.get(("env/env_obs", t))
print(observation.shape)
if self.marl: if self.marl:
observation = observation.permute(2, 0, 1, 3, 4, 5)
observation = observation[self.agent_id] observation = observation[self.agent_id]
return observation return observation
@ -57,7 +62,7 @@ if __name__ == '__main__':
# Setup workspace # Setup workspace
uid = time.time() uid = time.time()
workspace = Workspace() workspace = Workspace()
n_agents = 1 n_agents = 2
# load config # load config
cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml') cfg = load_yaml_file(Path(__file__).parent / 'sat_mad.yaml')
@ -65,10 +70,11 @@ if __name__ == '__main__':
cfg['env'].update({'n_agents': n_agents}) cfg['env'].update({'n_agents': n_agents})
# instantiate agent and env # instantiate agent and env
env_agent = AutoResetGymAgent( env_agent = AutoResetGymMultiAgent(
get_class(cfg['env']), get_class(cfg['env']),
get_arguments(cfg['env']), get_arguments(cfg['env']),
n_envs=1 n_envs=1,
n_agents=n_agents
) )
a2c_agents = [instantiate_class({**cfg['agent'], a2c_agents = [instantiate_class({**cfg['agent'],
@ -103,7 +109,8 @@ if __name__ == '__main__':
f'agent{agent_id}_action_probs', "env/reward", f'agent{agent_id}_action_probs', "env/reward",
f"agent{agent_id}_action" f"agent{agent_id}_action"
] ]
td = gae(critic, reward, done, 0.99, 0.3) reward = reward[agent_id]
td = gae(critic, reward, done, 0.98, 0.25)
td_error = td ** 2 td_error = td ** 2
critic_loss = td_error.mean() critic_loss = td_error.mean()
entropy_loss = Categorical(action_probs).entropy().mean() entropy_loss = Categorical(action_probs).entropy().mean()
@ -118,11 +125,12 @@ if __name__ == '__main__':
optimizer = optimizers[agent_id] optimizer = optimizers[agent_id]
optimizer.zero_grad() optimizer.zero_grad()
loss.backward() loss.backward()
#torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), 2) #torch.nn.utils.clip_grad_norm_(a2c_agents[agent_id].parameters(), .5)
optimizer.step() optimizer.step()
# Compute the cumulated reward on final_state # Compute the cumulated reward on final_state
creward = workspace["env/cumulated_reward"] creward = workspace["env/cumulated_reward"]#[agent_id].unsqueeze(-1)
print(creward.shape, done.shape)
creward = creward[done] creward = creward[done]
if creward.size()[0] > 0: if creward.size()[0] > 0:
cum_r = creward.mean().item() cum_r = creward.mean().item()

View File

@ -5,21 +5,22 @@ agent:
n_actions: 10 n_actions: 10
env: env:
classname: environments.factory.make classname: environments.factory.make
env_name: "DirtyFactory-v0" env_name: "DirtyFactory-v0"
n_agents: 1 n_agents: 1
pomdp_r: 2 pomdp_r: 2
max_steps: 400 max_steps: 400
stack_n_frames: 3 stack_n_frames: 3
individual_rewards: True
algorithm: algorithm:
max_epochs: 1000000 max_epochs: 1000000
n_envs: 1 n_envs: 1
n_timesteps: 16 n_timesteps: 10
discount_factor: 0.99 discount_factor: 0.99
entropy_coef: 0.01 entropy_coef: 0.01
critic_coef: 1.0 critic_coef: 1.0
gae: 0.3 gae: 0.25
optimizer: optimizer:
classname: torch.optim.Adam classname: torch.optim.Adam
lr: 0.0003 lr: 0.0003