diff --git a/environments/factory/base_factory.py b/environments/factory/base_factory.py index 0fa06ae..db05acd 100644 --- a/environments/factory/base_factory.py +++ b/environments/factory/base_factory.py @@ -7,20 +7,15 @@ import numpy as np from gym import spaces import yaml +from gym.wrappers import FrameStack from environments import helpers as h -from environments.utility_classes import Actions, StateSlice, AgentState, MovementProperties +from environments.utility_classes import Actions, StateSlice, AgentState, MovementProperties, Zones # noinspection PyAttributeOutsideInit class BaseFactory(gym.Env): - def __setattr__(self, key, value): - if isinstance(value, dict): - super(BaseFactory, self).__setattr__(key, Namespace(**value)) - else: - super(BaseFactory, self).__setattr__(key, value) - @property def action_space(self): return spaces.Discrete(self._actions.n) @@ -40,11 +35,20 @@ class BaseFactory(gym.Env): def movement_actions(self): return self._actions.movement_actions + def __enter__(self): + return self if self.frames_to_stack == 0 else FrameStack(self, self.frames_to_stack) + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + def __init__(self, level_name='simple', n_agents=1, max_steps=int(5e2), pomdp_radius: Union[None, int] = 0, movement_properties: MovementProperties = MovementProperties(), - combin_agent_slices_in_obs: bool = False, + combin_agent_slices_in_obs: bool = False, frames_to_stack=0, omit_agent_slice_in_obs=False, **kwargs): - assert combin_agent_slices_in_obs != omit_agent_slice_in_obs, 'Both options are exclusive' + assert (combin_agent_slices_in_obs != omit_agent_slice_in_obs) or \ + (not combin_agent_slices_in_obs and not omit_agent_slice_in_obs), \ + 'Both options are exclusive' + assert frames_to_stack != 1 and frames_to_stack >= 0, "'frames_to_stack' cannot be negative or 1." self.movement_properties = movement_properties self.level_name = level_name @@ -54,17 +58,19 @@ class BaseFactory(gym.Env): self.pomdp_radius = pomdp_radius self.combin_agent_slices_in_obs = combin_agent_slices_in_obs self.omit_agent_slice_in_obs = omit_agent_slice_in_obs + self.frames_to_stack = frames_to_stack self.done_at_collision = False _actions = Actions(self.movement_properties) self._actions = _actions + self.additional_actions - self._level = h.one_hot_level( - h.parse_level(Path(__file__).parent / h.LEVELS_DIR / f'{self.level_name}.txt') - ) + level_filepath = Path(__file__).parent / h.LEVELS_DIR / f'{self.level_name}.txt' + parsed_level = h.parse_level(level_filepath) + self._level = h.one_hot_level(parsed_level) self._state_slices = StateSlice(n_agents) if 'additional_slices' in kwargs: self._state_slices.register_additional_items(kwargs.get('additional_slices')) + self._zones = Zones(parsed_level) self.reset() @property @@ -259,7 +265,7 @@ class BaseFactory(gym.Env): # d = {key: val._asdict() if hasattr(val, '_asdict') else val for key, val in self.__dict__.items() d = {key: val for key, val in self.__dict__.items() if not key.startswith('_') and not key.startswith('__')} filepath.parent.mkdir(parents=True, exist_ok=True) - + super(BaseFactory, self).save_params() with filepath.open('w') as f: yaml.dump(d, f) # pickle.dump(d, f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/environments/factory/levels/rooms.txt b/environments/factory/levels/rooms.txt index 83d2e9c..43e8193 100644 --- a/environments/factory/levels/rooms.txt +++ b/environments/factory/levels/rooms.txt @@ -1,13 +1,13 @@ ############### -#------#------# -#---#--#------# -#--------#----# -#------#------# -#------#------# -###-#######-### -#----##-------# -#-----#----#--# -#-------------# -#-----#-------# -#-----#-------# +#333x33#444444# +#333#33#444444# +#333333xx#4444# +#333333#444444# +#333333#444444# +###x#######x### +#1111##2222222# +#11111#2222#22# +#11111x2222222# +#11111#2222222# +#11111#2222222# ############### \ No newline at end of file diff --git a/environments/factory/simple_factory.py b/environments/factory/simple_factory.py index 21b45ab..22e7a4b 100644 --- a/environments/factory/simple_factory.py +++ b/environments/factory/simple_factory.py @@ -8,7 +8,7 @@ from environments.factory.base_factory import BaseFactory from environments import helpers as h from environments.factory.renderer import Renderer, Entity -from environments.utility_classes import AgentState, MovementProperties +from environments.utility_classes import AgentState, MovementProperties, Register DIRT_INDEX = -1 CLEAN_UP_ACTION = 'clean_up' @@ -39,8 +39,8 @@ class SimpleFactory(BaseFactory): self.dirt_properties = dirt_properties self.verbose = verbose self.max_dirt = 20 - super(SimpleFactory, self).__init__(*args, additional_slices='dirt', **kwargs) self._renderer = None # expensive - don't use it when not required ! + super(SimpleFactory, self).__init__(*args, additional_slices='dirt', **kwargs) def render(self): @@ -79,7 +79,6 @@ class SimpleFactory(BaseFactory): for x, y in free_for_dirt[:n_dirt_tiles]: new_value = self._state[DIRT_INDEX, x, y] + self.dirt_properties.gain_amount self._state[DIRT_INDEX, x, y] = max(new_value, self.dirt_properties.max_local_amount) - else: pass @@ -126,10 +125,11 @@ class SimpleFactory(BaseFactory): return obs def calculate_reward(self, agent_states: List[AgentState]) -> (int, dict): - # TODO: What reward to use? + info_dict = dict() current_dirt_amount = self._state[DIRT_INDEX].sum() dirty_tiles = np.argwhere(self._state[DIRT_INDEX] != h.IS_FREE_CELL).shape[0] - info_dict = dict() + info_dict.update(dirt_amount=current_dirt_amount) + info_dict.update(dirty_tile_count=dirty_tiles) try: # penalty = current_dirt_amount @@ -156,14 +156,13 @@ class SimpleFactory(BaseFactory): reward -= 0.01 self.print(f'Agent {agent_state.i} just tried to clean up some dirt ' f'at {agent_state.pos}, but was unsucsessfull.') - info_dict.update(failed_cleanup_attempt=1) + info_dict.update({f'agent_{agent_state.i}_failed_action': 1}) elif self._actions.is_moving_action(agent_state.action): if agent_state.action_valid: # info_dict.update(movement=1) reward -= 0.00 else: - # info_dict.update(collision=1) # self.print('collision') reward -= 0.01 @@ -172,10 +171,9 @@ class SimpleFactory(BaseFactory): reward -= 0.00 for entity in list_of_collisions: + entity = 'agent' if 'agent' in entity else entity info_dict.update({f'agent_{agent_state.i}_vs_{entity}': 1}) - info_dict.update(dirt_amount=current_dirt_amount) - info_dict.update(dirty_tile_count=dirty_tiles) self.print(f"reward is {reward}") # Potential based rewards -> # track the last reward , minus the current reward = potential @@ -191,8 +189,8 @@ if __name__ == '__main__': move_props = MovementProperties(allow_diagonal_movement=True, allow_square_movement=True) dirt_props = DirtProperties() - factory = SimpleFactory(movement_properties=move_props, dirt_properties=dirt_props, n_agents=2, - combin_agent_slices_in_obs=True, omit_agent_slice_in_obs=False) + factory = SimpleFactory(movement_properties=move_props, dirt_properties=dirt_props, n_agents=10, + combin_agent_slices_in_obs=True, omit_agent_slice_in_obs=False, level_name='rooms') # dirt_props = DirtProperties() # move_props = MovementProperties(allow_diagonal_movement=False, allow_no_op=False) diff --git a/environments/helpers.py b/environments/helpers.py index 64670a3..3f0b2a3 100644 --- a/environments/helpers.py +++ b/environments/helpers.py @@ -6,6 +6,7 @@ from pathlib import Path # Constants WALL = '#' +DANGER_ZONE = 'x' LEVELS_DIR = 'levels' LEVEL_IDX = 0 AGENT_START_IDX = 1 diff --git a/environments/utility_classes.py b/environments/utility_classes.py index b49609c..bc7be60 100644 --- a/environments/utility_classes.py +++ b/environments/utility_classes.py @@ -1,6 +1,8 @@ from typing import Union, List, NamedTuple import numpy as np +from environments import helpers as h + class MovementProperties(NamedTuple): allow_square_movement: bool = True @@ -123,5 +125,38 @@ class StateSlice(Register): def __init__(self, n_agents: int): super(StateSlice, self).__init__() - offset = 1 + offset = 1 # AGENT_START_IDX self.register_additional_items(['level', *[f'agent#{i}' for i in range(offset, n_agents+offset)]]) + + +class Zones(Register): + + @property + def danger_zone(self): + return self._zone_slices[self.by_name(h.DANGER_ZONE)] + + @property + def accounting_zones(self): + return [self[idx] for idx, name in self.items() if name != h.DANGER_ZONE] + + def __init__(self, parsed_level): + super(Zones, self).__init__() + slices = list() + self._accounting_zones = list() + self._danger_zones = list() + for symbol in np.unique(parsed_level): + if symbol == h.WALL: + continue + elif symbol == h.DANGER_ZONE: + self + symbol + slices.append(h.one_hot_level(parsed_level, symbol)) + self._danger_zones.append(symbol) + else: + self + symbol + slices.append(h.one_hot_level(parsed_level, symbol)) + self._accounting_zones.append(symbol) + + self._zone_slices = np.stack(slices) + + def __getitem__(self, item): + return self._zone_slices[item] diff --git a/main.py b/main.py index 63b7588..937bd51 100644 --- a/main.py +++ b/main.py @@ -93,7 +93,7 @@ if __name__ == '__main__': # from sb3_contrib import QRDQN dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, - max_local_amount=5, spawn_frequency=3) + max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) move_props = MovementProperties(allow_diagonal_movement=True, allow_square_movement=True, allow_no_op=False) @@ -104,31 +104,29 @@ if __name__ == '__main__': for modeL_type in [PPO, A2C]: # , RegDQN, DQN]: for seed in range(3): - env = SimpleFactory(n_agents=1, dirt_properties=dirt_props, pomdp_radius=3, max_steps=400, - movement_properties=move_props, level_name='rooms', - omit_agent_slice_in_obs=True) + with SimpleFactory(n_agents=1, dirt_properties=dirt_props, pomdp_radius=2, max_steps=400, + movement_properties=move_props, level_name='rooms', frames_to_stack=4, + omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True) as env: - # env = FrameStack(env, 4) + kwargs = dict(ent_coef=0.01) if isinstance(modeL_type, (PPO, A2C)) else {} + model = modeL_type("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **kwargs) - kwargs = dict(ent_coef=0.01) if isinstance(modeL_type, (PPO, A2C)) else {} - model = modeL_type("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **kwargs) + out_path = Path('debug_out') / f'{model.__class__.__name__}_{time_stamp}' - out_path = Path('debug_out') / f'{model.__class__.__name__}_{time_stamp}' + # identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' + identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' + out_path /= identifier - # identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' - identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' - out_path /= identifier + callbacks = CallbackList( + [MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick', plotting=False)] + ) - callbacks = CallbackList( - [MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick', plotting=False)] - ) + model.learn(total_timesteps=int(1e5), callback=callbacks) - model.learn(total_timesteps=int(1e5), callback=callbacks) - - save_path = out_path / f'model_{identifier}.zip' - save_path.parent.mkdir(parents=True, exist_ok=True) - model.save(save_path) - env.save_params(out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.yaml') + save_path = out_path / f'model_{identifier}.zip' + save_path.parent.mkdir(parents=True, exist_ok=True) + model.save(save_path) + env.save_params(out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.yaml') if out_path: combine_runs(out_path.parent) diff --git a/main_test.py b/main_test.py index 5113b68..0925487 100644 --- a/main_test.py +++ b/main_test.py @@ -3,13 +3,14 @@ import warnings from pathlib import Path import yaml +from gym.wrappers import FrameStack from natsort import natsorted from stable_baselines3.common.callbacks import CallbackList from stable_baselines3 import PPO, DQN, A2C # our imports -from environments.factory.simple_factory import SimpleFactory +from environments.factory.simple_factory import SimpleFactory, DirtProperties from environments.logging.monitor import MonitorCallback from algorithms.reg_dqn import RegDQN from main import compare_runs, combine_runs @@ -28,7 +29,7 @@ if __name__ == '__main__': # rewards += [total reward] # boxplot total rewards - run_id = '1623078961' + run_id = '1623241962' model_name = 'PPO' # ----------------------- @@ -45,9 +46,13 @@ if __name__ == '__main__': for seed in range(3): with (model_path / f'env_{model_path.name}.yaml').open('r') as f: env_kwargs = yaml.load(f, Loader=yaml.FullLoader) - env_kwargs.update(n_agents=2) + dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, + max_local_amount=3, spawn_frequency=1, max_spawn_ratio=0.05) + env_kwargs.update(n_agents=1, dirt_properties=dirt_props) env = SimpleFactory(**env_kwargs) + env = FrameStack(env, 4) + exp_out_path = model_path / 'exp' callbacks = CallbackList( [MonitorCallback(filepath=exp_out_path / f'future_exp_name', plotting=True)] @@ -58,13 +63,19 @@ if __name__ == '__main__': for epoch in range(100): observations = env.reset() if render: - env.render() + if isinstance(env, FrameStack): + env.env.render() + else: + env.render() done_bool = False r = 0 while not done_bool: - actions = [model.predict(obs, deterministic=False)[0] for obs in observations] + if env.n_agents > 1: + actions = [model.predict(obs, deterministic=False)[0] for obs in observations] + else: + actions = model.predict(observations, deterministic=False)[0] - obs, r, done_bool, info_obj = env.step(actions) + observations, r, done_bool, info_obj = env.step(actions) if render: env.render() if done_bool: diff --git a/reload_agent.py b/reload_agent.py index b5a8607..58f3cce 100644 --- a/reload_agent.py +++ b/reload_agent.py @@ -21,14 +21,14 @@ if __name__ == '__main__': with (model_path / f'env_{model_name}.yaml').open('r') as f: env_kwargs = yaml.load(f, Loader=yaml.FullLoader) - env = SimpleFactory(level_name='rooms', **env_kwargs) + with SimpleFactory(level_name='rooms', **env_kwargs) as env: - # Edit THIS: - model_files = list(natsorted((model_path / f'{run_id}_{model_name}').rglob('model_*.zip'))) - this_model = model_files[0] + # Edit THIS: + model_files = list(natsorted((model_path / f'{run_id}_{model_name}').rglob('model_*.zip'))) + this_model = model_files[0] + + model = PPO.load(this_model) + evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True) + print(evaluation_result) - model = PPO.load(this_model) - evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True) - print(evaluation_result) - env.close()