diff --git a/.gitignore b/.gitignore index 76048de..4994491 100644 --- a/.gitignore +++ b/.gitignore @@ -700,4 +700,5 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk -# End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex \ No newline at end of file +# End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex +/studies/e_1/ diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/algorithms/__init__.py b/algorithms/__init__.py index e69de29..0980070 100644 --- a/algorithms/__init__.py +++ b/algorithms/__init__.py @@ -0,0 +1 @@ +import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__))) diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py index 97cff7a..f0cdf16 100644 --- a/environments/factory/base/base_factory.py +++ b/environments/factory/base/base_factory.py @@ -13,8 +13,8 @@ from environments.factory.base.shadow_casting import Map from environments.factory.renderer import Renderer, RenderEntity from environments.helpers import Constants as c, Constants from environments import helpers as h -from environments.factory.base.objects import Agent, Tile, Action, Wall -from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles +from environments.factory.base.objects import Agent, Tile, Action +from environments.factory.base.registers import Actions, Entities, Agents, Doors, FloorTiles, WallTiles, PlaceHolders from environments.utility_classes import MovementProperties import simplejson @@ -58,7 +58,7 @@ class BaseFactory(gym.Env): def __init__(self, level_name='simple', n_agents=1, max_steps=int(5e2), pomdp_r: Union[None, int] = 0, movement_properties: MovementProperties = MovementProperties(), parse_doors=False, combin_agent_obs: bool = False, frames_to_stack=0, record_episodes=False, - omit_agent_in_obs=False, done_at_collision=False, cast_shadows=True, + omit_agent_in_obs=False, done_at_collision=False, cast_shadows=True, additional_agent_placeholder=None, verbose=False, doors_have_area=True, env_seed=time.time_ns(), **kwargs): assert frames_to_stack != 1 and frames_to_stack >= 0, "'frames_to_stack' cannot be negative or 1." if kwargs: @@ -74,6 +74,7 @@ class BaseFactory(gym.Env): self.level_name = level_name self._level_shape = None self.verbose = verbose + self.additional_agent_placeholder = additional_agent_placeholder self._renderer = None # expensive - don't use it when not required ! self._entities = Entities() @@ -141,6 +142,14 @@ class BaseFactory(gym.Env): individual_slices=not self.combin_agent_obs) entities.update({c.AGENT: agents}) + if self.additional_agent_placeholder is not None: + + # Empty Observations with either [0, 1, N(0, 1)] + placeholder = PlaceHolders.from_tiles([self._NO_POS_TILE], self._level_shape, + fill_value=self.additional_agent_placeholder) + + entities.update({c.AGENT_PLACEHOLDER: placeholder}) + # All entities self._entities = Entities() self._entities.register_additional_items(entities) @@ -155,10 +164,12 @@ class BaseFactory(gym.Env): def _init_obs_cube(self): arrays = self._entities.observable_arrays + # FIXME: Move logic to Register if self.omit_agent_in_obs and self.n_agents == 1: del arrays[c.AGENT] - elif self.omit_agent_in_obs: - arrays[c.AGENT] = np.delete(arrays[c.AGENT], 0, axis=0) + # This does not seem to be necesarry, because this case is allready handled by the Agent Register Class + # elif self.omit_agent_in_obs: + # arrays[c.AGENT] = np.delete(arrays[c.AGENT], 0, axis=0) obs_cube_z = sum([a.shape[0] if not self[key].is_per_agent else 1 for key, a in arrays.items()]) self._obs_cube = np.zeros((obs_cube_z, *self._level_shape), dtype=np.float32) @@ -273,6 +284,7 @@ class BaseFactory(gym.Env): agent_pos_is_omitted = False agent_omit_idx = None if self.omit_agent_in_obs and self.n_agents == 1: + # There is only a single agent and we want to omit the agent obs, so just remove the array. del state_array_dict[c.AGENT] elif self.omit_agent_in_obs and self.combin_agent_obs and self.n_agents > 1: state_array_dict[c.AGENT][0, agent.x, agent.y] -= agent.encoding @@ -295,6 +307,9 @@ class BaseFactory(gym.Env): for array_idx in range(array.shape[0]): self._obs_cube[running_idx: running_idx+z] = array[[x for x in range(array.shape[0]) if x != agent_omit_idx]] + elif key == c.AGENT and self.omit_agent_in_obs and self.combin_agent_obs: + z = 1 + self._obs_cube[running_idx: running_idx + z] = array else: z = array.shape[0] self._obs_cube[running_idx: running_idx+z] = array @@ -499,12 +514,8 @@ class BaseFactory(gym.Env): def _summarize_state(self): summary = {f'{REC_TAC}step': self._steps} - if self._steps == 0: - summary.update({f'{REC_TAC}{self[c.WALLS].name}': {self[c.WALLS].summarize_states()}, - 'FactoryName': self.__class__.__name__}) for entity_group in self._entities: - if not isinstance(entity_group, WallTiles): - summary.update({f'{REC_TAC}{entity_group.name}': entity_group.summarize_states()}) + summary.update({f'{REC_TAC}{entity_group.name}': entity_group.summarize_states(n_steps=self._steps)}) return summary def print(self, string): diff --git a/environments/factory/base/objects.py b/environments/factory/base/objects.py index 6863d27..e470799 100644 --- a/environments/factory/base/objects.py +++ b/environments/factory/base/objects.py @@ -93,11 +93,11 @@ class Entity(Object): return self._tile def __init__(self, tile, **kwargs): - super(Entity, self).__init__(**kwargs) + super().__init__(**kwargs) self._tile = tile tile.enter(self) - def summarize_state(self) -> dict: + def summarize_state(self, **_) -> dict: return dict(name=str(self.name), x=int(self.x), y=int(self.y), tile=str(self.tile.name), can_collide=bool(self.can_collide)) @@ -125,7 +125,7 @@ class MoveableEntity(Entity): return last_x-curr_x, last_y-curr_y def __init__(self, *args, **kwargs): - super(MoveableEntity, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._last_tile = None def move(self, next_tile): @@ -143,11 +143,34 @@ class MoveableEntity(Entity): class Action(Object): def __init__(self, *args, **kwargs): - super(Action, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) class PlaceHolder(MoveableEntity): - pass + + def __init__(self, *args, fill_value=0, **kwargs): + super().__init__(*args, **kwargs) + self._fill_value = fill_value + + @property + def last_tile(self): + return self.tile + + @property + def direction_of_view(self): + return self.pos + + @property + def can_collide(self): + return False + + @property + def encoding(self): + return c.NO_POS.value[0] + + @property + def name(self): + return "PlaceHolder" class Tile(Object): @@ -203,8 +226,8 @@ class Tile(Object): def __repr__(self): return f'{self.name}(@{self.pos})' - def summarize_state(self): - return dict(name=self.name, x=self.x, y=self.y) + def summarize_state(self, **_): + return dict(name=self.name, x=int(self.x), y=int(self.y)) class Wall(Tile): @@ -254,8 +277,8 @@ class Door(Entity): if not closed_on_init: self._open() - def summarize_state(self): - state_dict = super().summarize_state() + def summarize_state(self, **kwargs): + state_dict = super().summarize_state(**kwargs) state_dict.update(state=str(self.str_state), time_to_close=int(self.time_to_close)) return state_dict @@ -315,7 +338,7 @@ class Agent(MoveableEntity): self.temp_action = None self.temp_light_map = None - def summarize_state(self): - state_dict = super().summarize_state() + def summarize_state(self, **kwargs): + state_dict = super().summarize_state(**kwargs) state_dict.update(valid=bool(self.temp_valid), action=str(self.temp_action)) return state_dict diff --git a/environments/factory/base/registers.py b/environments/factory/base/registers.py index 12cbf6a..ef1bcd2 100644 --- a/environments/factory/base/registers.py +++ b/environments/factory/base/registers.py @@ -81,8 +81,8 @@ class ObjectRegister(Register): if self.individual_slices: self._array = np.concatenate((self._array, np.zeros((1, *self._array.shape[1:])))) - def summarize_states(self): - return [val.summarize_state() for val in self.values()] + def summarize_states(self, n_steps=None): + return [val.summarize_state(n_steps=n_steps) for val in self.values()] class EntityObjectRegister(ObjectRegister, ABC): @@ -156,23 +156,25 @@ class MovingEntityObjectRegister(EntityObjectRegister, ABC): del self[name] -class PlaceHolderRegister(MovingEntityObjectRegister): +class PlaceHolders(MovingEntityObjectRegister): _accepted_objects = PlaceHolder + def __init__(self, *args, fill_value: Union[str, int] = 0, **kwargs): + super().__init__(*args, **kwargs) + self.fill_value = fill_value + # noinspection DuplicatedCode def as_array(self): - self._array[:] = c.FREE_CELL.value - # noinspection PyTupleAssignmentBalance - for z, x, y, v in zip(range(len(self)), *zip(*[x.pos for x in self]), [x.encoding for x in self]): - if self.individual_slices: - self._array[z, x, y] += v - else: - self._array[0, x, y] += v + if isinstance(self.fill_value, int): + self._array[:] = self.fill_value + elif self.fill_value == "normal": + self._array = np.random.normal(size=self._array.shape) + if self.individual_slices: return self._array else: - return self._array.sum(axis=0, keepdims=True) + return self._array[None, 0] class Entities(Register): @@ -243,6 +245,12 @@ class WallTiles(EntityObjectRegister): def from_tiles(cls, tiles, *args, **kwargs): raise RuntimeError() + def summarize_states(self, n_steps=None): + if n_steps == h.STEPS_START: + return super(WallTiles, self).summarize_states(n_steps=n_steps) + else: + return {} + class FloorTiles(WallTiles): @@ -272,6 +280,10 @@ class FloorTiles(WallTiles): def from_tiles(cls, tiles, *args, **kwargs): raise RuntimeError() + def summarize_states(self, n_steps=None): + # Do not summarize + return {} + class Agents(MovingEntityObjectRegister): diff --git a/environments/factory/env_item_default_param.json b/environments/factory/env_item_default_param.json new file mode 100644 index 0000000..c42544f --- /dev/null +++ b/environments/factory/env_item_default_param.json @@ -0,0 +1,29 @@ +{ + "item_properties": { + "n_items": 5, + "spawn_frequency": 10, + "n_drop_off_locations": 5, + "max_dropoff_storage_size": 0, + "max_agent_inventory_capacity": 5, + "agent_can_interact": true + }, + "env_seed": 2, + "movement_properties": { + "allow_square_movement": true, + "allow_diagonal_movement": true, + "allow_no_op": false + }, + "level_name": "rooms", + "verbose": false, + "n_agents": 1, + "max_steps": 400, + "pomdp_r": 2, + "combin_agent_obs": true, + "omit_agent_in_obs": true, + "cast_shadows": true, + "frames_to_stack": 3, + "done_at_collision": false, + "record_episodes": false, + "parse_doors": false, + "doors_have_area": false +} \ No newline at end of file diff --git a/environments/factory/factory_dirt.py b/environments/factory/factory_dirt.py index acce0e3..fdecb01 100644 --- a/environments/factory/factory_dirt.py +++ b/environments/factory/factory_dirt.py @@ -51,8 +51,8 @@ class Dirt(Entity): def set_new_amount(self, amount): self._amount = amount - def summarize_state(self): - state_dict = super().summarize_state() + def summarize_state(self, **kwargs): + state_dict = super().summarize_state(**kwargs) state_dict.update(amount=float(self.amount)) return state_dict diff --git a/environments/factory/factory_dirt_item.py b/environments/factory/factory_dirt_item.py index 7ef10d0..077f07c 100644 --- a/environments/factory/factory_dirt_item.py +++ b/environments/factory/factory_dirt_item.py @@ -1,7 +1,9 @@ import random +from pathlib import Path from environments.factory.factory_dirt import DirtFactory, DirtProperties from environments.factory.factory_item import ItemFactory, ItemProperties +from environments.logging.recorder import RecorderCallback from environments.utility_classes import MovementProperties @@ -12,40 +14,44 @@ class DirtItemFactory(ItemFactory, DirtFactory): if __name__ == '__main__': + with RecorderCallback(filepath=Path('debug_out') / f'recorder_xxxx.json', occupation_map=False, + trajectory_map=False) as recorder: - dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, - max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05, - dirt_smear_amount=0.0, agent_can_interact=True) - item_props = ItemProperties(n_items=5, agent_can_interact=True) - move_props = MovementProperties(allow_diagonal_movement=True, - allow_square_movement=True, - allow_no_op=False) + dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, + max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05, + dirt_smear_amount=0.0, agent_can_interact=True) + item_props = ItemProperties(n_items=5, agent_can_interact=True) + move_props = MovementProperties(allow_diagonal_movement=True, + allow_square_movement=True, + allow_no_op=False) - render = True + render = False - factory = DirtItemFactory(n_agents=1, done_at_collision=False, frames_to_stack=0, - level_name='rooms', max_steps=400, combin_agent_obs=True, - omit_agent_in_obs=True, parse_doors=True, pomdp_r=3, - record_episodes=True, verbose=True, cast_shadows=True, - movement_properties=move_props, dirt_properties=dirt_props - ) + factory = DirtItemFactory(n_agents=1, done_at_collision=False, frames_to_stack=0, + level_name='rooms', max_steps=200, combin_agent_obs=True, + omit_agent_in_obs=True, parse_doors=True, pomdp_r=3, + record_episodes=True, verbose=False, cast_shadows=True, + movement_properties=move_props, dirt_properties=dirt_props + ) - # noinspection DuplicatedCode - n_actions = factory.action_space.n - 1 - _ = factory.observation_space + # noinspection DuplicatedCode + n_actions = factory.action_space.n - 1 + _ = factory.observation_space - for epoch in range(4): - random_actions = [[random.randint(0, n_actions) for _ - in range(factory.n_agents)] for _ - in range(factory.max_steps + 1)] - env_state = factory.reset() - r = 0 - for agent_i_action in random_actions: - env_state, step_r, done_bool, info_obj = factory.step(agent_i_action) - r += step_r - if render: - factory.render() - if done_bool: - break - print(f'Factory run {epoch} done, reward is:\n {r}') - pass + for epoch in range(4): + random_actions = [[random.randint(0, n_actions) for _ + in range(factory.n_agents)] for _ + in range(factory.max_steps + 1)] + env_state = factory.reset() + r = 0 + for agent_i_action in random_actions: + env_state, step_r, done_bool, info_obj = factory.step(agent_i_action) + recorder.read_info(0, info_obj) + r += step_r + if render: + factory.render() + if done_bool: + recorder.read_done(0, done_bool) + break + print(f'Factory run {epoch} done, reward is:\n {r}') + pass diff --git a/environments/factory/factory_item.py b/environments/factory/factory_item.py index f93c2c7..f280c40 100644 --- a/environments/factory/factory_item.py +++ b/environments/factory/factory_item.py @@ -109,8 +109,10 @@ class Inventory(UserList): def belongs_to_entity(self, entity): return self.agent == entity - def summarize_state(self): - return {val.name: val.summarize_state() for val in self} + def summarize_state(self, **kwargs): + attr_dict = {key: str(val) for key, val in self.__dict__.items() if not key.startswith('_') and key != 'data'} + attr_dict.update({val.name: val.summarize_state(**kwargs) for val in self}) + return attr_dict class Inventories(ObjectRegister): @@ -176,6 +178,10 @@ class DropOffLocation(Entity): def is_full(self): return False if not self.storage.maxlen else self.storage.maxlen == len(self.storage) + def summarize_state(self, n_steps=None) -> dict: + if n_steps == h.STEPS_START: + return super().summarize_state(n_steps=n_steps) + class DropOffLocations(EntityObjectRegister): diff --git a/environments/helpers.py b/environments/helpers.py index 4622a41..64a0c87 100644 --- a/environments/helpers.py +++ b/environments/helpers.py @@ -7,7 +7,10 @@ from pathlib import Path from stable_baselines3 import PPO, DQN, A2C +MODEL_MAP = dict(PPO=PPO, DQN=DQN, A2C=A2C) + LEVELS_DIR = 'levels' +STEPS_START = 1 TO_BE_AVERAGED = ['dirt_amount', 'dirty_tiles'] IGNORED_DF_COLUMNS = ['Episode', 'Run', 'train_step', 'step', 'index', 'dirt_amount', @@ -16,34 +19,35 @@ IGNORED_DF_COLUMNS = ['Episode', 'Run', 'train_step', 'step', 'index', 'dirt_amo # Constants class Constants(Enum): - WALL = '#' - WALLS = 'Walls' - FLOOR = 'Floor' - DOOR = 'D' - DANGER_ZONE = 'x' - LEVEL = 'Level' - AGENT = 'Agent' - FREE_CELL = 0 - OCCUPIED_CELL = 1 - SHADOWED_CELL = -1 - NO_POS = (-9999, -9999) + WALL = '#' + WALLS = 'Walls' + FLOOR = 'Floor' + DOOR = 'D' + DANGER_ZONE = 'x' + LEVEL = 'Level' + AGENT = 'Agent' + AGENT_PLACEHOLDER = 'AGENT_PLACEHOLDER' + FREE_CELL = 0 + OCCUPIED_CELL = 1 + SHADOWED_CELL = -1 + NO_POS = (-9999, -9999) - DOORS = 'Doors' - CLOSED_DOOR = 'closed' - OPEN_DOOR = 'open' + DOORS = 'Doors' + CLOSED_DOOR = 'closed' + OPEN_DOOR = 'open' - ACTION = 'action' - COLLISIONS = 'collision' - VALID = 'valid' - NOT_VALID = 'not_valid' + ACTION = 'action' + COLLISIONS = 'collision' + VALID = 'valid' + NOT_VALID = 'not_valid' # Dirt Env - DIRT = 'Dirt' + DIRT = 'Dirt' # Item Env - ITEM = 'Item' - INVENTORY = 'Inventory' - DROP_OFF = 'Drop_Off' + ITEM = 'Item' + INVENTORY = 'Inventory' + DROP_OFF = 'Drop_Off' def __bool__(self): if 'not_' in self.value: @@ -144,8 +148,6 @@ def asset_str(agent): return c.AGENT.value, 'idle' -model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C) - if __name__ == '__main__': parsed_level = parse_level(Path(__file__).parent / 'factory' / 'levels' / 'simple.txt') y = one_hot_level(parsed_level) diff --git a/environments/logging/monitor.py b/environments/logging/monitor.py index b4b85fe..4b25d9c 100644 --- a/environments/logging/monitor.py +++ b/environments/logging/monitor.py @@ -6,7 +6,7 @@ from typing import List, Dict from stable_baselines3.common.callbacks import BaseCallback from environments.helpers import IGNORED_DF_COLUMNS -from environments.logging.plotting import prepare_plot + import pandas as pd @@ -14,85 +14,76 @@ class MonitorCallback(BaseCallback): ext = 'png' - def __init__(self, filepath=Path('debug_out/monitor.pick'), plotting=True): + def __init__(self, filepath=Path('debug_out/monitor.pick')): super(MonitorCallback, self).__init__() self.filepath = Path(filepath) self._monitor_df = pd.DataFrame() self._monitor_dicts = defaultdict(dict) - self.plotting = plotting self.started = False self.closed = False def __enter__(self): - self._on_training_start() + self.start() + return self def __exit__(self, exc_type, exc_val, exc_tb): - self._on_training_end() + self.stop() def _on_training_start(self) -> None: if self.started: pass else: - self.filepath.parent.mkdir(exist_ok=True, parents=True) - self.started = True + self.start() pass def _on_training_end(self) -> None: if self.closed: pass else: - # self.out_file.unlink(missing_ok=True) - with self.filepath.open('wb') as f: - pickle.dump(self._monitor_df.reset_index(), f, protocol=pickle.HIGHEST_PROTOCOL) - if self.plotting: - print('Monitor files were dumped to disk, now plotting....') - - # %% Load MonitorList from Disk - with self.filepath.open('rb') as f: - monitor_list = pickle.load(f) - df = None - for m_idx, monitor in enumerate(monitor_list): - monitor['episode'] = m_idx - if df is None: - df = pd.DataFrame(columns=monitor.columns) - for _, row in monitor.iterrows(): - df.loc[df.shape[0]] = row - if df is None: # The env exited premature, we catch it. - self.closed = True - return - for column in list(df.columns): - if column != 'episode': - df[f'{column}_roll'] = df[column].rolling(window=50).mean() - # result.tail() - prepare_plot(filepath=self.filepath, results_df=df.filter(regex=(".+_roll"))) - print('Plotting done.') - self.closed = True + self.stop() def _on_step(self, alt_infos: List[Dict] = None, alt_dones: List[bool] = None) -> bool: - infos = alt_infos or self.locals.get('infos', []) - if alt_dones is not None: - dones = alt_dones - elif self.locals.get('dones', None) is not None: - dones =self.locals.get('dones', None) - elif self.locals.get('done', None) is not None: - dones = self.locals.get('done', [None]) - else: - dones = [] + if self.started: + for env_idx, info in enumerate(self.locals.get('infos', [])): + self.read_info(env_idx, info) - for env_idx, (info, done) in enumerate(zip(infos, dones)): - self._monitor_dicts[env_idx][len(self._monitor_dicts[env_idx])] = {key: val for key, val in info.items() - if key not in ['terminal_observation', 'episode'] - and not key.startswith('rec_')} - if done: - env_monitor_df = pd.DataFrame.from_dict(self._monitor_dicts[env_idx], orient='index') - self._monitor_dicts[env_idx] = dict() - columns = [col for col in env_monitor_df.columns if col not in IGNORED_DF_COLUMNS] - env_monitor_df = env_monitor_df.aggregate( - {col: 'mean' if col.endswith('ount') else 'sum' for col in columns} - ) - env_monitor_df['episode'] = len(self._monitor_df) - self._monitor_df = self._monitor_df.append([env_monitor_df]) - else: - pass + for env_idx, done in list( + enumerate(self.locals.get('dones', []))) + list(enumerate(self.locals.get('done', []))): + self.read_done(env_idx, done) + else: + pass return True + def read_info(self, env_idx, info: dict): + self._monitor_dicts[env_idx][len(self._monitor_dicts[env_idx])] = { + key: val for key, val in info.items() if + key not in ['terminal_observation', 'episode'] and not key.startswith('rec_')} + return + + def read_done(self, env_idx, done): + if done: + env_monitor_df = pd.DataFrame.from_dict(self._monitor_dicts[env_idx], orient='index') + self._monitor_dicts[env_idx] = dict() + columns = [col for col in env_monitor_df.columns if col not in IGNORED_DF_COLUMNS] + env_monitor_df = env_monitor_df.aggregate( + {col: 'mean' if col.endswith('ount') else 'sum' for col in columns} + ) + env_monitor_df['episode'] = len(self._monitor_df) + self._monitor_df = self._monitor_df.append([env_monitor_df]) + else: + pass + return + + def stop(self): + # self.out_file.unlink(missing_ok=True) + with self.filepath.open('wb') as f: + pickle.dump(self._monitor_df.reset_index(), f, protocol=pickle.HIGHEST_PROTOCOL) + self.closed = True + + def start(self): + if self.started: + pass + else: + self.filepath.parent.mkdir(exist_ok=True, parents=True) + self.started = True + pass diff --git a/environments/logging/recorder.py b/environments/logging/recorder.py index beb31fc..bca4a8a 100644 --- a/environments/logging/recorder.py +++ b/environments/logging/recorder.py @@ -3,11 +3,10 @@ from collections import defaultdict from pathlib import Path from typing import Union -import pandas as pd +import simplejson from stable_baselines3.common.callbacks import BaseCallback from environments.factory.base.base_factory import REC_TAC -from environments.helpers import IGNORED_DF_COLUMNS # noinspection PyAttributeOutsideInit @@ -18,8 +17,8 @@ class RecorderCallback(BaseCallback): self.trajectory_map = trajectory_map self.occupation_map = occupation_map self.filepath = Path(filepath) - self._recorder_dict = defaultdict(dict) - self._recorder_json_list = list() + self._recorder_dict = defaultdict(list) + self._recorder_out_list = list() self.do_record: bool self.started = False self.closed = False @@ -27,15 +26,15 @@ class RecorderCallback(BaseCallback): def read_info(self, env_idx, info: dict): if info_dict := {key.replace(REC_TAC, ''): val for key, val in info.items() if key.startswith(f'{REC_TAC}')}: info_dict.update(episode=(self.num_timesteps + env_idx)) - self._recorder_dict[env_idx][len(self._recorder_dict[env_idx])] = info_dict + self._recorder_dict[env_idx].append(info_dict) else: pass return def read_done(self, env_idx, done): if done: - self._recorder_json_list.append(json.dumps(self._recorder_dict[env_idx])) - self._recorder_dict[env_idx] = dict() + self._recorder_out_list.append({'steps': self._recorder_dict[env_idx]}) + self._recorder_dict[env_idx] = list() else: pass @@ -51,8 +50,11 @@ class RecorderCallback(BaseCallback): if self.do_record and self.started: # self.out_file.unlink(missing_ok=True) with self.filepath.open('w') as f: - json_list = self._recorder_json_list - json.dump(json_list, f, indent=4) + out_dict = {'episodes': self._recorder_out_list} + try: + simplejson.dump(out_dict, f, indent=4) + except TypeError: + print('Shit') if self.occupation_map: print('Recorder files were dumped to disk, now plotting the occupation map...') diff --git a/main.py b/main.py index 5c6867a..4f54ab6 100644 --- a/main.py +++ b/main.py @@ -1,101 +1,27 @@ -import pickle import warnings -from typing import Union, List -from os import PathLike + from pathlib import Path import time -import pandas as pd - from stable_baselines3.common.callbacks import CallbackList from stable_baselines3.common.vec_env import SubprocVecEnv from environments.factory.factory_dirt_item import DirtItemFactory from environments.factory.factory_item import ItemFactory, ItemProperties from environments.factory.factory_dirt import DirtProperties, DirtFactory -from environments.helpers import IGNORED_DF_COLUMNS from environments.logging.monitor import MonitorCallback -from environments.logging.plotting import prepare_plot from environments.logging.recorder import RecorderCallback from environments.utility_classes import MovementProperties +from plotting.compare_runs import compare_seed_runs, compare_model_runs warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) -def combine_runs(run_path: Union[str, PathLike]): - run_path = Path(run_path) - df_list = list() - for run, monitor_file in enumerate(run_path.rglob('monitor_*.pick')): - with monitor_file.open('rb') as f: - monitor_df = pickle.load(f) - - monitor_df['run'] = run - monitor_df = monitor_df.fillna(0) - df_list.append(monitor_df) - - df = pd.concat(df_list, ignore_index=True) - df = df.fillna(0).rename(columns={'episode': 'Episode', 'run': 'Run'}).sort_values(['Run', 'Episode']) - columns = [col for col in df.columns if col not in IGNORED_DF_COLUMNS] - - roll_n = 50 - - non_overlapp_window = df.groupby(['Run', 'Episode']).rolling(roll_n, min_periods=1).mean() - - df_melted = non_overlapp_window[columns].reset_index().melt(id_vars=['Episode', 'Run'], - value_vars=columns, var_name="Measurement", - value_name="Score") - - if df_melted['Episode'].max() > 800: - skip_n = round(df_melted['Episode'].max() * 0.02) - df_melted = df_melted[df_melted['Episode'] % skip_n == 0] - - prepare_plot(run_path / f'{run_path.name}_monitor_lineplot.png', df_melted) - print('Plotting done.') - - -def compare_runs(run_path: Path, run_identifier: int, parameter: Union[str, List[str]]): - run_path = Path(run_path) - df_list = list() - parameter = [parameter] if isinstance(parameter, str) else parameter - for path in run_path.iterdir(): - if path.is_dir() and str(run_identifier) in path.name: - for run, monitor_file in enumerate(path.rglob('monitor_*.pick')): - with monitor_file.open('rb') as f: - monitor_df = pickle.load(f) - - monitor_df['run'] = run - monitor_df['model'] = path.name.split('_')[0] - monitor_df = monitor_df.fillna(0) - df_list.append(monitor_df) - - df = pd.concat(df_list, ignore_index=True) - df = df.fillna(0).rename(columns={'episode': 'Episode', 'run': 'Run', 'model': 'Model'}) - columns = [col for col in df.columns if col in parameter] - - last_episode_to_report = min(df.groupby(['Model'])['Episode'].max()) - df = df[df['Episode'] < last_episode_to_report] - - roll_n = 40 - non_overlapp_window = df.groupby(['Model', 'Run', 'Episode']).rolling(roll_n, min_periods=1).mean() - - df_melted = non_overlapp_window[columns].reset_index().melt(id_vars=['Episode', 'Run', 'Model'], - value_vars=columns, var_name="Measurement", - value_name="Score") - - if df_melted['Episode'].max() > 100: - skip_n = round(df_melted['Episode'].max() * 0.02) - df_melted = df_melted[df_melted['Episode'] % skip_n == 0] - - style = 'Measurement' if len(columns) > 1 else None - prepare_plot(run_path / f'{run_identifier}_compare_{parameter}.png', df_melted, hue='Model', style=style) - print('Plotting done.') - - def make_env(env_kwargs_dict): def _init(): - with ItemFactory(**env_kwargs_dict) as init_env: + with DirtFactory(**env_kwargs_dict) as init_env: return init_env return _init @@ -110,17 +36,19 @@ if __name__ == '__main__': # exit() from stable_baselines3 import PPO, DQN, A2C - from algorithms.reg_dqn import RegDQN + # from algorithms.reg_dqn import RegDQN # from sb3_contrib import QRDQN dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, - max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05, + max_local_amount=1, spawn_frequency=16, max_spawn_ratio=0.05, dirt_smear_amount=0.0, agent_can_interact=True) - item_props = ItemProperties(n_items=5, agent_can_interact=True) + item_props = ItemProperties(n_items=10, agent_can_interact=True, + spawn_frequency=30, n_drop_off_locations=2, + max_agent_inventory_capacity=15) move_props = MovementProperties(allow_diagonal_movement=True, allow_square_movement=True, allow_no_op=False) - train_steps = 8e5 + train_steps = 5e6 time_stamp = int(time.time()) out_path = None @@ -128,18 +56,18 @@ if __name__ == '__main__': for modeL_type in [A2C, PPO, DQN]: # ,RegDQN, QRDQN]: for seed in range(3): env_kwargs = dict(n_agents=1, - item_properties=item_props, - # dirt_properties=dirt_props, + # item_properties=item_props, + dirt_properties=dirt_props, movement_properties=move_props, - pomdp_r=2, max_steps=400, parse_doors=False, - level_name='rooms', frames_to_stack=3, + pomdp_r=2, max_steps=1000, parse_doors=False, + level_name='rooms', frames_to_stack=4, omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False, cast_shadows=True, doors_have_area=False, env_seed=seed, verbose=False, ) if modeL_type.__name__ in ["PPO", "A2C"]: kwargs = dict(ent_coef=0.01) - env = SubprocVecEnv([make_env(env_kwargs) for _ in range(1)], start_method="spawn") + env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn") elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]: env = make_env(env_kwargs)() kwargs = dict(buffer_size=50000, @@ -161,7 +89,7 @@ if __name__ == '__main__': out_path /= identifier callbacks = CallbackList( - [MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick', plotting=False), + [MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick'), RecorderCallback(filepath=out_path / f'recorder_{identifier}.json', occupation_map=False, trajectory_map=False )] @@ -172,7 +100,7 @@ if __name__ == '__main__': save_path = out_path / f'model_{identifier}.zip' save_path.parent.mkdir(parents=True, exist_ok=True) model.save(save_path) - param_path = out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.yaml' + param_path = out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.json' try: env.env_method('save_params', param_path) except AttributeError: @@ -181,7 +109,7 @@ if __name__ == '__main__': print("Model Group Done.. Plotting...") if out_path: - combine_runs(out_path.parent) + compare_seed_runs(out_path.parent) print("All Models Done... Evaluating") if out_path: - compare_runs(Path('debug_out'), time_stamp, 'step_reward') + compare_model_runs(Path('debug_out'), time_stamp, 'step_reward') diff --git a/main_test.py b/main_test.py index 0940285..ed502ef 100644 --- a/main_test.py +++ b/main_test.py @@ -13,7 +13,7 @@ from stable_baselines3 import PPO, DQN, A2C from environments.factory.factory_dirt import DirtFactory, DirtProperties from environments.logging.monitor import MonitorCallback from algorithms.reg_dqn import RegDQN -from main import compare_runs, combine_runs +from main import compare_model_runs, compare_seed_runs warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) @@ -55,7 +55,7 @@ if __name__ == '__main__': exp_out_path = model_path / 'exp' callbacks = CallbackList( - [MonitorCallback(filepath=exp_out_path / f'future_exp_name', plotting=True)] + [MonitorCallback(filepath=exp_out_path / f'future_exp_name')] ) n_actions = env.action_space.n @@ -83,4 +83,4 @@ if __name__ == '__main__': print(f'Factory run {epoch} done, reward is:\n {r}') if out_path: - combine_runs(out_path.parent) + compare_seed_runs(out_path.parent) diff --git a/plotting/__init__.py b/plotting/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/plotting/compare_runs.py b/plotting/compare_runs.py new file mode 100644 index 0000000..3af498b --- /dev/null +++ b/plotting/compare_runs.py @@ -0,0 +1,155 @@ +import pickle +import re +from os import PathLike +from pathlib import Path +from typing import Union, List + +import pandas as pd + +from environments.helpers import IGNORED_DF_COLUMNS, MODEL_MAP +from plotting.plotting import prepare_plot + + +def compare_seed_runs(run_path: Union[str, PathLike]): + run_path = Path(run_path) + df_list = list() + for run, monitor_file in enumerate(run_path.rglob('monitor*.pick')): + with monitor_file.open('rb') as f: + monitor_df = pickle.load(f) + + monitor_df['run'] = run + monitor_df = monitor_df.fillna(0) + df_list.append(monitor_df) + + df = pd.concat(df_list, ignore_index=True) + df = df.fillna(0).rename(columns={'episode': 'Episode', 'run': 'Run'}).sort_values(['Run', 'Episode']) + columns = [col for col in df.columns if col not in IGNORED_DF_COLUMNS] + + roll_n = 50 + + non_overlapp_window = df.groupby(['Run', 'Episode']).rolling(roll_n, min_periods=1).mean() + + df_melted = non_overlapp_window[columns].reset_index().melt(id_vars=['Episode', 'Run'], + value_vars=columns, var_name="Measurement", + value_name="Score") + + if df_melted['Episode'].max() > 800: + skip_n = round(df_melted['Episode'].max() * 0.02) + df_melted = df_melted[df_melted['Episode'] % skip_n == 0] + + prepare_plot(run_path / f'{run_path.name}_monitor_lineplot.png', df_melted) + print('Plotting done.') + + +def compare_model_runs(run_path: Path, run_identifier: Union[str, int], parameter: Union[str, List[str]]): + run_path = Path(run_path) + df_list = list() + parameter = [parameter] if isinstance(parameter, str) else parameter + for path in run_path.iterdir(): + if path.is_dir() and str(run_identifier) in path.name: + for run, monitor_file in enumerate(path.rglob('monitor*.pick')): + with monitor_file.open('rb') as f: + monitor_df = pickle.load(f) + + monitor_df['run'] = run + monitor_df['model'] = next((x for x in path.name.split('_') if x in MODEL_MAP.keys())) + monitor_df = monitor_df.fillna(0) + df_list.append(monitor_df) + + df = pd.concat(df_list, ignore_index=True) + df = df.fillna(0).rename(columns={'episode': 'Episode', 'run': 'Run', 'model': 'Model'}) + columns = [col for col in df.columns if col in parameter] + + last_episode_to_report = min(df.groupby(['Model'])['Episode'].max()) + df = df[df['Episode'] < last_episode_to_report] + + roll_n = 40 + non_overlapp_window = df.groupby(['Model', 'Run', 'Episode']).rolling(roll_n, min_periods=1).mean() + + df_melted = non_overlapp_window[columns].reset_index().melt(id_vars=['Episode', 'Run', 'Model'], + value_vars=columns, var_name="Measurement", + value_name="Score") + + if df_melted['Episode'].max() > 80: + skip_n = round(df_melted['Episode'].max() * 0.02) + df_melted = df_melted[df_melted['Episode'] % skip_n == 0] + + style = 'Measurement' if len(columns) > 1 else None + prepare_plot(run_path / f'{run_identifier}_compare_{parameter}.png', df_melted, hue='Model', style=style) + print('Plotting done.') + + +def compare_all_parameter_runs(run_root_path: Path, parameter: Union[str, List[str]], + param_names: Union[List[str], None] = None, str_to_ignore=''): + run_root_path = Path(run_root_path) + df_list = list() + parameter = [parameter] if isinstance(parameter, str) else parameter + for monitor_idx, monitor_file in enumerate(run_root_path.rglob('monitor*.pick')): + with monitor_file.open('rb') as f: + monitor_df = pickle.load(f) + + parameters = [x.name for x in monitor_file.parents if x.parent not in run_root_path.parents] + if str_to_ignore: + parameters = [re.sub(f'_*({str_to_ignore})', '', param) for param in parameters] + + if monitor_idx == 0: + if param_names is not None: + if len(param_names) < len(parameters): + # FIXME: Missing Seed Detection, see below @111 + param_names = [next(param_names) if param not in MODEL_MAP.keys() else 'Model' for param in parameters] + elif len(param_names) == len(parameters): + pass + else: + raise ValueError + else: + param_names = [] + for param_idx, param in enumerate(parameters): + dtype = None + if param in MODEL_MAP.keys(): + param_name = 'Model' + elif '_' in param: + param_split = param.split('_') + if len(param_split) == 2 and any(split in MODEL_MAP.keys() for split in param_split): + # Extract the seed + param = int(next(x for x in param_split if x not in MODEL_MAP)) + param_name = 'Seed' + dtype = int + else: + param_name = f'param_{param_idx}' + else: + param_name = f'param_{param_idx}' + dtype = dtype if dtype is not None else str + monitor_df[param_name] = str(param) + monitor_df[param_name] = monitor_df[param_name].astype(dtype) + if monitor_idx == 0: + param_names.append(param_name) + + monitor_df = monitor_df.fillna(0) + df_list.append(monitor_df) + + df = pd.concat(df_list, ignore_index=True) + df = df.fillna(0).rename(columns={'episode': 'Episode'}).sort_values(['Episode']) + + for param_name in param_names: + df[param_name] = df[param_name].astype(str) + columns = [col for col in df.columns if col in parameter] + + last_episode_to_report = min(df.groupby(['Model'])['Episode'].max()) + df = df[df['Episode'] < last_episode_to_report] + + if df['Episode'].max() > 80: + skip_n = round(df['Episode'].max() * 0.02) + df = df[df['Episode'] % skip_n == 0] + combinations = [x for x in param_names if x not in ['Model', 'Seed']] + df['Parameter Combination'] = df[combinations].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) + df.drop(columns=combinations, inplace=True) + + # non_overlapp_window = df.groupby(param_names).sum() + + df_melted = df.reset_index().melt(id_vars=['Parameter Combination', 'Episode'], + value_vars=columns, var_name="Measurement", + value_name="Score") + + style = 'Measurement' if len(columns) > 1 else None + prepare_plot(run_root_path / f'compare_{parameter}.png', df_melted, hue='Parameter Combination', style=style) + print('Plotting done.') diff --git a/environments/logging/plotting.py b/plotting/plotting.py similarity index 100% rename from environments/logging/plotting.py rename to plotting/plotting.py diff --git a/reload_agent.py b/reload_agent.py index 44c9464..dcd26b0 100644 --- a/reload_agent.py +++ b/reload_agent.py @@ -3,10 +3,10 @@ from pathlib import Path import yaml from natsort import natsorted -from stable_baselines3.common.evaluation import evaluate_policy +from environments import helpers as h -from environments.factory.factory_dirt import DirtProperties, DirtFactory -from environments.factory.factory_item import ItemProperties, ItemFactory +from environments.factory.factory_dirt_item import DirtItemFactory +from environments.logging.recorder import RecorderCallback warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) @@ -14,27 +14,35 @@ warnings.filterwarnings('ignore', category=UserWarning) if __name__ == '__main__': - model_name = 'DQN_1631092016' + model_name = 'PPO_1631187073' run_id = 0 seed = 69 - out_path = Path(__file__).parent / 'debug_out' + out_path = Path(__file__).parent / 'study_out' / 'e_1_1631709932'/ 'no_obs' / 'itemdirt'/'A2C_1631709932' / '0_A2C_1631709932' model_path = out_path / model_name - with (model_path / f'env_{model_name}.yaml').open('r') as f: + with (out_path / f'env_params.json').open('r') as f: env_kwargs = yaml.load(f, Loader=yaml.FullLoader) - env_kwargs.update(verbose=True, env_seed=seed) - if False: - env_kwargs.update(dirt_properties=DirtProperties(clean_amount=1, gain_amount=0.1, max_global_amount=20, - max_local_amount=1, spawn_frequency=5, max_spawn_ratio=0.05, - dirt_smear_amount=0.5), - combin_agent_slices_in_obs=True, omit_agent_slice_in_obs=True) - with ItemFactory(**env_kwargs) as env: + env_kwargs.update(verbose=False, env_seed=seed, record_episodes=True) - # Edit THIS: - env.seed(seed) - model_files = list(natsorted((model_path / f'{run_id}_{model_name}').rglob('model_*.zip'))) - this_model = model_files[0] - model_cls = next(val for key, val in model_map.items() if key in model_name) - model = model_cls.load(this_model) - evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True) - print(evaluation_result) + this_model = out_path / 'model.zip' + + model_cls = next(val for key, val in h.MODEL_MAP.items() if key in model_name) + model = model_cls.load(this_model) + + with RecorderCallback(filepath=Path() / 'recorder_out.json') as recorder: + # Init Env + with DirtItemFactory(**env_kwargs) as env: + # Evaluation Loop for i in range(n Episodes) + for episode in range(5): + obs = env.reset() + rew, done_bool = 0, False + while not done_bool: + action = model.predict(obs, deterministic=False)[0] + env_state, step_r, done_bool, info_obj = env.step(action[0]) + recorder.read_info(0, info_obj) + rew += step_r + if done_bool: + recorder.read_done(0, done_bool) + break + print(f'Factory run {episode} done, reward is:\n {rew}') + print('all done') diff --git a/setup.py b/setup.py index f87f5c1..ac693c5 100644 --- a/setup.py +++ b/setup.py @@ -1 +1,7 @@ -# TODO \ No newline at end of file +# setup.py +from setuptools import setup, find_packages + +setup( + name='F_IKS', + packages=find_packages() +) diff --git a/studies/e_1.py b/studies/e_1.py index 58d06fb..9f8eaff 100644 --- a/studies/e_1.py +++ b/studies/e_1.py @@ -1,130 +1,234 @@ -import itertools -import random +import sys from pathlib import Path +try: + # noinspection PyUnboundLocalVariable + if __package__ is None: + DIR = Path(__file__).resolve().parent + sys.path.insert(0, str(DIR.parent)) + __package__ = DIR.name + else: + DIR = None +except NameError: + DIR = None + pass + +import time + + import simplejson -from stable_baselines3 import DQN, PPO, A2C +from stable_baselines3.common.vec_env import SubprocVecEnv +from environments import helpers as h from environments.factory.factory_dirt import DirtProperties, DirtFactory +from environments.factory.factory_dirt_item import DirtItemFactory from environments.factory.factory_item import ItemProperties, ItemFactory +from environments.logging.monitor import MonitorCallback +from environments.utility_classes import MovementProperties +from plotting.compare_runs import compare_seed_runs, compare_model_runs, compare_all_parameter_runs -if __name__ == '__main__': - """ - In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, - but never saw each other in training. - Those agents learned - - - We start with training a single policy on a single task (dirt cleanup / item pickup). - Then multiple agent equipped with the same policy are deployed in the same environment. - - There are further distinctions to be made: - - 1. No Observation - ['no_obs']: - - Agent do not see each other but their consequences of their combined actions - - Agents can collide - - 2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]: - - Agents see other entitys on a seperate slice - - This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$ - -- Depending ob the fill value, agents will react diffently - -> TODO: Test this! - - 3. Observation in level slice - ['in_lvl_obs']: - - This tells the agent to treat other agents as obstacle. - - However, the state space is altered since moving obstacles are not part the original agent observation. - - We are out of distribution. - """ +# Define a global studi save path +start_time = 1631709932 # int(time.time()) +study_root_path = (Path('..') if not DIR else Path()) / 'study_out' / f'{Path(__file__).stem}_{start_time}' + +""" +In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, +but never saw each other in training. +Those agents learned -def bundle_model(model_class): - if model_class.__class__.__name__ in ["PPO", "A2C"]: - kwargs = dict(ent_coef=0.01) - elif model_class.__class__.__name__ in ["RegDQN", "DQN", "QRDQN"]: - kwargs = dict(buffer_size=50000, - learning_starts=64, - batch_size=64, - target_update_interval=5000, - exploration_fraction=0.25, - exploration_final_eps=0.025 - ) - return lambda: model_class(kwargs) +We start with training a single policy on a single task (dirt cleanup / item pickup). +Then multiple agent equipped with the same policy are deployed in the same environment. + +There are further distinctions to be made: + +1. No Observation - ['no_obs']: +- Agent do not see each other but their consequences of their combined actions +- Agents can collide + +2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]: +- Agents see other entitys on a seperate slice +- This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$ +-- Depending ob the fill value, agents will react diffently + -> TODO: Test this! + +3. Observation in level slice - ['in_lvl_obs']: +- This tells the agent to treat other agents as obstacle. +- However, the state space is altered since moving obstacles are not part the original agent observation. +- We are out of distribution. +""" + + +def policy_model_kwargs(): + return dict(ent_coef=0.01) + + +def dqn_model_kwargs(): + return dict(buffer_size=50000, + learning_starts=64, + batch_size=64, + target_update_interval=5000, + exploration_fraction=0.25, + exploration_final_eps=0.025 + ) + + +def encapsule_env_factory(env_fctry, env_kwrgs): + + def _init(): + with env_fctry(**env_kwrgs) as init_env: + return init_env + + return _init if __name__ == '__main__': - # Define a global studi save path - study_root_path = Path(Path(__file__).stem) / 'out' + train_steps = 5e5 - # TODO: Define Global Env Parameters - factory_kwargs = { - - - } - - # TODO: Define global model parameters - - - # TODO: Define parameters for both envs - dirt_props = DirtProperties() - item_props = ItemProperties() + # Define Global Env Parameters + # Define properties object parameters + move_props = MovementProperties(allow_diagonal_movement=True, + allow_square_movement=True, + allow_no_op=False) + dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, + max_local_amount=1, spawn_frequency=15, max_spawn_ratio=0.05, + dirt_smear_amount=0.0, agent_can_interact=True) + item_props = ItemProperties(n_items=10, agent_can_interact=True, + spawn_frequency=30, n_drop_off_locations=2, + max_agent_inventory_capacity=15) + factory_kwargs = dict(n_agents=1, + pomdp_r=2, max_steps=400, parse_doors=False, + level_name='rooms', frames_to_stack=3, + omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False, + cast_shadows=True, doors_have_area=False, verbose=False, + movement_properties=move_props + ) # Bundle both environments with global kwargs and parameters - env_bundles = [lambda: ('dirt', DirtFactory(factory_kwargs, dirt_properties=dirt_props)), - lambda: ('item', ItemFactory(factory_kwargs, item_properties=item_props))] + env_map = {'dirt': (DirtFactory, dict(dirt_properties=dirt_props, **factory_kwargs)), + 'item': (ItemFactory, dict(item_properties=item_props, **factory_kwargs)), + 'itemdirt': (DirtItemFactory, dict(dirt_properties=dirt_props, item_properties=item_props, + **factory_kwargs))} + env_names = list(env_map.keys()) # Define parameter versions according with #1,2[1,0,N],3 - observation_modes = ['no_obs', 'seperate_0', 'seperate_1', 'seperate_N', 'in_lvl_obs'] - - # Define RL-Models - model_bundles = [bundle_model(model) for model in [A2C, PPO, DQN]] - - # Zip parameters, parameter versions, Env Classes and models - combinations = itertools.product(model_bundles, env_bundles) + observation_modes = { + # Fill-value = 0 + 'seperate_0': dict(additional_env_kwargs=dict(additional_agent_placeholder=0)), + # Fill-value = 1 + 'seperate_1': dict(additional_env_kwargs=dict(additional_agent_placeholder=1)), + # Fill-value = N(0, 1) + 'seperate_N': dict(additional_env_kwargs=dict(additional_agent_placeholder='N')), + # Further Adjustments are done post-training + 'in_lvl_obs': dict(post_training_kwargs=dict(other_agent_obs='in_lvl')), + # No further adjustment needed + 'no_obs': None + } # Train starts here ############################################################ - # Build Major Loop - for model, (env_identifier, env_bundle) in combinations: - for observation_mode in observation_modes: - # TODO: Create an identifier, which is unique for every combination and easy to read in filesystem - identifier = f'{model.name}_{observation_mode}_{env_identifier}' - # Train each combination per seed - for seed in range(3): - # TODO: Output folder - # TODO: Monitor Init - # TODO: Env Init - # TODO: Model Init - # TODO: Model train - # TODO: Model save - pass - # TODO: Seed Compare Plot + # Build Major Loop parameters, parameter versions, Env Classes and models + if False: + for observation_mode in observation_modes.keys(): + for env_name in env_names: + for model_cls in h.MODEL_MAP.values(): + # Create an identifier, which is unique for every combination and easy to read in filesystem + identifier = f'{model_cls.__name__}_{start_time}' + # Train each combination per seed + combination_path = study_root_path / observation_mode / env_name / identifier + env_class, env_kwargs = env_map[env_name] + # Retrieve and set the observation mode specific env parameters + if observation_mode_kwargs := observation_modes.get(observation_mode, None): + if additional_env_kwargs := observation_mode_kwargs.get("additional_env_kwargs", None): + env_kwargs.update(additional_env_kwargs) + for seed in range(5): + env_kwargs.update(env_seed=seed) + # Output folder + seed_path = combination_path / f'{str(seed)}_{identifier}' + seed_path.mkdir(parents=True, exist_ok=True) + + # Monitor Init + callbacks = [MonitorCallback(seed_path / 'monitor.pick')] + + # Env Init & Model kwargs definition + if model_cls.__name__ in ["PPO", "A2C"]: + env = env_class(**env_kwargs) + + # env = SubprocVecEnv([encapsule_env_factory(env_class, env_kwargs) for _ in range(1)], + # start_method="spawn") + model_kwargs = policy_model_kwargs() + + elif model_cls.__name__ in ["RegDQN", "DQN", "QRDQN"]: + env = env_class(**env_kwargs) + model_kwargs = dqn_model_kwargs() + + else: + raise NameError(f'The model "{model_cls.__name__}" has the wrong name.') + + param_path = seed_path / f'env_params.json' + try: + env.env_method('save_params', param_path) + except AttributeError: + env.save_params(param_path) + + # Model Init + model = model_cls("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **model_kwargs) + + # Model train + model.learn(total_timesteps=int(train_steps), callback=callbacks) + + # Model save + save_path = seed_path / f'model.zip' + model.save(save_path) + pass + # Compare perfoormance runs, for each seed within a model + compare_seed_runs(combination_path) + # Compare performance runs, for each model + # FIXME: Check THIS!!!! + compare_model_runs(study_root_path / observation_mode / env_name, f'{start_time}', 'step_reward') # Train ends here ############################################################ # Evaluation starts here ##################################################### # Iterate Observation Modes - for observation_mode in observation_modes: - # TODO: For trained policy in study_root_path / identifier - for policy_group in (x for x in study_root_path.iterdir() if x.is_dir()): - # TODO: Pick random seed or iterate over available seeds - policy_seed = next((y for y in study_root_path.iterdir() if y.is_dir())) - # TODO: retrieve model class - # TODO: Load both agents - models = [] - # TODO: Evaluation Loop for i in range(100) Episodes - for episode in range(100): - with next(policy_seed.glob('*.yaml')).open('r') as f: - env_kwargs = simplejson.load(f) - # TODO: Monitor Init - env = None # TODO: Init Env - for step in range(400): - random_actions = [[random.randint(0, env.n_actions) for _ in range(len(models))] for _ in range(200)] - env_state = env.reset() - rew = 0 - for agent_i_action in random_actions: - env_state, step_r, done_bool, info_obj = env.step(agent_i_action) - rew += step_r - if done_bool: - break - print(f'Factory run {episode} done, reward is:\n {rew}') - # TODO: Plotting - pass + for observation_mode in observation_modes: + obs_mode_path = next(x for x in study_root_path.iterdir() if x.is_dir() and x.name == observation_mode) + # For trained policy in study_root_path / identifier + for env_path in [x for x in obs_mode_path.iterdir() if x.is_dir()]: + for policy_path in [x for x in env_path.iterdir() if x. is_dir()]: + # TODO: Pick random seed or iterate over available seeds + # First seed path version + # seed_path = next((y for y in policy_path.iterdir() if y.is_dir())) + # Iteration + for seed_path in (y for y in policy_path.iterdir() if y.is_dir()): + # retrieve model class + for model_cls in (val for key, val in h.MODEL_MAP.items() if key in policy_path.name): + # Load both agents + models = [model_cls.load(seed_path / 'model.zip') for _ in range(2)] + # Load old env kwargs + with next(seed_path.glob('*.json')).open('r') as f: + env_kwargs = simplejson.load(f) + env_kwargs.update(n_agents=2, additional_agent_placeholder=None, + **observation_modes[observation_mode].get('post_training_env_kwargs', {})) + + # Monitor Init + with MonitorCallback(filepath=seed_path / f'e_1_monitor.pick') as monitor: + # Init Env + env = env_map[env_path.name][0](**env_kwargs) + # Evaluation Loop for i in range(n Episodes) + for episode in range(50): + obs = env.reset() + rew, done_bool = 0, False + while not done_bool: + actions = [model.predict(obs[i], deterministic=False)[0] + for i, model in enumerate(models)] + env_state, step_r, done_bool, info_obj = env.step(actions) + monitor.read_info(0, info_obj) + rew += step_r + if done_bool: + monitor.read_done(0, done_bool) + break + print(f'Factory run {episode} done, reward is:\n {rew}') + # Eval monitor outputs are automatically stored by the monitor object + + # TODO: Plotting + pass