diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py index bb0c121..889b949 100644 --- a/environments/factory/base/base_factory.py +++ b/environments/factory/base/base_factory.py @@ -65,7 +65,7 @@ class BaseFactory(gym.Env): def __init__(self, level_name='simple', n_agents=1, max_steps=int(5e2), mv_prop: MovementProperties = MovementProperties(), obs_prop: ObservationProperties = ObservationProperties(), - parse_doors=False, record_episodes=False, done_at_collision=False, + parse_doors=False, done_at_collision=False, verbose=False, doors_have_area=True, env_seed=time.time_ns(), individual_rewards=False, **kwargs): @@ -97,7 +97,7 @@ class BaseFactory(gym.Env): self._pomdp_r = self.obs_prop.pomdp_r self.done_at_collision = done_at_collision - self.record_episodes = record_episodes + self._record_episodes = False self.parse_doors = parse_doors self.doors_have_area = doors_have_area self.individual_rewards = individual_rewards @@ -249,7 +249,7 @@ class BaseFactory(gym.Env): if self._steps >= self.max_steps: done = True info.update(step_reward=reward, step=self._steps) - if self.record_episodes: + if self._record_episodes: info.update(self._summarize_state()) # Post step Hook for later use @@ -280,7 +280,7 @@ class BaseFactory(gym.Env): if self.n_agents == 1: obs = self._build_per_agent_obs(self[c.AGENT][0], state_array_dict) elif self.n_agents >= 2: - obs = np.stack([self._build_per_agent_obs(agent, state_array_dict) for agent in self[c.AGENT]]) + obs = np.stack(self._build_per_agent_obs(agent, state_array_dict) for agent in self[c.AGENT]) else: raise ValueError('n_agents cannot be smaller than 1!!') return obs @@ -290,9 +290,6 @@ class BaseFactory(gym.Env): agent_omit_idx = None if self.obs_prop.omit_agent_self and self.n_agents == 1: - # There is only a single agent and we want to omit the agent obs, so just remove the array. - # del state_array_dict[c.AGENT] - # Not Needed any more, pass elif self.obs_prop.omit_agent_self and self.obs_prop.render_agents in [a_obs.COMBINED, ] and self.n_agents > 1: state_array_dict[c.AGENT][0, agent.x, agent.y] -= agent.encoding @@ -439,7 +436,7 @@ class BaseFactory(gym.Env): tiles_with_collisions = list() for tile in self[c.FLOOR]: if tile.is_occupied(): - guests = [guest for guest in tile.guests if guest.can_collide] + guests = tile.guests_that_can_collide if len(guests) >= 2: tiles_with_collisions.append(tile) return tiles_with_collisions @@ -521,7 +518,7 @@ class BaseFactory(gym.Env): per_agent_info_dict[agent.name].update(no_op=1) # per_agent_reward -= 0.00 - # Monitor Notes + # EnvMonitor Notes if agent.temp_valid: per_agent_info_dict[agent.name].update(valid_action=1) per_agent_info_dict[agent.name].update({f'{agent.name}_valid_action': 1}) diff --git a/environments/factory/base/objects.py b/environments/factory/base/objects.py index adec715..7bf92e1 100644 --- a/environments/factory/base/objects.py +++ b/environments/factory/base/objects.py @@ -209,7 +209,7 @@ class Tile(Object): return not len(self._guests) def is_occupied(self): - return len(self._guests) + return bool(len(self._guests)) def enter(self, guest): if guest.name not in self._guests: diff --git a/environments/factory/factory_dirt.py b/environments/factory/factory_dirt.py index 675b39b..6e0a7a2 100644 --- a/environments/factory/factory_dirt.py +++ b/environments/factory/factory_dirt.py @@ -28,7 +28,7 @@ class DirtProperties(NamedTuple): max_global_amount: int = 20 # Max dirt amount in the whole environment. dirt_smear_amount: float = 0.2 # Agents smear dirt, when not cleaning up in place. agent_can_interact: bool = True # Whether the agents can interact with the dirt in this environment. - done_when_clean = True + done_when_clean: bool = True class Dirt(Entity): @@ -228,14 +228,14 @@ class DirtFactory(BaseFactory): dirt = [dirt.amount for dirt in self[c.DIRT]] current_dirt_amount = sum(dirt) dirty_tile_count = len(dirt) - if dirty_tile_count: - dirt_distribution_score = entropy(softmax(np.asarray(dirt)) / dirty_tile_count) - else: - dirt_distribution_score = 0 + # if dirty_tile_count: + # dirt_distribution_score = entropy(softmax(np.asarray(dirt)) / dirty_tile_count) + #else: + # dirt_distribution_score = 0 info_dict.update(dirt_amount=current_dirt_amount) info_dict.update(dirty_tile_count=dirty_tile_count) - info_dict.update(dirt_distribution_score=dirt_distribution_score) + # info_dict.update(dirt_distribution_score=dirt_distribution_score) if agent.temp_action == CLEAN_UP_ACTION: if agent.temp_valid: diff --git a/environments/logging/monitor.py b/environments/logging/envmonitor.py similarity index 52% rename from environments/logging/monitor.py rename to environments/logging/envmonitor.py index 4b25d9c..42e11e0 100644 --- a/environments/logging/monitor.py +++ b/environments/logging/envmonitor.py @@ -1,7 +1,7 @@ import pickle from collections import defaultdict from pathlib import Path -from typing import List, Dict +from typing import List, Dict, Union from stable_baselines3.common.callbacks import BaseCallback @@ -10,57 +10,50 @@ from environments.helpers import IGNORED_DF_COLUMNS import pandas as pd -class MonitorCallback(BaseCallback): +class EnvMonitor(BaseCallback): ext = 'png' - def __init__(self, filepath=Path('debug_out/monitor.pick')): - super(MonitorCallback, self).__init__() - self.filepath = Path(filepath) + def __init__(self, env): + super(EnvMonitor, self).__init__() + self.unwrapped = env self._monitor_df = pd.DataFrame() self._monitor_dicts = defaultdict(dict) - self.started = False - self.closed = False - def __enter__(self): - self.start() - return self + def __getattr__(self, item): + return getattr(self.unwrapped, item) - def __exit__(self, exc_type, exc_val, exc_tb): - self.stop() + def step(self, action): + obs, reward, done, info = self.unwrapped.step(action) + self._read_info(0, info) + self._read_done(0, done) + return obs, reward, done, info + + def reset(self): + return self.unwrapped.reset() def _on_training_start(self) -> None: - if self.started: - pass - else: - self.start() pass def _on_training_end(self) -> None: - if self.closed: - pass - else: - self.stop() + pass def _on_step(self, alt_infos: List[Dict] = None, alt_dones: List[bool] = None) -> bool: - if self.started: - for env_idx, info in enumerate(self.locals.get('infos', [])): - self.read_info(env_idx, info) + for env_idx, info in enumerate(self.locals.get('infos', [])): + self._read_info(env_idx, info) - for env_idx, done in list( - enumerate(self.locals.get('dones', []))) + list(enumerate(self.locals.get('done', []))): - self.read_done(env_idx, done) - else: - pass + for env_idx, done in list( + enumerate(self.locals.get('dones', []))) + list(enumerate(self.locals.get('done', []))): + self._read_done(env_idx, done) return True - def read_info(self, env_idx, info: dict): + def _read_info(self, env_idx, info: dict): self._monitor_dicts[env_idx][len(self._monitor_dicts[env_idx])] = { key: val for key, val in info.items() if key not in ['terminal_observation', 'episode'] and not key.startswith('rec_')} return - def read_done(self, env_idx, done): + def _read_done(self, env_idx, done): if done: env_monitor_df = pd.DataFrame.from_dict(self._monitor_dicts[env_idx], orient='index') self._monitor_dicts[env_idx] = dict() @@ -74,16 +67,8 @@ class MonitorCallback(BaseCallback): pass return - def stop(self): - # self.out_file.unlink(missing_ok=True) - with self.filepath.open('wb') as f: + def save_run(self, filepath: Union[Path, str]): + filepath = Path(filepath) + filepath.parent.mkdir(exist_ok=True, parents=True) + with filepath.open('wb') as f: pickle.dump(self._monitor_df.reset_index(), f, protocol=pickle.HIGHEST_PROTOCOL) - self.closed = True - - def start(self): - if self.started: - pass - else: - self.filepath.parent.mkdir(exist_ok=True, parents=True) - self.started = True - pass diff --git a/environments/logging/recorder.py b/environments/logging/recorder.py index 61f8391..38569ca 100644 --- a/environments/logging/recorder.py +++ b/environments/logging/recorder.py @@ -1,4 +1,3 @@ -import json from collections import defaultdict from pathlib import Path from typing import Union @@ -11,22 +10,13 @@ from stable_baselines3.common.callbacks import BaseCallback from environments.factory.base.base_factory import REC_TAC -# noinspection PyAttributeOutsideInit -from environments.helpers import Constants as c +class EnvRecorder(BaseCallback): - -class RecorderCallback(BaseCallback): - - def __init__(self, filepath: Union[str, Path], occupation_map: bool = False, trajectory_map: bool = False, - entities='all'): - super(RecorderCallback, self).__init__() - self.trajectory_map = trajectory_map - self.occupation_map = occupation_map - self.filepath = Path(filepath) + def __init__(self, env, entities='all'): + super(EnvRecorder, self).__init__() + self.unwrapped = env self._recorder_dict = defaultdict(list) self._recorder_out_list = list() - self._env_params = None - self.do_record: bool if isinstance(entities, str): if entities.lower() == 'all': self._entities = None @@ -37,10 +27,18 @@ class RecorderCallback(BaseCallback): self.started = False self.closed = False - def read_params(self, params): - self._env_params = params + def __getattr__(self, item): + return getattr(self.unwrapped, item) - def read_info(self, env_idx, info: dict): + def reset(self): + self.unwrapped._record_episodes = True + return self.unwrapped.reset() + + def _on_training_start(self) -> None: + self.unwrapped._record_episodes = True + pass + + def _read_info(self, env_idx, info: dict): if info_dict := {key.replace(REC_TAC, ''): val for key, val in info.items() if key.startswith(f'{REC_TAC}')}: if self._entities: info_dict = {k: v for k, v in info_dict.items() if k in self._entities} @@ -51,7 +49,7 @@ class RecorderCallback(BaseCallback): pass return - def read_done(self, env_idx, done): + def _read_done(self, env_idx, done): if done: self._recorder_out_list.append({'steps': self._recorder_dict[env_idx], 'episode': len(self._recorder_out_list)}) @@ -59,77 +57,46 @@ class RecorderCallback(BaseCallback): else: pass - def start(self, force=False): - if (hasattr(self.training_env, 'record_episodes') and self.training_env.record_episodes) or force: - self.do_record = True - self.filepath.parent.mkdir(exist_ok=True, parents=True) - self.started = True - else: - self.do_record = False + def save_records(self, filepath: Union[Path, str], save_occupation_map=False, save_trajectory_map=False): + filepath = Path(filepath) + filepath.parent.mkdir(exist_ok=True, parents=True) + # self.out_file.unlink(missing_ok=True) + with filepath.open('w') as f: + out_dict = {'episodes': self._recorder_out_list, 'header': self.unwrapped.params} + try: + simplejson.dump(out_dict, f, indent=4) + except TypeError: + print('Shit') - def stop(self): - if self.do_record and self.started: - # self.out_file.unlink(missing_ok=True) - with self.filepath.open('w') as f: - out_dict = {'episodes': self._recorder_out_list, 'header': self._env_params} - try: - simplejson.dump(out_dict, f, indent=4) - except TypeError: - print('Shit') + if save_occupation_map: + a = np.zeros((15, 15)) + for episode in out_dict['episodes']: + df = pd.DataFrame([y for x in episode['steps'] for y in x['Agents']]) - if self.occupation_map: - a = np.zeros((15, 15)) - for episode in out_dict['episodes']: - df = pd.DataFrame([y for x in episode['steps'] for y in x['Agents']]) + b = list(df[['x', 'y']].to_records(index=False)) - b = list(df[['x', 'y']].to_records(index=False)) + np.add.at(a, tuple(zip(*b)), 1) - np.add.at(a, tuple(zip(*b)), 1) + # a = np.rot90(a) + import seaborn as sns + from matplotlib import pyplot as plt + hm = sns.heatmap(data=a) + hm.set_title('Very Nice Heatmap') + plt.show() - # a = np.rot90(a) - import seaborn as sns - from matplotlib import pyplot as plt - hm = sns.heatmap(data=a) - hm.set_title('Very Nice Heatmap') - plt.show() - - if self.trajectory_map: - print('Recorder files were dumped to disk, now plotting the occupation map...') - - self.closed = True - self.started = False - else: - pass + if save_trajectory_map: + raise NotImplementedError('This has not yet been implemented.') def _on_step(self) -> bool: - if self.do_record and self.started: - for env_idx, info in enumerate(self.locals.get('infos', [])): - self.read_info(env_idx, info) + for env_idx, info in enumerate(self.locals.get('infos', [])): + self._read_info(env_idx, info) + + dones = list(enumerate(self.locals.get('dones', []))) + dones.extend(list(enumerate(self.locals.get('done', [])))) + for env_idx, done in dones: + self._read_done(env_idx, done) - for env_idx, done in list( - enumerate(self.locals.get('dones', []))) + list( - enumerate(self.locals.get('done', []))): - self.read_done(env_idx, done) - else: - pass return True - def __enter__(self): - self.start(force=True) - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - self.stop() - - def _on_training_start(self) -> None: - if self.started: - pass - else: - self.start() - pass - def _on_training_end(self) -> None: - if self.closed: - pass - else: - self.stop() + pass diff --git a/environments/logging/training.py b/environments/logging/training.py deleted file mode 100644 index d467981..0000000 --- a/environments/logging/training.py +++ /dev/null @@ -1,54 +0,0 @@ -from collections import defaultdict -from pathlib import Path - -import numpy as np -import pandas as pd -from stable_baselines3.common.callbacks import BaseCallback - -from environments.logging.plotting import prepare_plot - - -class TraningMonitor(BaseCallback): - - def __init__(self, filepath, flush_interval=None): - super(TraningMonitor, self).__init__() - self.values = defaultdict(dict) - self.rewards = defaultdict(lambda: 0) - - self.filepath = Path(filepath) - self.flush_interval = flush_interval - self.next_flush: int - pass - - def _on_training_start(self) -> None: - self.flush_interval = self.flush_interval or (self.locals['total_timesteps'] * 0.1) - self.next_flush = self.flush_interval - - def _flush(self): - df = pd.DataFrame.from_dict(self.values, orient='index') - if not self.filepath.exists(): - df.to_csv(self.filepath, mode='wb', header=True) - else: - df.to_csv(self.filepath, mode='a', header=False) - - def _on_step(self) -> bool: - for idx, done in np.ndenumerate(self.locals.get('dones', [])): - idx = idx[0] - # self.values[self.num_timesteps].update(**{f'reward_env_{idx}': self.locals['rewards'][idx]}) - self.rewards[idx] += self.locals['rewards'][idx] - if done: - self.values[self.num_timesteps].update(**{f'acc_epispde_r_env_{idx}': self.rewards[idx]}) - self.rewards[idx] = 0 - - if self.num_timesteps >= self.next_flush and self.values: - self._flush() - self.values = defaultdict(dict) - - self.next_flush += self.flush_interval - return True - - def on_training_end(self) -> None: - self._flush() - self.values = defaultdict(dict) - # prepare_plot() - diff --git a/environments/utility_classes.py b/environments/utility_classes.py index 3069d6d..5ca2caf 100644 --- a/environments/utility_classes.py +++ b/environments/utility_classes.py @@ -1,4 +1,3 @@ -from enum import Enum from typing import NamedTuple, Union diff --git a/plotting/plotting.py b/plotting/plotting.py index 4f0fba0..eb76881 100644 --- a/plotting/plotting.py +++ b/plotting/plotting.py @@ -39,7 +39,9 @@ def prepare_plt(df, hue, style, hue_order): plt.close('all') sns.set(rc={'text.usetex': False}, style='whitegrid') lineplot = sns.lineplot(data=df, x='Episode', y='Score', hue=hue, style=style, - ci=95, palette=PALETTE, hue_order=hue_order) + ci=95, palette=PALETTE, hue_order=hue_order, ) + plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0) + plt.tight_layout() # lineplot.set_title(f'{sorted(list(df["Measurement"].unique()))}') return lineplot diff --git a/reload_agent.py b/reload_agent.py index f569447..ac66fc1 100644 --- a/reload_agent.py +++ b/reload_agent.py @@ -8,7 +8,7 @@ from environments import helpers as h from environments.helpers import Constants as c from environments.factory.factory_dirt import DirtFactory from environments.factory.combined_factories import DirtItemFactory -from environments.logging.recorder import RecorderCallback +from environments.logging.recorder import EnvRecorder warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) @@ -16,14 +16,13 @@ warnings.filterwarnings('ignore', category=UserWarning) if __name__ == '__main__': - model_name = 'A2C_ItsDirt' - run_id = 0 - determin = True - render=False + determin = False + render = True record = True seed = 67 - n_agents = 1 - out_path = Path('study_out/e_1_Now_with_doors/no_obs/dirt/A2C_Now_with_doors/0_A2C_Now_with_doors') + n_agents = 2 + out_path = Path('study_out/e_1_obs_stack_3_gae_0.25_n_steps_16/seperate_N/dirt/A2C_obs_stack_3_gae_0.25_n_steps_16/0_A2C_obs_stack_3_gae_0.25_n_steps_16') + out_path_2 = Path('study_out/e_1_obs_stack_3_gae_0.25_n_steps_16/seperate_N/dirt/A2C_obs_stack_3_gae_0.25_n_steps_16/1_A2C_obs_stack_3_gae_0.25_n_steps_16') model_path = out_path with (out_path / f'env_params.json').open('r') as f: @@ -33,42 +32,35 @@ if __name__ == '__main__': env_kwargs['dirt_prop']['max_spawn_amount'] = gain_amount del env_kwargs['dirt_prop']['gain_amount'] - env_kwargs.update(record_episodes=record) + env_kwargs.update(record_episodes=record, done_at_collision=True) this_model = out_path / 'model.zip' + other_model = out_path / 'model.zip' - model_cls = next(val for key, val in h.MODEL_MAP.items() if key in model_name) - models = [model_cls.load(this_model) for _ in range(n_agents)] + model_cls = next(val for key, val in h.MODEL_MAP.items() if key in out_path.parent.name) + models = [model_cls.load(this_model), model_cls.load(other_model)] - with RecorderCallback(filepath=Path() / 'recorder_out_DQN.json', occupation_map=True, - entities=['Agents']) as recorder: - # Init Env - with DirtFactory(**env_kwargs) as env: - obs_shape = env.observation_space.shape - # Evaluation Loop for i in range(n Episodes) - recorder.read_params(env.params) - for episode in range(200): - env_state = env.reset() - rew, done_bool = 0, False - while not done_bool: - if n_agents > 1: - actions = [model.predict( - np.stack([env_state[i][j] for i in range(env_state.shape[0])]), - deterministic=determin)[0] for j, model in enumerate(models)] - else: - actions = models[0].predict(env_state, deterministic=determin)[0] - if False: - if any([agent.pos in [door.pos for door in env.unwrapped[c.DOORS]] - for agent in env.unwrapped[c.AGENT]]): - print('On Door') - env_state, step_r, done_bool, info_obj = env.step(actions) + # Init Env + with DirtFactory(**env_kwargs) as env: + env = EnvRecorder(env) + obs_shape = env.observation_space.shape + # Evaluation Loop for i in range(n Episodes) + for episode in range(50): + env_state = env.reset() + rew, done_bool = 0, False + while not done_bool: + if n_agents > 1: + actions = [model.predict( + np.stack([env_state[i][j] for i in range(env_state.shape[0])]), + deterministic=determin)[0] for j, model in enumerate(models)] + else: + actions = models[0].predict(env_state, deterministic=determin)[0] + env_state, step_r, done_bool, info_obj = env.step(actions) - recorder.read_info(0, info_obj) - rew += step_r - if render: - env.render() - if done_bool: - recorder.read_done(0, done_bool) - break + rew += step_r + if render: + env.render() + if done_bool: + break print(f'Factory run {episode} done, reward is:\n {rew}') print('all done') diff --git a/studies/e_1.py b/studies/e_1.py index e880f59..62ed7d4 100644 --- a/studies/e_1.py +++ b/studies/e_1.py @@ -18,7 +18,6 @@ except NameError: import time - import simplejson from stable_baselines3.common.vec_env import SubprocVecEnv @@ -26,13 +25,17 @@ from environments import helpers as h from environments.factory.factory_dirt import DirtProperties, DirtFactory from environments.factory.combined_factories import DirtItemFactory from environments.factory.factory_item import ItemProperties, ItemFactory -from environments.logging.monitor import MonitorCallback +from environments.logging.envmonitor import EnvMonitor from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions import pickle from plotting.compare_runs import compare_seed_runs, compare_model_runs, compare_all_parameter_runs import pandas as pd import seaborn as sns +import multiprocessing as mp + +# mp.set_start_method("spawn") + """ In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, but never saw each other in training. @@ -69,9 +72,10 @@ n_agents = 4 ood_monitor_file = f'e_1_{n_agents}_agents' baseline_monitor_file = 'e_1_baseline' +from stable_baselines3 import A2C def policy_model_kwargs(): - return dict() + return dict(gae_lambda=0.25, n_steps=16, max_grad_norm=0, use_rms_prop=False) def dqn_model_kwargs(): @@ -102,27 +106,23 @@ def load_model_run_baseline(seed_path, env_to_run): with next(seed_path.glob('*.json')).open('r') as f: env_kwargs = simplejson.load(f) env_kwargs.update(done_at_collision=True) - # Monitor Init - with MonitorCallback(filepath=seed_path / f'{baseline_monitor_file}.pick') as monitor: - # Init Env - with env_to_run(**env_kwargs) as env_factory: - # Evaluation Loop for i in range(n Episodes) - for episode in range(100): - env_state = env_factory.reset() - rew, done_bool = 0, False - while not done_bool: - action = model.predict(env_state, deterministic=True)[0] - env_state, step_r, done_bool, info_obj = env_factory.step(action) - monitor.read_info(0, info_obj) - rew += step_r - if done_bool: - monitor.read_done(0, done_bool) - break - print(f'Factory run {episode} done, reward is:\n {rew}') - # Eval monitor outputs are automatically stored by the monitor object - # del model, env_kwargs, env_factory - # import gc - # gc.collect() + # Init Env + with env_to_run(**env_kwargs) as env_factory: + monitored_env_factory = EnvMonitor(env_factory) + + # Evaluation Loop for i in range(n Episodes) + for episode in range(100): + env_state = monitored_env_factory.reset() + rew, done_bool = 0, False + while not done_bool: + action = model.predict(env_state, deterministic=True)[0] + env_state, step_r, done_bool, info_obj = monitored_env_factory.step(action) + rew += step_r + if done_bool: + break + print(f'Factory run {episode} done, reward is:\n {rew}') + monitored_env_factory.save_run(filepath=seed_path / f'{ood_monitor_file}.pick') + def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict): @@ -138,33 +138,31 @@ def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict): n_agents=n_agents, done_at_collision=True, **additional_kwargs_dict.get('post_training_kwargs', {})) - # Monitor Init - with MonitorCallback(filepath=seed_path / f'{ood_monitor_file}.pick') as monitor: - # Init Env - with env_to_run(**env_kwargs) as env_factory: - # Evaluation Loop for i in range(n Episodes) - for episode in range(50): - env_state = env_factory.reset() - rew, done_bool = 0, False - while not done_bool: - try: - actions = [model.predict( - np.stack([env_state[i][j] for i in range(env_state.shape[0])]), - deterministic=True)[0] for j, model in enumerate(models)] - except ValueError as e: - print(e) - print('Env_Kwargs are:\n') - print(env_kwargs) - print('Path is:\n') - print(seed_path) - exit() - env_state, step_r, done_bool, info_obj = env_factory.step(actions) - monitor.read_info(0, info_obj) - rew += step_r - if done_bool: - monitor.read_done(0, done_bool) - break - print(f'Factory run {episode} done, reward is:\n {rew}') + # Init Env + with env_to_run(**env_kwargs) as env_factory: + monitored_factory_env = EnvMonitor(env_factory) + # Evaluation Loop for i in range(n Episodes) + for episode in range(50): + env_state = monitored_factory_env.reset() + rew, done_bool = 0, False + while not done_bool: + try: + actions = [model.predict( + np.stack([env_state[i][j] for i in range(env_state.shape[0])]), + deterministic=True)[0] for j, model in enumerate(models)] + except ValueError as e: + print(e) + print('Env_Kwargs are:\n') + print(env_kwargs) + print('Path is:\n') + print(seed_path) + exit() + env_state, step_r, done_bool, info_obj = monitored_factory_env.step(actions) + rew += step_r + if done_bool: + break + print(f'Factory run {episode} done, reward is:\n {rew}') + monitored_factory_env.save_run(filepath=seed_path / f'{ood_monitor_file}.pick') # Eval monitor outputs are automatically stored by the monitor object del models, env_kwargs, env_factory import gc @@ -174,27 +172,25 @@ def load_model_run_study(seed_path, env_to_run, additional_kwargs_dict): def start_mp_study_run(envs_map, policies_path): paths = list(y for y in policies_path.iterdir() if y.is_dir() and not (y / f'{ood_monitor_file}.pick').exists()) if paths: - import multiprocessing as mp - pool = mp.Pool(mp.cpu_count()) - print("Starting MP with: ", pool._processes, " Processes") - _ = pool.starmap(load_model_run_study, - it.product(paths, - (envs_map[policies_path.parent.name][0],), - (observation_modes[policies_path.parent.parent.name],)) - ) + with mp.get_context("spawn").Pool(mp.cpu_count()) as pool: + print("Starting MP with: ", pool._processes, " Processes") + _ = pool.starmap(load_model_run_study, + it.product(paths, + (envs_map[policies_path.parent.name][0],), + (observation_modes[policies_path.parent.parent.name],)) + ) def start_mp_baseline_run(envs_map, policies_path): paths = list(y for y in policies_path.iterdir() if y.is_dir() and not (y / f'{baseline_monitor_file}.pick').exists()) if paths: - import multiprocessing as mp - pool = mp.Pool(mp.cpu_count()) - print("Starting MP with: ", pool._processes, " Processes") - _ = pool.starmap(load_model_run_baseline, - it.product(paths, - (envs_map[policies_path.parent.name][0],)) - ) + with mp.get_context("spawn").Pool(mp.cpu_count()) as pool: + print("Starting MP with: ", pool._processes, " Processes") + _ = pool.starmap(load_model_run_baseline, + it.product(paths, + (envs_map[policies_path.parent.name][0],)) + ) if __name__ == '__main__': @@ -206,9 +202,10 @@ if __name__ == '__main__': train_steps = 5e6 n_seeds = 3 + frames_to_stack = 3 # Define a global studi save path - start_time = 'exploring_obs_stack' # int(time.time()) + start_time = 'obs_stack_3_gae_0.25_n_steps_16' # int(time.time()) study_root_path = Path(__file__).parent.parent / 'study_out' / f'{Path(__file__).stem}_{start_time}' # Define Global Env Parameters @@ -216,7 +213,7 @@ if __name__ == '__main__': obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT, omit_agent_self=True, additional_agent_placeholder=None, - frames_to_stack=6, + frames_to_stack=frames_to_stack, pomdp_r=2 ) move_props = MovementProperties(allow_diagonal_movement=True, @@ -234,7 +231,8 @@ if __name__ == '__main__': level_name='rooms', record_episodes=False, doors_have_area=True, verbose=False, mv_prop=move_props, - obs_prop=obs_props + obs_prop=obs_props, + done_at_collision=True ) # Bundle both environments with global kwargs and parameters @@ -250,44 +248,45 @@ if __name__ == '__main__': # Define parameter versions according with #1,2[1,0,N],3 observation_modes = {} - observation_modes.update({ - 'seperate_1': dict( - post_training_kwargs= - dict(obs_prop=ObservationProperties( - render_agents=AgentRenderOptions.COMBINED, - additional_agent_placeholder=None, - omit_agent_self=True, - frames_to_stack=3, - pomdp_r=2) - ), - additional_env_kwargs= - dict(obs_prop=ObservationProperties( - render_agents=AgentRenderOptions.NOT, - additional_agent_placeholder=1, - omit_agent_self=True, - frames_to_stack=3, - pomdp_r=2) - ) - )}) - observation_modes.update({ - 'seperate_0': dict( - post_training_kwargs= - dict(obs_prop=ObservationProperties( - render_agents=AgentRenderOptions.COMBINED, - additional_agent_placeholder=None, - omit_agent_self=True, - frames_to_stack=3, - pomdp_r=2) - ), - additional_env_kwargs= - dict(obs_prop=ObservationProperties( - render_agents=AgentRenderOptions.NOT, - additional_agent_placeholder=0, - omit_agent_self=True, - frames_to_stack=3, - pomdp_r=2) - ) - )}) + if False: + observation_modes.update({ + 'seperate_1': dict( + post_training_kwargs= + dict(obs_prop=ObservationProperties( + render_agents=AgentRenderOptions.COMBINED, + additional_agent_placeholder=None, + omit_agent_self=True, + frames_to_stack=frames_to_stack, + pomdp_r=2) + ), + additional_env_kwargs= + dict(obs_prop=ObservationProperties( + render_agents=AgentRenderOptions.NOT, + additional_agent_placeholder=1, + omit_agent_self=True, + frames_to_stack=frames_to_stack, + pomdp_r=2) + ) + )}) + observation_modes.update({ + 'seperate_0': dict( + post_training_kwargs= + dict(obs_prop=ObservationProperties( + render_agents=AgentRenderOptions.COMBINED, + additional_agent_placeholder=None, + omit_agent_self=True, + frames_to_stack=frames_to_stack, + pomdp_r=2) + ), + additional_env_kwargs= + dict(obs_prop=ObservationProperties( + render_agents=AgentRenderOptions.NOT, + additional_agent_placeholder=0, + omit_agent_self=True, + frames_to_stack=frames_to_stack, + pomdp_r=2) + ) + )}) observation_modes.update({ 'seperate_N': dict( post_training_kwargs= @@ -295,7 +294,7 @@ if __name__ == '__main__': render_agents=AgentRenderOptions.COMBINED, additional_agent_placeholder=None, omit_agent_self=True, - frames_to_stack=3, + frames_to_stack=frames_to_stack, pomdp_r=2) ), additional_env_kwargs= @@ -303,7 +302,7 @@ if __name__ == '__main__': render_agents=AgentRenderOptions.NOT, additional_agent_placeholder='N', omit_agent_self=True, - frames_to_stack=3, + frames_to_stack=frames_to_stack, pomdp_r=2) ) )}) @@ -314,7 +313,7 @@ if __name__ == '__main__': render_agents=AgentRenderOptions.LEVEL, omit_agent_self=True, additional_agent_placeholder=None, - frames_to_stack=3, + frames_to_stack=frames_to_stack, pomdp_r=2) ) )}) @@ -326,7 +325,7 @@ if __name__ == '__main__': render_agents=AgentRenderOptions.NOT, additional_agent_placeholder=None, omit_agent_self=True, - frames_to_stack=3, + frames_to_stack=frames_to_stack, pomdp_r=2) ) ) @@ -355,9 +354,6 @@ if __name__ == '__main__': continue seed_path.mkdir(parents=True, exist_ok=True) - # Monitor Init - callbacks = [MonitorCallback(seed_path / 'monitor.pick')] - # Env Init & Model kwargs definition if model_cls.__name__ in ["PPO", "A2C"]: # env_factory = env_class(**env_kwargs) @@ -378,6 +374,9 @@ if __name__ == '__main__': except AttributeError: env_factory.save_params(param_path) + # EnvMonitor Init + callbacks = [EnvMonitor(env_factory)] + # Model Init model = model_cls("MlpPolicy", env_factory, verbose=1, seed=seed, device='cpu', @@ -390,6 +389,9 @@ if __name__ == '__main__': save_path = seed_path / f'model.zip' model.save(save_path) + # Monitor Save + callbacks[0].save_run(seed_path / 'monitor.pick') + # Better be save then sorry: Clean up! del env_factory, model import gc @@ -500,13 +502,14 @@ if __name__ == '__main__': df['failed_cleanup'] = df.loc[:, df.columns.str.contains("]_failed_dirt_cleanup")].sum(1) df['coll_lvl'] = df.loc[:, df.columns.str.contains("]_vs_LEVEL")].sum(1) df['coll_agent'] = df.loc[:, df.columns.str.contains("]_vs_Agent")].sum(1) / 2 - # df['collisions'] = df['coll_lvl'] + df['coll_agent'] + # df['`collis`ions'] = df['coll_lvl'] + df['coll_agent'] value_vars = ['pick_up', 'drop_off', 'failed_item_action', 'failed_cleanup', 'coll_lvl', 'coll_agent', 'dirt_cleaned'] df_grouped = df.groupby(id_cols + ['seed'] - ).agg({key: 'sum' if "Agent" in key else 'mean' for key in df.columns + # 'sum' if "agent" in key else 'mean' + ).agg({key: 'sum' for key in df.columns if key not in (id_cols + ['seed'])}) df_melted = df_grouped.reset_index().melt(id_vars=id_cols, value_vars=value_vars, # 'step_reward',