diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py index b7b515d..6270d52 100644 --- a/environments/factory/base/base_factory.py +++ b/environments/factory/base/base_factory.py @@ -35,7 +35,7 @@ class BaseFactory(gym.Env): @property def named_action_space(self): - return {x.identifier.value: idx for idx, x in enumerate(self._actions.values())} + return {x.identifier: idx for idx, x in enumerate(self._actions.values())} @property def observation_space(self): @@ -287,7 +287,7 @@ class BaseFactory(gym.Env): doors.tick_doors() # Finalize - reward, reward_info = self.build_reward_result() + reward, reward_info = self.build_reward_result(rewards) info.update(reward_info) if self._steps >= self.max_steps: @@ -313,8 +313,8 @@ class BaseFactory(gym.Env): if door is not None: door.use() valid = c.VALID - self.print(f'{agent.name} just used a door {door.name}') - info_dict = {f'{agent.name}_door_use_{door.name}': 1} + self.print(f'{agent.name} just used a {door.name} at {door.pos}') + info_dict = {f'{agent.name}_door_use': 1} # When he doesn't... else: valid = c.NOT_VALID @@ -478,8 +478,7 @@ class BaseFactory(gym.Env): return oobs def get_all_tiles_with_collisions(self) -> List[Tile]: - tiles = [x.tile for y in self._entities for x in y if - y.can_collide and not isinstance(y, WallTiles) and x.can_collide and len(x.tile.guests) > 1] + tiles = [x for x in self[c.FLOOR] if len(x.guests_that_can_collide) > 1] if False: tiles_with_collisions = list() for tile in self[c.FLOOR]: @@ -503,11 +502,11 @@ class BaseFactory(gym.Env): else: valid = c.NOT_VALID self.print(f'{agent.name} just hit the wall at {agent.pos}.') - info_dict.update({f'{agent.pos}_wall_collide': 1}) + info_dict.update({f'{agent.name}_wall_collide': 1}) else: # Agent seems to be trying to Leave the level self.print(f'{agent.name} tried to leave the level {agent.pos}.') - info_dict.update({f'{agent.pos}_wall_collide': 1}) + info_dict.update({f'{agent.name}_wall_collide': 1}) reward_value = r.MOVEMENTS_VALID if valid else r.MOVEMENTS_FAIL reward = {'value': reward_value, 'reason': action.identifier, 'info': info_dict} return valid, reward @@ -554,7 +553,7 @@ class BaseFactory(gym.Env): def additional_per_agent_rewards(self, agent) -> List[dict]: return [] - def build_reward_result(self) -> (int, dict): + def build_reward_result(self, global_env_rewards: list) -> (int, dict): # Returns: Reward, Info info = defaultdict(lambda: 0.0) @@ -584,12 +583,14 @@ class BaseFactory(gym.Env): combined_info_dict = dict(combined_info_dict) combined_info_dict.update(info) + global_reward_sum = sum(global_env_rewards) if self.individual_rewards: self.print(f"rewards are {comb_rewards}") reward = list(comb_rewards.values()) + reward = [x + global_reward_sum for x in reward] return reward, combined_info_dict else: - reward = sum(comb_rewards.values()) + reward = sum(comb_rewards.values()) + global_reward_sum self.print(f"reward is {reward}") return reward, combined_info_dict diff --git a/environments/factory/factory_dirt.py b/environments/factory/factory_dirt.py index 4f08073..f5c8d2a 100644 --- a/environments/factory/factory_dirt.py +++ b/environments/factory/factory_dirt.py @@ -268,7 +268,7 @@ class DirtFactory(BaseFactory): if __name__ == '__main__': from environments.utility_classes import AgentRenderOptions as aro - render = False + render = True dirt_props = DirtProperties( initial_dirt_ratio=0.35, @@ -293,11 +293,11 @@ if __name__ == '__main__': global_timings = [] for i in range(10): - factory = DirtFactory(n_agents=2, done_at_collision=False, + factory = DirtFactory(n_agents=4, done_at_collision=False, level_name='rooms', max_steps=1000, doors_have_area=False, obs_prop=obs_props, parse_doors=True, - verbose=False, + verbose=True, mv_prop=move_props, dirt_prop=dirt_props, # inject_agents=[TSPDirtAgent], ) @@ -307,6 +307,7 @@ if __name__ == '__main__': _ = factory.observation_space obs_space = factory.observation_space obs_space_named = factory.named_observation_space + action_space_named = factory.named_action_space times = [] for epoch in range(10): start_time = time.time() diff --git a/environments/helpers.py b/environments/helpers.py index 28e1665..d90849d 100644 --- a/environments/helpers.py +++ b/environments/helpers.py @@ -78,12 +78,12 @@ class EnvActions: class Rewards: - MOVEMENTS_VALID = -0.001 - MOVEMENTS_FAIL = -0.001 - NOOP = -0.1 - USE_DOOR_VALID = -0.001 - USE_DOOR_FAIL = -0.001 - COLLISION = -1 + MOVEMENTS_VALID = -0.01 + MOVEMENTS_FAIL = -0.1 + NOOP = -0.01 + USE_DOOR_VALID = -0.01 + USE_DOOR_FAIL = -0.1 + COLLISION = -0.5 m = EnvActions @@ -120,7 +120,7 @@ class ObservationTranslator: def translate_observation(self, agent_idx: int, obs: np.ndarray): target_obs_space = self._per_agent_named_obs_space[agent_idx] - translation = [idx_space_dict['explained_idxs'] for name, idx_space_dict in target_obs_space.items()] + translation = [idx_space_dict for name, idx_space_dict in target_obs_space.items()] flat_translation = [x for y in translation for x in y] return np.take(obs, flat_translation, axis=1 if obs.ndim == 4 else 0) diff --git a/reload_agent.py b/reload_agent.py index fcb38ef..45b5e88 100644 --- a/reload_agent.py +++ b/reload_agent.py @@ -22,6 +22,7 @@ if __name__ == '__main__': record = False seed = 67 n_agents = 1 + # out_path = Path('study_out/e_1_new_reward/no_obs/dirt/A2C_new_reward/0_A2C_new_reward') out_path = Path('study_out/single_run_with_export/dirt') model_path = out_path @@ -49,7 +50,7 @@ if __name__ == '__main__': rew, done_bool = 0, False while not done_bool: if n_agents > 1: - actions = [model.predict(env_state[model_idx], deterministic=True)[0] + actions = [model.predict(env_state[model_idx], deterministic=determin)[0] for model_idx, model in enumerate(models)] else: actions = models[0].predict(env_state, deterministic=determin)[0] @@ -58,8 +59,6 @@ if __name__ == '__main__': rew += step_r if render: env.render() - if not env.unwrapped.unwrapped[c.AGENT][0].temp_valid: - print('Invalid ACtions') if done_bool: break print(f'Factory run {episode} done, reward is:\n {rew}') diff --git a/studies/e_1.py b/studies/e_1.py index a875008..ffdfe82 100644 --- a/studies/e_1.py +++ b/studies/e_1.py @@ -1,7 +1,6 @@ import sys from pathlib import Path from matplotlib import pyplot as plt -import numpy as np import itertools as it try: @@ -16,8 +15,6 @@ except NameError: DIR = None pass -import time - import simplejson from stable_baselines3.common.vec_env import SubprocVecEnv @@ -28,14 +25,12 @@ from environments.factory.factory_item import ItemProperties, ItemFactory from environments.logging.envmonitor import EnvMonitor from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions import pickle -from plotting.compare_runs import compare_seed_runs, compare_model_runs, compare_all_parameter_runs +from plotting.compare_runs import compare_seed_runs, compare_model_runs import pandas as pd import seaborn as sns import multiprocessing as mp -# mp.set_start_method("spawn") - """ In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, but never saw each other in training. @@ -72,10 +67,9 @@ n_agents = 4 ood_monitor_file = f'e_1_{n_agents}_agents' baseline_monitor_file = 'e_1_baseline' -from stable_baselines3 import A2C def policy_model_kwargs(): - return dict() # gae_lambda=0.25, n_steps=16, max_grad_norm=0.25, use_rms_prop=True) + return dict() # gae_lambda=0.25, n_steps=16, max_grad_norm=0.25, use_rms_prop=True) def dqn_model_kwargs(): @@ -198,7 +192,7 @@ if __name__ == '__main__': ood_run = True plotting = True - train_steps = 1e7 + train_steps = 1e6 n_seeds = 3 frames_to_stack = 3 @@ -222,7 +216,7 @@ if __name__ == '__main__': max_spawn_amount=0.1, max_global_amount=20, max_local_amount=1, spawn_frequency=0, max_spawn_ratio=0.05, dirt_smear_amount=0.0, agent_can_interact=True) - item_props = ItemProperties(n_items=10, agent_can_interact=True, + item_props = ItemProperties(n_items=10, spawn_frequency=30, n_drop_off_locations=2, max_agent_inventory_capacity=15) factory_kwargs = dict(n_agents=1, max_steps=400, parse_doors=True, diff --git a/studies/single_run_with_export.py b/studies/single_run_with_export.py index 4f50491..62a8f04 100644 --- a/studies/single_run_with_export.py +++ b/studies/single_run_with_export.py @@ -1,6 +1,8 @@ import sys from pathlib import Path +from stable_baselines3.common.vec_env import SubprocVecEnv + try: # noinspection PyUnboundLocalVariable if __package__ is None: @@ -44,7 +46,7 @@ def load_model_run_baseline(policy_path, env_to_run): # Load both agents model = model_cls.load(policy_path / 'model.zip', device='cpu') # Load old env kwargs - with next(policy_path.glob('*.json')).open('r') as f: + with next(policy_path.glob('*params.json')).open('r') as f: env_kwargs = simplejson.load(f) env_kwargs.update(done_at_collision=True) # Init Env @@ -103,8 +105,8 @@ def load_model_run_combined(root_path, env_to_run, env_kwargs): if done_bool: break print(f'Factory run {episode} done, reward is:\n {rew}') - recorded_env_factory.save_run(filepath=policy_path / f'monitor.pick') - recorded_env_factory.save_records(filepath=policy_path / f'recorder.json') + recorded_env_factory.save_run(filepath=root_path / f'monitor.pick') + recorded_env_factory.save_records(filepath=root_path / f'recorder.json') if __name__ == '__main__': @@ -113,12 +115,15 @@ if __name__ == '__main__': individual_run = True combined_run = True - train_steps = 2e6 + train_steps = 2e5 frames_to_stack = 3 # Define a global studi save path study_root_path = Path(__file__).parent.parent / 'study_out' / f'{Path(__file__).stem}' + def policy_model_kwargs(): + return dict(learning_rate=0.0003, n_steps=10, gamma=0.95, gae_lambda=0.0, ent_coef=0.01, vf_coef=0.5) + # Define Global Env Parameters # Define properties object parameters obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT, @@ -138,11 +143,11 @@ if __name__ == '__main__': max_agent_inventory_capacity=15) dest_props = DestProperties(n_dests=4, spawn_mode=DestModeOptions.GROUPED, spawn_frequency=1) factory_kwargs = dict(n_agents=1, max_steps=400, parse_doors=True, - level_name='rooms', doors_have_area=True, + level_name='rooms', doors_have_area=False, verbose=False, mv_prop=move_props, obs_prop=obs_props, - done_at_collision=False + done_at_collision=True ) # Bundle both environments with global kwargs and parameters @@ -172,33 +177,42 @@ if __name__ == '__main__': continue combination_path.mkdir(parents=True, exist_ok=True) - with env_class(**env_kwargs) as env_factory: - param_path = combination_path / f'env_params.json' + env_factory = SubprocVecEnv([encapsule_env_factory(env_class, env_kwargs) + for _ in range(6)], start_method="spawn") + + param_path = combination_path / f'env_params.json' + try: + env_factory.env_method('save_params', param_path) + except AttributeError: env_factory.save_params(param_path) - # EnvMonitor Init - callbacks = [EnvMonitor(env_factory)] + # EnvMonitor Init + callbacks = [EnvMonitor(env_factory)] - # Model Init - model = model_cls("MlpPolicy", env_factory, - verbose=1, seed=69, device='cpu') + # Model Init + model = model_cls("MlpPolicy", env_factory, **policy_model_kwargs(), + verbose=1, seed=69, device='cpu') - # Model train - model.learn(total_timesteps=int(train_steps), callback=callbacks) + # Model train + model.learn(total_timesteps=int(train_steps), callback=callbacks) - # Model save + # Model save + try: model.named_action_space = env_factory.unwrapped.named_action_space model.named_observation_space = env_factory.unwrapped.named_observation_space - save_path = combination_path / f'model.zip' - model.save(save_path) + except AttributeError: + model.named_action_space = env_factory.get_attr("named_action_space")[0] + model.named_observation_space = env_factory.get_attr("named_observation_space")[0] + save_path = combination_path / f'model.zip' + model.save(save_path) - # Monitor Save - callbacks[0].save_run(combination_path / 'monitor.pick') + # Monitor Save + callbacks[0].save_run(combination_path / 'monitor.pick') - # Better be save then sorry: Clean up! - del env_factory, model - import gc - gc.collect() + # Better be save then sorry: Clean up! + del env_factory, model + import gc + gc.collect() # Train ends here ############################################################ @@ -213,7 +227,7 @@ if __name__ == '__main__': # for policy_path in (y for y in policy_path.iterdir() if y.is_dir()): # load_model_run_baseline(policy_path) - print('Start Individual Training') + print('Done Individual Recording') # Then iterate over every model and monitor "ood behavior" - "is it ood?" if combined_run: