From 4c21a0af7cc8927d8489c7bd873fbb1a4049d70d Mon Sep 17 00:00:00 2001 From: Steffen Illium Date: Wed, 8 Sep 2021 16:24:14 +0200 Subject: [PATCH] study e_1 corpus --- environments/factory/base/base_factory.py | 2 +- environments/factory/base/objects.py | 152 +++++++++++----------- environments/factory/base/registers.py | 26 +++- environments/factory/factory_item.py | 6 +- environments/helpers.py | 4 + main.py | 5 +- reload_agent.py | 8 +- studies/e_1.py | 130 ++++++++++++++++++ 8 files changed, 246 insertions(+), 87 deletions(-) create mode 100644 studies/e_1.py diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py index 6545a64..97cff7a 100644 --- a/environments/factory/base/base_factory.py +++ b/environments/factory/base/base_factory.py @@ -195,7 +195,7 @@ class BaseFactory(gym.Env): for action, agent in zip(actions, self[c.AGENT]): agent.clear_temp_state() action_obj = self._actions[int(action)] - self.print(f'Action #{action} has been resolved to: {action_obj}') + # self.print(f'Action #{action} has been resolved to: {action_obj}') if h.MovingAction.is_member(action_obj): valid = self._move_or_colide(agent, action_obj) elif h.EnvActions.NOOP == agent.temp_action: diff --git a/environments/factory/base/objects.py b/environments/factory/base/objects.py index 5963e18..6863d27 100644 --- a/environments/factory/base/objects.py +++ b/environments/factory/base/objects.py @@ -66,12 +66,90 @@ class Object: return other.name == self.name +class Entity(Object): + + @property + def can_collide(self): + return True + + @property + def encoding(self): + return c.OCCUPIED_CELL.value + + @property + def x(self): + return self.pos[0] + + @property + def y(self): + return self.pos[1] + + @property + def pos(self): + return self._tile.pos + + @property + def tile(self): + return self._tile + + def __init__(self, tile, **kwargs): + super(Entity, self).__init__(**kwargs) + self._tile = tile + tile.enter(self) + + def summarize_state(self) -> dict: + return dict(name=str(self.name), x=int(self.x), y=int(self.y), + tile=str(self.tile.name), can_collide=bool(self.can_collide)) + + def __repr__(self): + return f'{self.name}(@{self.pos})' + + +class MoveableEntity(Entity): + + @property + def last_tile(self): + return self._last_tile + + @property + def last_pos(self): + if self._last_tile: + return self._last_tile.pos + else: + return c.NO_POS + + @property + def direction_of_view(self): + last_x, last_y = self.last_pos + curr_x, curr_y = self.pos + return last_x-curr_x, last_y-curr_y + + def __init__(self, *args, **kwargs): + super(MoveableEntity, self).__init__(*args, **kwargs) + self._last_tile = None + + def move(self, next_tile): + curr_tile = self.tile + if curr_tile != next_tile: + next_tile.enter(self) + curr_tile.leave(self) + self._tile = next_tile + self._last_tile = curr_tile + return True + else: + return False + + class Action(Object): def __init__(self, *args, **kwargs): super(Action, self).__init__(*args, **kwargs) +class PlaceHolder(MoveableEntity): + pass + + class Tile(Object): @property @@ -133,45 +211,6 @@ class Wall(Tile): pass -class Entity(Object): - - @property - def can_collide(self): - return True - - @property - def encoding(self): - return c.OCCUPIED_CELL.value - - @property - def x(self): - return self.pos[0] - - @property - def y(self): - return self.pos[1] - - @property - def pos(self): - return self._tile.pos - - @property - def tile(self): - return self._tile - - def __init__(self, tile: Tile, **kwargs): - super(Entity, self).__init__(**kwargs) - self._tile = tile - tile.enter(self) - - def summarize_state(self) -> dict: - return dict(name=str(self.name), x=int(self.x), y=int(self.y), - tile=str(self.tile.name), can_collide=bool(self.can_collide)) - - def __repr__(self): - return f'{self.name}(@{self.pos})' - - class Door(Entity): @property @@ -261,41 +300,6 @@ class Door(Entity): return False -class MoveableEntity(Entity): - - @property - def last_tile(self): - return self._last_tile - - @property - def last_pos(self): - if self._last_tile: - return self._last_tile.pos - else: - return c.NO_POS - - @property - def direction_of_view(self): - last_x, last_y = self.last_pos - curr_x, curr_y = self.pos - return last_x-curr_x, last_y-curr_y - - def __init__(self, *args, **kwargs): - super(MoveableEntity, self).__init__(*args, **kwargs) - self._last_tile = None - - def move(self, next_tile): - curr_tile = self.tile - if curr_tile != next_tile: - next_tile.enter(self) - curr_tile.leave(self) - self._tile = next_tile - self._last_tile = curr_tile - return True - else: - return False - - class Agent(MoveableEntity): def __init__(self, *args, **kwargs): diff --git a/environments/factory/base/registers.py b/environments/factory/base/registers.py index 3d06fb7..12cbf6a 100644 --- a/environments/factory/base/registers.py +++ b/environments/factory/base/registers.py @@ -4,7 +4,7 @@ from typing import List, Union, Dict import numpy as np -from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object +from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object, PlaceHolder from environments.utility_classes import MovementProperties from environments import helpers as h from environments.helpers import Constants as c @@ -156,6 +156,25 @@ class MovingEntityObjectRegister(EntityObjectRegister, ABC): del self[name] +class PlaceHolderRegister(MovingEntityObjectRegister): + + _accepted_objects = PlaceHolder + + # noinspection DuplicatedCode + def as_array(self): + self._array[:] = c.FREE_CELL.value + # noinspection PyTupleAssignmentBalance + for z, x, y, v in zip(range(len(self)), *zip(*[x.pos for x in self]), [x.encoding for x in self]): + if self.individual_slices: + self._array[z, x, y] += v + else: + self._array[0, x, y] += v + if self.individual_slices: + return self._array + else: + return self._array.sum(axis=0, keepdims=True) + + class Entities(Register): _accepted_objects = EntityObjectRegister @@ -256,6 +275,9 @@ class FloorTiles(WallTiles): class Agents(MovingEntityObjectRegister): + _accepted_objects = Agent + + # noinspection DuplicatedCode def as_array(self): self._array[:] = c.FREE_CELL.value # noinspection PyTupleAssignmentBalance @@ -269,8 +291,6 @@ class Agents(MovingEntityObjectRegister): else: return self._array.sum(axis=0, keepdims=True) - _accepted_objects = Agent - @property def positions(self): return [agent.pos for agent in self] diff --git a/environments/factory/factory_item.py b/environments/factory/factory_item.py index c04a15a..f93c2c7 100644 --- a/environments/factory/factory_item.py +++ b/environments/factory/factory_item.py @@ -311,15 +311,17 @@ class ItemFactory(BaseFactory): reward, info_dict = super().calculate_additional_reward(agent) if h.EnvActions.ITEM_ACTION == agent.temp_action: if agent.temp_valid: - if self[c.DROP_OFF].by_pos(agent.pos): + if drop_off := self[c.DROP_OFF].by_pos(agent.pos): info_dict.update({f'{agent.name}_item_dropoff': 1}) - + self.print(f'{agent.name} just dropped of an item at {drop_off.pos}.') reward += 0.5 else: info_dict.update({f'{agent.name}_item_pickup': 1}) + self.print(f'{agent.name} just picked up an item at {agent.pos}') reward += 0.1 else: info_dict.update({f'{agent.name}_failed_item_action': 1}) + self.print(f'{agent.name} just tried to pick up an item at {agent.pos}, but failed.') reward -= 0.1 return reward, info_dict diff --git a/environments/helpers.py b/environments/helpers.py index c9127b9..4622a41 100644 --- a/environments/helpers.py +++ b/environments/helpers.py @@ -5,6 +5,8 @@ from typing import Tuple, Union import numpy as np from pathlib import Path +from stable_baselines3 import PPO, DQN, A2C + LEVELS_DIR = 'levels' TO_BE_AVERAGED = ['dirt_amount', 'dirty_tiles'] @@ -142,6 +144,8 @@ def asset_str(agent): return c.AGENT.value, 'idle' +model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C) + if __name__ == '__main__': parsed_level = parse_level(Path(__file__).parent / 'factory' / 'levels' / 'simple.txt') y = one_hot_level(parsed_level) diff --git a/main.py b/main.py index 013d7e1..5c6867a 100644 --- a/main.py +++ b/main.py @@ -139,7 +139,7 @@ if __name__ == '__main__': if modeL_type.__name__ in ["PPO", "A2C"]: kwargs = dict(ent_coef=0.01) - env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn") + env = SubprocVecEnv([make_env(env_kwargs) for _ in range(1)], start_method="spawn") elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]: env = make_env(env_kwargs)() kwargs = dict(buffer_size=50000, @@ -147,7 +147,8 @@ if __name__ == '__main__': batch_size=64, target_update_interval=5000, exploration_fraction=0.25, - exploration_final_eps=0.025) + exploration_final_eps=0.025 + ) else: raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.') diff --git a/reload_agent.py b/reload_agent.py index 47e9d25..44c9464 100644 --- a/reload_agent.py +++ b/reload_agent.py @@ -3,7 +3,6 @@ from pathlib import Path import yaml from natsort import natsorted -from stable_baselines3 import PPO, DQN, A2C from stable_baselines3.common.evaluation import evaluate_policy from environments.factory.factory_dirt import DirtProperties, DirtFactory @@ -12,13 +11,12 @@ from environments.factory.factory_item import ItemProperties, ItemFactory warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=UserWarning) -model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C) if __name__ == '__main__': - model_name = 'PPO_1631029150' + model_name = 'DQN_1631092016' run_id = 0 - seed=69 + seed = 69 out_path = Path(__file__).parent / 'debug_out' model_path = out_path / model_name @@ -38,5 +36,5 @@ if __name__ == '__main__': this_model = model_files[0] model_cls = next(val for key, val in model_map.items() if key in model_name) model = model_cls.load(this_model) - evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True, render=True) + evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True) print(evaluation_result) diff --git a/studies/e_1.py b/studies/e_1.py new file mode 100644 index 0000000..58d06fb --- /dev/null +++ b/studies/e_1.py @@ -0,0 +1,130 @@ +import itertools +import random +from pathlib import Path + +import simplejson +from stable_baselines3 import DQN, PPO, A2C + +from environments.factory.factory_dirt import DirtProperties, DirtFactory +from environments.factory.factory_item import ItemProperties, ItemFactory + +if __name__ == '__main__': + """ + In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, + but never saw each other in training. + Those agents learned + + + We start with training a single policy on a single task (dirt cleanup / item pickup). + Then multiple agent equipped with the same policy are deployed in the same environment. + + There are further distinctions to be made: + + 1. No Observation - ['no_obs']: + - Agent do not see each other but their consequences of their combined actions + - Agents can collide + + 2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]: + - Agents see other entitys on a seperate slice + - This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$ + -- Depending ob the fill value, agents will react diffently + -> TODO: Test this! + + 3. Observation in level slice - ['in_lvl_obs']: + - This tells the agent to treat other agents as obstacle. + - However, the state space is altered since moving obstacles are not part the original agent observation. + - We are out of distribution. + """ + + +def bundle_model(model_class): + if model_class.__class__.__name__ in ["PPO", "A2C"]: + kwargs = dict(ent_coef=0.01) + elif model_class.__class__.__name__ in ["RegDQN", "DQN", "QRDQN"]: + kwargs = dict(buffer_size=50000, + learning_starts=64, + batch_size=64, + target_update_interval=5000, + exploration_fraction=0.25, + exploration_final_eps=0.025 + ) + return lambda: model_class(kwargs) + + +if __name__ == '__main__': + # Define a global studi save path + study_root_path = Path(Path(__file__).stem) / 'out' + + # TODO: Define Global Env Parameters + factory_kwargs = { + + + } + + # TODO: Define global model parameters + + + # TODO: Define parameters for both envs + dirt_props = DirtProperties() + item_props = ItemProperties() + + # Bundle both environments with global kwargs and parameters + env_bundles = [lambda: ('dirt', DirtFactory(factory_kwargs, dirt_properties=dirt_props)), + lambda: ('item', ItemFactory(factory_kwargs, item_properties=item_props))] + + # Define parameter versions according with #1,2[1,0,N],3 + observation_modes = ['no_obs', 'seperate_0', 'seperate_1', 'seperate_N', 'in_lvl_obs'] + + # Define RL-Models + model_bundles = [bundle_model(model) for model in [A2C, PPO, DQN]] + + # Zip parameters, parameter versions, Env Classes and models + combinations = itertools.product(model_bundles, env_bundles) + + # Train starts here ############################################################ + # Build Major Loop + for model, (env_identifier, env_bundle) in combinations: + for observation_mode in observation_modes: + # TODO: Create an identifier, which is unique for every combination and easy to read in filesystem + identifier = f'{model.name}_{observation_mode}_{env_identifier}' + # Train each combination per seed + for seed in range(3): + # TODO: Output folder + # TODO: Monitor Init + # TODO: Env Init + # TODO: Model Init + # TODO: Model train + # TODO: Model save + pass + # TODO: Seed Compare Plot + # Train ends here ############################################################ + + # Evaluation starts here ##################################################### + # Iterate Observation Modes + for observation_mode in observation_modes: + # TODO: For trained policy in study_root_path / identifier + for policy_group in (x for x in study_root_path.iterdir() if x.is_dir()): + # TODO: Pick random seed or iterate over available seeds + policy_seed = next((y for y in study_root_path.iterdir() if y.is_dir())) + # TODO: retrieve model class + # TODO: Load both agents + models = [] + # TODO: Evaluation Loop for i in range(100) Episodes + for episode in range(100): + with next(policy_seed.glob('*.yaml')).open('r') as f: + env_kwargs = simplejson.load(f) + # TODO: Monitor Init + env = None # TODO: Init Env + for step in range(400): + random_actions = [[random.randint(0, env.n_actions) for _ in range(len(models))] for _ in range(200)] + env_state = env.reset() + rew = 0 + for agent_i_action in random_actions: + env_state, step_r, done_bool, info_obj = env.step(agent_i_action) + rew += step_r + if done_bool: + break + print(f'Factory run {episode} done, reward is:\n {rew}') + # TODO: Plotting + + pass