study e_1 corpus

2021-09-08 16:24:14 +02:00
parent b09055d95d
commit 4c21a0af7c
8 changed files with 246 additions and 87 deletions
--- a/environments/factory/base/base_factory.py
+++ b/environments/factory/base/base_factory.py
@ -195,7 +195,7 @@ class BaseFactory(gym.Env):
        for action, agent in zip(actions, self[c.AGENT]):
            agent.clear_temp_state()
            action_obj = self._actions[int(action)]
-            self.print(f'Action #{action} has been resolved to: {action_obj}')
+            # self.print(f'Action #{action} has been resolved to: {action_obj}')
            if h.MovingAction.is_member(action_obj):
                valid = self._move_or_colide(agent, action_obj)
            elif h.EnvActions.NOOP == agent.temp_action:
--- a/environments/factory/base/objects.py
+++ b/environments/factory/base/objects.py
@ -66,12 +66,90 @@ class Object:
            return other.name == self.name
 class Entity(Object):
    @property
    def can_collide(self):
        return True
    @property
    def encoding(self):
        return c.OCCUPIED_CELL.value
    @property
    def x(self):
        return self.pos[0]
    @property
    def y(self):
        return self.pos[1]
    @property
    def pos(self):
        return self._tile.pos
    @property
    def tile(self):
        return self._tile
    def __init__(self, tile, **kwargs):
        super(Entity, self).__init__(**kwargs)
        self._tile = tile
        tile.enter(self)
    def summarize_state(self) -> dict:
        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
    def __repr__(self):
        return f'{self.name}(@{self.pos})'
 class MoveableEntity(Entity):
    @property
    def last_tile(self):
        return self._last_tile
    @property
    def last_pos(self):
        if self._last_tile:
            return self._last_tile.pos
        else:
            return c.NO_POS
    @property
    def direction_of_view(self):
        last_x, last_y = self.last_pos
        curr_x, curr_y = self.pos
        return last_x-curr_x, last_y-curr_y
    def __init__(self, *args, **kwargs):
        super(MoveableEntity, self).__init__(*args, **kwargs)
        self._last_tile = None
    def move(self, next_tile):
        curr_tile = self.tile
        if curr_tile != next_tile:
            next_tile.enter(self)
            curr_tile.leave(self)
            self._tile = next_tile
            self._last_tile = curr_tile
            return True
        else:
            return False
 class Action(Object):
    def __init__(self, *args, **kwargs):
        super(Action, self).__init__(*args, **kwargs)
 class PlaceHolder(MoveableEntity):
    pass
 class Tile(Object):
    @property
@ -133,45 +211,6 @@ class Wall(Tile):
    pass
 class Entity(Object):
    @property
    def can_collide(self):
        return True
    @property
    def encoding(self):
        return c.OCCUPIED_CELL.value
    @property
    def x(self):
        return self.pos[0]
    @property
    def y(self):
        return self.pos[1]
    @property
    def pos(self):
        return self._tile.pos
    @property
    def tile(self):
        return self._tile
    def __init__(self, tile: Tile, **kwargs):
        super(Entity, self).__init__(**kwargs)
        self._tile = tile
        tile.enter(self)
    def summarize_state(self) -> dict:
        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
    def __repr__(self):
        return f'{self.name}(@{self.pos})'
 class Door(Entity):
    @property
@ -261,41 +300,6 @@ class Door(Entity):
            return False
 class MoveableEntity(Entity):
    @property
    def last_tile(self):
        return self._last_tile
    @property
    def last_pos(self):
        if self._last_tile:
            return self._last_tile.pos
        else:
            return c.NO_POS
    @property
    def direction_of_view(self):
        last_x, last_y = self.last_pos
        curr_x, curr_y = self.pos
        return last_x-curr_x, last_y-curr_y
    def __init__(self, *args, **kwargs):
        super(MoveableEntity, self).__init__(*args, **kwargs)
        self._last_tile = None
    def move(self, next_tile):
        curr_tile = self.tile
        if curr_tile != next_tile:
            next_tile.enter(self)
            curr_tile.leave(self)
            self._tile = next_tile
            self._last_tile = curr_tile
            return True
        else:
            return False
 class Agent(MoveableEntity):
    def __init__(self, *args, **kwargs):
--- a/environments/factory/base/registers.py
+++ b/environments/factory/base/registers.py
@ -4,7 +4,7 @@ from typing import List, Union, Dict
 import numpy as np
-from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object
+from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object, PlaceHolder
 from environments.utility_classes import MovementProperties
 from environments import helpers as h
 from environments.helpers import Constants as c
@ -156,6 +156,25 @@ class MovingEntityObjectRegister(EntityObjectRegister, ABC):
        del self[name]
 class PlaceHolderRegister(MovingEntityObjectRegister):
    _accepted_objects = PlaceHolder
    # noinspection DuplicatedCode
    def as_array(self):
        self._array[:] = c.FREE_CELL.value
        # noinspection PyTupleAssignmentBalance
        for z, x, y, v in zip(range(len(self)), *zip(*[x.pos for x in self]), [x.encoding for x in self]):
            if self.individual_slices:
                self._array[z, x, y] += v
            else:
                self._array[0, x, y] += v
        if self.individual_slices:
            return self._array
        else:
            return self._array.sum(axis=0, keepdims=True)
 class Entities(Register):
    _accepted_objects = EntityObjectRegister
@ -256,6 +275,9 @@ class FloorTiles(WallTiles):
 class Agents(MovingEntityObjectRegister):
    _accepted_objects = Agent
    # noinspection DuplicatedCode
    def as_array(self):
        self._array[:] = c.FREE_CELL.value
        # noinspection PyTupleAssignmentBalance
@ -269,8 +291,6 @@ class Agents(MovingEntityObjectRegister):
        else:
            return self._array.sum(axis=0, keepdims=True)
    _accepted_objects = Agent
    @property
    def positions(self):
        return [agent.pos for agent in self]
--- a/environments/factory/factory_item.py
+++ b/environments/factory/factory_item.py
@ -311,15 +311,17 @@ class ItemFactory(BaseFactory):
        reward, info_dict = super().calculate_additional_reward(agent)
        if h.EnvActions.ITEM_ACTION == agent.temp_action:
            if agent.temp_valid:
-                if self[c.DROP_OFF].by_pos(agent.pos):
+                if drop_off := self[c.DROP_OFF].by_pos(agent.pos):
                    info_dict.update({f'{agent.name}_item_dropoff': 1})
-
+                    self.print(f'{agent.name} just dropped of an item at {drop_off.pos}.')
                    reward += 0.5
                else:
                    info_dict.update({f'{agent.name}_item_pickup': 1})
                    self.print(f'{agent.name} just picked up an item at {agent.pos}')
                    reward += 0.1
            else:
                info_dict.update({f'{agent.name}_failed_item_action': 1})
                self.print(f'{agent.name} just tried to pick up an item at {agent.pos}, but failed.')
                reward -= 0.1
        return reward, info_dict
--- a/environments/helpers.py
+++ b/environments/helpers.py
@ -5,6 +5,8 @@ from typing import Tuple, Union
 import numpy as np
 from pathlib import Path
 from stable_baselines3 import PPO, DQN, A2C
 LEVELS_DIR = 'levels'
 TO_BE_AVERAGED = ['dirt_amount', 'dirty_tiles']
@ -142,6 +144,8 @@ def asset_str(agent):
        return c.AGENT.value, 'idle'
 model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)
 if __name__ == '__main__':
    parsed_level = parse_level(Path(__file__).parent / 'factory' / 'levels' / 'simple.txt')
    y = one_hot_level(parsed_level)
--- a/main.py
+++ b/main.py
@ -139,7 +139,7 @@ if __name__ == '__main__':
            if modeL_type.__name__ in ["PPO", "A2C"]:
                kwargs = dict(ent_coef=0.01)
-                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn")
+                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(1)], start_method="spawn")
            elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]:
                env = make_env(env_kwargs)()
                kwargs = dict(buffer_size=50000,
@ -147,7 +147,8 @@ if __name__ == '__main__':
                              batch_size=64,
                              target_update_interval=5000,
                              exploration_fraction=0.25,
-                              exploration_final_eps=0.025)
+                              exploration_final_eps=0.025
                              )
            else:
                raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.')
--- a/reload_agent.py
+++ b/reload_agent.py
@ -3,7 +3,6 @@ from pathlib import Path
 import yaml
 from natsort import natsorted
 from stable_baselines3 import PPO, DQN, A2C
 from stable_baselines3.common.evaluation import evaluate_policy
 from environments.factory.factory_dirt import DirtProperties, DirtFactory
@ -12,13 +11,12 @@ from environments.factory.factory_item import ItemProperties, ItemFactory
 warnings.filterwarnings('ignore', category=FutureWarning)
 warnings.filterwarnings('ignore', category=UserWarning)
 model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)
 if __name__ == '__main__':
-    model_name = 'PPO_1631029150'
+    model_name = 'DQN_1631092016'
    run_id = 0
-    seed=69
+    seed = 69
    out_path = Path(__file__).parent / 'debug_out'
    model_path = out_path / model_name
@ -38,5 +36,5 @@ if __name__ == '__main__':
        this_model = model_files[0]
        model_cls = next(val for key, val in model_map.items() if key in model_name)
        model = model_cls.load(this_model)
-        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True, render=True)
+        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True)
        print(evaluation_result)
--- a/studies/e_1.py
+++ b/studies/e_1.py
@ -0,0 +1,130 @@
 import itertools
 import random
 from pathlib import Path
 import simplejson
 from stable_baselines3 import DQN, PPO, A2C
 from environments.factory.factory_dirt import DirtProperties, DirtFactory
 from environments.factory.factory_item import ItemProperties, ItemFactory
 if __name__ == '__main__':
    """
    In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, 
    but never saw each other in training.
    Those agents learned 
    We start with training a single policy on a single task (dirt cleanup / item pickup).
    Then multiple agent equipped with the same policy are deployed in the same environment.
    There are further distinctions to be made:
    1. No Observation - ['no_obs']:
    - Agent do not see each other but their consequences of their combined actions
    - Agents can collide
    2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]:
    - Agents see other entitys on a seperate slice
    - This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$
    -- Depending ob the fill value, agents will react diffently
       -> TODO: Test this! 
    3. Observation in level slice - ['in_lvl_obs']:
    - This tells the agent to treat other agents as obstacle. 
    - However, the state space is altered since moving obstacles are not part the original agent observation. 
    - We are out of distribution.
    """
 def bundle_model(model_class):
    if model_class.__class__.__name__ in ["PPO", "A2C"]:
        kwargs = dict(ent_coef=0.01)
    elif model_class.__class__.__name__ in ["RegDQN", "DQN", "QRDQN"]:
        kwargs = dict(buffer_size=50000,
                      learning_starts=64,
                      batch_size=64,
                      target_update_interval=5000,
                      exploration_fraction=0.25,
                      exploration_final_eps=0.025
                      )
    return lambda: model_class(kwargs)
 if __name__ == '__main__':
    # Define a global studi save path
    study_root_path = Path(Path(__file__).stem) / 'out'
    # TODO: Define Global Env Parameters
    factory_kwargs = {
    }
    # TODO: Define global model parameters
    # TODO: Define parameters for both envs
    dirt_props = DirtProperties()
    item_props = ItemProperties()
    # Bundle both environments with global kwargs and parameters
    env_bundles = [lambda: ('dirt', DirtFactory(factory_kwargs, dirt_properties=dirt_props)),
                   lambda: ('item', ItemFactory(factory_kwargs, item_properties=item_props))]
    # Define parameter versions according with #1,2[1,0,N],3
    observation_modes = ['no_obs', 'seperate_0', 'seperate_1', 'seperate_N', 'in_lvl_obs']
    # Define RL-Models
    model_bundles = [bundle_model(model) for model in [A2C, PPO, DQN]]
    # Zip parameters, parameter versions, Env Classes and models
    combinations = itertools.product(model_bundles, env_bundles)
    # Train starts here ############################################################
    # Build Major Loop
    for model, (env_identifier, env_bundle) in combinations:
        for observation_mode in observation_modes:
            # TODO: Create an identifier, which is unique for every combination and easy to read in filesystem
            identifier = f'{model.name}_{observation_mode}_{env_identifier}'
            # Train each combination per seed
            for seed in range(3):
                # TODO: Output folder
                # TODO: Monitor Init
                # TODO: Env Init
                # TODO: Model Init
                # TODO: Model train
                # TODO: Model save
                pass
            # TODO: Seed Compare Plot
    # Train ends here ############################################################
    # Evaluation starts here #####################################################
    # Iterate Observation Modes
    for observation_mode in observation_modes:
        # TODO: For trained policy in study_root_path / identifier
        for policy_group in (x for x in study_root_path.iterdir() if x.is_dir()):
            # TODO: Pick random seed or iterate over available seeds
            policy_seed = next((y for y in study_root_path.iterdir() if y.is_dir()))
            # TODO: retrieve model class
            # TODO: Load both agents
            models = []
            # TODO: Evaluation Loop for i in range(100) Episodes
            for episode in range(100):
                with next(policy_seed.glob('*.yaml')).open('r') as f:
                    env_kwargs = simplejson.load(f)
                # TODO: Monitor Init
                env = None  # TODO: Init Env
                for step in range(400):
                    random_actions = [[random.randint(0, env.n_actions) for _ in range(len(models))] for _ in range(200)]
                    env_state = env.reset()
                    rew = 0
                    for agent_i_action in random_actions:
                        env_state, step_r, done_bool, info_obj = env.step(agent_i_action)
                        rew += step_r
                        if done_bool:
                            break
                print(f'Factory run {episode} done, reward is:\n    {rew}')
            # TODO: Plotting
    pass