study e_1 corpus

2021-09-08 16:24:14 +02:00
parent b09055d95d
commit 4c21a0af7c
8 changed files with 246 additions and 87 deletions
--- a/environments/factory/base/base_factory.py
+++ b/environments/factory/base/base_factory.py
@ -195,7 +195,7 @@ class BaseFactory(gym.Env):
        for action, agent in zip(actions, self[c.AGENT]):
            agent.clear_temp_state()
            action_obj = self._actions[int(action)]
-            self.print(f'Action #{action} has been resolved to: {action_obj}')
+            # self.print(f'Action #{action} has been resolved to: {action_obj}')
            if h.MovingAction.is_member(action_obj):
                valid = self._move_or_colide(agent, action_obj)
            elif h.EnvActions.NOOP == agent.temp_action:
--- a/environments/factory/base/objects.py
+++ b/environments/factory/base/objects.py
@ -66,12 +66,90 @@ class Object:
            return other.name == self.name


+class Entity(Object):
+
+    @property
+    def can_collide(self):
+        return True
+
+    @property
+    def encoding(self):
+        return c.OCCUPIED_CELL.value
+
+    @property
+    def x(self):
+        return self.pos[0]
+
+    @property
+    def y(self):
+        return self.pos[1]
+
+    @property
+    def pos(self):
+        return self._tile.pos
+
+    @property
+    def tile(self):
+        return self._tile
+
+    def __init__(self, tile, **kwargs):
+        super(Entity, self).__init__(**kwargs)
+        self._tile = tile
+        tile.enter(self)
+
+    def summarize_state(self) -> dict:
+        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
+                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
+
+    def __repr__(self):
+        return f'{self.name}(@{self.pos})'
+
+
+class MoveableEntity(Entity):
+
+    @property
+    def last_tile(self):
+        return self._last_tile
+
+    @property
+    def last_pos(self):
+        if self._last_tile:
+            return self._last_tile.pos
+        else:
+            return c.NO_POS
+
+    @property
+    def direction_of_view(self):
+        last_x, last_y = self.last_pos
+        curr_x, curr_y = self.pos
+        return last_x-curr_x, last_y-curr_y
+
+    def __init__(self, *args, **kwargs):
+        super(MoveableEntity, self).__init__(*args, **kwargs)
+        self._last_tile = None
+
+    def move(self, next_tile):
+        curr_tile = self.tile
+        if curr_tile != next_tile:
+            next_tile.enter(self)
+            curr_tile.leave(self)
+            self._tile = next_tile
+            self._last_tile = curr_tile
+            return True
+        else:
+            return False
+
+
 class Action(Object):

    def __init__(self, *args, **kwargs):
        super(Action, self).__init__(*args, **kwargs)


+class PlaceHolder(MoveableEntity):
+    pass
+
+
 class Tile(Object):

    @property
@ -133,45 +211,6 @@ class Wall(Tile):
    pass


-class Entity(Object):
-
-    @property
-    def can_collide(self):
-        return True
-
-    @property
-    def encoding(self):
-        return c.OCCUPIED_CELL.value
-
-    @property
-    def x(self):
-        return self.pos[0]
-
-    @property
-    def y(self):
-        return self.pos[1]
-
-    @property
-    def pos(self):
-        return self._tile.pos
-
-    @property
-    def tile(self):
-        return self._tile
-
-    def __init__(self, tile: Tile, **kwargs):
-        super(Entity, self).__init__(**kwargs)
-        self._tile = tile
-        tile.enter(self)
-
-    def summarize_state(self) -> dict:
-        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
-                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
-
-    def __repr__(self):
-        return f'{self.name}(@{self.pos})'
-
-
 class Door(Entity):

    @property
@ -261,41 +300,6 @@ class Door(Entity):
            return False


-class MoveableEntity(Entity):
-
-    @property
-    def last_tile(self):
-        return self._last_tile
-
-    @property
-    def last_pos(self):
-        if self._last_tile:
-            return self._last_tile.pos
-        else:
-            return c.NO_POS
-
-    @property
-    def direction_of_view(self):
-        last_x, last_y = self.last_pos
-        curr_x, curr_y = self.pos
-        return last_x-curr_x, last_y-curr_y
-
-    def __init__(self, *args, **kwargs):
-        super(MoveableEntity, self).__init__(*args, **kwargs)
-        self._last_tile = None
-
-    def move(self, next_tile):
-        curr_tile = self.tile
-        if curr_tile != next_tile:
-            next_tile.enter(self)
-            curr_tile.leave(self)
-            self._tile = next_tile
-            self._last_tile = curr_tile
-            return True
-        else:
-            return False
-
-
 class Agent(MoveableEntity):

    def __init__(self, *args, **kwargs):
--- a/environments/factory/base/registers.py
+++ b/environments/factory/base/registers.py
@ -4,7 +4,7 @@ from typing import List, Union, Dict

 import numpy as np

-from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object
+from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object, PlaceHolder
 from environments.utility_classes import MovementProperties
 from environments import helpers as h
 from environments.helpers import Constants as c
@ -156,6 +156,25 @@ class MovingEntityObjectRegister(EntityObjectRegister, ABC):
        del self[name]


+class PlaceHolderRegister(MovingEntityObjectRegister):
+
+    _accepted_objects = PlaceHolder
+
+    # noinspection DuplicatedCode
+    def as_array(self):
+        self._array[:] = c.FREE_CELL.value
+        # noinspection PyTupleAssignmentBalance
+        for z, x, y, v in zip(range(len(self)), *zip(*[x.pos for x in self]), [x.encoding for x in self]):
+            if self.individual_slices:
+                self._array[z, x, y] += v
+            else:
+                self._array[0, x, y] += v
+        if self.individual_slices:
+            return self._array
+        else:
+            return self._array.sum(axis=0, keepdims=True)
+
+
 class Entities(Register):

    _accepted_objects = EntityObjectRegister
@ -256,6 +275,9 @@ class FloorTiles(WallTiles):

 class Agents(MovingEntityObjectRegister):

+    _accepted_objects = Agent
+
+    # noinspection DuplicatedCode
    def as_array(self):
        self._array[:] = c.FREE_CELL.value
        # noinspection PyTupleAssignmentBalance
@ -269,8 +291,6 @@ class Agents(MovingEntityObjectRegister):
        else:
            return self._array.sum(axis=0, keepdims=True)

-    _accepted_objects = Agent
-
    @property
    def positions(self):
        return [agent.pos for agent in self]
--- a/environments/factory/factory_item.py
+++ b/environments/factory/factory_item.py
@ -311,15 +311,17 @@ class ItemFactory(BaseFactory):
        reward, info_dict = super().calculate_additional_reward(agent)
        if h.EnvActions.ITEM_ACTION == agent.temp_action:
            if agent.temp_valid:
-                if self[c.DROP_OFF].by_pos(agent.pos):
+                if drop_off := self[c.DROP_OFF].by_pos(agent.pos):
                    info_dict.update({f'{agent.name}_item_dropoff': 1})
-
+                    self.print(f'{agent.name} just dropped of an item at {drop_off.pos}.')
                    reward += 0.5
                else:
                    info_dict.update({f'{agent.name}_item_pickup': 1})
+                    self.print(f'{agent.name} just picked up an item at {agent.pos}')
                    reward += 0.1
            else:
                info_dict.update({f'{agent.name}_failed_item_action': 1})
+                self.print(f'{agent.name} just tried to pick up an item at {agent.pos}, but failed.')
                reward -= 0.1
        return reward, info_dict

--- a/environments/helpers.py
+++ b/environments/helpers.py
@ -5,6 +5,8 @@ from typing import Tuple, Union
 import numpy as np
 from pathlib import Path

+from stable_baselines3 import PPO, DQN, A2C
+
 LEVELS_DIR = 'levels'

 TO_BE_AVERAGED = ['dirt_amount', 'dirty_tiles']
@ -142,6 +144,8 @@ def asset_str(agent):
        return c.AGENT.value, 'idle'


+model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)
+
 if __name__ == '__main__':
    parsed_level = parse_level(Path(__file__).parent / 'factory' / 'levels' / 'simple.txt')
    y = one_hot_level(parsed_level)
--- a/main.py
+++ b/main.py
@ -139,7 +139,7 @@ if __name__ == '__main__':

            if modeL_type.__name__ in ["PPO", "A2C"]:
                kwargs = dict(ent_coef=0.01)
-                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn")
+                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(1)], start_method="spawn")
            elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]:
                env = make_env(env_kwargs)()
                kwargs = dict(buffer_size=50000,
@ -147,7 +147,8 @@ if __name__ == '__main__':
                              batch_size=64,
                              target_update_interval=5000,
                              exploration_fraction=0.25,
-                              exploration_final_eps=0.025)
+                              exploration_final_eps=0.025
+                              )
            else:
                raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.')

--- a/reload_agent.py
+++ b/reload_agent.py
@ -3,7 +3,6 @@ from pathlib import Path

 import yaml
 from natsort import natsorted
-from stable_baselines3 import PPO, DQN, A2C
 from stable_baselines3.common.evaluation import evaluate_policy

 from environments.factory.factory_dirt import DirtProperties, DirtFactory
@ -12,11 +11,10 @@ from environments.factory.factory_item import ItemProperties, ItemFactory
 warnings.filterwarnings('ignore', category=FutureWarning)
 warnings.filterwarnings('ignore', category=UserWarning)

-model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)

 if __name__ == '__main__':

-    model_name = 'PPO_1631029150'
+    model_name = 'DQN_1631092016'
    run_id = 0
    seed = 69
    out_path = Path(__file__).parent / 'debug_out'
@ -38,5 +36,5 @@ if __name__ == '__main__':
        this_model = model_files[0]
        model_cls = next(val for key, val in model_map.items() if key in model_name)
        model = model_cls.load(this_model)
-        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True, render=True)
+        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True)
        print(evaluation_result)
--- a/studies/e_1.py
+++ b/studies/e_1.py
@ -0,0 +1,130 @@
+import itertools
+import random
+from pathlib import Path
+
+import simplejson
+from stable_baselines3 import DQN, PPO, A2C
+
+from environments.factory.factory_dirt import DirtProperties, DirtFactory
+from environments.factory.factory_item import ItemProperties, ItemFactory
+
+if __name__ == '__main__':
+    """
+    In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, 
+    but never saw each other in training.
+    Those agents learned 
+    
+    
+    We start with training a single policy on a single task (dirt cleanup / item pickup).
+    Then multiple agent equipped with the same policy are deployed in the same environment.
+    
+    There are further distinctions to be made:
+    
+    1. No Observation - ['no_obs']:
+    - Agent do not see each other but their consequences of their combined actions
+    - Agents can collide
+    
+    2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]:
+    - Agents see other entitys on a seperate slice
+    - This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$
+    -- Depending ob the fill value, agents will react diffently
+       -> TODO: Test this! 
+    
+    3. Observation in level slice - ['in_lvl_obs']:
+    - This tells the agent to treat other agents as obstacle. 
+    - However, the state space is altered since moving obstacles are not part the original agent observation. 
+    - We are out of distribution.
+    """
+
+
+def bundle_model(model_class):
+    if model_class.__class__.__name__ in ["PPO", "A2C"]:
+        kwargs = dict(ent_coef=0.01)
+    elif model_class.__class__.__name__ in ["RegDQN", "DQN", "QRDQN"]:
+        kwargs = dict(buffer_size=50000,
+                      learning_starts=64,
+                      batch_size=64,
+                      target_update_interval=5000,
+                      exploration_fraction=0.25,
+                      exploration_final_eps=0.025
+                      )
+    return lambda: model_class(kwargs)
+
+
+if __name__ == '__main__':
+    # Define a global studi save path
+    study_root_path = Path(Path(__file__).stem) / 'out'
+
+    # TODO: Define Global Env Parameters
+    factory_kwargs = {
+
+
+    }
+
+    # TODO: Define global model parameters
+
+
+    # TODO: Define parameters for both envs
+    dirt_props = DirtProperties()
+    item_props = ItemProperties()
+
+    # Bundle both environments with global kwargs and parameters
+    env_bundles = [lambda: ('dirt', DirtFactory(factory_kwargs, dirt_properties=dirt_props)),
+                   lambda: ('item', ItemFactory(factory_kwargs, item_properties=item_props))]
+
+    # Define parameter versions according with #1,2[1,0,N],3
+    observation_modes = ['no_obs', 'seperate_0', 'seperate_1', 'seperate_N', 'in_lvl_obs']
+
+    # Define RL-Models
+    model_bundles = [bundle_model(model) for model in [A2C, PPO, DQN]]
+
+    # Zip parameters, parameter versions, Env Classes and models
+    combinations = itertools.product(model_bundles, env_bundles)
+
+    # Train starts here ############################################################
+    # Build Major Loop
+    for model, (env_identifier, env_bundle) in combinations:
+        for observation_mode in observation_modes:
+            # TODO: Create an identifier, which is unique for every combination and easy to read in filesystem
+            identifier = f'{model.name}_{observation_mode}_{env_identifier}'
+            # Train each combination per seed
+            for seed in range(3):
+                # TODO: Output folder
+                # TODO: Monitor Init
+                # TODO: Env Init
+                # TODO: Model Init
+                # TODO: Model train
+                # TODO: Model save
+                pass
+            # TODO: Seed Compare Plot
+    # Train ends here ############################################################
+
+    # Evaluation starts here #####################################################
+    # Iterate Observation Modes
+    for observation_mode in observation_modes:
+        # TODO: For trained policy in study_root_path / identifier
+        for policy_group in (x for x in study_root_path.iterdir() if x.is_dir()):
+            # TODO: Pick random seed or iterate over available seeds
+            policy_seed = next((y for y in study_root_path.iterdir() if y.is_dir()))
+            # TODO: retrieve model class
+            # TODO: Load both agents
+            models = []
+            # TODO: Evaluation Loop for i in range(100) Episodes
+            for episode in range(100):
+                with next(policy_seed.glob('*.yaml')).open('r') as f:
+                    env_kwargs = simplejson.load(f)
+                # TODO: Monitor Init
+                env = None  # TODO: Init Env
+                for step in range(400):
+                    random_actions = [[random.randint(0, env.n_actions) for _ in range(len(models))] for _ in range(200)]
+                    env_state = env.reset()
+                    rew = 0
+                    for agent_i_action in random_actions:
+                        env_state, step_r, done_bool, info_obj = env.step(agent_i_action)
+                        rew += step_r
+                        if done_bool:
+                            break
+                print(f'Factory run {episode} done, reward is:\n    {rew}')
+            # TODO: Plotting
+
+    pass