From 4c21a0af7cc8927d8489c7bd873fbb1a4049d70d Mon Sep 17 00:00:00 2001
From: Steffen Illium <steffen.illium@ifi.lmu.de>
Date: Wed, 8 Sep 2021 16:24:14 +0200
Subject: [PATCH] study e_1 corpus

---
 environments/factory/base/base_factory.py |   2 +-
 environments/factory/base/objects.py      | 152 +++++++++++-----------
 environments/factory/base/registers.py    |  26 +++-
 environments/factory/factory_item.py      |   6 +-
 environments/helpers.py                   |   4 +
 main.py                                   |   5 +-
 reload_agent.py                           |   8 +-
 studies/e_1.py                            | 130 ++++++++++++++++++
 8 files changed, 246 insertions(+), 87 deletions(-)
 create mode 100644 studies/e_1.py

diff --git a/environments/factory/base/base_factory.py b/environments/factory/base/base_factory.py
index 6545a64..97cff7a 100644
--- a/environments/factory/base/base_factory.py
+++ b/environments/factory/base/base_factory.py
@@ -195,7 +195,7 @@ class BaseFactory(gym.Env):
         for action, agent in zip(actions, self[c.AGENT]):
             agent.clear_temp_state()
             action_obj = self._actions[int(action)]
-            self.print(f'Action #{action} has been resolved to: {action_obj}')
+            # self.print(f'Action #{action} has been resolved to: {action_obj}')
             if h.MovingAction.is_member(action_obj):
                 valid = self._move_or_colide(agent, action_obj)
             elif h.EnvActions.NOOP == agent.temp_action:
diff --git a/environments/factory/base/objects.py b/environments/factory/base/objects.py
index 5963e18..6863d27 100644
--- a/environments/factory/base/objects.py
+++ b/environments/factory/base/objects.py
@@ -66,12 +66,90 @@ class Object:
             return other.name == self.name
 
 
+class Entity(Object):
+
+    @property
+    def can_collide(self):
+        return True
+
+    @property
+    def encoding(self):
+        return c.OCCUPIED_CELL.value
+
+    @property
+    def x(self):
+        return self.pos[0]
+
+    @property
+    def y(self):
+        return self.pos[1]
+
+    @property
+    def pos(self):
+        return self._tile.pos
+
+    @property
+    def tile(self):
+        return self._tile
+
+    def __init__(self, tile, **kwargs):
+        super(Entity, self).__init__(**kwargs)
+        self._tile = tile
+        tile.enter(self)
+
+    def summarize_state(self) -> dict:
+        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
+                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
+
+    def __repr__(self):
+        return f'{self.name}(@{self.pos})'
+
+
+class MoveableEntity(Entity):
+
+    @property
+    def last_tile(self):
+        return self._last_tile
+
+    @property
+    def last_pos(self):
+        if self._last_tile:
+            return self._last_tile.pos
+        else:
+            return c.NO_POS
+
+    @property
+    def direction_of_view(self):
+        last_x, last_y = self.last_pos
+        curr_x, curr_y = self.pos
+        return last_x-curr_x, last_y-curr_y
+
+    def __init__(self, *args, **kwargs):
+        super(MoveableEntity, self).__init__(*args, **kwargs)
+        self._last_tile = None
+
+    def move(self, next_tile):
+        curr_tile = self.tile
+        if curr_tile != next_tile:
+            next_tile.enter(self)
+            curr_tile.leave(self)
+            self._tile = next_tile
+            self._last_tile = curr_tile
+            return True
+        else:
+            return False
+
+
 class Action(Object):
 
     def __init__(self, *args, **kwargs):
         super(Action, self).__init__(*args, **kwargs)
 
 
+class PlaceHolder(MoveableEntity):
+    pass
+
+
 class Tile(Object):
 
     @property
@@ -133,45 +211,6 @@ class Wall(Tile):
     pass
 
 
-class Entity(Object):
-
-    @property
-    def can_collide(self):
-        return True
-
-    @property
-    def encoding(self):
-        return c.OCCUPIED_CELL.value
-
-    @property
-    def x(self):
-        return self.pos[0]
-
-    @property
-    def y(self):
-        return self.pos[1]
-
-    @property
-    def pos(self):
-        return self._tile.pos
-
-    @property
-    def tile(self):
-        return self._tile
-
-    def __init__(self, tile: Tile, **kwargs):
-        super(Entity, self).__init__(**kwargs)
-        self._tile = tile
-        tile.enter(self)
-
-    def summarize_state(self) -> dict:
-        return dict(name=str(self.name), x=int(self.x), y=int(self.y),
-                    tile=str(self.tile.name), can_collide=bool(self.can_collide))
-
-    def __repr__(self):
-        return f'{self.name}(@{self.pos})'
-
-
 class Door(Entity):
 
     @property
@@ -261,41 +300,6 @@ class Door(Entity):
             return False
 
 
-class MoveableEntity(Entity):
-
-    @property
-    def last_tile(self):
-        return self._last_tile
-
-    @property
-    def last_pos(self):
-        if self._last_tile:
-            return self._last_tile.pos
-        else:
-            return c.NO_POS
-
-    @property
-    def direction_of_view(self):
-        last_x, last_y = self.last_pos
-        curr_x, curr_y = self.pos
-        return last_x-curr_x, last_y-curr_y
-
-    def __init__(self, *args, **kwargs):
-        super(MoveableEntity, self).__init__(*args, **kwargs)
-        self._last_tile = None
-
-    def move(self, next_tile):
-        curr_tile = self.tile
-        if curr_tile != next_tile:
-            next_tile.enter(self)
-            curr_tile.leave(self)
-            self._tile = next_tile
-            self._last_tile = curr_tile
-            return True
-        else:
-            return False
-
-
 class Agent(MoveableEntity):
 
     def __init__(self, *args, **kwargs):
diff --git a/environments/factory/base/registers.py b/environments/factory/base/registers.py
index 3d06fb7..12cbf6a 100644
--- a/environments/factory/base/registers.py
+++ b/environments/factory/base/registers.py
@@ -4,7 +4,7 @@ from typing import List, Union, Dict
 
 import numpy as np
 
-from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object
+from environments.factory.base.objects import Entity, Tile, Agent, Door, Action, Wall, Object, PlaceHolder
 from environments.utility_classes import MovementProperties
 from environments import helpers as h
 from environments.helpers import Constants as c
@@ -156,6 +156,25 @@ class MovingEntityObjectRegister(EntityObjectRegister, ABC):
         del self[name]
 
 
+class PlaceHolderRegister(MovingEntityObjectRegister):
+
+    _accepted_objects = PlaceHolder
+
+    # noinspection DuplicatedCode
+    def as_array(self):
+        self._array[:] = c.FREE_CELL.value
+        # noinspection PyTupleAssignmentBalance
+        for z, x, y, v in zip(range(len(self)), *zip(*[x.pos for x in self]), [x.encoding for x in self]):
+            if self.individual_slices:
+                self._array[z, x, y] += v
+            else:
+                self._array[0, x, y] += v
+        if self.individual_slices:
+            return self._array
+        else:
+            return self._array.sum(axis=0, keepdims=True)
+
+
 class Entities(Register):
 
     _accepted_objects = EntityObjectRegister
@@ -256,6 +275,9 @@ class FloorTiles(WallTiles):
 
 class Agents(MovingEntityObjectRegister):
 
+    _accepted_objects = Agent
+
+    # noinspection DuplicatedCode
     def as_array(self):
         self._array[:] = c.FREE_CELL.value
         # noinspection PyTupleAssignmentBalance
@@ -269,8 +291,6 @@ class Agents(MovingEntityObjectRegister):
         else:
             return self._array.sum(axis=0, keepdims=True)
 
-    _accepted_objects = Agent
-
     @property
     def positions(self):
         return [agent.pos for agent in self]
diff --git a/environments/factory/factory_item.py b/environments/factory/factory_item.py
index c04a15a..f93c2c7 100644
--- a/environments/factory/factory_item.py
+++ b/environments/factory/factory_item.py
@@ -311,15 +311,17 @@ class ItemFactory(BaseFactory):
         reward, info_dict = super().calculate_additional_reward(agent)
         if h.EnvActions.ITEM_ACTION == agent.temp_action:
             if agent.temp_valid:
-                if self[c.DROP_OFF].by_pos(agent.pos):
+                if drop_off := self[c.DROP_OFF].by_pos(agent.pos):
                     info_dict.update({f'{agent.name}_item_dropoff': 1})
-
+                    self.print(f'{agent.name} just dropped of an item at {drop_off.pos}.')
                     reward += 0.5
                 else:
                     info_dict.update({f'{agent.name}_item_pickup': 1})
+                    self.print(f'{agent.name} just picked up an item at {agent.pos}')
                     reward += 0.1
             else:
                 info_dict.update({f'{agent.name}_failed_item_action': 1})
+                self.print(f'{agent.name} just tried to pick up an item at {agent.pos}, but failed.')
                 reward -= 0.1
         return reward, info_dict
 
diff --git a/environments/helpers.py b/environments/helpers.py
index c9127b9..4622a41 100644
--- a/environments/helpers.py
+++ b/environments/helpers.py
@@ -5,6 +5,8 @@ from typing import Tuple, Union
 import numpy as np
 from pathlib import Path
 
+from stable_baselines3 import PPO, DQN, A2C
+
 LEVELS_DIR = 'levels'
 
 TO_BE_AVERAGED = ['dirt_amount', 'dirty_tiles']
@@ -142,6 +144,8 @@ def asset_str(agent):
         return c.AGENT.value, 'idle'
 
 
+model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)
+
 if __name__ == '__main__':
     parsed_level = parse_level(Path(__file__).parent / 'factory' / 'levels' / 'simple.txt')
     y = one_hot_level(parsed_level)
diff --git a/main.py b/main.py
index 013d7e1..5c6867a 100644
--- a/main.py
+++ b/main.py
@@ -139,7 +139,7 @@ if __name__ == '__main__':
 
             if modeL_type.__name__ in ["PPO", "A2C"]:
                 kwargs = dict(ent_coef=0.01)
-                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn")
+                env = SubprocVecEnv([make_env(env_kwargs) for _ in range(1)], start_method="spawn")
             elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]:
                 env = make_env(env_kwargs)()
                 kwargs = dict(buffer_size=50000,
@@ -147,7 +147,8 @@ if __name__ == '__main__':
                               batch_size=64,
                               target_update_interval=5000,
                               exploration_fraction=0.25,
-                              exploration_final_eps=0.025)
+                              exploration_final_eps=0.025
+                              )
             else:
                 raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.')
 
diff --git a/reload_agent.py b/reload_agent.py
index 47e9d25..44c9464 100644
--- a/reload_agent.py
+++ b/reload_agent.py
@@ -3,7 +3,6 @@ from pathlib import Path
 
 import yaml
 from natsort import natsorted
-from stable_baselines3 import PPO, DQN, A2C
 from stable_baselines3.common.evaluation import evaluate_policy
 
 from environments.factory.factory_dirt import DirtProperties, DirtFactory
@@ -12,13 +11,12 @@ from environments.factory.factory_item import ItemProperties, ItemFactory
 warnings.filterwarnings('ignore', category=FutureWarning)
 warnings.filterwarnings('ignore', category=UserWarning)
 
-model_map = dict(PPO=PPO, DQN=DQN, A2C=A2C)
 
 if __name__ == '__main__':
 
-    model_name = 'PPO_1631029150'
+    model_name = 'DQN_1631092016'
     run_id = 0
-    seed=69
+    seed = 69
     out_path = Path(__file__).parent / 'debug_out'
     model_path = out_path / model_name
 
@@ -38,5 +36,5 @@ if __name__ == '__main__':
         this_model = model_files[0]
         model_cls = next(val for key, val in model_map.items() if key in model_name)
         model = model_cls.load(this_model)
-        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True, render=True)
+        evaluation_result = evaluate_policy(model, env, n_eval_episodes=100, deterministic=False, render=True)
         print(evaluation_result)
diff --git a/studies/e_1.py b/studies/e_1.py
new file mode 100644
index 0000000..58d06fb
--- /dev/null
+++ b/studies/e_1.py
@@ -0,0 +1,130 @@
+import itertools
+import random
+from pathlib import Path
+
+import simplejson
+from stable_baselines3 import DQN, PPO, A2C
+
+from environments.factory.factory_dirt import DirtProperties, DirtFactory
+from environments.factory.factory_item import ItemProperties, ItemFactory
+
+if __name__ == '__main__':
+    """
+    In this studie, we want to explore the macro behaviour of multi agents which are trained on the same task, 
+    but never saw each other in training.
+    Those agents learned 
+    
+    
+    We start with training a single policy on a single task (dirt cleanup / item pickup).
+    Then multiple agent equipped with the same policy are deployed in the same environment.
+    
+    There are further distinctions to be made:
+    
+    1. No Observation - ['no_obs']:
+    - Agent do not see each other but their consequences of their combined actions
+    - Agents can collide
+    
+    2. Observation in seperate slice - [['seperate_0'], ['seperate_1'], ['seperate_N']]:
+    - Agents see other entitys on a seperate slice
+    - This slice has been filled with $0 | 1 | \mathbb{N}(0, 1)$
+    -- Depending ob the fill value, agents will react diffently
+       -> TODO: Test this! 
+    
+    3. Observation in level slice - ['in_lvl_obs']:
+    - This tells the agent to treat other agents as obstacle. 
+    - However, the state space is altered since moving obstacles are not part the original agent observation. 
+    - We are out of distribution.
+    """
+
+
+def bundle_model(model_class):
+    if model_class.__class__.__name__ in ["PPO", "A2C"]:
+        kwargs = dict(ent_coef=0.01)
+    elif model_class.__class__.__name__ in ["RegDQN", "DQN", "QRDQN"]:
+        kwargs = dict(buffer_size=50000,
+                      learning_starts=64,
+                      batch_size=64,
+                      target_update_interval=5000,
+                      exploration_fraction=0.25,
+                      exploration_final_eps=0.025
+                      )
+    return lambda: model_class(kwargs)
+
+
+if __name__ == '__main__':
+    # Define a global studi save path
+    study_root_path = Path(Path(__file__).stem) / 'out'
+
+    # TODO: Define Global Env Parameters
+    factory_kwargs = {
+
+
+    }
+
+    # TODO: Define global model parameters
+
+
+    # TODO: Define parameters for both envs
+    dirt_props = DirtProperties()
+    item_props = ItemProperties()
+
+    # Bundle both environments with global kwargs and parameters
+    env_bundles = [lambda: ('dirt', DirtFactory(factory_kwargs, dirt_properties=dirt_props)),
+                   lambda: ('item', ItemFactory(factory_kwargs, item_properties=item_props))]
+
+    # Define parameter versions according with #1,2[1,0,N],3
+    observation_modes = ['no_obs', 'seperate_0', 'seperate_1', 'seperate_N', 'in_lvl_obs']
+
+    # Define RL-Models
+    model_bundles = [bundle_model(model) for model in [A2C, PPO, DQN]]
+
+    # Zip parameters, parameter versions, Env Classes and models
+    combinations = itertools.product(model_bundles, env_bundles)
+
+    # Train starts here ############################################################
+    # Build Major Loop
+    for model, (env_identifier, env_bundle) in combinations:
+        for observation_mode in observation_modes:
+            # TODO: Create an identifier, which is unique for every combination and easy to read in filesystem
+            identifier = f'{model.name}_{observation_mode}_{env_identifier}'
+            # Train each combination per seed
+            for seed in range(3):
+                # TODO: Output folder
+                # TODO: Monitor Init
+                # TODO: Env Init
+                # TODO: Model Init
+                # TODO: Model train
+                # TODO: Model save
+                pass
+            # TODO: Seed Compare Plot
+    # Train ends here ############################################################
+
+    # Evaluation starts here #####################################################
+    # Iterate Observation Modes
+    for observation_mode in observation_modes:
+        # TODO: For trained policy in study_root_path / identifier
+        for policy_group in (x for x in study_root_path.iterdir() if x.is_dir()):
+            # TODO: Pick random seed or iterate over available seeds
+            policy_seed = next((y for y in study_root_path.iterdir() if y.is_dir()))
+            # TODO: retrieve model class
+            # TODO: Load both agents
+            models = []
+            # TODO: Evaluation Loop for i in range(100) Episodes
+            for episode in range(100):
+                with next(policy_seed.glob('*.yaml')).open('r') as f:
+                    env_kwargs = simplejson.load(f)
+                # TODO: Monitor Init
+                env = None  # TODO: Init Env
+                for step in range(400):
+                    random_actions = [[random.randint(0, env.n_actions) for _ in range(len(models))] for _ in range(200)]
+                    env_state = env.reset()
+                    rew = 0
+                    for agent_i_action in random_actions:
+                        env_state, step_r, done_bool, info_obj = env.step(agent_i_action)
+                        rew += step_r
+                        if done_bool:
+                            break
+                print(f'Factory run {episode} done, reward is:\n    {rew}')
+            # TODO: Plotting
+
+    pass