Stable Baseline Running

2026-07-15 15:21:50 +02:00 · 2021-05-19 16:50:42 +02:00
parent 575eec9ee6
commit b979a47b6f
4 changed files with 147 additions and 197 deletions
@@ -1,3 +1,4 @@
+import abc
 from typing import List, Union, Iterable

 import gym
@@ -61,17 +62,14 @@ class BaseFactory(gym.Env):
        self.allow_horizontal_movement = True
        self.allow_no_OP = True
        self._monitor_list = list()
-        self._registered_actions = self.movement_actions + int(self.allow_no_OP)
+        self._registered_actions = self.movement_actions + int(self.allow_no_OP) + self.register_additional_actions()
        self.level = h.one_hot_level(
            h.parse_level(Path(__file__).parent / h.LEVELS_DIR / f'{level}.txt')
        )
        self.slice_strings = {0: 'level', **{i: f'agent#{i}' for i in range(1, self.n_agents+1)}}
        self.reset()

-    def __init_subclass__(cls):
-        print(cls)
-
-    def register_additional_actions(self):
+    def register_additional_actions(self) -> int:
        raise NotImplementedError('Please register additional actions ')

    def reset(self) -> (np.ndarray, int, bool, dict):
@@ -111,6 +109,8 @@ class BaseFactory(gym.Env):
            agent_i_state = AgentState(agent_i, action)
            if self._is_moving_action(action):
                pos, valid = self.move_or_colide(agent_i, action)
+            elif self._is_no_op(action):
+                pos, valid = self.agent_i_position(agent_i), True
            else:
                pos, valid = self.additional_actions(agent_i, action)
            # Update state accordingly
@@ -129,10 +129,10 @@ class BaseFactory(gym.Env):
        return self.state, self.cumulative_reward, self.done, info

    def _is_moving_action(self, action):
-        if action < self.movement_actions:
-            return True
-        else:
-            return False
+        return action < self.movement_actions
+
+    def _is_no_op(self, action):
+        return self.allow_no_OP and (action - self.movement_actions) == 0

    def check_all_collisions(self, agent_states: List[AgentState], collisions: int) -> np.ndarray:
        collision_vecs = np.zeros((len(agent_states), collisions))  # n_agents x n_slices
@@ -1,49 +1,157 @@
+from collections import OrderedDict
+from dataclasses import dataclass
+from typing import List
+import random
+
 import numpy as np
-from environments.factory.base_factory import BaseFactory, FactoryMonitor
+
+from environments.factory.base_factory import BaseFactory, AgentState
+from environments import helpers as h
+
+from environments.factory.renderer import Renderer
+from environments.factory.renderer import Entity
+from environments.logging.monitor import MonitorCallback
+
+DIRT_INDEX = -1
+
+
+@dataclass
+class DirtProperties:
+    clean_amount = 0.25
+    max_spawn_ratio = 0.1
+    gain_amount = 0.1
+    spawn_frequency = 5


 class SimpleFactory(BaseFactory):
-    def __init__(self, *args, max_dirt=5, **kwargs):
-        self.max_dirt = max_dirt
+
+    def register_additional_actions(self):
+        return 1
+
+    def _is_clean_up_action(self, action):
+        return self.action_space.n - 1 == action
+
+    def __init__(self, *args, dirt_properties: DirtProperties, **kwargs):
+        self._dirt_properties = dirt_properties
        super(SimpleFactory, self).__init__(*args, **kwargs)
        self.slice_strings.update({self.state.shape[0]-1: 'dirt'})
+        self.renderer = None  # expensive - dont use it when not required !

-    def spawn_dirt(self):
-        free_for_dirt = self.free_cells
-        for x, y in free_for_dirt[:self.max_dirt]:  # randomly distribute dirt across the grid
-            self.state[-1, x, y] = 1
+    def render(self):
+        if not self.renderer:  # lazy init
+            height, width = self.state.shape[1:]
+            self.renderer = Renderer(width, height, view_radius=2)

-    def reset(self):
-        state, r, done, _ = super().reset()
+        dirt      = [Entity('dirt', [x, y], min(0.15+self.state[DIRT_INDEX, x, y], 1.5), 'scale')
+                     for x, y in np.argwhere(self.state[DIRT_INDEX] > h.IS_FREE_CELL)]
+        walls     = [Entity('wall', pos) for pos in np.argwhere(self.state[h.LEVEL_IDX] > h.IS_FREE_CELL)]
+
+        def asset_str(agent):
+            cols = ' '.join([self.slice_strings[j] for j in agent.collisions])
+            if 'agent' in cols:
+                return 'agent_collision'
+            elif not agent.action_valid or 'level' in cols or 'agent' in cols:
+                return f'agent{agent.i + 1}violation'
+            elif self._is_clean_up_action(agent.action):
+                return f'agent{agent.i + 1}valid'
+            else:
+                return f'agent{agent.i + 1}'
+
+        agents = {f'agent{i+1}': [Entity(asset_str(agent), agent.pos)]
+                  for i, agent in enumerate(self.agent_states)}
+        self.renderer.render(OrderedDict(dirt=dirt, wall=walls, **agents))
+
+    def spawn_dirt(self) -> None:
+        free_for_dirt = self.free_cells(excluded_slices=DIRT_INDEX)
+
+        # randomly distribute dirt across the grid
+        n_dirt_tiles = int(random.uniform(0, self._dirt_properties.max_spawn_ratio) * len(free_for_dirt))
+        for x, y in free_for_dirt[:n_dirt_tiles]:
+            self.state[DIRT_INDEX, x, y] += self._dirt_properties.gain_amount
+
+    def clean_up(self, pos: (int, int)) -> ((int, int), bool):
+        new_dirt_amount = self.state[DIRT_INDEX][pos] - self._dirt_properties.clean_amount
+        cleanup_was_sucessfull: bool
+        if self.state[DIRT_INDEX][pos] == h.IS_FREE_CELL:
+            cleanup_was_sucessfull = False
+            return pos, cleanup_was_sucessfull
+        else:
+            cleanup_was_sucessfull = True
+            self.state[DIRT_INDEX][pos] = max(new_dirt_amount, h.IS_FREE_CELL)
+            return pos, cleanup_was_sucessfull
+
+    def step(self, actions):
+        _, _, _, info = super(SimpleFactory, self).step(actions)
+        if not self.next_dirt_spawn:
+            self.spawn_dirt()
+            self.next_dirt_spawn = self._dirt_properties.spawn_frequency
+        else:
+            self.next_dirt_spawn -= 1
+        return self.state, self.cumulative_reward, self.done, info
+
+    def additional_actions(self, agent_i: int, action: int) -> ((int, int), bool):
+        if action != self._is_moving_action(action):
+            if self._is_clean_up_action(action):
+                agent_i_pos = self.agent_i_position(agent_i)
+                _, valid = self.clean_up(agent_i_pos)
+                if valid:
+                    print(f'Agent {agent_i} did just clean up some dirt at {agent_i_pos}.')
+                    self.monitor.add('dirt_cleaned', self._dirt_properties.clean_amount)
+                else:
+                    print(f'Agent {agent_i} just tried to clean up some dirt at {agent_i_pos}, but was unsucsessfull.')
+                    self.monitor.add('failed_cleanup_attempt', 1)
+                return agent_i_pos, valid
+            else:
+                raise RuntimeError('This should not happen!!!')
+        else:
+            raise RuntimeError('This should not happen!!!')
+
+    def reset(self) -> (np.ndarray, int, bool, dict):
+        _ = super().reset()  # state, reward, done, info ... =
        dirt_slice = np.zeros((1, *self.state.shape[1:]))
        self.state = np.concatenate((self.state, dirt_slice))  # dirt is now the last slice
        self.spawn_dirt()
-        # Always: This should return state
+        self.next_dirt_spawn = self._dirt_properties.spawn_frequency
        return self.state

-    def calculate_reward(self, agent_states):
+    def calculate_reward(self, agent_states: List[AgentState]) -> (int, dict):
+        # TODO: What reward to use?
+        current_dirt_amount = self.state[DIRT_INDEX].sum()
+        dirty_tiles = len(np.nonzero(self.state[DIRT_INDEX]))
+
+        try:
+            this_step_reward = -(dirty_tiles / current_dirt_amount)
+        except ZeroDivisionError:
+            this_step_reward = 0
+
        for agent_state in agent_states:
            collisions = agent_state.collisions
-            entities = [self.slice_strings[entity] for entity in collisions]
-            if entities:
-                for entity in entities:
-                    self.monitor.add(f'agent_{agent_state.i}_collision_{entity}', 1)
-                print(f't = {self.steps}\tAgent {agent_state.i} has collisions with '
-                      f'{entities}')
-        return 0, {}
+            print(f't = {self.steps}\tAgent {agent_state.i} has collisions with '
+                  f'{[self.slice_strings[entity] for entity in collisions if entity != self.string_slices["dirt"]]}')
+            if self._is_clean_up_action(agent_state.action) and agent_state.action_valid:
+                this_step_reward += 1
+
+            for entity in collisions:
+                if entity != self.string_slices["dirt"]:
+                    self.monitor.add(f'agent_{agent_state.i}_vs_{self.slice_strings[entity]}', 1)
+        self.monitor.set('dirt_amount', current_dirt_amount)
+        self.monitor.set('dirty_tiles', dirty_tiles)
+        return this_step_reward, {}


 if __name__ == '__main__':
-    import random
-    factory = SimpleFactory(n_agents=1, max_dirt=8)
-    monitor_list = list()
-    for epoch in range(5):
-        random_actions = [random.randint(0, 7) for _ in range(200)]
-        state, r, done, _ = factory.reset()
-        for action in random_actions:
-            state, r, done, info = factory.step(action)
-        monitor_list.append(factory.monitor)
-
-        print(f'Factory run done, reward is:\n    {r}')
-        print(f'There have been the following collisions: \n {dict(factory.monitor)}')
+    render = True

+    dirt_props = DirtProperties()
+    factory = SimpleFactory(n_agents=2, dirt_properties=dirt_props)
+    with MonitorCallback(factory):
+        for epoch in range(100):
+            random_actions = [(random.randint(0, 8), random.randint(0, 8)) for _ in range(200)]
+            env_state, reward, done_bool, _ = factory.reset()
+            for agent_i_action in random_actions:
+                env_state, reward, done_bool, info_obj = factory.step(agent_i_action)
+                if render:
+                    factory.render()
+                if done_bool:
+                    break
+            print(f'Factory run {epoch} done, reward is:\n    {reward}')
@@ -1,158 +0,0 @@
-from collections import OrderedDict
-from dataclasses import dataclass
-from typing import List
-import random
-
-import numpy as np
-
-from environments.factory.base_factory import BaseFactory, AgentState
-from environments import helpers as h
-
-from environments.factory.renderer import Renderer
-from environments.factory.renderer import Entity
-from environments.logging.monitor import MonitorCallback
-
-DIRT_INDEX = -1
-
-
-@dataclass
-class DirtProperties:
-    clean_amount = 0.25
-    max_spawn_ratio = 0.1
-    gain_amount = 0.1
-    spawn_frequency = 5
-
-
-class GettingDirty(BaseFactory):
-
-    def register_additional_actions(self):
-        self._registered_actions += 1
-        return True
-
-    def _is_clean_up_action(self, action):
-        return self.action_space.n - 1 == action
-
-    def __init__(self, *args, dirt_properties: DirtProperties, **kwargs):
-        self._dirt_properties = dirt_properties
-        super(GettingDirty, self).__init__(*args, **kwargs)
-        self.slice_strings.update({self.state.shape[0]-1: 'dirt'})
-        self.renderer = None  # expensive - dont use it when not required !
-
-    def render(self):
-        if not self.renderer:  # lazy init
-            height, width = self.state.shape[1:]
-            self.renderer = Renderer(width, height, view_radius=2)
-
-        dirt      = [Entity('dirt', [x, y], min(0.15+self.state[DIRT_INDEX, x, y], 1.5), 'scale')
-                     for x, y in np.argwhere(self.state[DIRT_INDEX] > h.IS_FREE_CELL)]
-        walls     = [Entity('wall', pos) for pos in np.argwhere(self.state[h.LEVEL_IDX] > h.IS_FREE_CELL)]
-
-        def asset_str(agent):
-            cols = ' '.join([self.slice_strings[j] for j in agent.collisions])
-            if 'agent' in cols:
-                return 'agent_collision'
-            elif not agent.action_valid or 'level' in cols or 'agent' in cols:
-                return f'agent{agent.i + 1}violation'
-            elif self._is_clean_up_action(agent.action):
-                return f'agent{agent.i + 1}valid'
-            else:
-                return f'agent{agent.i + 1}'
-
-        agents = {f'agent{i+1}': [Entity(asset_str(agent), agent.pos)]
-                  for i, agent in enumerate(self.agent_states)}
-        self.renderer.render(OrderedDict(dirt=dirt, wall=walls, **agents))
-
-    def spawn_dirt(self) -> None:
-        free_for_dirt = self.free_cells(excluded_slices=DIRT_INDEX)
-
-        # randomly distribute dirt across the grid
-        n_dirt_tiles = int(random.uniform(0, self._dirt_properties.max_spawn_ratio) * len(free_for_dirt))
-        for x, y in free_for_dirt[:n_dirt_tiles]:
-            self.state[DIRT_INDEX, x, y] += self._dirt_properties.gain_amount
-
-    def clean_up(self, pos: (int, int)) -> ((int, int), bool):
-        new_dirt_amount = self.state[DIRT_INDEX][pos] - self._dirt_properties.clean_amount
-        cleanup_was_sucessfull: bool
-        if self.state[DIRT_INDEX][pos] == h.IS_FREE_CELL:
-            cleanup_was_sucessfull = False
-            return pos, cleanup_was_sucessfull
-        else:
-            cleanup_was_sucessfull = True
-            self.state[DIRT_INDEX][pos] = max(new_dirt_amount, h.IS_FREE_CELL)
-            return pos, cleanup_was_sucessfull
-
-    def step(self, actions):
-        _, _, _, info = super(GettingDirty, self).step(actions)
-        if not self.next_dirt_spawn:
-            self.spawn_dirt()
-            self.next_dirt_spawn = self._dirt_properties.spawn_frequency
-        else:
-            self.next_dirt_spawn -= 1
-        return self.state, self.cumulative_reward, self.done, info
-
-    def additional_actions(self, agent_i: int, action: int) -> ((int, int), bool):
-        if action != self._is_moving_action(action):
-            if self._is_clean_up_action(action):
-                agent_i_pos = self.agent_i_position(agent_i)
-                _, valid = self.clean_up(agent_i_pos)
-                if valid:
-                    print(f'Agent {agent_i} did just clean up some dirt at {agent_i_pos}.')
-                    self.monitor.add('dirt_cleaned', self._dirt_properties.clean_amount)
-                else:
-                    print(f'Agent {agent_i} just tried to clean up some dirt at {agent_i_pos}, but was unsucsessfull.')
-                    self.monitor.add('failed_cleanup_attempt', 1)
-                return agent_i_pos, valid
-            else:
-                raise RuntimeError('This should not happen!!!')
-        else:
-            raise RuntimeError('This should not happen!!!')
-
-    def reset(self) -> (np.ndarray, int, bool, dict):
-        _ = super().reset()  # state, reward, done, info ... =
-        dirt_slice = np.zeros((1, *self.state.shape[1:]))
-        self.state = np.concatenate((self.state, dirt_slice))  # dirt is now the last slice
-        self.spawn_dirt()
-        self.next_dirt_spawn = self._dirt_properties.spawn_frequency
-        return self.state
-
-    def calculate_reward(self, agent_states: List[AgentState]) -> (int, dict):
-        # TODO: What reward to use?
-        current_dirt_amount = self.state[DIRT_INDEX].sum()
-        dirty_tiles = len(np.nonzero(self.state[DIRT_INDEX]))
-
-        try:
-            this_step_reward = -(dirty_tiles / current_dirt_amount)
-        except ZeroDivisionError:
-            this_step_reward = 0
-
-        for agent_state in agent_states:
-            collisions = agent_state.collisions
-            print(f't = {self.steps}\tAgent {agent_state.i} has collisions with '
-                  f'{[self.slice_strings[entity] for entity in collisions if entity != self.string_slices["dirt"]]}')
-            if self._is_clean_up_action(agent_state.action) and agent_state.action_valid:
-                this_step_reward += 1
-
-            for entity in collisions:
-                if entity != self.string_slices["dirt"]:
-                    self.monitor.add(f'agent_{agent_state.i}_vs_{self.slice_strings[entity]}', 1)
-        self.monitor.set('dirt_amount', current_dirt_amount)
-        self.monitor.set('dirty_tiles', dirty_tiles)
-        return this_step_reward, {}
-
-
-if __name__ == '__main__':
-    render = True
-
-    dirt_props = DirtProperties()
-    factory = GettingDirty(n_agents=2, dirt_properties=dirt_props)
-    with MonitorCallback(factory):
-        for epoch in range(100):
-            random_actions = [(random.randint(0, 8), random.randint(0, 8)) for _ in range(200)]
-            env_state, reward, done_bool, _ = factory.reset()
-            for agent_i_action in random_actions:
-                env_state, reward, done_bool, info_obj = factory.step(agent_i_action)
-                if render:
-                    factory.render()
-                if done_bool:
-                    break
-            print(f'Factory run {epoch} done, reward is:\n    {reward}')