From b979a47b6fd4bcb337a4170958eb1ca440a2b2c3 Mon Sep 17 00:00:00 2001 From: steffen-illium Date: Wed, 19 May 2021 16:50:42 +0200 Subject: [PATCH] Stable Baseline Running --- environments/factory/_factory_monitor.py | 0 environments/factory/base_factory.py | 18 +- environments/factory/simple_factory.py | 168 ++++++++++++++---- .../factory/simple_factory_getting_dirty.py | 158 ---------------- 4 files changed, 147 insertions(+), 197 deletions(-) delete mode 100644 environments/factory/_factory_monitor.py delete mode 100644 environments/factory/simple_factory_getting_dirty.py diff --git a/environments/factory/_factory_monitor.py b/environments/factory/_factory_monitor.py deleted file mode 100644 index e69de29..0000000 diff --git a/environments/factory/base_factory.py b/environments/factory/base_factory.py index 89c7e8a..6d8414a 100644 --- a/environments/factory/base_factory.py +++ b/environments/factory/base_factory.py @@ -1,3 +1,4 @@ +import abc from typing import List, Union, Iterable import gym @@ -61,17 +62,14 @@ class BaseFactory(gym.Env): self.allow_horizontal_movement = True self.allow_no_OP = True self._monitor_list = list() - self._registered_actions = self.movement_actions + int(self.allow_no_OP) + self._registered_actions = self.movement_actions + int(self.allow_no_OP) + self.register_additional_actions() self.level = h.one_hot_level( h.parse_level(Path(__file__).parent / h.LEVELS_DIR / f'{level}.txt') ) self.slice_strings = {0: 'level', **{i: f'agent#{i}' for i in range(1, self.n_agents+1)}} self.reset() - def __init_subclass__(cls): - print(cls) - - def register_additional_actions(self): + def register_additional_actions(self) -> int: raise NotImplementedError('Please register additional actions ') def reset(self) -> (np.ndarray, int, bool, dict): @@ -111,6 +109,8 @@ class BaseFactory(gym.Env): agent_i_state = AgentState(agent_i, action) if self._is_moving_action(action): pos, valid = self.move_or_colide(agent_i, action) + elif self._is_no_op(action): + pos, valid = self.agent_i_position(agent_i), True else: pos, valid = self.additional_actions(agent_i, action) # Update state accordingly @@ -129,10 +129,10 @@ class BaseFactory(gym.Env): return self.state, self.cumulative_reward, self.done, info def _is_moving_action(self, action): - if action < self.movement_actions: - return True - else: - return False + return action < self.movement_actions + + def _is_no_op(self, action): + return self.allow_no_OP and (action - self.movement_actions) == 0 def check_all_collisions(self, agent_states: List[AgentState], collisions: int) -> np.ndarray: collision_vecs = np.zeros((len(agent_states), collisions)) # n_agents x n_slices diff --git a/environments/factory/simple_factory.py b/environments/factory/simple_factory.py index 26bdfec..3ad70ea 100644 --- a/environments/factory/simple_factory.py +++ b/environments/factory/simple_factory.py @@ -1,49 +1,157 @@ +from collections import OrderedDict +from dataclasses import dataclass +from typing import List +import random + import numpy as np -from environments.factory.base_factory import BaseFactory, FactoryMonitor + +from environments.factory.base_factory import BaseFactory, AgentState +from environments import helpers as h + +from environments.factory.renderer import Renderer +from environments.factory.renderer import Entity +from environments.logging.monitor import MonitorCallback + +DIRT_INDEX = -1 + + +@dataclass +class DirtProperties: + clean_amount = 0.25 + max_spawn_ratio = 0.1 + gain_amount = 0.1 + spawn_frequency = 5 class SimpleFactory(BaseFactory): - def __init__(self, *args, max_dirt=5, **kwargs): - self.max_dirt = max_dirt + + def register_additional_actions(self): + return 1 + + def _is_clean_up_action(self, action): + return self.action_space.n - 1 == action + + def __init__(self, *args, dirt_properties: DirtProperties, **kwargs): + self._dirt_properties = dirt_properties super(SimpleFactory, self).__init__(*args, **kwargs) self.slice_strings.update({self.state.shape[0]-1: 'dirt'}) + self.renderer = None # expensive - dont use it when not required ! - def spawn_dirt(self): - free_for_dirt = self.free_cells - for x, y in free_for_dirt[:self.max_dirt]: # randomly distribute dirt across the grid - self.state[-1, x, y] = 1 + def render(self): + if not self.renderer: # lazy init + height, width = self.state.shape[1:] + self.renderer = Renderer(width, height, view_radius=2) - def reset(self): - state, r, done, _ = super().reset() + dirt = [Entity('dirt', [x, y], min(0.15+self.state[DIRT_INDEX, x, y], 1.5), 'scale') + for x, y in np.argwhere(self.state[DIRT_INDEX] > h.IS_FREE_CELL)] + walls = [Entity('wall', pos) for pos in np.argwhere(self.state[h.LEVEL_IDX] > h.IS_FREE_CELL)] + + def asset_str(agent): + cols = ' '.join([self.slice_strings[j] for j in agent.collisions]) + if 'agent' in cols: + return 'agent_collision' + elif not agent.action_valid or 'level' in cols or 'agent' in cols: + return f'agent{agent.i + 1}violation' + elif self._is_clean_up_action(agent.action): + return f'agent{agent.i + 1}valid' + else: + return f'agent{agent.i + 1}' + + agents = {f'agent{i+1}': [Entity(asset_str(agent), agent.pos)] + for i, agent in enumerate(self.agent_states)} + self.renderer.render(OrderedDict(dirt=dirt, wall=walls, **agents)) + + def spawn_dirt(self) -> None: + free_for_dirt = self.free_cells(excluded_slices=DIRT_INDEX) + + # randomly distribute dirt across the grid + n_dirt_tiles = int(random.uniform(0, self._dirt_properties.max_spawn_ratio) * len(free_for_dirt)) + for x, y in free_for_dirt[:n_dirt_tiles]: + self.state[DIRT_INDEX, x, y] += self._dirt_properties.gain_amount + + def clean_up(self, pos: (int, int)) -> ((int, int), bool): + new_dirt_amount = self.state[DIRT_INDEX][pos] - self._dirt_properties.clean_amount + cleanup_was_sucessfull: bool + if self.state[DIRT_INDEX][pos] == h.IS_FREE_CELL: + cleanup_was_sucessfull = False + return pos, cleanup_was_sucessfull + else: + cleanup_was_sucessfull = True + self.state[DIRT_INDEX][pos] = max(new_dirt_amount, h.IS_FREE_CELL) + return pos, cleanup_was_sucessfull + + def step(self, actions): + _, _, _, info = super(SimpleFactory, self).step(actions) + if not self.next_dirt_spawn: + self.spawn_dirt() + self.next_dirt_spawn = self._dirt_properties.spawn_frequency + else: + self.next_dirt_spawn -= 1 + return self.state, self.cumulative_reward, self.done, info + + def additional_actions(self, agent_i: int, action: int) -> ((int, int), bool): + if action != self._is_moving_action(action): + if self._is_clean_up_action(action): + agent_i_pos = self.agent_i_position(agent_i) + _, valid = self.clean_up(agent_i_pos) + if valid: + print(f'Agent {agent_i} did just clean up some dirt at {agent_i_pos}.') + self.monitor.add('dirt_cleaned', self._dirt_properties.clean_amount) + else: + print(f'Agent {agent_i} just tried to clean up some dirt at {agent_i_pos}, but was unsucsessfull.') + self.monitor.add('failed_cleanup_attempt', 1) + return agent_i_pos, valid + else: + raise RuntimeError('This should not happen!!!') + else: + raise RuntimeError('This should not happen!!!') + + def reset(self) -> (np.ndarray, int, bool, dict): + _ = super().reset() # state, reward, done, info ... = dirt_slice = np.zeros((1, *self.state.shape[1:])) self.state = np.concatenate((self.state, dirt_slice)) # dirt is now the last slice self.spawn_dirt() - # Always: This should return state + self.next_dirt_spawn = self._dirt_properties.spawn_frequency return self.state - def calculate_reward(self, agent_states): + def calculate_reward(self, agent_states: List[AgentState]) -> (int, dict): + # TODO: What reward to use? + current_dirt_amount = self.state[DIRT_INDEX].sum() + dirty_tiles = len(np.nonzero(self.state[DIRT_INDEX])) + + try: + this_step_reward = -(dirty_tiles / current_dirt_amount) + except ZeroDivisionError: + this_step_reward = 0 + for agent_state in agent_states: collisions = agent_state.collisions - entities = [self.slice_strings[entity] for entity in collisions] - if entities: - for entity in entities: - self.monitor.add(f'agent_{agent_state.i}_collision_{entity}', 1) - print(f't = {self.steps}\tAgent {agent_state.i} has collisions with ' - f'{entities}') - return 0, {} + print(f't = {self.steps}\tAgent {agent_state.i} has collisions with ' + f'{[self.slice_strings[entity] for entity in collisions if entity != self.string_slices["dirt"]]}') + if self._is_clean_up_action(agent_state.action) and agent_state.action_valid: + this_step_reward += 1 + + for entity in collisions: + if entity != self.string_slices["dirt"]: + self.monitor.add(f'agent_{agent_state.i}_vs_{self.slice_strings[entity]}', 1) + self.monitor.set('dirt_amount', current_dirt_amount) + self.monitor.set('dirty_tiles', dirty_tiles) + return this_step_reward, {} if __name__ == '__main__': - import random - factory = SimpleFactory(n_agents=1, max_dirt=8) - monitor_list = list() - for epoch in range(5): - random_actions = [random.randint(0, 7) for _ in range(200)] - state, r, done, _ = factory.reset() - for action in random_actions: - state, r, done, info = factory.step(action) - monitor_list.append(factory.monitor) - - print(f'Factory run done, reward is:\n {r}') - print(f'There have been the following collisions: \n {dict(factory.monitor)}') + render = True + dirt_props = DirtProperties() + factory = SimpleFactory(n_agents=2, dirt_properties=dirt_props) + with MonitorCallback(factory): + for epoch in range(100): + random_actions = [(random.randint(0, 8), random.randint(0, 8)) for _ in range(200)] + env_state, reward, done_bool, _ = factory.reset() + for agent_i_action in random_actions: + env_state, reward, done_bool, info_obj = factory.step(agent_i_action) + if render: + factory.render() + if done_bool: + break + print(f'Factory run {epoch} done, reward is:\n {reward}') diff --git a/environments/factory/simple_factory_getting_dirty.py b/environments/factory/simple_factory_getting_dirty.py deleted file mode 100644 index 533522e..0000000 --- a/environments/factory/simple_factory_getting_dirty.py +++ /dev/null @@ -1,158 +0,0 @@ -from collections import OrderedDict -from dataclasses import dataclass -from typing import List -import random - -import numpy as np - -from environments.factory.base_factory import BaseFactory, AgentState -from environments import helpers as h - -from environments.factory.renderer import Renderer -from environments.factory.renderer import Entity -from environments.logging.monitor import MonitorCallback - -DIRT_INDEX = -1 - - -@dataclass -class DirtProperties: - clean_amount = 0.25 - max_spawn_ratio = 0.1 - gain_amount = 0.1 - spawn_frequency = 5 - - -class GettingDirty(BaseFactory): - - def register_additional_actions(self): - self._registered_actions += 1 - return True - - def _is_clean_up_action(self, action): - return self.action_space.n - 1 == action - - def __init__(self, *args, dirt_properties: DirtProperties, **kwargs): - self._dirt_properties = dirt_properties - super(GettingDirty, self).__init__(*args, **kwargs) - self.slice_strings.update({self.state.shape[0]-1: 'dirt'}) - self.renderer = None # expensive - dont use it when not required ! - - def render(self): - if not self.renderer: # lazy init - height, width = self.state.shape[1:] - self.renderer = Renderer(width, height, view_radius=2) - - dirt = [Entity('dirt', [x, y], min(0.15+self.state[DIRT_INDEX, x, y], 1.5), 'scale') - for x, y in np.argwhere(self.state[DIRT_INDEX] > h.IS_FREE_CELL)] - walls = [Entity('wall', pos) for pos in np.argwhere(self.state[h.LEVEL_IDX] > h.IS_FREE_CELL)] - - def asset_str(agent): - cols = ' '.join([self.slice_strings[j] for j in agent.collisions]) - if 'agent' in cols: - return 'agent_collision' - elif not agent.action_valid or 'level' in cols or 'agent' in cols: - return f'agent{agent.i + 1}violation' - elif self._is_clean_up_action(agent.action): - return f'agent{agent.i + 1}valid' - else: - return f'agent{agent.i + 1}' - - agents = {f'agent{i+1}': [Entity(asset_str(agent), agent.pos)] - for i, agent in enumerate(self.agent_states)} - self.renderer.render(OrderedDict(dirt=dirt, wall=walls, **agents)) - - def spawn_dirt(self) -> None: - free_for_dirt = self.free_cells(excluded_slices=DIRT_INDEX) - - # randomly distribute dirt across the grid - n_dirt_tiles = int(random.uniform(0, self._dirt_properties.max_spawn_ratio) * len(free_for_dirt)) - for x, y in free_for_dirt[:n_dirt_tiles]: - self.state[DIRT_INDEX, x, y] += self._dirt_properties.gain_amount - - def clean_up(self, pos: (int, int)) -> ((int, int), bool): - new_dirt_amount = self.state[DIRT_INDEX][pos] - self._dirt_properties.clean_amount - cleanup_was_sucessfull: bool - if self.state[DIRT_INDEX][pos] == h.IS_FREE_CELL: - cleanup_was_sucessfull = False - return pos, cleanup_was_sucessfull - else: - cleanup_was_sucessfull = True - self.state[DIRT_INDEX][pos] = max(new_dirt_amount, h.IS_FREE_CELL) - return pos, cleanup_was_sucessfull - - def step(self, actions): - _, _, _, info = super(GettingDirty, self).step(actions) - if not self.next_dirt_spawn: - self.spawn_dirt() - self.next_dirt_spawn = self._dirt_properties.spawn_frequency - else: - self.next_dirt_spawn -= 1 - return self.state, self.cumulative_reward, self.done, info - - def additional_actions(self, agent_i: int, action: int) -> ((int, int), bool): - if action != self._is_moving_action(action): - if self._is_clean_up_action(action): - agent_i_pos = self.agent_i_position(agent_i) - _, valid = self.clean_up(agent_i_pos) - if valid: - print(f'Agent {agent_i} did just clean up some dirt at {agent_i_pos}.') - self.monitor.add('dirt_cleaned', self._dirt_properties.clean_amount) - else: - print(f'Agent {agent_i} just tried to clean up some dirt at {agent_i_pos}, but was unsucsessfull.') - self.monitor.add('failed_cleanup_attempt', 1) - return agent_i_pos, valid - else: - raise RuntimeError('This should not happen!!!') - else: - raise RuntimeError('This should not happen!!!') - - def reset(self) -> (np.ndarray, int, bool, dict): - _ = super().reset() # state, reward, done, info ... = - dirt_slice = np.zeros((1, *self.state.shape[1:])) - self.state = np.concatenate((self.state, dirt_slice)) # dirt is now the last slice - self.spawn_dirt() - self.next_dirt_spawn = self._dirt_properties.spawn_frequency - return self.state - - def calculate_reward(self, agent_states: List[AgentState]) -> (int, dict): - # TODO: What reward to use? - current_dirt_amount = self.state[DIRT_INDEX].sum() - dirty_tiles = len(np.nonzero(self.state[DIRT_INDEX])) - - try: - this_step_reward = -(dirty_tiles / current_dirt_amount) - except ZeroDivisionError: - this_step_reward = 0 - - for agent_state in agent_states: - collisions = agent_state.collisions - print(f't = {self.steps}\tAgent {agent_state.i} has collisions with ' - f'{[self.slice_strings[entity] for entity in collisions if entity != self.string_slices["dirt"]]}') - if self._is_clean_up_action(agent_state.action) and agent_state.action_valid: - this_step_reward += 1 - - for entity in collisions: - if entity != self.string_slices["dirt"]: - self.monitor.add(f'agent_{agent_state.i}_vs_{self.slice_strings[entity]}', 1) - self.monitor.set('dirt_amount', current_dirt_amount) - self.monitor.set('dirty_tiles', dirty_tiles) - return this_step_reward, {} - - -if __name__ == '__main__': - render = True - - dirt_props = DirtProperties() - factory = GettingDirty(n_agents=2, dirt_properties=dirt_props) - with MonitorCallback(factory): - for epoch in range(100): - random_actions = [(random.randint(0, 8), random.randint(0, 8)) for _ in range(200)] - env_state, reward, done_bool, _ = factory.reset() - for agent_i_action in random_actions: - env_state, reward, done_bool, info_obj = factory.step(agent_i_action) - if render: - factory.render() - if done_bool: - break - print(f'Factory run {epoch} done, reward is:\n {reward}')