rewards and monitors

2021-05-20 14:27:46 +02:00
parent 3114cdffc3
commit 962f914ff9
3 changed files with 17 additions and 18 deletions
--- a/environments/factory/simple_factory.py
+++ b/environments/factory/simple_factory.py
@@ -120,38 +120,38 @@ class SimpleFactory(BaseFactory):

        try:
            # penalty = current_dirt_amount
-            penalty = 0
+            reward = 0
        except (ZeroDivisionError, RuntimeWarning):
-            penalty = 0
-        inforcements = 0
+            reward = 0
+
        for agent_state in agent_states:
            cols = agent_state.collisions
            self.print(f't = {self.steps}\tAgent {agent_state.i} has collisions with '
                       f'{[self.slice_strings[entity] for entity in cols if entity != self.string_slices["dirt"]]}')
            if self._is_clean_up_action(agent_state.action):
                if agent_state.action_valid:
-                    inforcements += 10
+                    reward += 2
                    self.print(f'Agent {agent_state.i} did just clean up some dirt at {agent_state.pos}.')
                    self.monitor.add('dirt_cleaned', self._dirt_properties.clean_amount)
                else:
                    self.print(f'Agent {agent_state.i} just tried to clean up some dirt '
                               f'at {agent_state.pos}, but was unsucsessfull.')
                    self.monitor.add('failed_cleanup_attempt', 1)
+                    reward -= 0.05
            elif self._is_moving_action(agent_state.action):
                if not agent_state.action_valid:
-                    penalty += 10
+                    reward -= 0.1
                else:
-                    inforcements += 1
+                    reward += 0

            for entity in cols:
                if entity != self.string_slices["dirt"]:
                    self.monitor.add(f'agent_{agent_state.i}_vs_{self.slice_strings[entity]}', 1)

-        this_step_reward = max(0, inforcements-penalty)
        self.monitor.set('dirt_amount', current_dirt_amount)
        self.monitor.set('dirty_tiles', dirty_tiles)
-        self.print(f"reward is {this_step_reward}")
-        return this_step_reward, {}
+        self.print(f"reward is {reward}")
+        return reward, {}

    def print(self, string):
        if self.verbose:
@@ -166,7 +166,7 @@ if __name__ == '__main__':
    with MonitorCallback(factory):
        for epoch in range(100):
            random_actions = [(random.randint(0, 8), random.randint(0, 8)) for _ in range(200)]
-            env_state, reward, done_bool, _ = factory.reset()
+            env_state, this_reward, done_bool, _ = factory.reset()
            for agent_i_action in random_actions:
                env_state, reward, done_bool, info_obj = factory.step(agent_i_action)
                if render: