Added shared piles option to dirt_quadrant eval + Changed dirt_quadrant layout and adapted configs

2025-07-11 23:42:40 +02:00 · 2024-05-17 17:52:52 +02:00
parent 5e9e59c843
commit 4571dc1cd1
9 changed files with 64 additions and 35 deletions
--- a/marl_factory_grid/algorithms/marl/a2c_dirt.py
+++ b/marl_factory_grid/algorithms/marl/a2c_dirt.py
@ -383,6 +383,14 @@ class A2C:
            obs[0][1][x][y] = 1
            print("Missing agent position")

+    def get_all_cleaned_dirt_piles(self, dirt_piles_positions, cleaned_dirt_piles):
+        meta_cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
+        for agent_idx in range(self.n_agents):
+            for (pos, cleaned) in cleaned_dirt_piles[agent_idx].items():
+                if cleaned:
+                    meta_cleaned_dirt_piles[pos] = True
+        return meta_cleaned_dirt_piles
+
    def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done):
        # Check if agent moved on field with dirt. If that is the case collect dirt automatically
        agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
@ -427,12 +435,7 @@ class A2C:
                    done = True
            elif self.cfg[nms.ALGORITHM]["pile_all_done"] == "shared":
                # End episode if both agents together have cleaned all dirt piles
-                meta_cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
-                for agent_idx in range(self.n_agents):
-                    for (pos, cleaned) in cleaned_dirt_piles[agent_idx].items():
-                        if cleaned:
-                            meta_cleaned_dirt_piles[pos] = True
-                if all(meta_cleaned_dirt_piles.values()):
+                if all(self.get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles).values()):
                    done = True

        return reward, done
--- a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
+++ b/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml
@ -24,7 +24,7 @@ algorithm:
  entropy_coef:       0.01
  vf_coef:            0.05
  n_steps:            0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
-  max_steps:          270000
+  max_steps:          240000
  advantage:          "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
  pile-order:         "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
  pile-observability: "single" # Options: "single", "all"