mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2026-01-15 23:41:39 +01:00
Included method to tackle emergence in two_rooms_one_door_modified + Better access of different settings in marl_adapted + Added and modified a lot of config files
This commit is contained in:
@@ -13,9 +13,7 @@ from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_
|
|||||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||||
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from stable_baselines3 import PPO
|
|
||||||
|
|
||||||
from marl_factory_grid.environment.actions import Noop
|
from marl_factory_grid.environment.actions import Noop
|
||||||
from marl_factory_grid.modules import Clean, DoorUse
|
from marl_factory_grid.modules import Clean, DoorUse
|
||||||
@@ -53,22 +51,25 @@ class A2C:
|
|||||||
self.factory = add_env_props(train_cfg)
|
self.factory = add_env_props(train_cfg)
|
||||||
self.eval_factory = add_env_props(eval_cfg)
|
self.eval_factory = add_env_props(eval_cfg)
|
||||||
self.__training = True
|
self.__training = True
|
||||||
|
self.train_cfg = train_cfg
|
||||||
|
self.eval_cfg = eval_cfg
|
||||||
self.cfg = train_cfg
|
self.cfg = train_cfg
|
||||||
self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
|
self.n_agents = train_cfg[nms.AGENT][nms.N_AGENTS]
|
||||||
self.setup()
|
self.setup()
|
||||||
self.reward_development = []
|
self.reward_development = []
|
||||||
|
self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
# act_dim=6 for dirt_quadrant
|
|
||||||
dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
|
dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
|
||||||
range(len(self.factory.state.entities['DirtPiles']))]
|
range(len(self.factory.state.entities['DirtPiles']))]
|
||||||
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
||||||
obs_dim = 2 + 2*len(dirt_piles_positions)
|
obs_dim = 2 + 2*len(dirt_piles_positions)
|
||||||
else:
|
else:
|
||||||
obs_dim = 4
|
obs_dim = 4
|
||||||
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)]
|
self.obs_dim = obs_dim
|
||||||
# self.agents[0].pi.load_model_parameters("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/run5/Wolfgang_PolicyNet_model_parameters.pth")
|
self.act_dim = 4
|
||||||
self.doors_exist = "Doors" in self.factory.state.entities.keys()
|
# act_dim=4, because we want the agent to only learn a routing problem
|
||||||
|
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim, act_dim=self.act_dim) for i in range(self.n_agents)]
|
||||||
if self.cfg[nms.ENV]["save_and_log"]:
|
if self.cfg[nms.ENV]["save_and_log"]:
|
||||||
# Create results folder
|
# Create results folder
|
||||||
runs = os.listdir("../study_out/")
|
runs = os.listdir("../study_out/")
|
||||||
@@ -79,6 +80,12 @@ class A2C:
|
|||||||
# Save settings in results folder
|
# Save settings in results folder
|
||||||
self.save_configs()
|
self.save_configs()
|
||||||
|
|
||||||
|
def set_cfg(self, eval=False):
|
||||||
|
if eval:
|
||||||
|
self.cfg = self.eval_cfg
|
||||||
|
else:
|
||||||
|
self.cfg = self.train_cfg
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _as_torch(cls, x):
|
def _as_torch(cls, x):
|
||||||
if isinstance(x, np.ndarray):
|
if isinstance(x, np.ndarray):
|
||||||
@@ -249,10 +256,50 @@ class A2C:
|
|||||||
indices.append(list(range(start_index, end_index)))
|
indices.append(list(range(start_index, end_index)))
|
||||||
start_index = end_index
|
start_index = end_index
|
||||||
|
|
||||||
|
# Static form: auxiliary pile, primary pile, auxiliary pile, ...
|
||||||
|
# -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
|
||||||
|
if self.cfg[nms.ALGORITHM]["auxiliary_piles"] and "Doors" in env.state.entities.keys():
|
||||||
|
door_positions = [door.pos for door in env.state.entities["Doors"]]
|
||||||
|
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
||||||
|
distances = {door_pos:[] for door_pos in door_positions}
|
||||||
|
|
||||||
|
# Calculate distance of every agent to every door
|
||||||
|
for door_pos in door_positions:
|
||||||
|
for agent_pos in agent_positions:
|
||||||
|
distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
|
||||||
|
|
||||||
|
def duplicate_indices(lst, item):
|
||||||
|
return [i for i, x in enumerate(lst) if x == item]
|
||||||
|
|
||||||
|
# Get agent indices of agents with same distance to door
|
||||||
|
affected_agents = {door_pos:{} for door_pos in door_positions}
|
||||||
|
for door_pos in distances.keys():
|
||||||
|
dist = distances[door_pos]
|
||||||
|
dist_set = set(dist)
|
||||||
|
for d in dist_set:
|
||||||
|
affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)
|
||||||
|
|
||||||
|
# TODO: Make generic for multiple doors
|
||||||
|
updated_indices = []
|
||||||
|
if len(affected_agents[door_positions[0]]) == 0:
|
||||||
|
# Remove auxiliary piles for all agents
|
||||||
|
updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
|
||||||
|
else:
|
||||||
|
for distance, agent_indices in affected_agents[door_positions[0]].items():
|
||||||
|
# Pick random agent to keep auxiliary pile and remove it for all others
|
||||||
|
#selected_agent = np.random.choice(agent_indices)
|
||||||
|
selected_agent = 0
|
||||||
|
for agent_idx in agent_indices:
|
||||||
|
if agent_idx == selected_agent:
|
||||||
|
updated_indices.append(indices[agent_idx])
|
||||||
|
else:
|
||||||
|
updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
|
||||||
|
|
||||||
|
indices = updated_indices
|
||||||
|
|
||||||
return indices
|
return indices
|
||||||
|
|
||||||
def update_target_pile(self, env, agent_idx, target_pile):
|
def update_target_pile(self, env, agent_idx, target_pile, indices):
|
||||||
indices = self.distribute_indices(env)
|
|
||||||
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
|
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
|
||||||
if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
|
if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
|
||||||
target_pile[agent_idx] += 1
|
target_pile[agent_idx] += 1
|
||||||
@@ -282,9 +329,7 @@ class A2C:
|
|||||||
if door := self.door_is_close(env, agent_idx):
|
if door := self.door_is_close(env, agent_idx):
|
||||||
if door.is_closed:
|
if door.is_closed:
|
||||||
action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "use_door"))
|
action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "use_door"))
|
||||||
if not det:
|
# Don't include action in agent experience
|
||||||
# Include agent experience entry manually
|
|
||||||
agent._episode.append((None, None, None, agent.vf(agent_obs)))
|
|
||||||
else:
|
else:
|
||||||
if det:
|
if det:
|
||||||
action.append(int(agent.pi(agent_obs, det=True)[0]))
|
action.append(int(agent.pi(agent_obs, det=True)[0]))
|
||||||
@@ -335,7 +380,7 @@ class A2C:
|
|||||||
obs[0][1][x][y] = 1
|
obs[0][1][x][y] = 1
|
||||||
print("Missing agent position")
|
print("Missing agent position")
|
||||||
|
|
||||||
def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done):
|
def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done):
|
||||||
# Check if agent moved on field with dirt. If that is the case collect dirt automatically
|
# Check if agent moved on field with dirt. If that is the case collect dirt automatically
|
||||||
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
@@ -354,7 +399,7 @@ class A2C:
|
|||||||
|
|
||||||
# Only simulate collecting the dirt
|
# Only simulate collecting the dirt
|
||||||
for idx, pos in enumerate(agent_positions):
|
for idx, pos in enumerate(agent_positions):
|
||||||
if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[idx][pos]:
|
if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
|
||||||
# print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
|
# print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
|
||||||
# If dirt piles should be cleaned in a specific order
|
# If dirt piles should be cleaned in a specific order
|
||||||
if ordered_dirt_piles[idx]:
|
if ordered_dirt_piles[idx]:
|
||||||
@@ -362,7 +407,7 @@ class A2C:
|
|||||||
reward[idx] += 50 # 1
|
reward[idx] += 50 # 1
|
||||||
cleaned_dirt_piles[idx][pos] = True
|
cleaned_dirt_piles[idx][pos] = True
|
||||||
# Set pointer to next dirt pile
|
# Set pointer to next dirt pile
|
||||||
self.update_target_pile(env, idx, target_pile)
|
self.update_target_pile(env, idx, target_pile, indices)
|
||||||
self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
|
self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
|
||||||
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
|
||||||
done = True
|
done = True
|
||||||
@@ -370,13 +415,11 @@ class A2C:
|
|||||||
# Reset cleaned_dirt_piles indicator
|
# Reset cleaned_dirt_piles indicator
|
||||||
for pos in dirt_piles_positions:
|
for pos in dirt_piles_positions:
|
||||||
cleaned_dirt_piles[idx][pos] = False
|
cleaned_dirt_piles[idx][pos] = False
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
reward[idx] += 50 # 1
|
reward[idx] += 50 # 1
|
||||||
cleaned_dirt_piles[idx][pos] = True
|
cleaned_dirt_piles[idx][pos] = True
|
||||||
break
|
|
||||||
|
|
||||||
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] in ["all", "distributed"]:
|
||||||
if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
|
if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
|
||||||
done = True
|
done = True
|
||||||
|
|
||||||
@@ -445,9 +488,10 @@ class A2C:
|
|||||||
env.render()
|
env.render()
|
||||||
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
|
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
|
||||||
global_steps, episode = 0, 0
|
global_steps, episode = 0, 0
|
||||||
|
indices = self.distribute_indices(env)
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
|
used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
target_pile = [partition[0] for partition in indices] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
||||||
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
|
||||||
|
|
||||||
while global_steps < max_steps:
|
while global_steps < max_steps:
|
||||||
@@ -457,7 +501,7 @@ class A2C:
|
|||||||
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
||||||
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
||||||
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)]
|
target_pile = [partition[0] for partition in indices]
|
||||||
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
"""passed_fields = [[] for _ in range(self.n_agents)]"""
|
"""passed_fields = [[] for _ in range(self.n_agents)]"""
|
||||||
|
|
||||||
@@ -476,7 +520,8 @@ class A2C:
|
|||||||
|
|
||||||
while not all(done):
|
while not all(done):
|
||||||
# 0="North", 1="East", 2="South", 3="West", 4="Clean", 5="Noop"
|
# 0="North", 1="East", 2="South", 3="West", 4="Clean", 5="Noop"
|
||||||
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile) if self.doors_exist else self.get_actions(obs)
|
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile) \
|
||||||
|
if "Doors" in env.state.entities.keys() else self.get_actions(obs)
|
||||||
used_actions[int(action[0])] += 1
|
used_actions[int(action[0])] += 1
|
||||||
_, next_obs, reward, done, info = env.step(action)
|
_, next_obs, reward, done, info = env.step(action)
|
||||||
if done:
|
if done:
|
||||||
@@ -491,7 +536,7 @@ class A2C:
|
|||||||
# with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
|
# with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
|
||||||
# the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
|
# the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
|
||||||
# before this method is called.
|
# before this method is called.
|
||||||
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
|
||||||
|
|
||||||
if n_steps != 0 and (global_steps + 1) % n_steps == 0:
|
if n_steps != 0 and (global_steps + 1) % n_steps == 0:
|
||||||
print("max_steps reached")
|
print("max_steps reached")
|
||||||
@@ -499,9 +544,11 @@ class A2C:
|
|||||||
|
|
||||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||||
for ag_i, agent in enumerate(self.agents):
|
for ag_i, agent in enumerate(self.agents):
|
||||||
# Add agent results into respective rollout buffers
|
# For forced actions like door opening, we have to call the step function with this action, but
|
||||||
agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
|
# since we are not allowed to exceed the dimensions range, we can't log the corresponding step info.
|
||||||
|
if action[ag_i] in range(self.act_dim):
|
||||||
|
# Add agent results into respective rollout buffers
|
||||||
|
agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
|
||||||
|
|
||||||
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
|
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
|
||||||
env.render()
|
env.render()
|
||||||
@@ -522,7 +569,7 @@ class A2C:
|
|||||||
|
|
||||||
self.plot_reward_development()
|
self.plot_reward_development()
|
||||||
if self.cfg[nms.ENV]["save_and_log"]:
|
if self.cfg[nms.ENV]["save_and_log"]:
|
||||||
self.create_info_maps(env, used_actions, target_pile)
|
self.create_info_maps(env, used_actions)
|
||||||
self.save_agent_models()
|
self.save_agent_models()
|
||||||
|
|
||||||
|
|
||||||
@@ -530,21 +577,29 @@ class A2C:
|
|||||||
@torch.inference_mode(True)
|
@torch.inference_mode(True)
|
||||||
def eval_loop(self, n_episodes, render=False):
|
def eval_loop(self, n_episodes, render=False):
|
||||||
env = self.eval_factory
|
env = self.eval_factory
|
||||||
|
self.set_cfg(eval=True)
|
||||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||||
env.render()
|
env.render()
|
||||||
episode, results = 0, []
|
episode, results = 0, []
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
indices = self.distribute_indices(env)
|
||||||
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
target_pile = [partition[0] for partition in indices] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "distributed":
|
||||||
|
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
|
||||||
|
else:
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
|
|
||||||
while episode < n_episodes:
|
while episode < n_episodes:
|
||||||
obs = env.reset()
|
obs = env.reset()
|
||||||
self.set_agent_spawnpoint(env)
|
self.set_agent_spawnpoint(env)
|
||||||
"""obs = list(obs.values())"""
|
"""obs = list(obs.values())"""
|
||||||
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
||||||
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] in ["all", "distributed"]:
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)]
|
target_pile = [partition[0] for partition in indices]
|
||||||
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "distributed":
|
||||||
|
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
|
||||||
|
else:
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
|
|
||||||
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
||||||
|
|
||||||
@@ -556,9 +611,9 @@ class A2C:
|
|||||||
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
|
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
|
||||||
|
|
||||||
while not all(done):
|
while not all(done):
|
||||||
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
|
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) \
|
||||||
print(action)
|
if "Doors" in env.state.entities.keys() else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
|
||||||
_, next_obs, reward, done, info = env.step(action)
|
_, next_obs, reward, done, info = env.step(action) # Note that this call seems to flip the lists in indices
|
||||||
if done:
|
if done:
|
||||||
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
||||||
|
|
||||||
@@ -566,7 +621,7 @@ class A2C:
|
|||||||
# reward = self.reward_distance(env, obs, target_pile, reward)
|
# reward = self.reward_distance(env, obs, target_pile, reward)
|
||||||
|
|
||||||
# Check and handle if agent is on field with dirt
|
# Check and handle if agent is on field with dirt
|
||||||
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
|
||||||
|
|
||||||
# Get transformed next_obs that might have been updated because of self.handle_dirt.
|
# Get transformed next_obs that might have been updated because of self.handle_dirt.
|
||||||
# For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
|
# For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
|
||||||
@@ -614,7 +669,7 @@ class A2C:
|
|||||||
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
|
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
|
||||||
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
|
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
|
||||||
|
|
||||||
def create_info_maps(self, env, used_actions, target_pile):
|
def create_info_maps(self, env, used_actions):
|
||||||
# Create value map
|
# Create value map
|
||||||
all_valid_observations = self.get_all_observations(env)
|
all_valid_observations = self.get_all_observations(env)
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
@@ -624,7 +679,7 @@ class A2C:
|
|||||||
max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
|
max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
|
||||||
value_maps = [np.zeros(observations_shape) for _ in self.agents]
|
value_maps = [np.zeros(observations_shape) for _ in self.agents]
|
||||||
likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
|
likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
|
||||||
action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], env.action_space[0].n)) for
|
action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], self.act_dim)) for
|
||||||
_ in self.agents]
|
_ in self.agents]
|
||||||
for obs in all_valid_observations[obs_layer]:
|
for obs in all_valid_observations[obs_layer]:
|
||||||
"""obs = self._as_torch(obs).view(-1).to(torch.float32)"""
|
"""obs = self._as_torch(obs).view(-1).to(torch.float32)"""
|
||||||
@@ -663,6 +718,7 @@ class A2C:
|
|||||||
txt_file.write("=======Action Probabilities=======\n")
|
txt_file.write("=======Action Probabilities=======\n")
|
||||||
print("=======Action Probabilities=======")
|
print("=======Action Probabilities=======")
|
||||||
for agent_idx, pmap in enumerate(action_probabilities):
|
for agent_idx, pmap in enumerate(action_probabilities):
|
||||||
|
self.action_probabilities[agent_idx].append(pmap)
|
||||||
txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
|
txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
|
||||||
print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
|
print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
|
||||||
for d in range(pmap.shape[0]):
|
for d in range(pmap.shape[0]):
|
||||||
|
|||||||
@@ -25,8 +25,9 @@ algorithm:
|
|||||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||||
max_steps: 200000
|
max_steps: 200000
|
||||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||||
pile-order: "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
pile-order: "dynamic" # Use "dynamic" to see emergent phenomenon and "smart" to prevent it
|
||||||
pile-observability: "single" # Options: "single", "all"
|
pile-observability: "single" # Options: "single", "all"
|
||||||
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
|
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||||
|
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
agent:
|
||||||
|
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||||
|
n_agents: 2
|
||||||
|
obs_emb_size: 96
|
||||||
|
action_emb_size: 16
|
||||||
|
hidden_size_actor: 64
|
||||||
|
hidden_size_critic: 64
|
||||||
|
use_agent_embedding: False
|
||||||
|
env:
|
||||||
|
classname: marl_factory_grid.configs.custom
|
||||||
|
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||||
|
n_agents: 2
|
||||||
|
max_steps: 250
|
||||||
|
pomdp_r: 2
|
||||||
|
stack_n_frames: 0
|
||||||
|
individual_rewards: True
|
||||||
|
train_render: False
|
||||||
|
eval_render: True
|
||||||
|
save_and_log: False
|
||||||
|
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||||
|
algorithm:
|
||||||
|
gamma: 0.99
|
||||||
|
entropy_coef: 0.01
|
||||||
|
vf_coef: 0.05
|
||||||
|
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||||
|
max_steps: 260000
|
||||||
|
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||||
|
pile-order: "agents" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||||
|
pile-observability: "single" # Options: "single", "all"
|
||||||
|
pile_all_done: "distributed" # Options: "single", "all" ("single" for training, "all" and "distributed" for eval)
|
||||||
|
auxiliary_piles: True # Use True to see emergent phenomenon and False to prevent it
|
||||||
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
||||||
|
|
||||||
@@ -16,7 +16,7 @@ env:
|
|||||||
individual_rewards: True
|
individual_rewards: True
|
||||||
train_render: False
|
train_render: False
|
||||||
eval_render: True
|
eval_render: True
|
||||||
save_and_log: False
|
save_and_log: True
|
||||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||||
algorithm:
|
algorithm:
|
||||||
gamma: 0.99
|
gamma: 0.99
|
||||||
@@ -28,5 +28,6 @@ algorithm:
|
|||||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||||
pile-observability: "single" # Options: "single", "all"
|
pile-observability: "single" # Options: "single", "all"
|
||||||
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||||
|
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||||
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
|
marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
|
||||||
marl_factory_grid>environment>rewards.py
|
marl_factory_grid>environment>rewards.py
|
||||||
marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
|
marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
|
||||||
|
marl_factory_grid>environment>rules.py#AgentSpawnRule
|
||||||
|
marl_factory_grid>utils>states.py#GameState.__init__()
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
agent:
|
agent:
|
||||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||||
n_agents: 2
|
n_agents: 1
|
||||||
obs_emb_size: 96
|
obs_emb_size: 96
|
||||||
action_emb_size: 16
|
action_emb_size: 16
|
||||||
hidden_size_actor: 64
|
hidden_size_actor: 64
|
||||||
@@ -8,21 +8,27 @@ agent:
|
|||||||
use_agent_embedding: False
|
use_agent_embedding: False
|
||||||
env:
|
env:
|
||||||
classname: marl_factory_grid.configs.custom
|
classname: marl_factory_grid.configs.custom
|
||||||
env_name: "custom/two_rooms_one_door_modified_random_pos"
|
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||||
n_agents: 2
|
n_agents: 1
|
||||||
max_steps: 250
|
max_steps: 250
|
||||||
pomdp_r: 2
|
pomdp_r: 2
|
||||||
stack_n_frames: 0
|
stack_n_frames: 0
|
||||||
individual_rewards: True
|
individual_rewards: True
|
||||||
train_render: False
|
train_render: False
|
||||||
eval_render: True
|
eval_render: True
|
||||||
|
save_and_log: False
|
||||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||||
algorithm:
|
algorithm:
|
||||||
gamma: 0.99
|
gamma: 0.99
|
||||||
entropy_coef: 0.01
|
entropy_coef: 0.01
|
||||||
vf_coef: 0.05
|
vf_coef: 0.05
|
||||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||||
max_steps: 100000
|
max_steps: 260000
|
||||||
advantage: "TD-Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||||
pile-order: "agents" # Options: "fixed", "random", "none", "agents"
|
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||||
|
pile-observability: "single" # Options: "single", "all"
|
||||||
|
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||||
|
auxiliary_piles: False # Option that is only considered when pile-order = "agents"
|
||||||
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -19,31 +19,21 @@ Agents:
|
|||||||
Sigmund:
|
Sigmund:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
#- Clean
|
|
||||||
- Noop
|
- Noop
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
- (9,1)
|
- (9,1)
|
||||||
#- (9,9)
|
|
||||||
#- (4,5)
|
|
||||||
Wolfgang:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
#- Clean
|
|
||||||
- Noop
|
- Noop
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
- (9,5)
|
- (9,5)
|
||||||
#- (9,9)
|
|
||||||
#- (4,5)
|
|
||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
|
|||||||
@@ -22,8 +22,6 @@ Agents:
|
|||||||
#- Clean
|
#- Clean
|
||||||
#- Noop
|
#- Noop
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
@@ -39,8 +37,6 @@ Agents:
|
|||||||
#- Clean
|
#- Clean
|
||||||
#- Noop
|
#- Noop
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
|
|||||||
@@ -14,40 +14,30 @@ General:
|
|||||||
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
||||||
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
||||||
Agents:
|
Agents:
|
||||||
Wolfgang:
|
|
||||||
Actions:
|
|
||||||
- Move8
|
|
||||||
- DoorUse
|
|
||||||
- Noop
|
|
||||||
Observations:
|
|
||||||
- DirtPiles
|
|
||||||
- Self
|
|
||||||
#Positions:
|
|
||||||
#- (1,1)
|
|
||||||
#- (2,1)
|
|
||||||
#- (3,1)
|
|
||||||
#- (4,1)
|
|
||||||
#- (5,1)
|
|
||||||
#- (6,1)
|
|
||||||
Sigmund:
|
Sigmund:
|
||||||
Actions:
|
Actions:
|
||||||
- Move8
|
- Move4
|
||||||
- DoorUse
|
- DoorUse
|
||||||
- Noop
|
- Noop
|
||||||
Observations:
|
Observations:
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
#Positions:
|
Positions:
|
||||||
#- (1,13)
|
- (3,1)
|
||||||
#- (2,13)
|
Wolfgang:
|
||||||
#- (3,13)
|
Actions:
|
||||||
#- (4,13)
|
- Move4
|
||||||
#- (5,13)
|
- DoorUse
|
||||||
#- (6,13)
|
- Noop
|
||||||
|
Observations:
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (3,13)
|
||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
coords_or_quantity: (3,12), (3,2) # This order is required, because agent 0 needs to reach (3, 12) and agent 1 (3, 2)
|
coords_or_quantity: (2,1), (3,12), (2,13), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
|
||||||
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
clean_amount: 1
|
clean_amount: 1
|
||||||
dirt_spawn_r_var: 0
|
dirt_spawn_r_var: 0
|
||||||
@@ -58,8 +48,8 @@ Entities:
|
|||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
# Environment Dynamics
|
# Environment Dynamics
|
||||||
DoorAutoClose:
|
#DoorAutoClose:
|
||||||
close_frequency: 10
|
#close_frequency: 10
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
@@ -14,19 +14,19 @@ General:
|
|||||||
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
||||||
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
||||||
Agents:
|
Agents:
|
||||||
Wolfgang:
|
Sigmund:
|
||||||
Actions:
|
Actions:
|
||||||
- Move8
|
- Move4
|
||||||
- DoorUse
|
- DoorUse
|
||||||
- Noop
|
- Noop
|
||||||
Observations:
|
Observations:
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
- (3,1) # Agent spawnpoint
|
- (3,1)
|
||||||
Sigmund:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move8
|
- Move4
|
||||||
- DoorUse
|
- DoorUse
|
||||||
- Noop
|
- Noop
|
||||||
Observations:
|
Observations:
|
||||||
@@ -37,7 +37,7 @@ Agents:
|
|||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
coords_or_quantity: (3,12), (3,2)
|
coords_or_quantity: (3,12), (3,2) # Static form: auxiliary pile, primary pile, auxiliary pile, ...
|
||||||
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
clean_amount: 1
|
clean_amount: 1
|
||||||
dirt_spawn_r_var: 0
|
dirt_spawn_r_var: 0
|
||||||
@@ -48,8 +48,8 @@ Entities:
|
|||||||
|
|
||||||
Rules:
|
Rules:
|
||||||
# Environment Dynamics
|
# Environment Dynamics
|
||||||
DoorAutoClose:
|
#DoorAutoClose:
|
||||||
close_frequency: 10
|
#close_frequency: 10
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
@@ -58,5 +58,5 @@ Rules:
|
|||||||
|
|
||||||
# Done Conditions
|
# Done Conditions
|
||||||
#DoneOnAllDirtCleaned:
|
#DoneOnAllDirtCleaned:
|
||||||
#DoneAtMaxStepsReached:
|
#DoneAtMaxStepsReached: # Mayne Required since door blocking will result in infinite loop
|
||||||
#max_steps: 100
|
#max_steps: 1000
|
||||||
@@ -19,11 +19,8 @@ Agents:
|
|||||||
#Sigmund:
|
#Sigmund:
|
||||||
#Actions:
|
#Actions:
|
||||||
#- Move4
|
#- Move4
|
||||||
#- Clean
|
|
||||||
#- Noop
|
#- Noop
|
||||||
#Observations:
|
#Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
#- DirtPiles
|
#- DirtPiles
|
||||||
#- Self
|
#- Self
|
||||||
#Positions:
|
#Positions:
|
||||||
@@ -33,17 +30,13 @@ Agents:
|
|||||||
Wolfgang:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
#- Clean
|
|
||||||
#- Noop
|
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
- (9,5)
|
- (9,5)
|
||||||
#- (9,9)
|
- (9,9)
|
||||||
#- (4,5)
|
- (4,5)
|
||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
|
|||||||
@@ -19,11 +19,7 @@ Agents:
|
|||||||
#Sigmund:
|
#Sigmund:
|
||||||
#Actions:
|
#Actions:
|
||||||
#- Move4
|
#- Move4
|
||||||
#- Clean
|
|
||||||
#- Noop
|
|
||||||
#Observations:
|
#Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
#- DirtPiles
|
#- DirtPiles
|
||||||
#- Self
|
#- Self
|
||||||
#Positions:
|
#Positions:
|
||||||
@@ -36,11 +32,7 @@ Agents:
|
|||||||
Wolfgang:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
#- Clean
|
|
||||||
#- Noop
|
|
||||||
Observations:
|
Observations:
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
|
|||||||
@@ -0,0 +1,62 @@
|
|||||||
|
General:
|
||||||
|
env_seed: 69
|
||||||
|
# Individual vs global rewards
|
||||||
|
individual_rewards: true
|
||||||
|
# The level.txt file to load from marl_factory_grid/levels
|
||||||
|
level_name: two_rooms_modified
|
||||||
|
# View Radius; 0 = full observatbility
|
||||||
|
pomdp_r: 0
|
||||||
|
# Print all messages and events
|
||||||
|
verbose: false
|
||||||
|
# Run tests
|
||||||
|
tests: false
|
||||||
|
|
||||||
|
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
||||||
|
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
||||||
|
Agents:
|
||||||
|
#Sigmund:
|
||||||
|
#Actions:
|
||||||
|
#- Move4
|
||||||
|
#- DoorUse
|
||||||
|
#Observations:
|
||||||
|
#- DirtPiles
|
||||||
|
#- Self
|
||||||
|
#Positions:
|
||||||
|
#- (3,1)
|
||||||
|
#- (2,1)
|
||||||
|
Wolfgang:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
- DoorUse
|
||||||
|
Observations:
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (3,13)
|
||||||
|
- (2,13)
|
||||||
|
|
||||||
|
Entities:
|
||||||
|
DirtPiles:
|
||||||
|
coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
|
||||||
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
|
clean_amount: 1
|
||||||
|
dirt_spawn_r_var: 0
|
||||||
|
max_global_amount: 12
|
||||||
|
max_local_amount: 1
|
||||||
|
|
||||||
|
Doors: { }
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
# Environment Dynamics
|
||||||
|
#DoorAutoClose:
|
||||||
|
#close_frequency: 10
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
|
WatchCollisions:
|
||||||
|
done_at_collisions: false
|
||||||
|
|
||||||
|
# Done Conditions
|
||||||
|
#DoneOnAllDirtCleaned:
|
||||||
|
#DoneAtMaxStepsReached: # Mayne Required since door blocking will result in infinite loop
|
||||||
|
#max_steps: 1000
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
General:
|
||||||
|
env_seed: 69
|
||||||
|
# Individual vs global rewards
|
||||||
|
individual_rewards: true
|
||||||
|
# The level.txt file to load from marl_factory_grid/levels
|
||||||
|
level_name: two_rooms_modified
|
||||||
|
# View Radius; 0 = full observatbility
|
||||||
|
pomdp_r: 0
|
||||||
|
# Print all messages and events
|
||||||
|
verbose: false
|
||||||
|
# Run tests
|
||||||
|
tests: false
|
||||||
|
|
||||||
|
# In "two rooms one door" scenario 2 agents spawn in 2 different rooms that are connected by a single door. Their aim
|
||||||
|
# is to reach the destination in the room they didn't spawn in leading to a conflict at the door.
|
||||||
|
Agents:
|
||||||
|
#Sigmund:
|
||||||
|
#Actions:
|
||||||
|
#- Move4
|
||||||
|
#Observations:
|
||||||
|
#- DirtPiles
|
||||||
|
#- Self
|
||||||
|
#Positions:
|
||||||
|
#- (3,1)
|
||||||
|
#- (1,1)
|
||||||
|
#- (3,1)
|
||||||
|
#- (5,1)
|
||||||
|
#- (3,1)
|
||||||
|
#- (1,8)
|
||||||
|
#- (3,1)
|
||||||
|
#- (5,8)
|
||||||
|
Wolfgang:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
Observations:
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (3,13)
|
||||||
|
- (2,13)
|
||||||
|
- (1,13)
|
||||||
|
- (3,13)
|
||||||
|
- (1,8)
|
||||||
|
- (2,6)
|
||||||
|
- (3,10)
|
||||||
|
- (4,6)
|
||||||
|
|
||||||
|
Entities:
|
||||||
|
DirtPiles:
|
||||||
|
coords_or_quantity: (2,13), (3,2) # (2,1), (3,12)
|
||||||
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
|
clean_amount: 1
|
||||||
|
dirt_spawn_r_var: 0
|
||||||
|
max_global_amount: 12
|
||||||
|
max_local_amount: 1
|
||||||
|
|
||||||
|
#Doors: { }
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
# Environment Dynamics
|
||||||
|
#DoorAutoClose:
|
||||||
|
#close_frequency: 10
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
|
WatchCollisions:
|
||||||
|
done_at_collisions: false
|
||||||
|
|
||||||
|
# Done Conditions
|
||||||
|
DoneOnAllDirtCleaned:
|
||||||
|
#DoneAtMaxStepsReached:
|
||||||
|
#max_steps: 100
|
||||||
|
|
||||||
|
AgentSpawnRule:
|
||||||
|
spawn_rule: "order"
|
||||||
@@ -3,13 +3,13 @@ from pathlib import Path
|
|||||||
from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
|
from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
|
||||||
from marl_factory_grid.algorithms.utils import load_yaml_file
|
from marl_factory_grid.algorithms.utils import load_yaml_file
|
||||||
|
|
||||||
def dirt_quadrant_single_agent_training():
|
def single_agent_training(config_name):
|
||||||
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml')
|
cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml')
|
||||||
|
|
||||||
train_cfg = load_yaml_file(cfg_path)
|
train_cfg = load_yaml_file(cfg_path)
|
||||||
# Use environment config with fixed spawnpoints for eval
|
# Use environment config with fixed spawnpoints for eval
|
||||||
eval_cfg = copy.deepcopy(train_cfg)
|
eval_cfg = copy.deepcopy(train_cfg)
|
||||||
eval_cfg["env"]["env_name"] = "custom/dirt_quadrant_eval_config"
|
eval_cfg["env"]["env_name"] = f"custom/{config_name}_eval_config"
|
||||||
|
|
||||||
print("Training phase")
|
print("Training phase")
|
||||||
agent = A2C(train_cfg, eval_cfg)
|
agent = A2C(train_cfg, eval_cfg)
|
||||||
@@ -17,22 +17,81 @@ def dirt_quadrant_single_agent_training():
|
|||||||
print("Evaluation phase")
|
print("Evaluation phase")
|
||||||
# Have consecutive episode for eval in single agent case
|
# Have consecutive episode for eval in single agent case
|
||||||
train_cfg["algorithm"]["pile_all_done"] = "all"
|
train_cfg["algorithm"]["pile_all_done"] = "all"
|
||||||
# agent.load_agents(["run0", "run1"])
|
|
||||||
agent.eval_loop(10)
|
agent.eval_loop(10)
|
||||||
|
print(agent.action_probabilities)
|
||||||
|
|
||||||
|
|
||||||
def dirt_quadrant_multi_agent_eval():
|
def single_agent_eval(config_name, run):
|
||||||
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml')
|
cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml')
|
||||||
|
|
||||||
train_cfg = load_yaml_file(cfg_path)
|
train_cfg = load_yaml_file(cfg_path)
|
||||||
# Use environment config with fixed spawnpoints for eval
|
# Use environment config with fixed spawnpoints for eval
|
||||||
eval_cfg = copy.deepcopy(train_cfg)
|
eval_cfg = copy.deepcopy(train_cfg)
|
||||||
eval_cfg["env"]["env_name"] = "custom/MultiAgentConfigs/dirt_quadrant_eval_config"
|
eval_cfg["env"]["env_name"] = f"custom/{config_name}_eval_config"
|
||||||
agent = A2C(train_cfg, eval_cfg)
|
agent = A2C(train_cfg, eval_cfg)
|
||||||
print("Evaluation phase")
|
print("Evaluation phase")
|
||||||
agent.load_agents(["run0", "run1"])
|
agent.load_agents(run)
|
||||||
agent.eval_loop(10)
|
agent.eval_loop(10)
|
||||||
|
|
||||||
|
|
||||||
|
def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
|
||||||
|
cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/{config_name}_config.yaml')
|
||||||
|
|
||||||
|
train_cfg = load_yaml_file(cfg_path)
|
||||||
|
# Use environment config with fixed spawnpoints for eval
|
||||||
|
eval_cfg = copy.deepcopy(train_cfg)
|
||||||
|
eval_cfg["env"]["env_name"] = f"custom/MultiAgentConfigs/{config_name}_eval_config"
|
||||||
|
# Sanity setting of required attributes and configs
|
||||||
|
if config_name == "two_rooms_one_door_modified":
|
||||||
|
if emergent_phenomenon:
|
||||||
|
eval_cfg["env"]["env_name"] = f"custom/MultiAgentConfigs/{config_name}_eval_config_emergent"
|
||||||
|
eval_cfg["algorithm"]["auxiliary_piles"] = False
|
||||||
|
else:
|
||||||
|
eval_cfg["algorithm"]["auxiliary_piles"] = True
|
||||||
|
elif config_name == "dirt_quadrant":
|
||||||
|
if emergent_phenomenon:
|
||||||
|
eval_cfg["algorithm"]["pile-order"] = "dynamic"
|
||||||
|
else:
|
||||||
|
eval_cfg["algorithm"]["pile-order"] = "smart"
|
||||||
|
agent = A2C(train_cfg, eval_cfg)
|
||||||
|
print("Evaluation phase")
|
||||||
|
agent.load_agents(runs)
|
||||||
|
agent.eval_loop(10)
|
||||||
|
|
||||||
|
|
||||||
|
def dirt_quadrant_single_agent_training():
|
||||||
|
single_agent_training("dirt_quadrant")
|
||||||
|
|
||||||
|
|
||||||
|
def two_rooms_one_door_modified_single_agent_training():
|
||||||
|
single_agent_training("two_rooms_one_door_modified")
|
||||||
|
|
||||||
|
|
||||||
|
def dirt_quadrant_single_agent_eval(agent_name):
|
||||||
|
if agent_name == "Sigmund":
|
||||||
|
run = "run0"
|
||||||
|
elif agent_name == "Wolfgang":
|
||||||
|
run = "run4"
|
||||||
|
single_agent_eval("dirt_quadrant", [run])
|
||||||
|
|
||||||
|
|
||||||
|
def two_rooms_one_door_modified_single_agent_eval(agent_name):
|
||||||
|
if agent_name == "Sigmund":
|
||||||
|
run = "run2"
|
||||||
|
elif agent_name == "Wolfgang":
|
||||||
|
run = "run3"
|
||||||
|
single_agent_eval("two_rooms_one_door_modified", [run])
|
||||||
|
|
||||||
|
|
||||||
|
def dirt_quadrant_multi_agent_eval(emergent_phenomenon):
|
||||||
|
multi_agent_eval("dirt_quadrant", ["run0", "run1"], emergent_phenomenon)
|
||||||
|
|
||||||
|
|
||||||
|
def two_rooms_one_door_modified_multi_agent_eval(emergent_phenomenon):
|
||||||
|
multi_agent_eval("two_rooms_one_door_modified", ["run2", "run3"], emergent_phenomenon)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
dirt_quadrant_single_agent_training()
|
dirt_quadrant_single_agent_training()
|
||||||
Reference in New Issue
Block a user