Updated pomdp_r comment + Added some additional comments + Restructured experiment calling + Added Readme and requirements.txt

This commit is contained in:
Julian Schönberger
2024-05-27 18:23:11 +02:00
parent 41a1ec0a5b
commit a0852e805a
23 changed files with 327 additions and 369 deletions

View File

@ -0,0 +1,73 @@
from pathlib import Path
from marl_factory_grid.algorithms.rl.a2c_dirt import A2C
from marl_factory_grid.algorithms.utils import load_yaml_file
####### Training routines ######
def rerun_dirt_quadrant_agent1_training():
train_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_train_config.yaml')
eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/dirt_quadrant_eval_config.yaml')
train_cfg = load_yaml_file(train_cfg_path)
eval_cfg = load_yaml_file(eval_cfg_path)
print("Training phase")
agent = A2C(train_cfg, eval_cfg)
agent.train_loop()
print("Evaluation phase")
agent.eval_loop(n_episodes=1)
def two_rooms_training(max_steps, agent_name):
train_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_train_config.yaml')
eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/single_agent_configs/two_rooms_eval_config.yaml')
train_cfg = load_yaml_file(train_cfg_path)
eval_cfg = load_yaml_file(eval_cfg_path)
train_cfg["algorithm"]["max_steps"] = max_steps
train_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_train_config"
eval_cfg["env"]["env_name"] = f"rl/two_rooms_{agent_name}_eval_config"
print("Training phase")
agent = A2C(train_cfg, eval_cfg)
agent.train_loop()
print("Evaluation phase")
agent.eval_loop(n_episodes=1)
def rerun_two_rooms_agent1_training():
two_rooms_training(max_steps=190000, agent_name="agent1")
def rerun_two_rooms_agent2_training():
two_rooms_training(max_steps=260000, agent_name="agent2")
####### Eval routines ########
def single_agent_eval(config_name, run_folder_name):
eval_cfg_path = Path(f'../marl_factory_grid/algorithms/rl/single_agent_configs/{config_name}_eval_config.yaml')
train_cfg = eval_cfg = load_yaml_file(eval_cfg_path)
# A value for train_cfg is required, but the train environment won't be used
agent = A2C(train_cfg=train_cfg, eval_cfg=eval_cfg)
print("Evaluation phase")
agent.load_agents([run_folder_name])
agent.eval_loop(1)
def multi_agent_eval(config_name, runs, emergent_phenomenon=False):
eval_cfg_path = Path(f'./marl_factory_grid/algorithms/rl/multi_agent_configs/{config_name}' +
f'_eval_config{"_emergent" if emergent_phenomenon else ""}.yaml')
eval_cfg = load_yaml_file(eval_cfg_path)
# A value for train_cfg is required, but the train environment won't be used
agent = A2C(train_cfg=eval_cfg, eval_cfg=eval_cfg)
print("Evaluation phase")
agent.load_agents(runs)
agent.eval_loop(1)
def dirt_quadrant_multi_agent_rl_eval(emergent_phenomenon):
multi_agent_eval("dirt_quadrant", ["run0", "run0"], emergent_phenomenon)
def two_rooms_multi_agent_rl_eval(emergent_phenomenon):
multi_agent_eval("two_rooms", ["run1", "run2"], emergent_phenomenon)

View File

@ -2,13 +2,14 @@ import os
import torch
from typing import Union, List
import numpy as np
from tqdm import tqdm
from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient, cumulate_discount
from marl_factory_grid.algorithms.rl.constants import Names
from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, door_is_close, \
from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
get_dirt_piles_positions, update_target_pile, update_ordered_dirt_piles, get_all_cleaned_dirt_piles, \
distribute_indices, set_agent_spawnpoint, get_ordered_dirt_piles, handle_finished_episode, save_configs, \
save_agent_models, get_all_observations
distribute_indices, set_agents_spawnpoints, get_ordered_dirt_piles, handle_finished_episode, save_configs, \
save_agent_models, get_all_observations, get_agents_positions
from marl_factory_grid.algorithms.utils import add_env_props
from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
create_info_maps
@ -28,93 +29,88 @@ class A2C:
self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
self.setup()
self.reward_development = []
self.action_probabilities = {agent_idx:[] for agent_idx in range(self.n_agents)}
self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}
def setup(self):
dirt_piles_positions = [self.factory.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in
range(len(self.factory.state.entities[nms.DIRT_PILES]))]
self.obs_dim = 2 + 2*len(dirt_piles_positions) if self.cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL else 4
""" Initialize agents and create entry for run results according to configuration """
self.obs_dim = 2 + 2 * len(get_dirt_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
nms.PILE_OBSERVABILITY] == nms.ALL else 4
self.act_dim = 4 # The 4 movement directions
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in range(self.n_agents)]
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
range(self.n_agents)]
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
# Create results folder
runs = os.listdir("../study_out/")
runs = os.listdir("./study_out/")
run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
next_run_number = max(run_numbers)+1 if run_numbers else 0
self.results_path = f"../study_out/run{next_run_number}"
next_run_number = max(run_numbers) + 1 if run_numbers else 0
self.results_path = f"./study_out/run{next_run_number}"
os.mkdir(self.results_path)
# Save settings in results folder
save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)
def set_cfg(self, eval=False):
""" Set the mode of the current configuration """
if eval:
self.cfg = self.eval_cfg
else:
self.cfg = self.train_cfg
def load_agents(self, runs_list):
""" Initialize networks with parameters of already trained agents """
for idx, run in enumerate(runs_list):
run_path = f"../study_out/{run}"
run_path = f"./study_out/{run}"
self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
@torch.no_grad()
def train_loop(self):
""" Function for training agents """
env = self.factory
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
global_steps, episode = 0, 0
indices = distribute_indices(env, self.cfg, self.n_agents)
dirt_piles_positions = get_dirt_piles_positions(env)
used_actions = {i:0 for i in range(len(env.state.entities[nms.AGENT][0]._actions))} # Assume both agents have the same actions
target_pile = [partition[0] for partition in indices] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
target_pile = [partition[0] for partition in
indices] # list of pointers that point to the current target pile for each agent
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
pbar = tqdm(total=max_steps)
while global_steps < max_steps:
print(global_steps)
obs = env.reset()
_ = env.reset()
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
env.render()
set_agent_spawnpoint(env, self.n_agents)
set_agents_spawnpoints(env, self.n_agents)
ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, self.cfg, self.n_agents)
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
target_pile = [partition[0] for partition in indices]
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
# Supply each agent with its local observation
obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
done, rew_log = [False] * self.n_agents, 0
print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
print("Agents target piles:", target_pile)
print("Agents initial observation:", obs)
print("Agents cleaned dirt piles:", cleaned_dirt_piles)
done, rew_log = [False] * self.n_agents, 0
while not all(done):
# 0="North", 1="East", 2="South", 3="West", 4="Clean", 5="Noop"
action = self.use_door_or_move(env, obs, cleaned_dirt_piles) \
if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
used_actions[int(action[0])] += 1
_, next_obs, reward, done, info = env.step(action)
if done:
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
next_obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
# Handle case where agent is on field with dirt
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices,
reward, done)
if n_steps != 0 and (global_steps + 1) % n_steps == 0:
print("max_steps reached")
done = True
if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True
done = [done] * self.n_agents if isinstance(done, bool) else done
for ag_i, agent in enumerate(self.agents):
# For forced actions like door opening, we have to call the step function with this action, but
# since we are not allowed to exceed the dimensions range, we can't log the corresponding step info.
if action[ag_i] in range(self.act_dim):
# Add agent results into respective rollout buffers
agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
env.render()
# Visualize state update
if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()
obs = next_obs
@ -123,97 +119,93 @@ class A2C:
global_steps += 1
rew_log += sum(reward)
if global_steps >= max_steps:
break
if global_steps >= max_steps: break
print(f'reward at episode: {episode} = {rew_log}')
self.reward_development.append(rew_log)
episode += 1
pbar.update(global_steps - pbar.n)
plot_reward_development(self.reward_development, self.cfg, self.results_path)
pbar.close()
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
create_info_maps(env, used_actions, get_all_observations(env, self.cfg, self.n_agents),
plot_reward_development(self.reward_development, self.results_path)
create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
get_dirt_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
save_agent_models(self.results_path, self.agents)
plot_action_maps(env, [self], self.results_path)
@torch.inference_mode(True)
def eval_loop(self, n_episodes, render=False):
def eval_loop(self, n_episodes):
""" Function for performing inference """
env = self.eval_factory
self.set_cfg(eval=True)
episode, results = 0, []
dirt_piles_positions = get_dirt_piles_positions(env)
indices = distribute_indices(env, self.cfg, self.n_agents)
target_pile = [partition[0] for partition in indices] # pointer that points to the target pile for each agent. (point to same pile/ point to different piles)
target_pile = [partition[0] for partition in
indices] # list of pointers that point to the current target pile for each agent
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
else:
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in
range(self.n_agents)]
else: cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
while episode < n_episodes:
obs = env.reset()
set_agent_spawnpoint(env, self.n_agents)
_ = env.reset()
set_agents_spawnpoints(env, self.n_agents)
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
# Don't render auxiliary piles
if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
# Don't render auxiliary piles
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.DIRT_PILES]) if idx % 2 == 0]
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.DIRT_PILES]) if
idx % 2 == 0]
for pile in auxiliary_piles:
pile.set_new_amount(0)
env.render()
env._renderer.fps = 5 # Slow down agent movement
env._renderer.fps = 5 # Slow down agent movement
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
target_pile = [partition[0] for partition in indices]
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in range(self.n_agents)]
else:
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
cleaned_dirt_piles = [{dirt_piles_positions[idx]: False for idx in indices[i]} for i in
range(self.n_agents)]
else: cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, self.cfg, self.n_agents)
# Supply each agent with its local observation
obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
while not all(done):
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, det=True) \
if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
_, next_obs, reward, done, info = env.step(action) # Note that this call seems to flip the lists in indices
if done:
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
cleaned_dirt_piles) # zero exploration
_, next_obs, reward, done, info = env.step(action)
# Add small negative reward if agent has moved away from the target_pile
# reward = self.reward_distance(env, obs, target_pile, reward)
# Handle case where agent is on field with dirt
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices,
reward, done)
# Check and handle if agent is on field with dirt
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done)
# Get transformed next_obs that might have been updated because of self.handle_dirt.
# For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
# in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
# Get transformed next_obs that might have been updated because of handle_dirt
next_obs = transform_observations(env, ordered_dirt_piles, target_pile, self.cfg, self.n_agents)
done = [done] * self.n_agents if isinstance(done, bool) else done
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
env.render()
if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()
obs = next_obs
episode += 1
########## Helper functions ########
def get_actions(self, observations) -> ListOrTensor:
# Given an observation, get actions for both agents
""" Given local observations, get actions for both agents """
actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
enumerate(self.agents)]
return actions
def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
# Use deterministic policy for inference
""" Execute agent policies deterministically for inference """
actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
enumerate(self.agents)]
for agent_idx in range(self.n_agents):
@ -224,10 +216,11 @@ class A2C:
return actions
def use_door_or_move(self, env, obs, cleaned_dirt_piles, det=False):
""" Function that handles automatic actions like door opening and forced Noop"""
action = []
for agent_idx, agent in enumerate(self.agents):
agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
# If agent already reached its target
# Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting)
if all(cleaned_dirt_piles[agent_idx].values()):
action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
a.name == nms.NOOP))
@ -235,37 +228,33 @@ class A2C:
# Include agent experience entry manually
agent._episode.append((None, None, None, agent.vf(agent_obs)))
else:
if door := door_is_close(env, agent_idx):
if door := is_door_close(env, agent_idx):
if door.is_closed:
action.append(next(
action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
a.name == nms.USE_DOOR))
# Don't include action in agent experience
else:
if det:
action.append(int(agent.pi(agent_obs, det=True)[0]))
else:
action.append(int(agent.step(agent_obs)))
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
else: action.append(int(agent.step(agent_obs)))
else:
if det:
action.append(int(agent.pi(agent_obs, det=True)[0]))
else:
action.append(int(agent.step(agent_obs)))
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
else: action.append(int(agent.step(agent_obs)))
return action
def handle_dirt(self, env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, indices, reward, done):
# Check if agent moved on field with dirt. If that is the case collect dirt automatically
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
""" Check if agent moved on field with dirt. If that is the case collect dirt automatically """
agents_positions = get_agents_positions(env, self.n_agents)
dirt_piles_positions = get_dirt_piles_positions(env)
if any([True for pos in agent_positions if pos in dirt_piles_positions]):
if any([True for pos in agents_positions if pos in dirt_piles_positions]):
# Only simulate collecting the dirt
for idx, pos in enumerate(agent_positions):
for idx, pos in enumerate(agents_positions):
if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
# If dirt piles should be cleaned in a specific order
if ordered_dirt_piles[idx]:
if pos == ordered_dirt_piles[idx][target_pile[idx]]:
reward[idx] += 50 # 1
reward[idx] += 50
cleaned_dirt_piles[idx][pos] = True
# Set pointer to next dirt pile
update_target_pile(env, idx, target_pile, indices, self.cfg)
@ -278,7 +267,7 @@ class A2C:
for pos in dirt_piles_positions:
cleaned_dirt_piles[idx][pos] = False
else:
reward[idx] += 50 # 1
reward[idx] += 50
cleaned_dirt_piles[idx][pos] = True
# Indicate that renderer can hide dirt pile
@ -294,4 +283,3 @@ class A2C:
done = True
return reward, done

View File

@ -10,6 +10,7 @@ from marl_factory_grid.algorithms.rl.constants import Names
nms = Names
def _as_torch(x):
""" Helper function to convert different list types to a torch tensor """
if isinstance(x, np.ndarray):
return torch.from_numpy(x)
elif isinstance(x, List):
@ -20,15 +21,16 @@ def _as_torch(x):
def transform_observations(env, ordered_dirt_piles, target_pile, cfg, n_agents):
""" Requires that agent has observations -DirtPiles and -Self """
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
""" Function that extracts local observations from global state
Requires that agents have observations -DirtPiles and -Self (cf. environment configs) """
agents_positions = get_agents_positions(env, n_agents)
pile_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL
if pile_observability_is_all:
trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agents_positions))]
else:
# Only show current target pile
trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
for i, pos in enumerate(agent_positions):
trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))]
for i, pos in enumerate(agents_positions):
agent_x, agent_y = pos[0], pos[1]
trans_obs[i][0] = agent_x
trans_obs[i][1] = agent_y
@ -45,6 +47,7 @@ def transform_observations(env, ordered_dirt_piles, target_pile, cfg, n_agents):
def get_all_observations(env, cfg, n_agents):
""" Helper function that returns all possible agent observations """
dirt_piles_positions = [env.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in
range(len(env.state.entities[nms.DIRT_PILES]))]
if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL:
@ -76,41 +79,48 @@ def get_all_observations(env, cfg, n_agents):
def get_dirt_piles_positions(env):
""" Get positions of dirt piles on the map """
return [env.state.entities[nms.DIRT_PILES][pile_idx].pos for pile_idx in range(len(env.state.entities[nms.DIRT_PILES]))]
def get_agents_positions(env, n_agents):
""" Get positions of agents on the map """
return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
def get_ordered_dirt_piles(env, cleaned_dirt_piles, cfg, n_agents):
""" Each agent can have its individual pile order """
""" This function determines in which order the agents should clean the dirt piles
Each agent can have its individual pile order """
ordered_dirt_piles = [[] for _ in range(n_agents)]
dirt_pile_positions = get_dirt_piles_positions(env)
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
dirt_piles_positions = get_dirt_piles_positions(env)
agents_positions = get_agents_positions(env, n_agents)
for agent_idx in range(n_agents):
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]:
ordered_dirt_piles[agent_idx] = dirt_pile_positions
ordered_dirt_piles[agent_idx] = dirt_piles_positions
elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]:
# Calculate distances for remaining unvisited dirt piles
remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
pile_distances = {pos:0 for pos in remaining_target_piles}
agent_pos = agent_positions[agent_idx]
agent_pos = agents_positions[agent_idx]
for pos in remaining_target_piles:
pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART:
# Check if there is an agent in line with any of the remaining dirt piles
# Check if there is an agent on the direct path to any of the remaining dirt piles
for pile_pos in remaining_target_piles:
for other_pos in agent_positions:
for other_pos in agents_positions:
if other_pos != agent_pos:
if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
# Get the line between the agent and the goal
# Get the line between the agent and the target
path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
# Check if the entity lies on the path between the agent and the goal
# Check if the entity lies on the path between the agent and the target
if other_pos in path:
pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])
sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
# Insert already visited dirt piles
ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
ordered_dirt_piles[agent_idx] = [pos for pos in dirt_piles_positions if pos not in remaining_target_piles]
# Fill up with sorted positions
for pos in sorted_pile_distances.keys():
ordered_dirt_piles[agent_idx].append(pos)
@ -145,6 +155,7 @@ def bresenham(x0, y0, x1, y1):
def update_ordered_dirt_piles(agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, cfg, n_agents):
""" Update the order of the remaining dirt piles """
# Only update ordered_dirt_pile for agent that reached its target pile
updated_ordered_dirt_piles = get_ordered_dirt_piles(env, cleaned_dirt_piles, cfg, n_agents)
for i in range(len(ordered_dirt_piles[agent_idx])):
@ -152,8 +163,10 @@ def update_ordered_dirt_piles(agent_idx, cleaned_dirt_piles, ordered_dirt_piles,
def distribute_indices(env, cfg, n_agents):
""" Distribute dirt piles evenly among the agents """
indices = []
n_dirt_piles = len(get_dirt_piles_positions(env))
agents_positions = get_agents_positions(env, n_agents)
if n_dirt_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
indices = [[0] for _ in range(n_agents)]
else:
@ -171,12 +184,11 @@ def distribute_indices(env, cfg, n_agents):
# -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
distances = {door_pos:[] for door_pos in door_positions}
# Calculate distance of every agent to every door
for door_pos in door_positions:
for agent_pos in agent_positions:
for agent_pos in agents_positions:
distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
def duplicate_indices(lst, item):
@ -213,6 +225,7 @@ def distribute_indices(env, cfg, n_agents):
def update_target_pile(env, agent_idx, target_pile, indices, cfg):
""" Get the next target pile for a given agent """
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
if target_pile[agent_idx] + 1 < len(get_dirt_piles_positions(env)):
target_pile[agent_idx] += 1
@ -223,7 +236,8 @@ def update_target_pile(env, agent_idx, target_pile, indices, cfg):
target_pile[agent_idx] += 1
def door_is_close(env, agent_idx):
def is_door_close(env, agent_idx):
""" Checks whether the agent is close to a door """
neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos)
for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name]
if neighbourhood:
@ -231,6 +245,7 @@ def door_is_close(env, agent_idx):
def get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles, n_agents):
""" Returns all dirt piles cleaned by any agent """
meta_cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
for agent_idx in range(n_agents):
for (pos, cleaned) in cleaned_dirt_piles[agent_idx].items():
@ -240,6 +255,7 @@ def get_all_cleaned_dirt_piles(dirt_piles_positions, cleaned_dirt_piles, n_agent
def handle_finished_episode(obs, agents, cfg):
""" Finish up episode, calculate advantages and perform policy net and value net updates"""
with torch.inference_mode(False):
for ag_i, agent in enumerate(agents):
# Get states, actions, rewards and values from rollout buffer
@ -268,6 +284,7 @@ def handle_finished_episode(obs, agents, cfg):
def split_into_chunks(data_tuple, cfg):
""" Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """
result = [data_tuple]
chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE]
if chunk_size > 0:
@ -286,7 +303,8 @@ def split_into_chunks(data_tuple, cfg):
return result
def set_agent_spawnpoint(env, n_agents):
def set_agents_spawnpoints(env, n_agents):
""" Tell environment where the agents should spawn in the next episode """
for agent_idx in range(n_agents):
agent_name = list(env.state.agents_conf.keys())[agent_idx]
current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER]
@ -299,6 +317,7 @@ def set_agent_spawnpoint(env, n_agents):
def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
""" Save configurations for logging purposes """
with open(f"{results_path}/MARL_config.txt", "w") as txt_file:
txt_file.write(str(cfg))
with open(f"{results_path}/train_env_config.txt", "w") as txt_file:
@ -308,6 +327,7 @@ def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
def save_agent_models(results_path, agents):
""" Save model parameters after training """
for idx, agent in enumerate(agents):
agent.pi.save_model_parameters(results_path)
agent.vf.save_model_parameters(results_path)

View File

@ -0,0 +1,61 @@
import os
from pathlib import Path
from tqdm import trange
from marl_factory_grid import Factory
from marl_factory_grid.algorithms.tsp.contortions import get_dirt_quadrant_tsp_agents, get_two_rooms_tsp_agents
def dirt_quadrant_multi_agent_tsp_eval(emergent_phenomenon):
run_tsp_setting("dirt_quadrant", emergent_phenomenon)
def two_rooms_multi_agent_tsp_eval(emergent_phenomenon):
run_tsp_setting("two_rooms", emergent_phenomenon)
def run_tsp_setting(config_name, emergent_phenomenon, n_episodes=1):
# Render at each step?
render = True
# Path to config File
path = Path(f'./marl_factory_grid/environment/configs/tsp/{config_name}.yaml')
# Create results folder
runs = os.listdir("./study_out/")
run_numbers = [int(run[7:]) for run in runs if run[:7] == "tsp_run"]
next_run_number = max(run_numbers) + 1 if run_numbers else 0
results_path = f"./study_out/tsp_run{next_run_number}"
os.mkdir(results_path)
# Env Init
factory = Factory(path)
with open(f"{results_path}/env_config.txt", "w") as txt_file:
txt_file.write(str(factory.conf))
for episode in trange(n_episodes):
_ = factory.reset()
done = False
if render:
factory.render()
factory._renderer.fps = 5
if config_name == "dirt_quadrant":
agents = get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory)
elif config_name == "two_rooms":
agents = get_two_rooms_tsp_agents(emergent_phenomenon, factory)
else:
print("Config name does not exist. Abort...")
break
while not done:
a = [x.predict() for x in agents]
# Have this condition, to terminate as soon as all dirt piles are collected. This ensures that the implementation
# of the TSP agent is equivalent to that of the RL agent
if 'DirtPiles' in list(factory.state.entities.keys()) and factory.state.entities['DirtPiles'].global_amount == 0.0:
break
obs_type, _, _, done, info = factory.step(a)
if render:
factory.render()
if done:
break

View File

@ -0,0 +1,55 @@
import numpy as np
from marl_factory_grid.algorithms.tsp.TSP_dirt_agent import TSPDirtAgent
from marl_factory_grid.algorithms.tsp.TSP_target_agent import TSPTargetAgent
def get_dirt_quadrant_tsp_agents(emergent_phenomenon, factory):
agents = [TSPDirtAgent(factory, 0), TSPDirtAgent(factory, 1)]
if not emergent_phenomenon:
edge_costs = {}
# Add costs for horizontal edges
for i in range(1, 10):
for j in range(1, 9):
# Add costs for both traversal directions
edge_costs[f"{(i, j)}-{i, j + 1}"] = 0.55 + (i - 1) * 0.05
edge_costs[f"{i, j + 1}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
# Add costs for vertical edges
for i in range(1, 9):
for j in range(1, 10):
# Add costs for both traversal directions
edge_costs[f"{(i, j)}-{i + 1, j}"] = 0.55 + (i) * 0.05
edge_costs[f"{i + 1, j}-{(i, j)}"] = 0.55 + (i - 1) * 0.05
for agent in agents:
for u, v, weight in agent._position_graph.edges(data='weight'):
agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
return agents
def get_two_rooms_tsp_agents(emergent_phenomenon, factory):
agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)]
if not emergent_phenomenon:
edge_costs = {}
# Add costs for horizontal edges
for i in range(1, 6):
for j in range(1, 13):
# Add costs for both traversal directions
edge_costs[f"{(i, j)}-{i, j + 1}"] = np.abs(5/i*np.cbrt(((j+1)/4 - 1)) - 1)
edge_costs[f"{i, j + 1}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
# Add costs for vertical edges
for i in range(1, 5):
for j in range(1, 14):
# Add costs for both traversal directions
edge_costs[f"{(i, j)}-{i + 1, j}"] = np.abs(5/(i+1)*np.cbrt((j/4 - 1)) - 1)
edge_costs[f"{i + 1, j}-{(i, j)}"] = np.abs(5/i*np.cbrt((j/4 - 1)) - 1)
for agent in agents:
for u, v, weight in agent._position_graph.edges(data='weight'):
agent._position_graph[u][v]['weight'] = edge_costs[f"{u}-{v}"]
return agents

View File

@ -58,7 +58,7 @@ def load_yaml_file(path: Path):
def add_env_props(cfg):
# Path to config File
env_path = Path(f'../marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')
env_path = Path(f'./marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')
# Env Init
factory = Factory(env_path)