All relevant functional code for A2C Dirt Quadrant setting with small changes to the environment + Different configs for single agent and multiagent settings

This commit is contained in:
Julian Schönberger
2024-05-06 12:33:37 +02:00
parent 55026eda12
commit 3c54d04f9f
13 changed files with 652 additions and 174 deletions

1
.gitignore vendored
View File

@ -701,3 +701,4 @@ $RECYCLE.BIN/
# End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex # End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex
/studies/e_1/ /studies/e_1/
/studies/curious_study/ /studies/curious_study/
/study_out/

View File

@ -1,4 +1,5 @@
import copy import copy
import os
import random import random
from scipy import signal from scipy import signal
@ -61,9 +62,22 @@ class A2C:
# act_dim=6 for dirt_quadrant # act_dim=6 for dirt_quadrant
dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
range(len(self.factory.state.entities['DirtPiles']))] range(len(self.factory.state.entities['DirtPiles']))]
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
obs_dim = 2 + 2*len(dirt_piles_positions) obs_dim = 2 + 2*len(dirt_piles_positions)
else:
obs_dim = 4
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)] self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)]
# self.agents[0].pi.load_model_parameters("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/run5/Wolfgang_PolicyNet_model_parameters.pth")
self.doors_exist = "Doors" in self.factory.state.entities.keys() self.doors_exist = "Doors" in self.factory.state.entities.keys()
if self.cfg[nms.ENV]["save_and_log"]:
# Create results folder
runs = os.listdir("../study_out/")
run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
next_run_number = max(run_numbers)+1 if run_numbers else 0
self.results_path = f"../study_out/run{next_run_number}"
os.mkdir(self.results_path)
# Save settings in results folder
self.save_configs()
@classmethod @classmethod
def _as_torch(cls, x): def _as_torch(cls, x):
@ -80,62 +94,149 @@ class A2C:
actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)] actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
return actions return actions
def execute_policy(self, observations) -> ListOrTensor: def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
# Use deterministic policy for inference # Use deterministic policy for inference
actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)] actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
for agent_idx in range(self.n_agents):
if all(cleaned_dirt_piles[agent_idx].values()):
actions[agent_idx] = np.array(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
return actions return actions
def transform_observations(self, env): def transform_observations(self, env, ordered_dirt_piles, target_pile):
""" Assumes that agent has observations -DirtPiles and -Self """ """ Assumes that agent has observations -DirtPiles and -Self """
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)] agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))] if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
trans_obs = [torch.zeros(2+2*len(dirt_piles_positions)) for _ in range(len(agent_positions))] trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
else:
# Only show current target pile
trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
for i, pos in enumerate(agent_positions): for i, pos in enumerate(agent_positions):
agent_x, agent_y = pos[0], pos[1] agent_x, agent_y = pos[0], pos[1]
trans_obs[i][0] = agent_x trans_obs[i][0] = agent_x
trans_obs[i][1] = agent_y trans_obs[i][1] = agent_y
idx = 2 idx = 2
for pos in dirt_piles_positions: if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
trans_obs[i][idx] = pos[0] for pile_pos in ordered_dirt_piles[i]:
trans_obs[i][idx + 1] = pos[1] trans_obs[i][idx] = pile_pos[0]
trans_obs[i][idx + 1] = pile_pos[1]
idx += 2 idx += 2
else:
trans_obs[i][2] = ordered_dirt_piles[i][target_pile[i]][0]
trans_obs[i][3] = ordered_dirt_piles[i][target_pile[i]][1]
return trans_obs return trans_obs
def get_all_observations(self, env): def get_all_observations(self, env):
first_trans_obs = self.transform_observations(env)[0] dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
range(len(env.state.entities['DirtPiles']))]
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
obs = [torch.zeros(2 + 2 * len(dirt_piles_positions))]
observations = [[]]
# Fill in pile positions
idx = 2
for pile_pos in dirt_piles_positions:
obs[0][idx] = pile_pos[0]
obs[0][idx + 1] = pile_pos[1]
idx += 2
else:
# Have multiple observation layers of the map for each dirt pile one
obs = [torch.zeros(4) for _ in range(self.n_agents) for _ in dirt_piles_positions]
observations = [[] for _ in dirt_piles_positions]
for idx, pile_pos in enumerate(dirt_piles_positions):
obs[idx][2] = pile_pos[0]
obs[idx][3] = pile_pos[1]
valid_agent_positions = env.state.entities.floorlist valid_agent_positions = env.state.entities.floorlist
#observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2) #observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2)
observations = []
for idx, pos in enumerate(valid_agent_positions): for idx, pos in enumerate(valid_agent_positions):
obs = copy.deepcopy(first_trans_obs) for obs_layer in range(len(obs)):
obs[0] = pos[0] observation = copy.deepcopy(obs[obs_layer])
obs[1] = pos[1] observation[0] = pos[0]
observations.append(obs) observation[1] = pos[1]
observations[obs_layer].append(observation)
return observations return observations
def get_dirt_piles_positions(self, env): def get_dirt_piles_positions(self, env):
return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))] return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
def get_ordered_dirt_piles(self, env): def get_ordered_dirt_piles(self, env, cleaned_dirt_piles, target_pile):
ordered_dirt_piles = [] """ Each agent can have it's individual pile order """
ordered_dirt_piles = [[] for _ in range(self.n_agents)]
dirt_pile_positions = self.get_dirt_piles_positions(env)
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
for agent_idx in range(self.n_agents):
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]: if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
ordered_dirt_piles = self.get_dirt_piles_positions(env) ordered_dirt_piles[agent_idx] = dirt_pile_positions
elif self.cfg[nms.ALGORITHM]["pile-order"] == "random": elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
ordered_dirt_piles = self.get_dirt_piles_positions(env) ordered_dirt_piles[agent_idx] = dirt_pile_positions
random.shuffle(ordered_dirt_piles) random.shuffle(ordered_dirt_piles)
elif self.cfg[nms.ALGORITHM]["pile-order"] == "none": elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
ordered_dirt_piles = None ordered_dirt_piles[agent_idx] = None
elif self.cfg[nms.ALGORITHM]["pile-order"] in ["smart", "dynamic"]:
# Calculate distances for remaining unvisited dirt piles
remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
pile_distances = {pos:0 for pos in remaining_target_piles}
agent_pos = agent_positions[agent_idx]
for pos in remaining_target_piles:
pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
if self.cfg[nms.ALGORITHM]["pile-order"] == "smart":
# Check if there is an agent in line with any of the remaining dirt piles
for pile_pos in remaining_target_piles:
for other_pos in agent_positions:
if other_pos != agent_pos:
if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
# Get the line between the agent and the goal
path = self.bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
# Check if the entity lies on the path between the agent and the goal
if other_pos in path:
pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])
sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
# Insert already visited dirt piles
ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
# Fill up with sorted positions
for pos in sorted_pile_distances.keys():
ordered_dirt_piles[agent_idx].append(pos)
else: else:
print("Not a valid pile order option.") print("Not a valid pile order option.")
exit() exit()
return ordered_dirt_piles return ordered_dirt_piles
def bresenham(self, x0, y0, x1, y1):
"""Bresenham's line algorithm to get the coordinates of a line between two points."""
dx = np.abs(x1 - x0)
dy = np.abs(y1 - y0)
sx = 1 if x0 < x1 else -1
sy = 1 if y0 < y1 else -1
err = dx - dy
coordinates = []
while True:
coordinates.append((x0, y0))
if x0 == x1 and y0 == y1:
break
e2 = 2 * err
if e2 > -dy:
err -= dy
x0 += sx
if e2 < dx:
err += dx
y0 += sy
return coordinates
def update_ordered_dirt_piles(self, agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile):
# Only update ordered_dirt_pile for agent that reached its target pile
updated_ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
for i in range(len(ordered_dirt_piles[agent_idx])):
ordered_dirt_piles[agent_idx][i] = updated_ordered_dirt_piles[agent_idx][i]
def distribute_indices(self, env): def distribute_indices(self, env):
indices = [] indices = []
n_dirt_piles = len(self.get_dirt_piles_positions(env)) n_dirt_piles = len(self.get_dirt_piles_positions(env))
if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none"]: if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
indices = [[0] for _ in range(self.n_agents)] indices = [[0] for _ in range(self.n_agents)]
else: else:
base_count = n_dirt_piles // self.n_agents base_count = n_dirt_piles // self.n_agents
@ -152,6 +253,12 @@ class A2C:
def update_target_pile(self, env, agent_idx, target_pile): def update_target_pile(self, env, agent_idx, target_pile):
indices = self.distribute_indices(env) indices = self.distribute_indices(env)
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
target_pile[agent_idx] += 1
else:
target_pile[agent_idx] = 0
else:
if target_pile[agent_idx] + 1 in indices[agent_idx]: if target_pile[agent_idx] + 1 in indices[agent_idx]:
target_pile[agent_idx] += 1 target_pile[agent_idx] += 1
@ -166,7 +273,7 @@ class A2C:
for agent_idx, agent in enumerate(self.agents): for agent_idx, agent in enumerate(self.agents):
agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32) agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
# If agent already reached its target # If agent already reached its target
if list(cleaned_dirt_piles.values())[target_pile[agent_idx]]: if all(cleaned_dirt_piles[agent_idx].values()):
action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop")) action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
if not det: if not det:
# Include agent experience entry manually # Include agent experience entry manually
@ -238,31 +345,39 @@ class A2C:
# Execute real step in environment # Execute real step in environment
for idx, pos in enumerate(agent_positions): for idx, pos in enumerate(agent_positions):
if pos in cleaned_dirt_piles.keys() and not cleaned_dirt_piles[pos]: if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
action[idx] = np.array(4) action[idx] = np.array(4)
# Collect dirt # Collect dirt
_, next_obs, reward, done, info = env.step(action) _, next_obs, reward, done, info = env.step(action)
cleaned_dirt_piles[pos] = True cleaned_dirt_piles[idx][pos] = True
break""" break"""
# Only simulate collecting the dirt # Only simulate collecting the dirt
for idx, pos in enumerate(agent_positions): for idx, pos in enumerate(agent_positions):
if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[pos]: if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[idx][pos]:
# print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles) # print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
# If dirt piles should be cleaned in a specific order # If dirt piles should be cleaned in a specific order
if ordered_dirt_piles: if ordered_dirt_piles[idx]:
if pos == ordered_dirt_piles[target_pile[idx]]: if pos == ordered_dirt_piles[idx][target_pile[idx]]:
reward[idx] += 1 # 1 reward[idx] += 50 # 1
cleaned_dirt_piles[pos] = True cleaned_dirt_piles[idx][pos] = True
# Set pointer to next dirt pile # Set pointer to next dirt pile
self.update_target_pile(env, idx, target_pile) self.update_target_pile(env, idx, target_pile)
self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
done = True
if all(cleaned_dirt_piles[idx].values()):
# Reset cleaned_dirt_piles indicator
for pos in dirt_piles_positions:
cleaned_dirt_piles[idx][pos] = False
break break
else: else:
reward[idx] += 1 # 1 reward[idx] += 50 # 1
cleaned_dirt_piles[pos] = True cleaned_dirt_piles[idx][pos] = True
break break
if all(cleaned_dirt_piles.values()): if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
done = True done = True
return reward, done return reward, done
@ -271,7 +386,10 @@ class A2C:
with torch.inference_mode(False): with torch.inference_mode(False):
for ag_i, agent in enumerate(self.agents): for ag_i, agent in enumerate(self.agents):
# Get states, actions, rewards and values from rollout buffer # Get states, actions, rewards and values from rollout buffer
(s, a, R, V) = agent.finish_episode() data = agent.finish_episode()
# Chunk episode data, such that there will be no memory failure for very long episodes
chunks = self.split_into_chunks(data)
for (s, a, R, V) in chunks:
# Calculate discounted return and advantage # Calculate discounted return and advantage
G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"]) G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce": if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
@ -291,6 +409,34 @@ class A2C:
# Update policy and value net of agent with experience from rollout buffer # Update policy and value net of agent with experience from rollout buffer
agent.train(*rollout) agent.train(*rollout)
def split_into_chunks(self, data_tuple):
result = [data_tuple]
chunk_size = self.cfg[nms.ALGORITHM]["chunk-episode"]
if chunk_size > 0:
# Get the maximum length of the lists in the tuple to handle different lengths
max_length = max(len(lst) for lst in data_tuple)
# Prepare a list to store the result
result = []
# Split each list into chunks and add them to the result
for i in range(0, max_length, chunk_size):
# Create a sublist containing the ith chunk from each list
sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
result.append(sublist)
return result
def set_agent_spawnpoint(self, env):
for agent_idx in range(self.n_agents):
agent_name = list(env.state.agents_conf.keys())[agent_idx]
current_pos_pointer = env.state.agents_conf[agent_name]["pos_pointer"]
# Making the reset dependent on the number of spawnpoints and not the number of dirtpiles allows
# for having multiple subsequent spawnpoints with the same target pile
if current_pos_pointer == len(env.state.agents_conf[agent_name]['positions']) - 1:
env.state.agents_conf[agent_name]["pos_pointer"] = 0
else:
env.state.agents_conf[agent_name]["pos_pointer"] += 1
@torch.no_grad() @torch.no_grad()
def train_loop(self): def train_loop(self):
@ -301,19 +447,28 @@ class A2C:
global_steps, episode = 0, 0 global_steps, episode = 0, 0
dirt_piles_positions = self.get_dirt_piles_positions(env) dirt_piles_positions = self.get_dirt_piles_positions(env)
used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
while global_steps < max_steps: while global_steps < max_steps:
print(global_steps) print(global_steps)
obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given
print([env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]) self.set_agent_spawnpoint(env)
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
target_pile = [partition[0] for partition in self.distribute_indices(env)]
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
"""passed_fields = [[] for _ in range(self.n_agents)]"""
"""obs = list(obs.values())""" """obs = list(obs.values())"""
obs = self.transform_observations(env) obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
done, rew_log = [False] * self.n_agents, 0 done, rew_log = [False] * self.n_agents, 0
cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions} print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
ordered_dirt_piles = self.get_ordered_dirt_piles(env) print("Agents target piles:", target_pile)
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles) print("Agents initial observation:", obs)
"""passed_fields = [[] for _ in range(self.n_agents)]""" print("Agents cleaned dirt piles:", cleaned_dirt_piles)
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile # Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
"""for i in range(self.n_agents): """for i in range(self.n_agents):
@ -326,12 +481,16 @@ class A2C:
_, next_obs, reward, done, info = env.step(action) _, next_obs, reward, done, info = env.step(action)
if done: if done:
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode)) print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
next_obs = self.transform_observations(env) next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
# Add small negative reward if agent has moved away from the target_pile # Add small negative reward if agent has moved away from the target_pile
reward = self.reward_distance(env, obs, target_pile, reward) # reward = self.reward_distance(env, obs, target_pile, reward)
# Check and handle if agent is on field with dirt # Check and handle if agent is on field with dirt. This method can change the observation for the next step.
# If pile_all_done is "single", the episode ends if agents reached its target pile and the new episode begins
# with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
# the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
# before this method is called.
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done) reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
if n_steps != 0 and (global_steps + 1) % n_steps == 0: if n_steps != 0 and (global_steps + 1) % n_steps == 0:
@ -361,12 +520,113 @@ class A2C:
self.reward_development.append(rew_log) self.reward_development.append(rew_log)
episode += 1 episode += 1
self.plot_reward_development()
if self.cfg[nms.ENV]["save_and_log"]:
self.create_info_maps(env, used_actions, target_pile)
self.save_agent_models()
@torch.inference_mode(True)
def eval_loop(self, n_episodes, render=False):
env = self.eval_factory
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
env.render()
episode, results = 0, []
dirt_piles_positions = self.get_dirt_piles_positions(env)
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
while episode < n_episodes:
obs = env.reset()
self.set_agent_spawnpoint(env)
"""obs = list(obs.values())"""
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
target_pile = [partition[0] for partition in self.distribute_indices(env)]
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
"""for i in range(self.n_agents):
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
while not all(done):
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
print(action)
_, next_obs, reward, done, info = env.step(action)
if done:
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
# Add small negative reward if agent has moved away from the target_pile
# reward = self.reward_distance(env, obs, target_pile, reward)
# Check and handle if agent is on field with dirt
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
# Get transformed next_obs that might have been updated because of self.handle_dirt.
# For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
# in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
done = [done] * self.n_agents if isinstance(done, bool) else done
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
env.render()
obs = next_obs
episode += 1
def plot_reward_development(self):
smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
plt.plot(smoothed_data)
plt.ylim([-10, max(smoothed_data) + 20])
plt.title('Smoothed Reward Development')
plt.xlabel('Episode')
plt.ylabel('Reward')
if self.cfg[nms.ENV]["save_and_log"]:
plt.savefig(f"{self.results_path}/smoothed_reward_development.png")
plt.show()
def save_configs(self):
with open(f"{self.results_path}/MARL_config.txt", "w") as txt_file:
txt_file.write(str(self.cfg))
with open(f"{self.results_path}/train_env_config.txt", "w") as txt_file:
txt_file.write(str(self.factory.conf))
with open(f"{self.results_path}/eval_env_config.txt", "w") as txt_file:
txt_file.write(str(self.eval_factory.conf))
def save_agent_models(self):
for idx, agent in enumerate(self.agents):
agent_name = list(self.factory.state.agents_conf.keys())[idx]
agent.pi.save_model_parameters(self.results_path, agent_name)
agent.vf.save_model_parameters(self.results_path, agent_name)
def load_agents(self, runs_list):
for idx, run in enumerate(runs_list):
run_path = f"../study_out/{run}"
agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
def create_info_maps(self, env, used_actions, target_pile):
# Create value map # Create value map
observations_shape = (max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2) all_valid_observations = self.get_all_observations(env)
dirt_piles_positions = self.get_dirt_piles_positions(env)
with open(f"{self.results_path}/info_maps.txt", "w") as txt_file:
for obs_layer, pos in enumerate(dirt_piles_positions):
observations_shape = (
max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
value_maps = [np.zeros(observations_shape) for _ in self.agents] value_maps = [np.zeros(observations_shape) for _ in self.agents]
likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents] likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
action_probabilities = [np.zeros((observations_shape[0],observations_shape[1], env.action_space[0].n)) for _ in self.agents] action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], env.action_space[0].n)) for
for obs in self.get_all_observations(env): _ in self.agents]
for obs in all_valid_observations[obs_layer]:
"""obs = self._as_torch(obs).view(-1).to(torch.float32)""" """obs = self._as_torch(obs).view(-1).to(torch.float32)"""
for idx, agent in enumerate(self.agents): for idx, agent in enumerate(self.agents):
"""indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position) """indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
@ -380,78 +640,38 @@ class A2C:
except: except:
pass pass
txt_file.write("=======Value Maps=======\n")
print("=======Value Maps=======") print("=======Value Maps=======")
for agent_idx, vmap in enumerate(value_maps): for agent_idx, vmap in enumerate(value_maps):
print(f"Value map of agent {agent_idx}:") txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
print(f"Value map of agent {agent_idx} for target pile {pos}:")
vmap = self._as_torch(vmap).round(decimals=4) vmap = self._as_torch(vmap).round(decimals=4)
max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item()))) max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
for idx, row in enumerate(vmap): for idx, row in enumerate(vmap):
txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
txt_file.write("\n")
print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist())) print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
txt_file.write("\n")
txt_file.write("=======Likeliest Action=======\n")
print("=======Likeliest Action=======") print("=======Likeliest Action=======")
for agent_idx, amap in enumerate(likeliest_action): for agent_idx, amap in enumerate(likeliest_action):
print(f"Likeliest action map of agent {agent_idx}:") txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
print(f"Likeliest action map of agent {agent_idx} for target pile {pos}:")
txt_file.write(np.array2string(amap))
print(amap) print(amap)
txt_file.write("\n")
txt_file.write("=======Action Probabilities=======\n")
print("=======Action Probabilities=======") print("=======Action Probabilities=======")
for agent_idx, pmap in enumerate(action_probabilities): for agent_idx, pmap in enumerate(action_probabilities):
print(f"Action probability map of agent {agent_idx}:") txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
for d in range(pmap.shape[0]): for d in range(pmap.shape[0]):
row = '[' row = '['
for r in range(pmap.shape[1]): for r in range(pmap.shape[1]):
row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]" row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
txt_file.write(row + "]")
txt_file.write("\n")
print(row + "]") print(row + "]")
txt_file.write(f"Used actions: {used_actions}\n")
print("Used actions:", used_actions) print("Used actions:", used_actions)
@torch.inference_mode(True)
def eval_loop(self, n_episodes, render=False):
env = self.eval_factory
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
env.render()
episode, results = 0, []
dirt_piles_positions = self.get_dirt_piles_positions(env)
while episode < n_episodes:
obs = env.reset()
"""obs = list(obs.values())"""
obs = self.transform_observations(env)
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
ordered_dirt_piles = self.get_ordered_dirt_piles(env)
target_pile = [partition[0] for partition in self.distribute_indices(env)]
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
"""for i in range(self.n_agents):
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
while not all(done):
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs) # zero exploration
print(action)
_, next_obs, reward, done, info = env.step(action)
if done:
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
next_obs = self.transform_observations(env)
# Add small negative reward if agent has moved away from the target_pile
reward = self.reward_distance(env, obs, target_pile, reward)
# Check and handle if agent is on field with dirt
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
done = [done] * self.n_agents if isinstance(done, bool) else done
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
env.render()
obs = next_obs
episode += 1
def plot_reward_development(self):
plt.plot(self.reward_development)
plt.title('Reward development')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.savefig("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/two_rooms_one_door_modified_runs/reward_development.png")
plt.show()

View File

@ -1,6 +1,6 @@
import numpy as np; import torch as th; import scipy as sp; import gym import numpy as np; import torch as th; import scipy as sp;
import os; from collections import deque; import matplotlib.pyplot as plt from collections import deque
from tqdm import tqdm from torch import nn
# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1) # RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107 # cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
@ -14,8 +14,25 @@ class Net(th.nn.Module):
for layer in [th.nn.Linear(*io), a()]]) for layer in [th.nn.Linear(*io), a()]])
self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr) self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
# Initialize weights uniformly, so that for the policy net all actions have approximately the same probability in the beginning
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.uniform_(module.weight, a=-0.1, b=0.1)
if module.bias is not None:
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
def save_model(self, path, agent_name):
th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
def save_model_parameters(self, path, agent_name):
th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
def load_model_parameters(self, path):
self.net.load_state_dict(th.load(path))
self.net.eval()
class ValueNet(Net): class ValueNet(Net):
def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.Tanh, lr=1e-3): def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.ReLU, lr=1e-3):
super().__init__([obs_dim] + hidden_sizes + [1], activation, lr) super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
def forward(self, obs): return self.net(obs) def forward(self, obs): return self.net(obs)
def loss(self, states, returns): return ((returns - self(states))**2).mean() def loss(self, states, returns): return ((returns - self(states))**2).mean()

View File

@ -0,0 +1,32 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
hidden_size_actor: 64
hidden_size_critic: 64
use_agent_embedding: False
env:
classname: marl_factory_grid.configs.custom
env_name: "custom/MultiAgentConfigs/dirt_quadrant_train_config"
n_agents: 2
max_steps: 250
pomdp_r: 2
stack_n_frames: 0
individual_rewards: True
train_render: False
eval_render: True
save_and_log: True
method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01
vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 200000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)

View File

@ -8,7 +8,7 @@ agent:
use_agent_embedding: False use_agent_embedding: False
env: env:
classname: marl_factory_grid.configs.custom classname: marl_factory_grid.configs.custom
env_name: "custom/dirt_quadrant_random_pos" env_name: "custom/dirt_quadrant_train_config"
n_agents: 1 n_agents: 1
max_steps: 250 max_steps: 250
pomdp_r: 2 pomdp_r: 2
@ -16,13 +16,17 @@ env:
individual_rewards: True individual_rewards: True
train_render: False train_render: False
eval_render: True eval_render: True
save_and_log: False
method: marl_factory_grid.algorithms.marl.LoopSEAC method: marl_factory_grid.algorithms.marl.LoopSEAC
algorithm: algorithm:
gamma: 0.99 gamma: 0.99
entropy_coef: 0.01 entropy_coef: 0.01
vf_coef: 0.05 vf_coef: 0.05
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
max_steps: 80000 max_steps: 270000
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce" advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
pile-order: "fixed" # Options: "fixed", "random", "none", "agents" pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
pile-observability: "single" # Options: "single", "all"
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)

View File

@ -0,0 +1,71 @@
General:
# RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
env_seed: 69
# Individual vs global rewards
individual_rewards: true
# The level.txt file to load from marl_factory_grid/levels
level_name: quadrant
# Radius of Partially observable Markov decision process
pomdp_r: 0 # default 3
# Print all messages and events
verbose: false
# Run tests
tests: false
# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
# other agents aim to clean dirt piles.
Agents:
# The clean agents
Sigmund:
Actions:
- Move4
#- Clean
- Noop
Observations:
# - Walls
# - Other
- DirtPiles
- Self
Positions:
- (9,1)
#- (9,9)
#- (4,5)
Wolfgang:
Actions:
- Move4
#- Clean
- Noop
Observations:
# - Walls
# - Other
- DirtPiles
- Self
Positions:
- (9,5)
#- (9,9)
#- (4,5)
Entities:
DirtPiles:
coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
clean_amount: 1
dirt_spawn_r_var: 0
max_global_amount: 12
max_local_amount: 1
# Rules section specifies the rules governing the dynamics of the environment.
Rules:
# Utilities
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
# Can be omitted/ignored if you do not want to take care of collisions at all.
WatchCollisions:
done_at_collisions: false
# Done Conditions
# Define the conditions for the environment to stop. Either success or a fail conditions.
# The environment stops when all dirt is cleaned
DoneOnAllDirtCleaned:
#DoneAtMaxStepsReached:
#max_steps: 200

View File

@ -16,6 +16,23 @@ General:
# other agents aim to clean dirt piles. # other agents aim to clean dirt piles.
Agents: Agents:
# The clean agents # The clean agents
Sigmund:
Actions:
- Move4
#- Clean
#- Noop
Observations:
# - Walls
# - Other
- DirtPiles
- Self
Positions:
- (9,1)
- (4,5)
- (1,1)
- (4,5)
- (9,1)
- (9,9)
Wolfgang: Wolfgang:
Actions: Actions:
- Move4 - Move4
@ -26,32 +43,17 @@ Agents:
# - Other # - Other
- DirtPiles - DirtPiles
- Self - Self
#Positions: Positions:
#- (9,1) - (9,5)
#- (9,2) - (4,5)
#- (9,3) - (1,1)
#- (9,4) - (4,5)
#- (9,5) - (9,5)
#- (9,6) - (9,9)
#- (9,7)
#- (9,8)
#- (9,9)
#Reiner:
#Actions:
#- Move4
#- Clean
#- Noop
#Observations:
# - Walls
# - Other
#- DirtPiles
#- Self
#Positions:
#- (9,8) # (9, 4)
Entities: Entities:
DirtPiles: DirtPiles:
coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9) coords_or_quantity: (9,9), (1,1), (4,5) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
clean_amount: 1 clean_amount: 1
dirt_spawn_r_var: 0 dirt_spawn_r_var: 0
@ -72,4 +74,4 @@ Rules:
# The environment stops when all dirt is cleaned # The environment stops when all dirt is cleaned
DoneOnAllDirtCleaned: DoneOnAllDirtCleaned:
#DoneAtMaxStepsReached: # An episode should last for at most max_steps steps #DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
#max_steps: 1000 #max_steps: 100

View File

@ -16,6 +16,20 @@ General:
# other agents aim to clean dirt piles. # other agents aim to clean dirt piles.
Agents: Agents:
# The clean agents # The clean agents
#Sigmund:
#Actions:
#- Move4
#- Clean
#- Noop
#Observations:
# - Walls
# - Other
#- DirtPiles
#- Self
#Positions:
#- (9,1)
#- (9,9)
#- (4,5)
Wolfgang: Wolfgang:
Actions: Actions:
- Move4 - Move4
@ -27,23 +41,13 @@ Agents:
- DirtPiles - DirtPiles
- Self - Self
Positions: Positions:
- (9,1) - (9,5)
#Reiner: #- (9,9)
#Actions: #- (4,5)
#- Move4
#- Clean
#- Noop
#Observations:
# - Walls
# - Other
#- DirtPiles
#- Self
#Positions:
#- (9,8) # (9, 4)
Entities: Entities:
DirtPiles: DirtPiles:
coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9) coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
clean_amount: 1 clean_amount: 1
dirt_spawn_r_var: 0 dirt_spawn_r_var: 0

View File

@ -0,0 +1,85 @@
General:
# RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
env_seed: 69
# Individual vs global rewards
individual_rewards: true
# The level.txt file to load from marl_factory_grid/levels
level_name: quadrant
# Radius of Partially observable Markov decision process
pomdp_r: 0 # default 3
# Print all messages and events
verbose: false
# Run tests
tests: false
# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
# other agents aim to clean dirt piles.
Agents:
# The clean agents
#Sigmund:
#Actions:
#- Move4
#- Clean
#- Noop
#Observations:
# - Walls
# - Other
#- DirtPiles
#- Self
#Positions:
#- (9,1)
#- (4,5)
#- (1,1)
#- (4,5)
#- (9,1)
#- (9,9)
Wolfgang:
Actions:
- Move4
#- Clean
#- Noop
Observations:
# - Walls
# - Other
- DirtPiles
- Self
Positions:
- (9,5)
- (4,5)
- (1,1)
- (4,5)
- (9,5)
- (9,9)
Entities:
DirtPiles:
coords_or_quantity: (9,9), (1,1), (4,5) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
clean_amount: 1
dirt_spawn_r_var: 0
max_global_amount: 12
max_local_amount: 1
# Rules section specifies the rules governing the dynamics of the environment.
Rules:
# Utilities
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
# Can be omitted/ignored if you do not want to take care of collisions at all.
WatchCollisions:
done_at_collisions: false
# Done Conditions
# Define the conditions for the environment to stop. Either success or a fail conditions.
# The environment stops when all dirt is cleaned
DoneOnAllDirtCleaned:
#DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
#max_steps: 1000
# Define how agents spawn.
# Options: "random" (Spawn agent at a random position from the list of defined positions)
# "first" (Always spawn agent at first position regardless of the other provided positions)
# "order" (Loop through agent positions)
AgentSpawnRule:
spawn_rule: "order"

View File

@ -1,5 +1,5 @@
MOVEMENTS_VALID: float = -0.01 # default: -0.001 MOVEMENTS_VALID: float = -1 # default: -0.001
MOVEMENTS_FAIL: float = -0.1 # default: -0.05 MOVEMENTS_FAIL: float = -1 # default: -0.05
NOOP: float = -0.01 NOOP: float = -1
COLLISION: float = -0.5 COLLISION: float = -1
COLLISION_DONE: float = -1 COLLISION_DONE: float = -1

View File

@ -5,6 +5,7 @@ from typing import List, Collection
import numpy as np import numpy as np
import marl_factory_grid
from marl_factory_grid.environment import rewards as r, constants as c from marl_factory_grid.environment import rewards as r, constants as c
from marl_factory_grid.environment.entity.agent import Agent from marl_factory_grid.environment.entity.agent import Agent
from marl_factory_grid.utils import helpers as h from marl_factory_grid.utils import helpers as h
@ -180,6 +181,11 @@ class SpawnAgents(Rule):
pass pass
def on_reset(self, state): def on_reset(self, state):
spawn_rule = None
for rule in state.rules.rules:
if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule):
spawn_rule = rule.spawn_rule
agents = state[c.AGENT] agents = state[c.AGENT]
for agent_name, agent_conf in state.agents_conf.items(): for agent_name, agent_conf in state.agents_conf.items():
empty_positions = state.entities.empty_positions empty_positions = state.entities.empty_positions
@ -187,10 +193,9 @@ class SpawnAgents(Rule):
observations = agent_conf['observations'].copy() observations = agent_conf['observations'].copy()
positions = agent_conf['positions'].copy() positions = agent_conf['positions'].copy()
other = agent_conf['other'].copy() other = agent_conf['other'].copy()
positions_pointer = agent_conf['pos_pointer']
# Spawn agent on random position if multiple spawn points are provided if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer):
func = random.choice if len(positions) else h.get_first
if position := func([x for x in positions if x in empty_positions]):
assert state.check_pos_validity(position), 'smth went wrong....' assert state.check_pos_validity(position), 'smth went wrong....'
agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other)) agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
elif positions: elif positions:
@ -200,6 +205,20 @@ class SpawnAgents(Rule):
agents.add_item(Agent(actions, observations, empty_positions.pop(), str_ident=agent_name, **other)) agents.add_item(Agent(actions, observations, empty_positions.pop(), str_ident=agent_name, **other))
return [] return []
def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer):
if spawn_rule and spawn_rule == "random":
position = random.choice(([x for x in positions if x in empty_positions]))
elif spawn_rule and spawn_rule == "order":
position = ([x for x in positions if x in empty_positions])[positions_pointer]
else:
position = h.get_first([x for x in positions if x in empty_positions])
return position
class AgentSpawnRule(Rule):
def __init__(self, spawn_rule):
self.spawn_rule = spawn_rule
super().__init__()
class DoneAtMaxStepsReached(Rule): class DoneAtMaxStepsReached(Rule):

View File

@ -118,6 +118,10 @@ class Gamestate(object):
self._floortile_graph = None self._floortile_graph = None
self.tests = StepTests(*tests) self.tests = StepTests(*tests)
# Pointer that defines current spawn points of agents
for agent in self.agents_conf:
self.agents_conf[agent]["pos_pointer"] = 0
def reset(self): def reset(self):
self.curr_step = 0 self.curr_step = 0
self.curr_actions = None self.curr_actions = None

View File

@ -3,17 +3,36 @@ from pathlib import Path
from marl_factory_grid.algorithms.marl.a2c_dirt import A2C from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
from marl_factory_grid.algorithms.utils import load_yaml_file from marl_factory_grid.algorithms.utils import load_yaml_file
if __name__ == '__main__': def dirt_quadrant_single_agent_training():
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml') cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml')
train_cfg = load_yaml_file(cfg_path) train_cfg = load_yaml_file(cfg_path)
# Use environment config with fixed spawnpoints for eval # Use environment config with fixed spawnpoints for eval
eval_cfg = copy.deepcopy(train_cfg) eval_cfg = copy.deepcopy(train_cfg)
eval_cfg["env"]["env_name"] = "custom/dirt_quadrant" # Options: two_rooms_one_door_modified, dirt_quadrant eval_cfg["env"]["env_name"] = "custom/dirt_quadrant_eval_config"
print("Training phase") print("Training phase")
agent = A2C(train_cfg, eval_cfg) agent = A2C(train_cfg, eval_cfg)
agent.train_loop() agent.train_loop()
agent.plot_reward_development()
print("Evaluation phase") print("Evaluation phase")
# Have consecutive episode for eval in single agent case
train_cfg["algorithm"]["pile_all_done"] = "all"
# agent.load_agents(["run0", "run1"])
agent.eval_loop(10) agent.eval_loop(10)
def dirt_quadrant_multi_agent_eval():
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml')
train_cfg = load_yaml_file(cfg_path)
# Use environment config with fixed spawnpoints for eval
eval_cfg = copy.deepcopy(train_cfg)
eval_cfg["env"]["env_name"] = "custom/MultiAgentConfigs/dirt_quadrant_eval_config"
agent = A2C(train_cfg, eval_cfg)
print("Evaluation phase")
agent.load_agents(["run0", "run1"])
agent.eval_loop(10)
if __name__ == '__main__':
dirt_quadrant_single_agent_training()