mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-07-04 16:41:36 +02:00
All relevant functional code for A2C Dirt Quadrant setting with small changes to the environment + Different configs for single agent and multiagent settings
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -701,3 +701,4 @@ $RECYCLE.BIN/
|
|||||||
# End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex
|
# End of https://www.toptal.com/developers/gitignore/api/linux,unity,macos,python,windows,pycharm,notepadpp,visualstudiocode,latex
|
||||||
/studies/e_1/
|
/studies/e_1/
|
||||||
/studies/curious_study/
|
/studies/curious_study/
|
||||||
|
/study_out/
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import copy
|
import copy
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
from scipy import signal
|
from scipy import signal
|
||||||
@ -61,9 +62,22 @@ class A2C:
|
|||||||
# act_dim=6 for dirt_quadrant
|
# act_dim=6 for dirt_quadrant
|
||||||
dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
|
dirt_piles_positions = [self.factory.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
|
||||||
range(len(self.factory.state.entities['DirtPiles']))]
|
range(len(self.factory.state.entities['DirtPiles']))]
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
||||||
obs_dim = 2 + 2*len(dirt_piles_positions)
|
obs_dim = 2 + 2*len(dirt_piles_positions)
|
||||||
|
else:
|
||||||
|
obs_dim = 4
|
||||||
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)]
|
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=obs_dim) for i in range(self.n_agents)]
|
||||||
|
# self.agents[0].pi.load_model_parameters("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/run5/Wolfgang_PolicyNet_model_parameters.pth")
|
||||||
self.doors_exist = "Doors" in self.factory.state.entities.keys()
|
self.doors_exist = "Doors" in self.factory.state.entities.keys()
|
||||||
|
if self.cfg[nms.ENV]["save_and_log"]:
|
||||||
|
# Create results folder
|
||||||
|
runs = os.listdir("../study_out/")
|
||||||
|
run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
|
||||||
|
next_run_number = max(run_numbers)+1 if run_numbers else 0
|
||||||
|
self.results_path = f"../study_out/run{next_run_number}"
|
||||||
|
os.mkdir(self.results_path)
|
||||||
|
# Save settings in results folder
|
||||||
|
self.save_configs()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _as_torch(cls, x):
|
def _as_torch(cls, x):
|
||||||
@ -80,62 +94,149 @@ class A2C:
|
|||||||
actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
|
actions = [agent.step(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
def execute_policy(self, observations) -> ListOrTensor:
|
def execute_policy(self, observations, env, cleaned_dirt_piles) -> ListOrTensor:
|
||||||
# Use deterministic policy for inference
|
# Use deterministic policy for inference
|
||||||
actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
|
actions = [agent.policy(self._as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in enumerate(self.agents)]
|
||||||
|
for agent_idx in range(self.n_agents):
|
||||||
|
if all(cleaned_dirt_piles[agent_idx].values()):
|
||||||
|
actions[agent_idx] = np.array(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
def transform_observations(self, env):
|
def transform_observations(self, env, ordered_dirt_piles, target_pile):
|
||||||
""" Assumes that agent has observations -DirtPiles and -Self """
|
""" Assumes that agent has observations -DirtPiles and -Self """
|
||||||
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
||||||
dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
|
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
||||||
trans_obs = [torch.zeros(2+2*len(dirt_piles_positions)) for _ in range(len(agent_positions))]
|
trans_obs = [torch.zeros(2+2*len(ordered_dirt_piles[0])) for _ in range(len(agent_positions))]
|
||||||
|
else:
|
||||||
|
# Only show current target pile
|
||||||
|
trans_obs = [torch.zeros(4) for _ in range(len(agent_positions))]
|
||||||
for i, pos in enumerate(agent_positions):
|
for i, pos in enumerate(agent_positions):
|
||||||
agent_x, agent_y = pos[0], pos[1]
|
agent_x, agent_y = pos[0], pos[1]
|
||||||
trans_obs[i][0] = agent_x
|
trans_obs[i][0] = agent_x
|
||||||
trans_obs[i][1] = agent_y
|
trans_obs[i][1] = agent_y
|
||||||
idx = 2
|
idx = 2
|
||||||
for pos in dirt_piles_positions:
|
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
||||||
trans_obs[i][idx] = pos[0]
|
for pile_pos in ordered_dirt_piles[i]:
|
||||||
trans_obs[i][idx + 1] = pos[1]
|
trans_obs[i][idx] = pile_pos[0]
|
||||||
|
trans_obs[i][idx + 1] = pile_pos[1]
|
||||||
idx += 2
|
idx += 2
|
||||||
|
else:
|
||||||
|
trans_obs[i][2] = ordered_dirt_piles[i][target_pile[i]][0]
|
||||||
|
trans_obs[i][3] = ordered_dirt_piles[i][target_pile[i]][1]
|
||||||
return trans_obs
|
return trans_obs
|
||||||
|
|
||||||
def get_all_observations(self, env):
|
def get_all_observations(self, env):
|
||||||
first_trans_obs = self.transform_observations(env)[0]
|
dirt_piles_positions = [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in
|
||||||
|
range(len(env.state.entities['DirtPiles']))]
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile-observability"] == "all":
|
||||||
|
obs = [torch.zeros(2 + 2 * len(dirt_piles_positions))]
|
||||||
|
observations = [[]]
|
||||||
|
# Fill in pile positions
|
||||||
|
idx = 2
|
||||||
|
for pile_pos in dirt_piles_positions:
|
||||||
|
obs[0][idx] = pile_pos[0]
|
||||||
|
obs[0][idx + 1] = pile_pos[1]
|
||||||
|
idx += 2
|
||||||
|
else:
|
||||||
|
# Have multiple observation layers of the map for each dirt pile one
|
||||||
|
obs = [torch.zeros(4) for _ in range(self.n_agents) for _ in dirt_piles_positions]
|
||||||
|
observations = [[] for _ in dirt_piles_positions]
|
||||||
|
for idx, pile_pos in enumerate(dirt_piles_positions):
|
||||||
|
obs[idx][2] = pile_pos[0]
|
||||||
|
obs[idx][3] = pile_pos[1]
|
||||||
valid_agent_positions = env.state.entities.floorlist
|
valid_agent_positions = env.state.entities.floorlist
|
||||||
#observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2)
|
#observations_shape = (max(t[0] for t in valid_agent_positions) + 2, max(t[1] for t in valid_agent_positions) + 2)
|
||||||
observations = []
|
|
||||||
for idx, pos in enumerate(valid_agent_positions):
|
for idx, pos in enumerate(valid_agent_positions):
|
||||||
obs = copy.deepcopy(first_trans_obs)
|
for obs_layer in range(len(obs)):
|
||||||
obs[0] = pos[0]
|
observation = copy.deepcopy(obs[obs_layer])
|
||||||
obs[1] = pos[1]
|
observation[0] = pos[0]
|
||||||
observations.append(obs)
|
observation[1] = pos[1]
|
||||||
|
observations[obs_layer].append(observation)
|
||||||
|
|
||||||
return observations
|
return observations
|
||||||
|
|
||||||
def get_dirt_piles_positions(self, env):
|
def get_dirt_piles_positions(self, env):
|
||||||
return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
|
return [env.state.entities['DirtPiles'][pile_idx].pos for pile_idx in range(len(env.state.entities['DirtPiles']))]
|
||||||
|
|
||||||
def get_ordered_dirt_piles(self, env):
|
def get_ordered_dirt_piles(self, env, cleaned_dirt_piles, target_pile):
|
||||||
ordered_dirt_piles = []
|
""" Each agent can have it's individual pile order """
|
||||||
|
ordered_dirt_piles = [[] for _ in range(self.n_agents)]
|
||||||
|
dirt_pile_positions = self.get_dirt_piles_positions(env)
|
||||||
|
agent_positions = [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)]
|
||||||
|
for agent_idx in range(self.n_agents):
|
||||||
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
|
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "agents"]:
|
||||||
ordered_dirt_piles = self.get_dirt_piles_positions(env)
|
ordered_dirt_piles[agent_idx] = dirt_pile_positions
|
||||||
elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
|
elif self.cfg[nms.ALGORITHM]["pile-order"] == "random":
|
||||||
ordered_dirt_piles = self.get_dirt_piles_positions(env)
|
ordered_dirt_piles[agent_idx] = dirt_pile_positions
|
||||||
random.shuffle(ordered_dirt_piles)
|
random.shuffle(ordered_dirt_piles)
|
||||||
elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
|
elif self.cfg[nms.ALGORITHM]["pile-order"] == "none":
|
||||||
ordered_dirt_piles = None
|
ordered_dirt_piles[agent_idx] = None
|
||||||
|
elif self.cfg[nms.ALGORITHM]["pile-order"] in ["smart", "dynamic"]:
|
||||||
|
# Calculate distances for remaining unvisited dirt piles
|
||||||
|
remaining_target_piles = [pos for pos, value in cleaned_dirt_piles[agent_idx].items() if not value]
|
||||||
|
pile_distances = {pos:0 for pos in remaining_target_piles}
|
||||||
|
agent_pos = agent_positions[agent_idx]
|
||||||
|
for pos in remaining_target_piles:
|
||||||
|
pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
|
||||||
|
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile-order"] == "smart":
|
||||||
|
# Check if there is an agent in line with any of the remaining dirt piles
|
||||||
|
for pile_pos in remaining_target_piles:
|
||||||
|
for other_pos in agent_positions:
|
||||||
|
if other_pos != agent_pos:
|
||||||
|
if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[1]:
|
||||||
|
# Get the line between the agent and the goal
|
||||||
|
path = self.bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
|
||||||
|
|
||||||
|
# Check if the entity lies on the path between the agent and the goal
|
||||||
|
if other_pos in path:
|
||||||
|
pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(agent_pos[1] - other_pos[1])
|
||||||
|
|
||||||
|
sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
|
||||||
|
# Insert already visited dirt piles
|
||||||
|
ordered_dirt_piles[agent_idx] = [pos for pos in dirt_pile_positions if pos not in remaining_target_piles]
|
||||||
|
# Fill up with sorted positions
|
||||||
|
for pos in sorted_pile_distances.keys():
|
||||||
|
ordered_dirt_piles[agent_idx].append(pos)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print("Not a valid pile order option.")
|
print("Not a valid pile order option.")
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
return ordered_dirt_piles
|
return ordered_dirt_piles
|
||||||
|
|
||||||
|
def bresenham(self, x0, y0, x1, y1):
|
||||||
|
"""Bresenham's line algorithm to get the coordinates of a line between two points."""
|
||||||
|
dx = np.abs(x1 - x0)
|
||||||
|
dy = np.abs(y1 - y0)
|
||||||
|
sx = 1 if x0 < x1 else -1
|
||||||
|
sy = 1 if y0 < y1 else -1
|
||||||
|
err = dx - dy
|
||||||
|
|
||||||
|
coordinates = []
|
||||||
|
while True:
|
||||||
|
coordinates.append((x0, y0))
|
||||||
|
if x0 == x1 and y0 == y1:
|
||||||
|
break
|
||||||
|
e2 = 2 * err
|
||||||
|
if e2 > -dy:
|
||||||
|
err -= dy
|
||||||
|
x0 += sx
|
||||||
|
if e2 < dx:
|
||||||
|
err += dx
|
||||||
|
y0 += sy
|
||||||
|
return coordinates
|
||||||
|
|
||||||
|
def update_ordered_dirt_piles(self, agent_idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile):
|
||||||
|
# Only update ordered_dirt_pile for agent that reached its target pile
|
||||||
|
updated_ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
||||||
|
for i in range(len(ordered_dirt_piles[agent_idx])):
|
||||||
|
ordered_dirt_piles[agent_idx][i] = updated_ordered_dirt_piles[agent_idx][i]
|
||||||
|
|
||||||
def distribute_indices(self, env):
|
def distribute_indices(self, env):
|
||||||
indices = []
|
indices = []
|
||||||
n_dirt_piles = len(self.get_dirt_piles_positions(env))
|
n_dirt_piles = len(self.get_dirt_piles_positions(env))
|
||||||
if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none"]:
|
if n_dirt_piles == 1 or self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
|
||||||
indices = [[0] for _ in range(self.n_agents)]
|
indices = [[0] for _ in range(self.n_agents)]
|
||||||
else:
|
else:
|
||||||
base_count = n_dirt_piles // self.n_agents
|
base_count = n_dirt_piles // self.n_agents
|
||||||
@ -152,6 +253,12 @@ class A2C:
|
|||||||
|
|
||||||
def update_target_pile(self, env, agent_idx, target_pile):
|
def update_target_pile(self, env, agent_idx, target_pile):
|
||||||
indices = self.distribute_indices(env)
|
indices = self.distribute_indices(env)
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile-order"] in ["fixed", "random", "none", "dynamic", "smart"]:
|
||||||
|
if target_pile[agent_idx] + 1 < len(self.get_dirt_piles_positions(env)):
|
||||||
|
target_pile[agent_idx] += 1
|
||||||
|
else:
|
||||||
|
target_pile[agent_idx] = 0
|
||||||
|
else:
|
||||||
if target_pile[agent_idx] + 1 in indices[agent_idx]:
|
if target_pile[agent_idx] + 1 in indices[agent_idx]:
|
||||||
target_pile[agent_idx] += 1
|
target_pile[agent_idx] += 1
|
||||||
|
|
||||||
@ -166,7 +273,7 @@ class A2C:
|
|||||||
for agent_idx, agent in enumerate(self.agents):
|
for agent_idx, agent in enumerate(self.agents):
|
||||||
agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
|
agent_obs = self._as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
|
||||||
# If agent already reached its target
|
# If agent already reached its target
|
||||||
if list(cleaned_dirt_piles.values())[target_pile[agent_idx]]:
|
if all(cleaned_dirt_piles[agent_idx].values()):
|
||||||
action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
|
action.append(next(action_i for action_i, a in enumerate(env.state["Agent"][agent_idx].actions) if a.name == "Noop"))
|
||||||
if not det:
|
if not det:
|
||||||
# Include agent experience entry manually
|
# Include agent experience entry manually
|
||||||
@ -238,31 +345,39 @@ class A2C:
|
|||||||
|
|
||||||
# Execute real step in environment
|
# Execute real step in environment
|
||||||
for idx, pos in enumerate(agent_positions):
|
for idx, pos in enumerate(agent_positions):
|
||||||
if pos in cleaned_dirt_piles.keys() and not cleaned_dirt_piles[pos]:
|
if pos in cleaned_dirt_piles[idx].keys() and not cleaned_dirt_piles[idx][pos]:
|
||||||
action[idx] = np.array(4)
|
action[idx] = np.array(4)
|
||||||
# Collect dirt
|
# Collect dirt
|
||||||
_, next_obs, reward, done, info = env.step(action)
|
_, next_obs, reward, done, info = env.step(action)
|
||||||
cleaned_dirt_piles[pos] = True
|
cleaned_dirt_piles[idx][pos] = True
|
||||||
break"""
|
break"""
|
||||||
|
|
||||||
# Only simulate collecting the dirt
|
# Only simulate collecting the dirt
|
||||||
for idx, pos in enumerate(agent_positions):
|
for idx, pos in enumerate(agent_positions):
|
||||||
if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[pos]:
|
if pos in self.get_dirt_piles_positions(env) and not cleaned_dirt_piles[idx][pos]:
|
||||||
# print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
|
# print(env.state.entities["Agent"][idx], pos, idx, target_pile, ordered_dirt_piles)
|
||||||
# If dirt piles should be cleaned in a specific order
|
# If dirt piles should be cleaned in a specific order
|
||||||
if ordered_dirt_piles:
|
if ordered_dirt_piles[idx]:
|
||||||
if pos == ordered_dirt_piles[target_pile[idx]]:
|
if pos == ordered_dirt_piles[idx][target_pile[idx]]:
|
||||||
reward[idx] += 1 # 1
|
reward[idx] += 50 # 1
|
||||||
cleaned_dirt_piles[pos] = True
|
cleaned_dirt_piles[idx][pos] = True
|
||||||
# Set pointer to next dirt pile
|
# Set pointer to next dirt pile
|
||||||
self.update_target_pile(env, idx, target_pile)
|
self.update_target_pile(env, idx, target_pile)
|
||||||
|
self.update_ordered_dirt_piles(idx, cleaned_dirt_piles, ordered_dirt_piles, env, target_pile)
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "single":
|
||||||
|
done = True
|
||||||
|
if all(cleaned_dirt_piles[idx].values()):
|
||||||
|
# Reset cleaned_dirt_piles indicator
|
||||||
|
for pos in dirt_piles_positions:
|
||||||
|
cleaned_dirt_piles[idx][pos] = False
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
reward[idx] += 1 # 1
|
reward[idx] += 50 # 1
|
||||||
cleaned_dirt_piles[pos] = True
|
cleaned_dirt_piles[idx][pos] = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if all(cleaned_dirt_piles.values()):
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
||||||
|
if all([all(cleaned_dirt_piles[i].values()) for i in range(self.n_agents)]):
|
||||||
done = True
|
done = True
|
||||||
|
|
||||||
return reward, done
|
return reward, done
|
||||||
@ -271,7 +386,10 @@ class A2C:
|
|||||||
with torch.inference_mode(False):
|
with torch.inference_mode(False):
|
||||||
for ag_i, agent in enumerate(self.agents):
|
for ag_i, agent in enumerate(self.agents):
|
||||||
# Get states, actions, rewards and values from rollout buffer
|
# Get states, actions, rewards and values from rollout buffer
|
||||||
(s, a, R, V) = agent.finish_episode()
|
data = agent.finish_episode()
|
||||||
|
# Chunk episode data, such that there will be no memory failure for very long episodes
|
||||||
|
chunks = self.split_into_chunks(data)
|
||||||
|
for (s, a, R, V) in chunks:
|
||||||
# Calculate discounted return and advantage
|
# Calculate discounted return and advantage
|
||||||
G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
|
G = cumulate_discount(R, self.cfg[nms.ALGORITHM]["gamma"])
|
||||||
if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
|
if self.cfg[nms.ALGORITHM]["advantage"] == "Reinforce":
|
||||||
@ -291,6 +409,34 @@ class A2C:
|
|||||||
# Update policy and value net of agent with experience from rollout buffer
|
# Update policy and value net of agent with experience from rollout buffer
|
||||||
agent.train(*rollout)
|
agent.train(*rollout)
|
||||||
|
|
||||||
|
def split_into_chunks(self, data_tuple):
|
||||||
|
result = [data_tuple]
|
||||||
|
chunk_size = self.cfg[nms.ALGORITHM]["chunk-episode"]
|
||||||
|
if chunk_size > 0:
|
||||||
|
# Get the maximum length of the lists in the tuple to handle different lengths
|
||||||
|
max_length = max(len(lst) for lst in data_tuple)
|
||||||
|
|
||||||
|
# Prepare a list to store the result
|
||||||
|
result = []
|
||||||
|
|
||||||
|
# Split each list into chunks and add them to the result
|
||||||
|
for i in range(0, max_length, chunk_size):
|
||||||
|
# Create a sublist containing the ith chunk from each list
|
||||||
|
sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
|
||||||
|
result.append(sublist)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def set_agent_spawnpoint(self, env):
|
||||||
|
for agent_idx in range(self.n_agents):
|
||||||
|
agent_name = list(env.state.agents_conf.keys())[agent_idx]
|
||||||
|
current_pos_pointer = env.state.agents_conf[agent_name]["pos_pointer"]
|
||||||
|
# Making the reset dependent on the number of spawnpoints and not the number of dirtpiles allows
|
||||||
|
# for having multiple subsequent spawnpoints with the same target pile
|
||||||
|
if current_pos_pointer == len(env.state.agents_conf[agent_name]['positions']) - 1:
|
||||||
|
env.state.agents_conf[agent_name]["pos_pointer"] = 0
|
||||||
|
else:
|
||||||
|
env.state.agents_conf[agent_name]["pos_pointer"] += 1
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def train_loop(self):
|
def train_loop(self):
|
||||||
@ -301,19 +447,28 @@ class A2C:
|
|||||||
global_steps, episode = 0, 0
|
global_steps, episode = 0, 0
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
|
used_actions = {i:0 for i in range(len(env.state.entities["Agent"][0]._actions))} # Assume both agents have the same actions
|
||||||
|
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)] # Have own dictionary for each agent
|
||||||
|
|
||||||
while global_steps < max_steps:
|
while global_steps < max_steps:
|
||||||
print(global_steps)
|
print(global_steps)
|
||||||
obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given
|
obs = env.reset() # !!!!!!!!Commented seems to work better? Only if a fixed spawnpoint is given
|
||||||
print([env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
|
self.set_agent_spawnpoint(env)
|
||||||
|
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
||||||
|
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
||||||
|
target_pile = [partition[0] for partition in self.distribute_indices(env)]
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
|
"""passed_fields = [[] for _ in range(self.n_agents)]"""
|
||||||
|
|
||||||
"""obs = list(obs.values())"""
|
"""obs = list(obs.values())"""
|
||||||
obs = self.transform_observations(env)
|
obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
|
||||||
done, rew_log = [False] * self.n_agents, 0
|
done, rew_log = [False] * self.n_agents, 0
|
||||||
|
|
||||||
cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
|
print("Agents spawnpoints:", [env.state.moving_entites[agent_idx].pos for agent_idx in range(self.n_agents)])
|
||||||
ordered_dirt_piles = self.get_ordered_dirt_piles(env)
|
print("Agents target piles:", target_pile)
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
print("Agents initial observation:", obs)
|
||||||
"""passed_fields = [[] for _ in range(self.n_agents)]"""
|
print("Agents cleaned dirt piles:", cleaned_dirt_piles)
|
||||||
|
|
||||||
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
|
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
|
||||||
"""for i in range(self.n_agents):
|
"""for i in range(self.n_agents):
|
||||||
@ -326,12 +481,16 @@ class A2C:
|
|||||||
_, next_obs, reward, done, info = env.step(action)
|
_, next_obs, reward, done, info = env.step(action)
|
||||||
if done:
|
if done:
|
||||||
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
||||||
next_obs = self.transform_observations(env)
|
next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
|
||||||
|
|
||||||
# Add small negative reward if agent has moved away from the target_pile
|
# Add small negative reward if agent has moved away from the target_pile
|
||||||
reward = self.reward_distance(env, obs, target_pile, reward)
|
# reward = self.reward_distance(env, obs, target_pile, reward)
|
||||||
|
|
||||||
# Check and handle if agent is on field with dirt
|
# Check and handle if agent is on field with dirt. This method can change the observation for the next step.
|
||||||
|
# If pile_all_done is "single", the episode ends if agents reached its target pile and the new episode begins
|
||||||
|
# with the updated observation. The observation that is saved to the rollout buffer, which resulted in reaching
|
||||||
|
# the target pile should not be updated before saving. Thus, the self.transform_observations call must happen
|
||||||
|
# before this method is called.
|
||||||
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
||||||
|
|
||||||
if n_steps != 0 and (global_steps + 1) % n_steps == 0:
|
if n_steps != 0 and (global_steps + 1) % n_steps == 0:
|
||||||
@ -361,12 +520,113 @@ class A2C:
|
|||||||
self.reward_development.append(rew_log)
|
self.reward_development.append(rew_log)
|
||||||
episode += 1
|
episode += 1
|
||||||
|
|
||||||
|
self.plot_reward_development()
|
||||||
|
if self.cfg[nms.ENV]["save_and_log"]:
|
||||||
|
self.create_info_maps(env, used_actions, target_pile)
|
||||||
|
self.save_agent_models()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode(True)
|
||||||
|
def eval_loop(self, n_episodes, render=False):
|
||||||
|
env = self.eval_factory
|
||||||
|
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||||
|
env.render()
|
||||||
|
episode, results = 0, []
|
||||||
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
|
target_pile = [partition[0] for partition in self.distribute_indices(env)] # pointer that points to the target pile for each agent. (point to same pile, point to different piles)
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
|
|
||||||
|
while episode < n_episodes:
|
||||||
|
obs = env.reset()
|
||||||
|
self.set_agent_spawnpoint(env)
|
||||||
|
"""obs = list(obs.values())"""
|
||||||
|
# Reset current target pile at episode begin if all piles have to be cleaned in one episode
|
||||||
|
if self.cfg[nms.ALGORITHM]["pile_all_done"] == "all":
|
||||||
|
target_pile = [partition[0] for partition in self.distribute_indices(env)]
|
||||||
|
cleaned_dirt_piles = [{pos: False for pos in dirt_piles_positions} for _ in range(self.n_agents)]
|
||||||
|
|
||||||
|
ordered_dirt_piles = self.get_ordered_dirt_piles(env, cleaned_dirt_piles, target_pile)
|
||||||
|
|
||||||
|
obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
|
||||||
|
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
|
||||||
|
|
||||||
|
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
|
||||||
|
"""for i in range(self.n_agents):
|
||||||
|
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
|
||||||
|
|
||||||
|
while not all(done):
|
||||||
|
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs, env, cleaned_dirt_piles) # zero exploration
|
||||||
|
print(action)
|
||||||
|
_, next_obs, reward, done, info = env.step(action)
|
||||||
|
if done:
|
||||||
|
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
||||||
|
|
||||||
|
# Add small negative reward if agent has moved away from the target_pile
|
||||||
|
# reward = self.reward_distance(env, obs, target_pile, reward)
|
||||||
|
|
||||||
|
# Check and handle if agent is on field with dirt
|
||||||
|
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
||||||
|
|
||||||
|
# Get transformed next_obs that might have been updated because of self.handle_dirt.
|
||||||
|
# For eval, where pile_all_done is "all", it's mandatory that the potential change of the target pile
|
||||||
|
# in the observation, caused by self.handle_dirt, is already considered when the next action is calculated.
|
||||||
|
next_obs = self.transform_observations(env, ordered_dirt_piles, target_pile)
|
||||||
|
|
||||||
|
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||||
|
|
||||||
|
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||||
|
env.render()
|
||||||
|
|
||||||
|
obs = next_obs
|
||||||
|
|
||||||
|
episode += 1
|
||||||
|
|
||||||
|
def plot_reward_development(self):
|
||||||
|
smoothed_data = np.convolve(self.reward_development, np.ones(10) / 10, mode='valid')
|
||||||
|
plt.plot(smoothed_data)
|
||||||
|
plt.ylim([-10, max(smoothed_data) + 20])
|
||||||
|
plt.title('Smoothed Reward Development')
|
||||||
|
plt.xlabel('Episode')
|
||||||
|
plt.ylabel('Reward')
|
||||||
|
if self.cfg[nms.ENV]["save_and_log"]:
|
||||||
|
plt.savefig(f"{self.results_path}/smoothed_reward_development.png")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def save_configs(self):
|
||||||
|
with open(f"{self.results_path}/MARL_config.txt", "w") as txt_file:
|
||||||
|
txt_file.write(str(self.cfg))
|
||||||
|
with open(f"{self.results_path}/train_env_config.txt", "w") as txt_file:
|
||||||
|
txt_file.write(str(self.factory.conf))
|
||||||
|
with open(f"{self.results_path}/eval_env_config.txt", "w") as txt_file:
|
||||||
|
txt_file.write(str(self.eval_factory.conf))
|
||||||
|
|
||||||
|
def save_agent_models(self):
|
||||||
|
for idx, agent in enumerate(self.agents):
|
||||||
|
agent_name = list(self.factory.state.agents_conf.keys())[idx]
|
||||||
|
agent.pi.save_model_parameters(self.results_path, agent_name)
|
||||||
|
agent.vf.save_model_parameters(self.results_path, agent_name)
|
||||||
|
|
||||||
|
def load_agents(self, runs_list):
|
||||||
|
for idx, run in enumerate(runs_list):
|
||||||
|
run_path = f"../study_out/{run}"
|
||||||
|
agent_name = list(self.eval_factory.state.agents_conf.keys())[idx]
|
||||||
|
self.agents[idx].pi.load_model_parameters(f"{run_path}/{agent_name}_PolicyNet_model_parameters.pth")
|
||||||
|
self.agents[idx].vf.load_model_parameters(f"{run_path}/{agent_name}_ValueNet_model_parameters.pth")
|
||||||
|
|
||||||
|
def create_info_maps(self, env, used_actions, target_pile):
|
||||||
# Create value map
|
# Create value map
|
||||||
observations_shape = (max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
|
all_valid_observations = self.get_all_observations(env)
|
||||||
|
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
||||||
|
with open(f"{self.results_path}/info_maps.txt", "w") as txt_file:
|
||||||
|
for obs_layer, pos in enumerate(dirt_piles_positions):
|
||||||
|
observations_shape = (
|
||||||
|
max(t[0] for t in env.state.entities.floorlist) + 2, max(t[1] for t in env.state.entities.floorlist) + 2)
|
||||||
value_maps = [np.zeros(observations_shape) for _ in self.agents]
|
value_maps = [np.zeros(observations_shape) for _ in self.agents]
|
||||||
likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
|
likeliest_action = [np.full(observations_shape, np.NaN) for _ in self.agents]
|
||||||
action_probabilities = [np.zeros((observations_shape[0],observations_shape[1], env.action_space[0].n)) for _ in self.agents]
|
action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], env.action_space[0].n)) for
|
||||||
for obs in self.get_all_observations(env):
|
_ in self.agents]
|
||||||
|
for obs in all_valid_observations[obs_layer]:
|
||||||
"""obs = self._as_torch(obs).view(-1).to(torch.float32)"""
|
"""obs = self._as_torch(obs).view(-1).to(torch.float32)"""
|
||||||
for idx, agent in enumerate(self.agents):
|
for idx, agent in enumerate(self.agents):
|
||||||
"""indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
|
"""indices = np.where(obs[1] == 1) # Get agent position on grid (1 indicates the position)
|
||||||
@ -380,78 +640,38 @@ class A2C:
|
|||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
txt_file.write("=======Value Maps=======\n")
|
||||||
print("=======Value Maps=======")
|
print("=======Value Maps=======")
|
||||||
for agent_idx, vmap in enumerate(value_maps):
|
for agent_idx, vmap in enumerate(value_maps):
|
||||||
print(f"Value map of agent {agent_idx}:")
|
txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
|
||||||
|
print(f"Value map of agent {agent_idx} for target pile {pos}:")
|
||||||
vmap = self._as_torch(vmap).round(decimals=4)
|
vmap = self._as_torch(vmap).round(decimals=4)
|
||||||
max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
|
max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
|
||||||
for idx, row in enumerate(vmap):
|
for idx, row in enumerate(vmap):
|
||||||
|
txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
|
||||||
|
txt_file.write("\n")
|
||||||
print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
|
print(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
|
||||||
|
txt_file.write("\n")
|
||||||
|
txt_file.write("=======Likeliest Action=======\n")
|
||||||
print("=======Likeliest Action=======")
|
print("=======Likeliest Action=======")
|
||||||
for agent_idx, amap in enumerate(likeliest_action):
|
for agent_idx, amap in enumerate(likeliest_action):
|
||||||
print(f"Likeliest action map of agent {agent_idx}:")
|
txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
|
||||||
|
print(f"Likeliest action map of agent {agent_idx} for target pile {pos}:")
|
||||||
|
txt_file.write(np.array2string(amap))
|
||||||
print(amap)
|
print(amap)
|
||||||
|
txt_file.write("\n")
|
||||||
|
txt_file.write("=======Action Probabilities=======\n")
|
||||||
print("=======Action Probabilities=======")
|
print("=======Action Probabilities=======")
|
||||||
for agent_idx, pmap in enumerate(action_probabilities):
|
for agent_idx, pmap in enumerate(action_probabilities):
|
||||||
print(f"Action probability map of agent {agent_idx}:")
|
txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
|
||||||
|
print(f"Action probability map of agent {agent_idx} for target pile {pos}:")
|
||||||
for d in range(pmap.shape[0]):
|
for d in range(pmap.shape[0]):
|
||||||
row = '['
|
row = '['
|
||||||
for r in range(pmap.shape[1]):
|
for r in range(pmap.shape[1]):
|
||||||
row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
|
row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
|
||||||
|
txt_file.write(row + "]")
|
||||||
|
txt_file.write("\n")
|
||||||
print(row + "]")
|
print(row + "]")
|
||||||
|
txt_file.write(f"Used actions: {used_actions}\n")
|
||||||
print("Used actions:", used_actions)
|
print("Used actions:", used_actions)
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode(True)
|
|
||||||
def eval_loop(self, n_episodes, render=False):
|
|
||||||
env = self.eval_factory
|
|
||||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
|
||||||
env.render()
|
|
||||||
episode, results = 0, []
|
|
||||||
dirt_piles_positions = self.get_dirt_piles_positions(env)
|
|
||||||
|
|
||||||
while episode < n_episodes:
|
|
||||||
obs = env.reset()
|
|
||||||
"""obs = list(obs.values())"""
|
|
||||||
obs = self.transform_observations(env)
|
|
||||||
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
|
|
||||||
|
|
||||||
cleaned_dirt_piles = {pos: False for pos in dirt_piles_positions}
|
|
||||||
ordered_dirt_piles = self.get_ordered_dirt_piles(env)
|
|
||||||
target_pile = [partition[0] for partition in self.distribute_indices(env)]
|
|
||||||
|
|
||||||
# Add Clean and Noop actions to agent actions so that they can be executed when the agent comes on a dirpile
|
|
||||||
"""for i in range(self.n_agents):
|
|
||||||
self.factory.state['Agent'][i].actions.extend([Clean(), Noop()])"""
|
|
||||||
|
|
||||||
while not all(done):
|
|
||||||
action = self.use_door_or_move(env, obs, cleaned_dirt_piles, target_pile, det=True) if self.doors_exist else self.execute_policy(obs) # zero exploration
|
|
||||||
print(action)
|
|
||||||
_, next_obs, reward, done, info = env.step(action)
|
|
||||||
if done:
|
|
||||||
print("DoneAtMaxStepsReached:", len(self.agents[0]._episode))
|
|
||||||
next_obs = self.transform_observations(env)
|
|
||||||
|
|
||||||
# Add small negative reward if agent has moved away from the target_pile
|
|
||||||
reward = self.reward_distance(env, obs, target_pile, reward)
|
|
||||||
|
|
||||||
# Check and handle if agent is on field with dirt
|
|
||||||
reward, done = self.handle_dirt(env, cleaned_dirt_piles, ordered_dirt_piles, target_pile, reward, done)
|
|
||||||
|
|
||||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
|
||||||
|
|
||||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
|
||||||
env.render()
|
|
||||||
|
|
||||||
obs = next_obs
|
|
||||||
|
|
||||||
episode += 1
|
|
||||||
|
|
||||||
def plot_reward_development(self):
|
|
||||||
plt.plot(self.reward_development)
|
|
||||||
plt.title('Reward development')
|
|
||||||
plt.xlabel('Episode')
|
|
||||||
plt.ylabel('Reward')
|
|
||||||
plt.savefig("/Users/julian/Coding/Projects/PyCharmProjects/EDYS/study_out/two_rooms_one_door_modified_runs/reward_development.png")
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import numpy as np; import torch as th; import scipy as sp; import gym
|
import numpy as np; import torch as th; import scipy as sp;
|
||||||
import os; from collections import deque; import matplotlib.pyplot as plt
|
from collections import deque
|
||||||
from tqdm import tqdm
|
from torch import nn
|
||||||
|
|
||||||
# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
|
# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
|
||||||
# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
|
# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
|
||||||
@ -14,8 +14,25 @@ class Net(th.nn.Module):
|
|||||||
for layer in [th.nn.Linear(*io), a()]])
|
for layer in [th.nn.Linear(*io), a()]])
|
||||||
self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
|
self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
|
||||||
|
|
||||||
|
# Initialize weights uniformly, so that for the policy net all actions have approximately the same probability in the beginning
|
||||||
|
for module in self.modules():
|
||||||
|
if isinstance(module, nn.Linear):
|
||||||
|
nn.init.uniform_(module.weight, a=-0.1, b=0.1)
|
||||||
|
if module.bias is not None:
|
||||||
|
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
|
||||||
|
|
||||||
|
def save_model(self, path, agent_name):
|
||||||
|
th.save(self.net, f"{path}/{agent_name}_{self.__class__.__name__}_model.pth")
|
||||||
|
|
||||||
|
def save_model_parameters(self, path, agent_name):
|
||||||
|
th.save(self.net.state_dict(), f"{path}/{agent_name}_{self.__class__.__name__}_model_parameters.pth")
|
||||||
|
|
||||||
|
def load_model_parameters(self, path):
|
||||||
|
self.net.load_state_dict(th.load(path))
|
||||||
|
self.net.eval()
|
||||||
|
|
||||||
class ValueNet(Net):
|
class ValueNet(Net):
|
||||||
def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.Tanh, lr=1e-3):
|
def __init__(self, obs_dim, hidden_sizes=[64,64], activation=th.nn.ReLU, lr=1e-3):
|
||||||
super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
|
super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
|
||||||
def forward(self, obs): return self.net(obs)
|
def forward(self, obs): return self.net(obs)
|
||||||
def loss(self, states, returns): return ((returns - self(states))**2).mean()
|
def loss(self, states, returns): return ((returns - self(states))**2).mean()
|
||||||
|
@ -0,0 +1,32 @@
|
|||||||
|
agent:
|
||||||
|
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||||
|
n_agents: 2
|
||||||
|
obs_emb_size: 96
|
||||||
|
action_emb_size: 16
|
||||||
|
hidden_size_actor: 64
|
||||||
|
hidden_size_critic: 64
|
||||||
|
use_agent_embedding: False
|
||||||
|
env:
|
||||||
|
classname: marl_factory_grid.configs.custom
|
||||||
|
env_name: "custom/MultiAgentConfigs/dirt_quadrant_train_config"
|
||||||
|
n_agents: 2
|
||||||
|
max_steps: 250
|
||||||
|
pomdp_r: 2
|
||||||
|
stack_n_frames: 0
|
||||||
|
individual_rewards: True
|
||||||
|
train_render: False
|
||||||
|
eval_render: True
|
||||||
|
save_and_log: True
|
||||||
|
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||||
|
algorithm:
|
||||||
|
gamma: 0.99
|
||||||
|
entropy_coef: 0.01
|
||||||
|
vf_coef: 0.05
|
||||||
|
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||||
|
max_steps: 200000
|
||||||
|
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||||
|
pile-order: "dynamic" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||||
|
pile-observability: "single" # Options: "single", "all"
|
||||||
|
pile_all_done: "all" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||||
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
@ -8,7 +8,7 @@ agent:
|
|||||||
use_agent_embedding: False
|
use_agent_embedding: False
|
||||||
env:
|
env:
|
||||||
classname: marl_factory_grid.configs.custom
|
classname: marl_factory_grid.configs.custom
|
||||||
env_name: "custom/dirt_quadrant_random_pos"
|
env_name: "custom/dirt_quadrant_train_config"
|
||||||
n_agents: 1
|
n_agents: 1
|
||||||
max_steps: 250
|
max_steps: 250
|
||||||
pomdp_r: 2
|
pomdp_r: 2
|
||||||
@ -16,13 +16,17 @@ env:
|
|||||||
individual_rewards: True
|
individual_rewards: True
|
||||||
train_render: False
|
train_render: False
|
||||||
eval_render: True
|
eval_render: True
|
||||||
|
save_and_log: False
|
||||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||||
algorithm:
|
algorithm:
|
||||||
gamma: 0.99
|
gamma: 0.99
|
||||||
entropy_coef: 0.01
|
entropy_coef: 0.01
|
||||||
vf_coef: 0.05
|
vf_coef: 0.05
|
||||||
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
n_steps: 0 # How much experience should be sampled at most (n-TD) until the next value and policy update is performed. Default 0: MC
|
||||||
max_steps: 80000
|
max_steps: 270000
|
||||||
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
advantage: "Advantage-AC" # Options: "Advantage-AC", "TD-Advantage-AC", "Reinforce"
|
||||||
pile-order: "fixed" # Options: "fixed", "random", "none", "agents"
|
pile-order: "fixed" # Options: "fixed", "random", "none", "agents", "dynamic", "smart" (Use "fixed", "random" and "none" for single agent training and the other for multi agent inference)
|
||||||
|
pile-observability: "single" # Options: "single", "all"
|
||||||
|
pile_all_done: "single" # Options: "single", "all" ("single" for training, "all" for eval)
|
||||||
|
chunk-episode: 20000 # Chunk size. (0 = update networks with full episode at once)
|
||||||
|
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
General:
|
||||||
|
# RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
|
||||||
|
env_seed: 69
|
||||||
|
# Individual vs global rewards
|
||||||
|
individual_rewards: true
|
||||||
|
# The level.txt file to load from marl_factory_grid/levels
|
||||||
|
level_name: quadrant
|
||||||
|
# Radius of Partially observable Markov decision process
|
||||||
|
pomdp_r: 0 # default 3
|
||||||
|
# Print all messages and events
|
||||||
|
verbose: false
|
||||||
|
# Run tests
|
||||||
|
tests: false
|
||||||
|
|
||||||
|
# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
|
||||||
|
# other agents aim to clean dirt piles.
|
||||||
|
Agents:
|
||||||
|
# The clean agents
|
||||||
|
Sigmund:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
#- Clean
|
||||||
|
- Noop
|
||||||
|
Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (9,1)
|
||||||
|
#- (9,9)
|
||||||
|
#- (4,5)
|
||||||
|
Wolfgang:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
#- Clean
|
||||||
|
- Noop
|
||||||
|
Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (9,5)
|
||||||
|
#- (9,9)
|
||||||
|
#- (4,5)
|
||||||
|
|
||||||
|
Entities:
|
||||||
|
DirtPiles:
|
||||||
|
coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
||||||
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
|
clean_amount: 1
|
||||||
|
dirt_spawn_r_var: 0
|
||||||
|
max_global_amount: 12
|
||||||
|
max_local_amount: 1
|
||||||
|
|
||||||
|
# Rules section specifies the rules governing the dynamics of the environment.
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
|
# Can be omitted/ignored if you do not want to take care of collisions at all.
|
||||||
|
WatchCollisions:
|
||||||
|
done_at_collisions: false
|
||||||
|
|
||||||
|
# Done Conditions
|
||||||
|
# Define the conditions for the environment to stop. Either success or a fail conditions.
|
||||||
|
# The environment stops when all dirt is cleaned
|
||||||
|
DoneOnAllDirtCleaned:
|
||||||
|
#DoneAtMaxStepsReached:
|
||||||
|
#max_steps: 200
|
@ -16,6 +16,23 @@ General:
|
|||||||
# other agents aim to clean dirt piles.
|
# other agents aim to clean dirt piles.
|
||||||
Agents:
|
Agents:
|
||||||
# The clean agents
|
# The clean agents
|
||||||
|
Sigmund:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
#- Clean
|
||||||
|
#- Noop
|
||||||
|
Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (9,1)
|
||||||
|
- (4,5)
|
||||||
|
- (1,1)
|
||||||
|
- (4,5)
|
||||||
|
- (9,1)
|
||||||
|
- (9,9)
|
||||||
Wolfgang:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
@ -26,32 +43,17 @@ Agents:
|
|||||||
# - Other
|
# - Other
|
||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
#Positions:
|
Positions:
|
||||||
#- (9,1)
|
- (9,5)
|
||||||
#- (9,2)
|
- (4,5)
|
||||||
#- (9,3)
|
- (1,1)
|
||||||
#- (9,4)
|
- (4,5)
|
||||||
#- (9,5)
|
- (9,5)
|
||||||
#- (9,6)
|
- (9,9)
|
||||||
#- (9,7)
|
|
||||||
#- (9,8)
|
|
||||||
#- (9,9)
|
|
||||||
#Reiner:
|
|
||||||
#Actions:
|
|
||||||
#- Move4
|
|
||||||
#- Clean
|
|
||||||
#- Noop
|
|
||||||
#Observations:
|
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
#- DirtPiles
|
|
||||||
#- Self
|
|
||||||
#Positions:
|
|
||||||
#- (9,8) # (9, 4)
|
|
||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
coords_or_quantity: (9,9), (1,1), (4,5) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
||||||
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
clean_amount: 1
|
clean_amount: 1
|
||||||
dirt_spawn_r_var: 0
|
dirt_spawn_r_var: 0
|
||||||
@ -72,4 +74,4 @@ Rules:
|
|||||||
# The environment stops when all dirt is cleaned
|
# The environment stops when all dirt is cleaned
|
||||||
DoneOnAllDirtCleaned:
|
DoneOnAllDirtCleaned:
|
||||||
#DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
|
#DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
|
||||||
#max_steps: 1000
|
#max_steps: 100
|
@ -16,6 +16,20 @@ General:
|
|||||||
# other agents aim to clean dirt piles.
|
# other agents aim to clean dirt piles.
|
||||||
Agents:
|
Agents:
|
||||||
# The clean agents
|
# The clean agents
|
||||||
|
#Sigmund:
|
||||||
|
#Actions:
|
||||||
|
#- Move4
|
||||||
|
#- Clean
|
||||||
|
#- Noop
|
||||||
|
#Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
#- DirtPiles
|
||||||
|
#- Self
|
||||||
|
#Positions:
|
||||||
|
#- (9,1)
|
||||||
|
#- (9,9)
|
||||||
|
#- (4,5)
|
||||||
Wolfgang:
|
Wolfgang:
|
||||||
Actions:
|
Actions:
|
||||||
- Move4
|
- Move4
|
||||||
@ -27,23 +41,13 @@ Agents:
|
|||||||
- DirtPiles
|
- DirtPiles
|
||||||
- Self
|
- Self
|
||||||
Positions:
|
Positions:
|
||||||
- (9,1)
|
- (9,5)
|
||||||
#Reiner:
|
#- (9,9)
|
||||||
#Actions:
|
#- (4,5)
|
||||||
#- Move4
|
|
||||||
#- Clean
|
|
||||||
#- Noop
|
|
||||||
#Observations:
|
|
||||||
# - Walls
|
|
||||||
# - Other
|
|
||||||
#- DirtPiles
|
|
||||||
#- Self
|
|
||||||
#Positions:
|
|
||||||
#- (9,8) # (9, 4)
|
|
||||||
|
|
||||||
Entities:
|
Entities:
|
||||||
DirtPiles:
|
DirtPiles:
|
||||||
coords_or_quantity: (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
coords_or_quantity: (9,9), (4,5), (1,1) # (4,7), (2,4), (1, 1) # (1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
||||||
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
clean_amount: 1
|
clean_amount: 1
|
||||||
dirt_spawn_r_var: 0
|
dirt_spawn_r_var: 0
|
@ -0,0 +1,85 @@
|
|||||||
|
General:
|
||||||
|
# RNG-seed to sample the same "random" numbers every time, to make the different runs comparable.
|
||||||
|
env_seed: 69
|
||||||
|
# Individual vs global rewards
|
||||||
|
individual_rewards: true
|
||||||
|
# The level.txt file to load from marl_factory_grid/levels
|
||||||
|
level_name: quadrant
|
||||||
|
# Radius of Partially observable Markov decision process
|
||||||
|
pomdp_r: 0 # default 3
|
||||||
|
# Print all messages and events
|
||||||
|
verbose: false
|
||||||
|
# Run tests
|
||||||
|
tests: false
|
||||||
|
|
||||||
|
# In the "clean and bring" Scenario one agent aims to pick up all items and drop them at drop-off locations while all
|
||||||
|
# other agents aim to clean dirt piles.
|
||||||
|
Agents:
|
||||||
|
# The clean agents
|
||||||
|
#Sigmund:
|
||||||
|
#Actions:
|
||||||
|
#- Move4
|
||||||
|
#- Clean
|
||||||
|
#- Noop
|
||||||
|
#Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
#- DirtPiles
|
||||||
|
#- Self
|
||||||
|
#Positions:
|
||||||
|
#- (9,1)
|
||||||
|
#- (4,5)
|
||||||
|
#- (1,1)
|
||||||
|
#- (4,5)
|
||||||
|
#- (9,1)
|
||||||
|
#- (9,9)
|
||||||
|
Wolfgang:
|
||||||
|
Actions:
|
||||||
|
- Move4
|
||||||
|
#- Clean
|
||||||
|
#- Noop
|
||||||
|
Observations:
|
||||||
|
# - Walls
|
||||||
|
# - Other
|
||||||
|
- DirtPiles
|
||||||
|
- Self
|
||||||
|
Positions:
|
||||||
|
- (9,5)
|
||||||
|
- (4,5)
|
||||||
|
- (1,1)
|
||||||
|
- (4,5)
|
||||||
|
- (9,5)
|
||||||
|
- (9,9)
|
||||||
|
|
||||||
|
|
||||||
|
Entities:
|
||||||
|
DirtPiles:
|
||||||
|
coords_or_quantity: (9,9), (1,1), (4,5) # (4,7), (2,4), (1, 1) #(1, 1), (2,4), (4,7), (7,9), (9,9) # (1, 1), (1,2), (1,3), (2,4), (2,5), (3,6), (4,7), (5,8), (6,8), (7,9), (8,9), (9,9)
|
||||||
|
initial_amount: 0.5 # <1 to ensure that the robot which first attempts to clean this field, can remove the dirt in one action
|
||||||
|
clean_amount: 1
|
||||||
|
dirt_spawn_r_var: 0
|
||||||
|
max_global_amount: 12
|
||||||
|
max_local_amount: 1
|
||||||
|
|
||||||
|
# Rules section specifies the rules governing the dynamics of the environment.
|
||||||
|
Rules:
|
||||||
|
|
||||||
|
# Utilities
|
||||||
|
# This rule defines the collision mechanic, introduces a related DoneCondition and lets you specify rewards.
|
||||||
|
# Can be omitted/ignored if you do not want to take care of collisions at all.
|
||||||
|
WatchCollisions:
|
||||||
|
done_at_collisions: false
|
||||||
|
|
||||||
|
# Done Conditions
|
||||||
|
# Define the conditions for the environment to stop. Either success or a fail conditions.
|
||||||
|
# The environment stops when all dirt is cleaned
|
||||||
|
DoneOnAllDirtCleaned:
|
||||||
|
#DoneAtMaxStepsReached: # An episode should last for at most max_steps steps
|
||||||
|
#max_steps: 1000
|
||||||
|
|
||||||
|
# Define how agents spawn.
|
||||||
|
# Options: "random" (Spawn agent at a random position from the list of defined positions)
|
||||||
|
# "first" (Always spawn agent at first position regardless of the other provided positions)
|
||||||
|
# "order" (Loop through agent positions)
|
||||||
|
AgentSpawnRule:
|
||||||
|
spawn_rule: "order"
|
@ -1,5 +1,5 @@
|
|||||||
MOVEMENTS_VALID: float = -0.01 # default: -0.001
|
MOVEMENTS_VALID: float = -1 # default: -0.001
|
||||||
MOVEMENTS_FAIL: float = -0.1 # default: -0.05
|
MOVEMENTS_FAIL: float = -1 # default: -0.05
|
||||||
NOOP: float = -0.01
|
NOOP: float = -1
|
||||||
COLLISION: float = -0.5
|
COLLISION: float = -1
|
||||||
COLLISION_DONE: float = -1
|
COLLISION_DONE: float = -1
|
||||||
|
@ -5,6 +5,7 @@ from typing import List, Collection
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
import marl_factory_grid
|
||||||
from marl_factory_grid.environment import rewards as r, constants as c
|
from marl_factory_grid.environment import rewards as r, constants as c
|
||||||
from marl_factory_grid.environment.entity.agent import Agent
|
from marl_factory_grid.environment.entity.agent import Agent
|
||||||
from marl_factory_grid.utils import helpers as h
|
from marl_factory_grid.utils import helpers as h
|
||||||
@ -180,6 +181,11 @@ class SpawnAgents(Rule):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def on_reset(self, state):
|
def on_reset(self, state):
|
||||||
|
spawn_rule = None
|
||||||
|
for rule in state.rules.rules:
|
||||||
|
if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule):
|
||||||
|
spawn_rule = rule.spawn_rule
|
||||||
|
|
||||||
agents = state[c.AGENT]
|
agents = state[c.AGENT]
|
||||||
for agent_name, agent_conf in state.agents_conf.items():
|
for agent_name, agent_conf in state.agents_conf.items():
|
||||||
empty_positions = state.entities.empty_positions
|
empty_positions = state.entities.empty_positions
|
||||||
@ -187,10 +193,9 @@ class SpawnAgents(Rule):
|
|||||||
observations = agent_conf['observations'].copy()
|
observations = agent_conf['observations'].copy()
|
||||||
positions = agent_conf['positions'].copy()
|
positions = agent_conf['positions'].copy()
|
||||||
other = agent_conf['other'].copy()
|
other = agent_conf['other'].copy()
|
||||||
|
positions_pointer = agent_conf['pos_pointer']
|
||||||
|
|
||||||
# Spawn agent on random position if multiple spawn points are provided
|
if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer):
|
||||||
func = random.choice if len(positions) else h.get_first
|
|
||||||
if position := func([x for x in positions if x in empty_positions]):
|
|
||||||
assert state.check_pos_validity(position), 'smth went wrong....'
|
assert state.check_pos_validity(position), 'smth went wrong....'
|
||||||
agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
|
agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
|
||||||
elif positions:
|
elif positions:
|
||||||
@ -200,6 +205,20 @@ class SpawnAgents(Rule):
|
|||||||
agents.add_item(Agent(actions, observations, empty_positions.pop(), str_ident=agent_name, **other))
|
agents.add_item(Agent(actions, observations, empty_positions.pop(), str_ident=agent_name, **other))
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer):
|
||||||
|
if spawn_rule and spawn_rule == "random":
|
||||||
|
position = random.choice(([x for x in positions if x in empty_positions]))
|
||||||
|
elif spawn_rule and spawn_rule == "order":
|
||||||
|
position = ([x for x in positions if x in empty_positions])[positions_pointer]
|
||||||
|
else:
|
||||||
|
position = h.get_first([x for x in positions if x in empty_positions])
|
||||||
|
|
||||||
|
return position
|
||||||
|
|
||||||
|
class AgentSpawnRule(Rule):
|
||||||
|
def __init__(self, spawn_rule):
|
||||||
|
self.spawn_rule = spawn_rule
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
class DoneAtMaxStepsReached(Rule):
|
class DoneAtMaxStepsReached(Rule):
|
||||||
|
|
||||||
|
@ -118,6 +118,10 @@ class Gamestate(object):
|
|||||||
self._floortile_graph = None
|
self._floortile_graph = None
|
||||||
self.tests = StepTests(*tests)
|
self.tests = StepTests(*tests)
|
||||||
|
|
||||||
|
# Pointer that defines current spawn points of agents
|
||||||
|
for agent in self.agents_conf:
|
||||||
|
self.agents_conf[agent]["pos_pointer"] = 0
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
self.curr_step = 0
|
self.curr_step = 0
|
||||||
self.curr_actions = None
|
self.curr_actions = None
|
||||||
|
@ -3,17 +3,36 @@ from pathlib import Path
|
|||||||
from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
|
from marl_factory_grid.algorithms.marl.a2c_dirt import A2C
|
||||||
from marl_factory_grid.algorithms.utils import load_yaml_file
|
from marl_factory_grid.algorithms.utils import load_yaml_file
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def dirt_quadrant_single_agent_training():
|
||||||
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml')
|
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml')
|
||||||
|
|
||||||
train_cfg = load_yaml_file(cfg_path)
|
train_cfg = load_yaml_file(cfg_path)
|
||||||
# Use environment config with fixed spawnpoints for eval
|
# Use environment config with fixed spawnpoints for eval
|
||||||
eval_cfg = copy.deepcopy(train_cfg)
|
eval_cfg = copy.deepcopy(train_cfg)
|
||||||
eval_cfg["env"]["env_name"] = "custom/dirt_quadrant" # Options: two_rooms_one_door_modified, dirt_quadrant
|
eval_cfg["env"]["env_name"] = "custom/dirt_quadrant_eval_config"
|
||||||
|
|
||||||
print("Training phase")
|
print("Training phase")
|
||||||
agent = A2C(train_cfg, eval_cfg)
|
agent = A2C(train_cfg, eval_cfg)
|
||||||
agent.train_loop()
|
agent.train_loop()
|
||||||
agent.plot_reward_development()
|
|
||||||
print("Evaluation phase")
|
print("Evaluation phase")
|
||||||
|
# Have consecutive episode for eval in single agent case
|
||||||
|
train_cfg["algorithm"]["pile_all_done"] = "all"
|
||||||
|
# agent.load_agents(["run0", "run1"])
|
||||||
agent.eval_loop(10)
|
agent.eval_loop(10)
|
||||||
|
|
||||||
|
|
||||||
|
def dirt_quadrant_multi_agent_eval():
|
||||||
|
cfg_path = Path('../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml')
|
||||||
|
|
||||||
|
train_cfg = load_yaml_file(cfg_path)
|
||||||
|
# Use environment config with fixed spawnpoints for eval
|
||||||
|
eval_cfg = copy.deepcopy(train_cfg)
|
||||||
|
eval_cfg["env"]["env_name"] = "custom/MultiAgentConfigs/dirt_quadrant_eval_config"
|
||||||
|
agent = A2C(train_cfg, eval_cfg)
|
||||||
|
print("Evaluation phase")
|
||||||
|
agent.load_agents(["run0", "run1"])
|
||||||
|
agent.eval_loop(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
dirt_quadrant_single_agent_training()
|
Reference in New Issue
Block a user