mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-12-06 15:40:37 +01:00
added changes from code submission branch and coin entity
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from .quickstart import init
|
||||
from marl_factory_grid.environment.factory import Factory
|
||||
"""
|
||||
Main module of the 'marl-factory-grid'-environment.
|
||||
Main module of the 'rl-factory-grid'-environment.
|
||||
Configure the :class:.Factory with any 'conf.yaml' file.
|
||||
Examples can be found in :module:.levels .
|
||||
"""
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
1
marl_factory_grid/algorithms/rl/__init__.py
Normal file
1
marl_factory_grid/algorithms/rl/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
|
||||
297
marl_factory_grid/algorithms/rl/a2c_coin.py
Normal file
297
marl_factory_grid/algorithms/rl/a2c_coin.py
Normal file
@@ -0,0 +1,297 @@
|
||||
import os
|
||||
import torch
|
||||
from typing import Union, List
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
|
||||
from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient
|
||||
from marl_factory_grid.algorithms.rl.constants import Names
|
||||
from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
|
||||
get_coin_piles_positions, update_target_pile, update_ordered_coin_piles, get_all_collected_coin_piles, \
|
||||
distribute_indices, set_agents_spawnpoints, get_ordered_coin_piles, handle_finished_episode, save_configs, \
|
||||
save_agent_models, get_all_observations, get_agents_positions
|
||||
from marl_factory_grid.algorithms.utils import add_env_props
|
||||
from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
|
||||
create_info_maps
|
||||
|
||||
nms = Names
|
||||
ListOrTensor = Union[List, torch.Tensor]
|
||||
|
||||
|
||||
class A2C:
|
||||
def __init__(self, train_cfg, eval_cfg):
|
||||
self.results_path = None
|
||||
self.agents = None
|
||||
self.act_dim = None
|
||||
self.obs_dim = None
|
||||
self.factory = add_env_props(train_cfg)
|
||||
self.eval_factory = add_env_props(eval_cfg)
|
||||
self.__training = True
|
||||
self.train_cfg = train_cfg
|
||||
self.eval_cfg = eval_cfg
|
||||
self.cfg = train_cfg
|
||||
self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
|
||||
self.setup()
|
||||
self.reward_development = []
|
||||
self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}
|
||||
|
||||
def setup(self):
|
||||
""" Initialize agents and create entry for run results according to configuration """
|
||||
self.obs_dim = 2 + 2 * len(get_coin_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
|
||||
nms.PILE_OBSERVABILITY] == nms.ALL else 4
|
||||
self.act_dim = 4 # The 4 movement directions
|
||||
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
|
||||
range(self.n_agents)]
|
||||
|
||||
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
|
||||
# Define study_out_path and check if it exists
|
||||
base_dir = os.path.dirname(os.path.abspath(__file__)) # Directory of the script
|
||||
study_out_path = os.path.join(base_dir, '../../../study_out')
|
||||
study_out_path = os.path.abspath(study_out_path)
|
||||
|
||||
if not os.path.exists(study_out_path):
|
||||
raise FileNotFoundError(f"The directory {study_out_path} does not exist.")
|
||||
|
||||
# Create results folder
|
||||
runs = os.listdir(study_out_path)
|
||||
run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
|
||||
next_run_number = max(run_numbers) + 1 if run_numbers else 0
|
||||
self.results_path = os.path.join(study_out_path, f"run{next_run_number}")
|
||||
os.mkdir(self.results_path)
|
||||
|
||||
# Save settings in results folder
|
||||
save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)
|
||||
|
||||
def set_cfg(self, eval=False):
|
||||
if eval:
|
||||
self.cfg = self.eval_cfg
|
||||
else:
|
||||
self.cfg = self.train_cfg
|
||||
|
||||
def load_agents(self, runs_list):
|
||||
""" Initialize networks with parameters of already trained agents """
|
||||
for idx, run in enumerate(runs_list):
|
||||
run_path = f"./study_out/{run}"
|
||||
self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
|
||||
self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
|
||||
|
||||
@torch.no_grad()
|
||||
def train_loop(self):
|
||||
""" Function for training agents """
|
||||
env = self.factory
|
||||
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
|
||||
global_steps, episode = 0, 0
|
||||
indices = distribute_indices(env, self.cfg, self.n_agents)
|
||||
coin_piles_positions = get_coin_piles_positions(env)
|
||||
target_pile = [partition[0] for partition in
|
||||
indices] # list of pointers that point to the current target pile for each agent
|
||||
collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
|
||||
|
||||
pbar = tqdm(total=max_steps)
|
||||
while global_steps < max_steps:
|
||||
_ = env.reset()
|
||||
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
|
||||
env.render()
|
||||
set_agents_spawnpoints(env, self.n_agents)
|
||||
ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
|
||||
# Reset current target pile at episode begin if all piles have to be collected in one episode
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
|
||||
target_pile = [partition[0] for partition in indices]
|
||||
collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
|
||||
|
||||
# Supply each agent with its local observation
|
||||
obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
|
||||
done, rew_log = [False] * self.n_agents, 0
|
||||
|
||||
while not all(done):
|
||||
action = self.use_door_or_move(env, obs, collected_coin_piles) \
|
||||
if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
|
||||
_, next_obs, reward, done, info = env.step(action)
|
||||
next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
|
||||
|
||||
# Handle case where agent is on field with coin
|
||||
reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
|
||||
reward, done)
|
||||
|
||||
if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True
|
||||
|
||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||
for ag_i, agent in enumerate(self.agents):
|
||||
if action[ag_i] in range(self.act_dim):
|
||||
# Add agent results into respective rollout buffers
|
||||
agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
|
||||
|
||||
# Visualize state update
|
||||
if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()
|
||||
|
||||
obs = next_obs
|
||||
|
||||
if all(done): handle_finished_episode(obs, self.agents, self.cfg)
|
||||
|
||||
global_steps += 1
|
||||
rew_log += sum(reward)
|
||||
|
||||
if global_steps >= max_steps: break
|
||||
|
||||
self.reward_development.append(rew_log)
|
||||
episode += 1
|
||||
pbar.update(global_steps - pbar.n)
|
||||
|
||||
pbar.close()
|
||||
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
|
||||
plot_reward_development(self.reward_development, self.results_path)
|
||||
create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
|
||||
get_coin_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
|
||||
save_agent_models(self.results_path, self.agents)
|
||||
plot_action_maps(env, [self], self.results_path)
|
||||
|
||||
@torch.inference_mode(True)
|
||||
def eval_loop(self, n_episodes):
|
||||
""" Function for performing inference """
|
||||
env = self.eval_factory
|
||||
self.set_cfg(eval=True)
|
||||
episode, results = 0, []
|
||||
coin_piles_positions = get_coin_piles_positions(env)
|
||||
indices = distribute_indices(env, self.cfg, self.n_agents)
|
||||
target_pile = [partition[0] for partition in
|
||||
indices] # list of pointers that point to the current target pile for each agent
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
|
||||
collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
|
||||
range(self.n_agents)]
|
||||
else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
|
||||
|
||||
while episode < n_episodes:
|
||||
_ = env.reset()
|
||||
set_agents_spawnpoints(env, self.n_agents)
|
||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||
# Don't render auxiliary piles
|
||||
if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
|
||||
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.COIN_PILES]) if
|
||||
idx % 2 == 0]
|
||||
for pile in auxiliary_piles:
|
||||
pile.set_new_amount(0)
|
||||
env.render()
|
||||
env._renderer.fps = 5 # Slow down agent movement
|
||||
|
||||
# Reset current target pile at episode begin if all piles have to be collected in one episode
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
|
||||
target_pile = [partition[0] for partition in indices]
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
|
||||
collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
|
||||
range(self.n_agents)]
|
||||
else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
|
||||
|
||||
ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
|
||||
|
||||
# Supply each agent with its local observation
|
||||
obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
|
||||
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
|
||||
|
||||
while not all(done):
|
||||
action = self.use_door_or_move(env, obs, collected_coin_piles, det=True) \
|
||||
if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
|
||||
collected_coin_piles) # zero exploration
|
||||
_, next_obs, reward, done, info = env.step(action)
|
||||
|
||||
# Handle case where agent is on field with coin
|
||||
reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
|
||||
reward, done)
|
||||
|
||||
# Get transformed next_obs that might have been updated because of handle_coin
|
||||
next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
|
||||
|
||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||
|
||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()
|
||||
|
||||
obs = next_obs
|
||||
|
||||
episode += 1
|
||||
|
||||
# -------------------------------------- HELPER FUNCTIONS ------------------------------------------------- #
|
||||
|
||||
def get_actions(self, observations) -> ListOrTensor:
|
||||
""" Given local observations, get actions for both agents """
|
||||
actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
|
||||
enumerate(self.agents)]
|
||||
return actions
|
||||
|
||||
def execute_policy(self, observations, env, collected_coin_piles) -> ListOrTensor:
|
||||
""" Execute agent policies deterministically for inference """
|
||||
actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
|
||||
enumerate(self.agents)]
|
||||
for agent_idx in range(self.n_agents):
|
||||
if all(collected_coin_piles[agent_idx].values()):
|
||||
actions[agent_idx] = np.array(next(
|
||||
action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
|
||||
a.name == nms.NOOP))
|
||||
return actions
|
||||
|
||||
def use_door_or_move(self, env, obs, collected_coin_piles, det=False):
|
||||
""" Function that handles automatic actions like door opening and forced Noop"""
|
||||
action = []
|
||||
for agent_idx, agent in enumerate(self.agents):
|
||||
agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
|
||||
# Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting)
|
||||
if all(collected_coin_piles[agent_idx].values()):
|
||||
action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
|
||||
a.name == nms.NOOP))
|
||||
if not det:
|
||||
# Include agent experience entry manually
|
||||
agent._episode.append((None, None, None, agent.vf(agent_obs)))
|
||||
else:
|
||||
if door := is_door_close(env, agent_idx):
|
||||
if door.is_closed:
|
||||
action.append(next(
|
||||
action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
|
||||
a.name == nms.USE_DOOR))
|
||||
# Don't include action in agent experience
|
||||
else:
|
||||
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
|
||||
else: action.append(int(agent.step(agent_obs)))
|
||||
else:
|
||||
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
|
||||
else: action.append(int(agent.step(agent_obs)))
|
||||
return action
|
||||
|
||||
def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done):
|
||||
""" Check if agent moved on field with coin. If that is the case collect coin automatically """
|
||||
agents_positions = get_agents_positions(env, self.n_agents)
|
||||
coin_piles_positions = get_coin_piles_positions(env)
|
||||
if any([True for pos in agents_positions if pos in coin_piles_positions]):
|
||||
# Only simulate collecting the coin
|
||||
for idx, pos in enumerate(agents_positions):
|
||||
if pos in collected_coin_piles[idx].keys() and not collected_coin_piles[idx][pos]:
|
||||
|
||||
# If coin piles should be collected in a specific order
|
||||
if ordered_coin_piles[idx]:
|
||||
if pos == ordered_coin_piles[idx][target_pile[idx]]:
|
||||
reward[idx] += 50
|
||||
collected_coin_piles[idx][pos] = True
|
||||
# Set pointer to next coin pile
|
||||
update_target_pile(env, idx, target_pile, indices, self.cfg)
|
||||
update_ordered_coin_piles(idx, collected_coin_piles, ordered_coin_piles, env,
|
||||
self.cfg, self.n_agents)
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE:
|
||||
done = True
|
||||
if all(collected_coin_piles[idx].values()):
|
||||
# Reset collected_coin_piles indicator
|
||||
for pos in coin_piles_positions:
|
||||
collected_coin_piles[idx][pos] = False
|
||||
else:
|
||||
reward[idx] += 50
|
||||
collected_coin_piles[idx][pos] = True
|
||||
|
||||
# Indicate that renderer can hide coin pile
|
||||
coin_at_position = env.state[nms.COIN_PILES].by_pos(pos)
|
||||
coin_at_position[0].set_new_amount(0)
|
||||
|
||||
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]:
|
||||
if all([all(collected_coin_piles[i].values()) for i in range(self.n_agents)]):
|
||||
done = True
|
||||
elif self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED:
|
||||
# End episode if both agents together have collected all coin piles
|
||||
if all(get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, self.n_agents).values()):
|
||||
done = True
|
||||
|
||||
return reward, done
|
||||
112
marl_factory_grid/algorithms/rl/base_a2c.py
Normal file
112
marl_factory_grid/algorithms/rl/base_a2c.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import scipy as sp
|
||||
from collections import deque
|
||||
from torch import nn
|
||||
|
||||
cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
|
||||
class Net(th.nn.Module):
|
||||
def __init__(self, shape, activation, lr):
|
||||
super().__init__()
|
||||
self.net = th.nn.Sequential(*[layer
|
||||
for io, a in zip(zip(shape[:-1], shape[1:]),
|
||||
[activation] * (len(shape) - 2) + [th.nn.Identity])
|
||||
for layer in [th.nn.Linear(*io), a()]])
|
||||
self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
|
||||
|
||||
# Initialize weights uniformly, so that for the policy net all actions have approximately the same
|
||||
# probability in the beginning
|
||||
for module in self.modules():
|
||||
if isinstance(module, nn.Linear):
|
||||
nn.init.uniform_(module.weight, a=-0.1, b=0.1)
|
||||
if module.bias is not None:
|
||||
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
|
||||
|
||||
def save_model(self, path):
|
||||
th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
|
||||
|
||||
def save_model_parameters(self, path):
|
||||
th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
|
||||
|
||||
def load_model_parameters(self, path):
|
||||
self.net.load_state_dict(th.load(path))
|
||||
self.net.eval()
|
||||
|
||||
|
||||
class ValueNet(Net):
|
||||
def __init__(self, obs_dim, hidden_sizes=[64, 64], activation=th.nn.ReLU, lr=1e-3):
|
||||
super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
|
||||
|
||||
def forward(self, obs): return self.net(obs)
|
||||
|
||||
def loss(self, states, returns): return ((returns - self(states)) ** 2).mean()
|
||||
|
||||
|
||||
class PolicyNet(Net):
|
||||
def __init__(self, obs_dim, act_dim, hidden_sizes=[64, 64], activation=th.nn.Tanh, lr=3e-4):
|
||||
super().__init__([obs_dim] + hidden_sizes + [act_dim], activation, lr)
|
||||
self.distribution = lambda obs: th.distributions.Categorical(logits=self.net(obs))
|
||||
|
||||
def forward(self, obs, act=None, det=False):
|
||||
"""Given an observation: Returns policy distribution and probablilty for a given action
|
||||
or Returns a sampled action and its corresponding probablilty"""
|
||||
pi = self.distribution(obs)
|
||||
if act is not None: return pi, pi.log_prob(act)
|
||||
act = self.net(obs).argmax() if det else pi.sample() # sample from the learned distribution
|
||||
return act, pi.log_prob(act)
|
||||
|
||||
def loss(self, states, actions, advantages):
|
||||
_, logp = self.forward(states, actions)
|
||||
loss = -(logp * advantages).mean()
|
||||
return loss
|
||||
|
||||
|
||||
class PolicyGradient:
|
||||
""" Autonomous agent using vanilla policy gradient. """
|
||||
|
||||
def __init__(self, env, seed=42, gamma=0.99, agent_id=0, act_dim=None, obs_dim=None):
|
||||
self.env = env
|
||||
self.gamma = gamma # Setup env and discount
|
||||
th.manual_seed(seed)
|
||||
np.random.seed(seed) # Seed Torch, numpy and gym
|
||||
# Keep track of previous rewards and performed steps to calcule the mean Return metric
|
||||
self._episode, self.ep_returns, self.num_steps = [], deque(maxlen=100), 0
|
||||
# Get observation and action shapes
|
||||
if not obs_dim:
|
||||
obs_size = env.observation_space.shape if len(env.state.entities.by_name("Agents")) == 1 \
|
||||
else env.observation_space[agent_id].shape # Single agent case vs. multi-agent case
|
||||
obs_dim = np.prod(obs_size)
|
||||
if not act_dim:
|
||||
act_dim = env.action_space[agent_id].n
|
||||
self.vf = ValueNet(obs_dim) # Setup Value Network (Critic)
|
||||
self.pi = PolicyNet(obs_dim, act_dim) # Setup Policy Network (Actor)
|
||||
|
||||
def step(self, obs):
|
||||
""" Given an observation, get action and probs from policy and values from critic"""
|
||||
with th.no_grad():
|
||||
(a, _), v = self.pi(obs), self.vf(obs)
|
||||
self._episode.append((None, None, None, v))
|
||||
return a.numpy()
|
||||
|
||||
def policy(self, obs, det=True):
|
||||
return self.pi(obs, det=det)[0].numpy()
|
||||
|
||||
def finish_episode(self):
|
||||
"""Process self._episode & reset self.env, Returns (s,a,G,V)-Tuple and new inital state"""
|
||||
s, a, r, v = (np.array(e) for e in zip(*self._episode)) # Get trajectories from rollout
|
||||
self.ep_returns.append(sum(r))
|
||||
self._episode = [] # Add episode return to buffer & reset
|
||||
return s, a, r, v # state, action, Return, Value Tensors
|
||||
|
||||
def train(self, states, actions, returns, advantages): # Update policy weights
|
||||
self.pi.optimizer.zero_grad()
|
||||
self.vf.optimizer.zero_grad() # Reset optimizer
|
||||
states = states.flatten(1, -1) # Reduce dimensionality to rollout_dim x input_dim
|
||||
policy_loss = self.pi.loss(states, actions, advantages) # Calculate Policy loss
|
||||
policy_loss.backward()
|
||||
self.pi.optimizer.step() # Apply Policy loss
|
||||
value_loss = self.vf.loss(states, returns) # Calculate Value loss
|
||||
value_loss.backward()
|
||||
self.vf.optimizer.step() # Apply Value loss
|
||||
@@ -2,7 +2,7 @@ import torch
|
||||
from typing import Union, List, Dict
|
||||
import numpy as np
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
@@ -1,5 +1,5 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
@@ -18,7 +18,7 @@ env:
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
@@ -1,5 +1,5 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 2
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
@@ -18,7 +18,7 @@ env:
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
@@ -1,5 +1,5 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
@@ -18,7 +18,7 @@ env:
|
||||
eval_render: True
|
||||
save_and_log: True
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
@@ -1,5 +1,5 @@
|
||||
agent:
|
||||
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
|
||||
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
|
||||
n_agents: 1
|
||||
obs_emb_size: 96
|
||||
action_emb_size: 16
|
||||
@@ -18,7 +18,7 @@ env:
|
||||
eval_render: True
|
||||
save_and_log: False
|
||||
record: False
|
||||
method: marl_factory_grid.algorithms.marl.LoopSEAC
|
||||
method: marl_factory_grid.algorithms.rl.LoopSEAC
|
||||
algorithm:
|
||||
gamma: 0.99
|
||||
entropy_coef: 0.01
|
||||
37
marl_factory_grid/algorithms/rl/constants.py
Normal file
37
marl_factory_grid/algorithms/rl/constants.py
Normal file
@@ -0,0 +1,37 @@
|
||||
class Names:
|
||||
ENV = 'env'
|
||||
ENV_NAME = 'env_name'
|
||||
N_AGENTS = 'n_agents'
|
||||
ALGORITHM = 'algorithm'
|
||||
MAX_STEPS = 'max_steps'
|
||||
N_STEPS = 'n_steps'
|
||||
TRAIN_RENDER = 'train_render'
|
||||
EVAL_RENDER = 'eval_render'
|
||||
AGENT = 'Agent'
|
||||
PILE_OBSERVABILITY = 'pile-observability'
|
||||
PILE_ORDER = 'pile-order'
|
||||
ALL = 'all'
|
||||
FIXED = 'fixed'
|
||||
AGENTS = 'agents'
|
||||
DYNAMIC = 'dynamic'
|
||||
SMART = 'smart'
|
||||
DIRT_PILES = 'DirtPiles'
|
||||
COIN_PILES = 'CoinPiles'
|
||||
AUXILIARY_PILES = "auxiliary_piles"
|
||||
DOORS = 'Doors'
|
||||
DOOR = 'Door'
|
||||
GAMMA = 'gamma'
|
||||
ADVANTAGE = 'advantage'
|
||||
REINFORCE = 'reinforce'
|
||||
ADVANTAGE_AC = "Advantage-AC"
|
||||
TD_ADVANTAGE_AC = "TD-Advantage-AC"
|
||||
CHUNK_EPISODE = 'chunk-episode'
|
||||
POS_POINTER = 'pos_pointer'
|
||||
POSITIONS = 'positions'
|
||||
SAVE_AND_LOG = 'save_and_log'
|
||||
NOOP = 'Noop'
|
||||
USE_DOOR = 'use_door'
|
||||
PILE_ALL_DONE = 'pile_all_done'
|
||||
SINGLE = 'single'
|
||||
DISTRIBUTED = 'distributed'
|
||||
SHARED = 'shared'
|
||||
@@ -1,9 +1,9 @@
|
||||
import torch
|
||||
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
|
||||
from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic, nms
|
||||
from marl_factory_grid.algorithms.utils import instantiate_class
|
||||
from pathlib import Path
|
||||
from natsort import natsorted
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
|
||||
|
||||
|
||||
class LoopIAC(BaseActorCritic):
|
||||
@@ -1,6 +1,6 @@
|
||||
from marl_factory_grid.algorithms.marl.base_ac import Names as nms
|
||||
from marl_factory_grid.algorithms.marl.snac import LoopSNAC
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.rl.base_ac import Names as nms
|
||||
from marl_factory_grid.algorithms.rl.snac import LoopSNAC
|
||||
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.utils import instantiate_class
|
||||
@@ -1,8 +1,8 @@
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.marl.iac import LoopIAC
|
||||
from marl_factory_grid.algorithms.marl.base_ac import nms
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.rl.iac import LoopIAC
|
||||
from marl_factory_grid.algorithms.rl.base_ac import nms
|
||||
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
|
||||
|
||||
|
||||
class LoopSEAC(LoopIAC):
|
||||
@@ -1,5 +1,5 @@
|
||||
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
|
||||
from marl_factory_grid.algorithms.marl.base_ac import nms
|
||||
from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic
|
||||
from marl_factory_grid.algorithms.rl.base_ac import nms
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from pathlib import Path
|
||||
337
marl_factory_grid/algorithms/rl/utils.py
Normal file
337
marl_factory_grid/algorithms/rl/utils.py
Normal file
@@ -0,0 +1,337 @@
|
||||
import copy
|
||||
from typing import List
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from marl_factory_grid.algorithms.rl.constants import Names as nms
|
||||
|
||||
from marl_factory_grid.algorithms.rl.base_a2c import cumulate_discount
|
||||
|
||||
|
||||
def _as_torch(x):
|
||||
""" Helper function to convert different list types to a torch tensor """
|
||||
if isinstance(x, np.ndarray):
|
||||
return torch.from_numpy(x)
|
||||
elif isinstance(x, List):
|
||||
return torch.tensor(x)
|
||||
elif isinstance(x, (int, float)):
|
||||
return torch.tensor([x])
|
||||
return x
|
||||
|
||||
|
||||
def transform_observations(env, ordered_coins, target_coin, cfg, n_agents):
|
||||
""" Function that extracts local observations from global state
|
||||
Requires that agents have observations -CoinPiles and -Self (cf. environment configs) """
|
||||
agents_positions = get_agents_positions(env, n_agents)
|
||||
coin_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL
|
||||
if coin_observability_is_all:
|
||||
trans_obs = [torch.zeros(2 + 2 * len(ordered_coins[0])) for _ in range(len(agents_positions))]
|
||||
else:
|
||||
# Only show current target pile
|
||||
trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))]
|
||||
for i, pos in enumerate(agents_positions):
|
||||
agent_x, agent_y = pos[0], pos[1]
|
||||
trans_obs[i][0] = agent_x
|
||||
trans_obs[i][1] = agent_y
|
||||
idx = 2
|
||||
if coin_observability_is_all:
|
||||
for coin_pos in ordered_coins[i]:
|
||||
trans_obs[i][idx] = coin_pos[0]
|
||||
trans_obs[i][idx + 1] = coin_pos[1]
|
||||
idx += 2
|
||||
else:
|
||||
trans_obs[i][2] = ordered_coins[i][target_coin[i]][0]
|
||||
trans_obs[i][3] = ordered_coins[i][target_coin[i]][1]
|
||||
return trans_obs
|
||||
|
||||
|
||||
def get_all_observations(env, cfg, n_agents):
|
||||
""" Helper function that returns all possible agent observations """
|
||||
coins_positions = [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
|
||||
range(len(env.state.entities[nms.COIN_PILES]))]
|
||||
if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL:
|
||||
obs = [torch.zeros(2 + 2 * len(coins_positions))]
|
||||
observations = [[]]
|
||||
# Fill in pile positions
|
||||
idx = 2
|
||||
for pile_pos in coins_positions:
|
||||
obs[0][idx] = pile_pos[0]
|
||||
obs[0][idx + 1] = pile_pos[1]
|
||||
idx += 2
|
||||
else:
|
||||
# Have multiple observation layers of the map for each coin pile one
|
||||
obs = [torch.zeros(4) for _ in range(n_agents) for _ in coins_positions]
|
||||
observations = [[] for _ in coins_positions]
|
||||
for idx, pile_pos in enumerate(coins_positions):
|
||||
obs[idx][2] = pile_pos[0]
|
||||
obs[idx][3] = pile_pos[1]
|
||||
valid_agent_positions = env.state.entities.floorlist
|
||||
|
||||
for idx, pos in enumerate(valid_agent_positions):
|
||||
for obs_layer in range(len(obs)):
|
||||
observation = copy.deepcopy(obs[obs_layer])
|
||||
observation[0] = pos[0]
|
||||
observation[1] = pos[1]
|
||||
observations[obs_layer].append(observation)
|
||||
|
||||
return observations
|
||||
|
||||
|
||||
def get_coin_piles_positions(env):
|
||||
""" Get positions of coin piles on the map """
|
||||
return [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
|
||||
range(len(env.state.entities[nms.COIN_PILES]))]
|
||||
|
||||
|
||||
def get_agents_positions(env, n_agents):
|
||||
""" Get positions of agents on the map """
|
||||
return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
|
||||
|
||||
|
||||
def get_ordered_coin_piles(env, collected_coins, cfg, n_agents):
|
||||
""" This function determines in which order the agents should collect the coin piles
|
||||
Each agent can have its individual pile order """
|
||||
ordered_coin_piles = [[] for _ in range(n_agents)]
|
||||
coin_piles_positions = get_coin_piles_positions(env)
|
||||
agents_positions = get_agents_positions(env, n_agents)
|
||||
for agent_idx in range(n_agents):
|
||||
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]:
|
||||
ordered_coin_piles[agent_idx] = coin_piles_positions
|
||||
elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]:
|
||||
# Calculate distances for remaining unvisited coin piles
|
||||
remaining_target_piles = [pos for pos, value in collected_coins[agent_idx].items() if not value]
|
||||
pile_distances = {pos: 0 for pos in remaining_target_piles}
|
||||
agent_pos = agents_positions[agent_idx]
|
||||
for pos in remaining_target_piles:
|
||||
pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
|
||||
|
||||
if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART:
|
||||
# Check if there is an agent on the direct path to any of the remaining coin piles
|
||||
for pile_pos in remaining_target_piles:
|
||||
for other_pos in agents_positions:
|
||||
if other_pos != agent_pos:
|
||||
if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[
|
||||
1]:
|
||||
# Get the line between the agent and the target
|
||||
path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
|
||||
|
||||
# Check if the entity lies on the path between the agent and the target
|
||||
if other_pos in path:
|
||||
pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(
|
||||
agent_pos[1] - other_pos[1])
|
||||
|
||||
sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
|
||||
# Insert already visited coin piles
|
||||
ordered_coin_piles[agent_idx] = [pos for pos in coin_piles_positions if pos not in remaining_target_piles]
|
||||
# Fill up with sorted positions
|
||||
for pos in sorted_pile_distances.keys():
|
||||
ordered_coin_piles[agent_idx].append(pos)
|
||||
|
||||
else:
|
||||
print("Not a valid pile order option.")
|
||||
exit()
|
||||
|
||||
return ordered_coin_piles
|
||||
|
||||
|
||||
def bresenham(x0, y0, x1, y1):
|
||||
"""Bresenham's line algorithm to get the coordinates of a line between two points."""
|
||||
dx = np.abs(x1 - x0)
|
||||
dy = np.abs(y1 - y0)
|
||||
sx = 1 if x0 < x1 else -1
|
||||
sy = 1 if y0 < y1 else -1
|
||||
err = dx - dy
|
||||
|
||||
coordinates = []
|
||||
while True:
|
||||
coordinates.append((x0, y0))
|
||||
if x0 == x1 and y0 == y1:
|
||||
break
|
||||
e2 = 2 * err
|
||||
if e2 > -dy:
|
||||
err -= dy
|
||||
x0 += sx
|
||||
if e2 < dx:
|
||||
err += dx
|
||||
y0 += sy
|
||||
return coordinates
|
||||
|
||||
|
||||
def update_ordered_coin_piles(agent_idx, collected_coin_piles, ordered_coin_piles, env, cfg, n_agents):
|
||||
""" Update the order of the remaining coin piles """
|
||||
# Only update ordered_coin_pile for agent that reached its target pile
|
||||
updated_ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, cfg, n_agents)
|
||||
for i in range(len(ordered_coin_piles[agent_idx])):
|
||||
ordered_coin_piles[agent_idx][i] = updated_ordered_coin_piles[agent_idx][i]
|
||||
|
||||
|
||||
def distribute_indices(env, cfg, n_agents):
|
||||
""" Distribute coin piles evenly among the agents """
|
||||
indices = []
|
||||
n_coin_piles = len(get_coin_piles_positions(env))
|
||||
agents_positions = get_agents_positions(env, n_agents)
|
||||
if n_coin_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
|
||||
indices = [[0] for _ in range(n_agents)]
|
||||
else:
|
||||
base_count = n_coin_piles // n_agents
|
||||
remainder = n_coin_piles % n_agents
|
||||
|
||||
start_index = 0
|
||||
for i in range(n_agents):
|
||||
# Add an extra index to the first 'remainder' objects
|
||||
end_index = start_index + base_count + (1 if i < remainder else 0)
|
||||
indices.append(list(range(start_index, end_index)))
|
||||
start_index = end_index
|
||||
|
||||
# Static form: auxiliary pile, primary pile, auxiliary pile, ...
|
||||
# -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
|
||||
if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
|
||||
door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
|
||||
distances = {door_pos: [] for door_pos in door_positions}
|
||||
|
||||
# Calculate distance of every agent to every door
|
||||
for door_pos in door_positions:
|
||||
for agent_pos in agents_positions:
|
||||
distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
|
||||
|
||||
def duplicate_indices(lst, item):
|
||||
return [i for i, x in enumerate(lst) if x == item]
|
||||
|
||||
# Get agent indices of agents with same distance to door
|
||||
affected_agents = {door_pos: {} for door_pos in door_positions}
|
||||
for door_pos in distances.keys():
|
||||
dist = distances[door_pos]
|
||||
dist_set = set(dist)
|
||||
for d in dist_set:
|
||||
affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)
|
||||
|
||||
updated_indices = []
|
||||
|
||||
for door_pos, agent_distances in affected_agents.items():
|
||||
if len(agent_distances) == 0:
|
||||
# Remove auxiliary piles for all agents
|
||||
# (In config, we defined every pile with an even numbered index to be an auxiliary pile)
|
||||
updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
|
||||
else:
|
||||
for distance, agent_indices in agent_distances.items():
|
||||
# For each distance group, pick one random agent to keep the auxiliary pile
|
||||
# selected_agent = np.random.choice(agent_indices)
|
||||
selected_agent = 0
|
||||
for agent_idx in agent_indices:
|
||||
if agent_idx == selected_agent:
|
||||
updated_indices.append(indices[agent_idx])
|
||||
else:
|
||||
updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
|
||||
|
||||
indices = updated_indices
|
||||
|
||||
return indices
|
||||
|
||||
|
||||
def update_target_pile(env, agent_idx, target_pile, indices, cfg):
|
||||
""" Get the next target pile for a given agent """
|
||||
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
|
||||
if target_pile[agent_idx] + 1 < len(get_coin_piles_positions(env)):
|
||||
target_pile[agent_idx] += 1
|
||||
else:
|
||||
target_pile[agent_idx] = 0
|
||||
else:
|
||||
if target_pile[agent_idx] + 1 in indices[agent_idx]:
|
||||
target_pile[agent_idx] += 1
|
||||
|
||||
|
||||
def is_door_close(env, agent_idx):
|
||||
""" Checks whether the agent is close to a door """
|
||||
neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos)
|
||||
for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name]
|
||||
if neighbourhood:
|
||||
return neighbourhood[0]
|
||||
|
||||
|
||||
def get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, n_agents):
|
||||
""" Returns all coin piles collected by any agent """
|
||||
meta_collected_coin_piles = {pos: False for pos in coin_piles_positions}
|
||||
for agent_idx in range(n_agents):
|
||||
for (pos, collected) in collected_coin_piles[agent_idx].items():
|
||||
if collected:
|
||||
meta_collected_coin_piles[pos] = True
|
||||
return meta_collected_coin_piles
|
||||
|
||||
|
||||
def handle_finished_episode(obs, agents, cfg):
|
||||
""" Finish up episode, calculate advantages and perform policy net and value net updates"""
|
||||
with torch.inference_mode(False):
|
||||
for ag_i, agent in enumerate(agents):
|
||||
# Get states, actions, rewards and values from rollout buffer
|
||||
data = agent.finish_episode()
|
||||
# Chunk episode data, such that there will be no memory failure for very long episodes
|
||||
chunks = split_into_chunks(data, cfg)
|
||||
for (s, a, R, V) in chunks:
|
||||
# Calculate discounted return and advantage
|
||||
G = cumulate_discount(R, cfg[nms.ALGORITHM][nms.GAMMA])
|
||||
if cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.REINFORCE:
|
||||
A = G
|
||||
elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.ADVANTAGE_AC:
|
||||
A = G - V # Actor-Critic Advantages
|
||||
elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.TD_ADVANTAGE_AC:
|
||||
with torch.no_grad():
|
||||
A = R + cfg[nms.ALGORITHM][nms.GAMMA] * np.append(V[1:], agent.vf(
|
||||
_as_torch(obs[ag_i]).view(-1).to(
|
||||
torch.float32)).numpy()) - V # TD Actor-Critic Advantages
|
||||
else:
|
||||
print("Not a valid advantage option.")
|
||||
exit()
|
||||
|
||||
rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
|
||||
# Update policy and value net of agent with experience from rollout buffer
|
||||
agent.train(*rollout)
|
||||
|
||||
|
||||
def split_into_chunks(data_tuple, cfg):
|
||||
""" Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """
|
||||
result = [data_tuple]
|
||||
chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE]
|
||||
if chunk_size > 0:
|
||||
# Get the maximum length of the lists in the tuple to handle different lengths
|
||||
max_length = max(len(lst) for lst in data_tuple)
|
||||
|
||||
# Prepare a list to store the result
|
||||
result = []
|
||||
|
||||
# Split each list into chunks and add them to the result
|
||||
for i in range(0, max_length, chunk_size):
|
||||
# Create a sublist containing the ith chunk from each list
|
||||
sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
|
||||
result.append(sublist)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def set_agents_spawnpoints(env, n_agents):
|
||||
""" Tell environment where the agents should spawn in the next episode """
|
||||
for agent_idx in range(n_agents):
|
||||
agent_name = list(env.state.agents_conf.keys())[agent_idx]
|
||||
current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER]
|
||||
# Making the reset dependent on the number of spawnpoints and not the number of coinpiles allows
|
||||
# for having multiple subsequent spawnpoints with the same target pile
|
||||
if current_pos_pointer == len(env.state.agents_conf[agent_name][nms.POSITIONS]) - 1:
|
||||
env.state.agents_conf[agent_name][nms.POS_POINTER] = 0
|
||||
else:
|
||||
env.state.agents_conf[agent_name][nms.POS_POINTER] += 1
|
||||
|
||||
|
||||
def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
|
||||
""" Save configurations for logging purposes """
|
||||
with open(f"{results_path}/MARL_config.txt", "w") as txt_file:
|
||||
txt_file.write(str(cfg))
|
||||
with open(f"{results_path}/train_env_config.txt", "w") as txt_file:
|
||||
txt_file.write(str(factory_conf))
|
||||
with open(f"{results_path}/eval_env_config.txt", "w") as txt_file:
|
||||
txt_file.write(str(eval_factory_conf))
|
||||
|
||||
|
||||
def save_agent_models(results_path, agents):
|
||||
""" Save model parameters after training """
|
||||
for idx, agent in enumerate(agents):
|
||||
agent.pi.save_model_parameters(results_path)
|
||||
agent.vf.save_model_parameters(results_path)
|
||||
40
marl_factory_grid/algorithms/static/TSP_coin_agent.py
Normal file
40
marl_factory_grid/algorithms/static/TSP_coin_agent.py
Normal file
@@ -0,0 +1,40 @@
|
||||
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
|
||||
|
||||
from marl_factory_grid.modules.coins import constants as c
|
||||
from marl_factory_grid.environment import constants as e
|
||||
|
||||
future_planning = 7
|
||||
|
||||
|
||||
class TSPCoinAgent(TSPBaseAgent):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
"""
|
||||
Initializes a TSPCoinAgent that aims to collect coins in the environment.
|
||||
"""
|
||||
super(TSPCoinAgent, self).__init__(*args, **kwargs)
|
||||
self.fallback_action = e.NOOP
|
||||
|
||||
def predict(self, *_, **__):
|
||||
"""
|
||||
Predicts the next action based on the presence of coins in the environment.
|
||||
|
||||
:return: Predicted action.
|
||||
:rtype: int
|
||||
"""
|
||||
coin_at_position = self._env.state[c.COIN].by_pos(self.state.pos)
|
||||
if coin_at_position:
|
||||
# Translate the action_object to an integer to have the same output as any other model
|
||||
action = c.COLLECT
|
||||
elif door := self._door_is_close(self._env.state):
|
||||
action = self._use_door_or_move(door, c.COIN)
|
||||
else:
|
||||
action = self._predict_move(c.COIN)
|
||||
self.action_list.append(action)
|
||||
# Translate the action_object to an integer to have the same output as any other model
|
||||
try:
|
||||
action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
|
||||
except (StopIteration, UnboundLocalError):
|
||||
print('Will not happen')
|
||||
raise EnvironmentError
|
||||
return action_obj
|
||||
@@ -40,10 +40,27 @@ Agents:
|
||||
# - DropOffLocations
|
||||
# - Maintainers
|
||||
# Clones: 0
|
||||
Target test agent:
|
||||
# Target test agent:
|
||||
# Actions:
|
||||
# - Noop
|
||||
# - Charge
|
||||
# - DoorUse
|
||||
# - Move8
|
||||
# Observations:
|
||||
# - Combined:
|
||||
# - Other
|
||||
# - Walls
|
||||
# - GlobalPosition
|
||||
# - Battery
|
||||
# - Destinations
|
||||
# - Doors
|
||||
# - Maintainers
|
||||
# Clones: 1
|
||||
Coin test agent:
|
||||
Actions:
|
||||
- Noop
|
||||
- Charge
|
||||
- Collect
|
||||
- DoorUse
|
||||
- Move8
|
||||
Observations:
|
||||
@@ -52,6 +69,8 @@ Agents:
|
||||
- Walls
|
||||
- GlobalPosition
|
||||
- Battery
|
||||
- ChargePods
|
||||
- CoinPiles
|
||||
- Destinations
|
||||
- Doors
|
||||
- Maintainers
|
||||
@@ -67,11 +86,18 @@ Entities:
|
||||
Destinations:
|
||||
coords_or_quantity: 1
|
||||
spawn_mode: GROUPED
|
||||
DirtPiles:
|
||||
# DirtPiles:
|
||||
# coords_or_quantity: 10
|
||||
# initial_amount: 2
|
||||
# clean_amount: 1
|
||||
# dirt_spawn_r_var: 0.1
|
||||
# max_global_amount: 20
|
||||
# max_local_amount: 5
|
||||
CoinPiles:
|
||||
coords_or_quantity: 10
|
||||
initial_amount: 2
|
||||
clean_amount: 1
|
||||
dirt_spawn_r_var: 0.1
|
||||
collect_amount: 1
|
||||
coin_spawn_r_var: 0.1
|
||||
max_global_amount: 20
|
||||
max_local_amount: 5
|
||||
Doors:
|
||||
@@ -90,24 +116,26 @@ Entities:
|
||||
General:
|
||||
env_seed: 69
|
||||
individual_rewards: true
|
||||
level_name: quadrant
|
||||
level_name: two_rooms
|
||||
pomdp_r: 3
|
||||
verbose: false
|
||||
tests: false
|
||||
|
||||
Rules:
|
||||
# Environment Dynamics
|
||||
EntitiesSmearDirtOnMove:
|
||||
smear_ratio: 0.2
|
||||
# EntitiesSmearDirtOnMove:
|
||||
# smear_ratio: 0.2
|
||||
DoorAutoClose:
|
||||
close_frequency: 10
|
||||
MoveMaintainers:
|
||||
|
||||
# Respawn Stuff
|
||||
RespawnDirt:
|
||||
respawn_freq: 15
|
||||
# RespawnDirt:
|
||||
# respawn_freq: 15
|
||||
RespawnItems:
|
||||
respawn_freq: 15
|
||||
RespawnCoins:
|
||||
respawn_freq: 15
|
||||
|
||||
# Utilities
|
||||
WatchCollisions:
|
||||
|
||||
@@ -81,7 +81,7 @@ class Factory(gym.Env):
|
||||
def __init__(self, config_file: Union[str, PathLike], custom_modules_path: Union[None, PathLike] = None,
|
||||
custom_level_path: Union[None, PathLike] = None):
|
||||
"""
|
||||
Initializes the marl-factory-grid as Gym environment.
|
||||
Initializes the rl-factory-grid as Gym environment.
|
||||
|
||||
:param config_file: Path to the configuration file.
|
||||
:type config_file: Union[str, PathLike]
|
||||
@@ -271,15 +271,37 @@ class Factory(gym.Env):
|
||||
if not self._renderer: # lazy init
|
||||
from marl_factory_grid.utils.renderer import Renderer
|
||||
global Renderer
|
||||
self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10)
|
||||
self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10)
|
||||
|
||||
render_entities = self.state.entities.render()
|
||||
|
||||
# Hide entities where certain conditions are met (e.g., amount <= 0 for DirtPiles)
|
||||
render_entities = self.filter_entities(render_entities)
|
||||
|
||||
# Mask entities based on dynamic conditions instead of hardcoding level-specific logic
|
||||
if self.conf['General']['level_name'] == 'two_rooms':
|
||||
render_entities = self.mask_entities(render_entities)
|
||||
|
||||
if self.conf.pomdp_r:
|
||||
for render_entity in render_entities:
|
||||
if render_entity.name == c.AGENT:
|
||||
render_entity.aux = self.obs_builder.curr_lightmaps[render_entity.real_name]
|
||||
return self._renderer.render(render_entities, self._recorder)
|
||||
|
||||
def filter_entities(self, entities):
|
||||
""" Generalized method to filter out entities that shouldn't be rendered. """
|
||||
if 'DirtPiles' in self.state.entities.keys():
|
||||
entities = [entity for entity in entities if not (entity.name == 'DirtPiles' and entity.amount <= 0)]
|
||||
return entities
|
||||
|
||||
def mask_entities(self, entities):
|
||||
""" Generalized method to mask entities based on dynamic conditions. """
|
||||
for entity in entities:
|
||||
if entity.name == 'CoinPiles':
|
||||
entity.mask = 'Destinations'
|
||||
entity.mask_value = 1
|
||||
return entities
|
||||
|
||||
def set_recorder(self, recorder):
|
||||
self._recorder = recorder
|
||||
|
||||
@@ -298,7 +320,7 @@ class Factory(gym.Env):
|
||||
summary.update({entity_group.name.lower(): entity_group.summarize_states()})
|
||||
# TODO Section End ########
|
||||
for key in list(summary.keys()):
|
||||
if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries']:
|
||||
if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries', 'coinPiles']:
|
||||
del summary[key]
|
||||
return summary
|
||||
|
||||
|
||||
@@ -168,14 +168,25 @@ class SpawnEntity(Rule):
|
||||
return results
|
||||
|
||||
|
||||
def _get_position(spawn_rule, positions, empty_positions, positions_pointer):
|
||||
"""
|
||||
Internal usage, selects positions based on rule.
|
||||
"""
|
||||
if spawn_rule and spawn_rule == "random":
|
||||
position = random.choice(([x for x in positions if x in empty_positions]))
|
||||
elif spawn_rule and spawn_rule == "order":
|
||||
position = ([x for x in positions if x in empty_positions])[positions_pointer]
|
||||
else:
|
||||
position = h.get_first([x for x in positions if x in empty_positions])
|
||||
return position
|
||||
|
||||
|
||||
class SpawnAgents(Rule):
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
TODO
|
||||
|
||||
|
||||
:return:
|
||||
Finds suitable spawn positions according to the given spawn rule, creates agents with these positions and adds
|
||||
them to state.agents.
|
||||
"""
|
||||
super().__init__()
|
||||
pass
|
||||
@@ -183,8 +194,9 @@ class SpawnAgents(Rule):
|
||||
def on_reset(self, state):
|
||||
spawn_rule = None
|
||||
for rule in state.rules.rules:
|
||||
if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule):
|
||||
if isinstance(rule, AgentSpawnRule):
|
||||
spawn_rule = rule.spawn_rule
|
||||
break
|
||||
|
||||
if not hasattr(state, 'agent_spawn_positions'):
|
||||
state.agent_spawn_positions = []
|
||||
@@ -200,7 +212,7 @@ class SpawnAgents(Rule):
|
||||
other = agent_conf['other'].copy()
|
||||
positions_pointer = agent_conf['pos_pointer']
|
||||
|
||||
if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer):
|
||||
if position := _get_position(spawn_rule, positions, empty_positions, positions_pointer):
|
||||
assert state.check_pos_validity(position), 'smth went wrong....'
|
||||
agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other))
|
||||
state.agent_spawn_positions.append(position)
|
||||
@@ -213,21 +225,13 @@ class SpawnAgents(Rule):
|
||||
state.agent_spawn_positions.append(chosen_position)
|
||||
return []
|
||||
|
||||
def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer):
|
||||
if spawn_rule and spawn_rule == "random":
|
||||
position = random.choice(([x for x in positions if x in empty_positions]))
|
||||
elif spawn_rule and spawn_rule == "order":
|
||||
position = ([x for x in positions if x in empty_positions])[positions_pointer]
|
||||
else:
|
||||
position = h.get_first([x for x in positions if x in empty_positions])
|
||||
|
||||
return position
|
||||
|
||||
class AgentSpawnRule(Rule):
|
||||
def __init__(self, spawn_rule):
|
||||
self.spawn_rule = spawn_rule
|
||||
super().__init__()
|
||||
|
||||
|
||||
class DoneAtMaxStepsReached(Rule):
|
||||
|
||||
def __init__(self, max_steps: int = 500):
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import ast
|
||||
import random
|
||||
from marl_factory_grid.environment import constants as c
|
||||
from marl_factory_grid.environment.groups.collection import Collection
|
||||
from marl_factory_grid.modules.clean_up.entitites import DirtPile
|
||||
@@ -33,7 +34,7 @@ class DirtPiles(Collection):
|
||||
return sum([dirt.amount for dirt in self])
|
||||
|
||||
def __init__(self, *args, max_local_amount=5, clean_amount=1, max_global_amount: int = 20, coords_or_quantity=10,
|
||||
initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs):
|
||||
initial_amount=2, amount_var=0.2, n_var=0.2, randomize=False, randomization_seed=0, **kwargs):
|
||||
"""
|
||||
A Collection of dirt piles that triggers their spawn.
|
||||
|
||||
@@ -67,6 +68,8 @@ class DirtPiles(Collection):
|
||||
self.max_local_amount = max_local_amount
|
||||
self.coords_or_quantity = coords_or_quantity
|
||||
self.initial_amount = initial_amount
|
||||
self.randomize = randomize
|
||||
self.randomized_selection = None
|
||||
|
||||
def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]:
|
||||
if ignore_blocking:
|
||||
@@ -85,7 +88,17 @@ class DirtPiles(Collection):
|
||||
else:
|
||||
n_new = [pos for pos in coords_or_quantity]
|
||||
|
||||
amounts = [amount if amount else (self.initial_amount ) # removed rng amount
|
||||
if self.randomize:
|
||||
if not self.randomized_selection:
|
||||
n_new_prime = []
|
||||
for n in n_new:
|
||||
if random.random() < 0.5:
|
||||
n_new_prime.append(n)
|
||||
n_new = n_new_prime
|
||||
self.randomized_selection = n_new
|
||||
else:
|
||||
n_new = self.randomized_selection
|
||||
amounts = [amount if amount else self.initial_amount # removed rng amount
|
||||
for _ in range(len(n_new))]
|
||||
|
||||
spawn_counter = 0
|
||||
|
||||
4
marl_factory_grid/modules/coins/__init__.py
Normal file
4
marl_factory_grid/modules/coins/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .actions import Collect
|
||||
from .entitites import CoinPile
|
||||
from .groups import CoinPiles
|
||||
from .rules import DoneOnAllCoinsCollected
|
||||
36
marl_factory_grid/modules/coins/actions.py
Normal file
36
marl_factory_grid/modules/coins/actions.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from typing import Union
|
||||
|
||||
from marl_factory_grid.environment.actions import Action
|
||||
from marl_factory_grid.utils.results import ActionResult
|
||||
|
||||
from marl_factory_grid.modules.coins import constants as d
|
||||
|
||||
from marl_factory_grid.environment import constants as c
|
||||
|
||||
|
||||
class Collect(Action):
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Attempts to reduce coin amount on entity's position. Fails if no coin is found at the at agents' position.
|
||||
"""
|
||||
super().__init__(d.COLLECT, d.REWARD_COLLECT_VALID, d.REWARD_COLLECT_FAIL)
|
||||
|
||||
def do(self, entity, state) -> Union[None, ActionResult]:
|
||||
if coin_pile := next((x for x in state.entities.pos_dict[entity.pos] if "coin" in x.name.lower()), None):
|
||||
new_coin_pile_amount = coin_pile.amount - state[d.COIN].collect_amount
|
||||
|
||||
if new_coin_pile_amount <= 0:
|
||||
state[d.COIN].delete_env_object(coin_pile)
|
||||
else:
|
||||
coin_pile.set_new_amount(max(new_coin_pile_amount, c.VALUE_FREE_CELL))
|
||||
valid = c.VALID
|
||||
print_str = f'{entity.name} did just collect some coins at {entity.pos}.'
|
||||
state.print(print_str)
|
||||
|
||||
else:
|
||||
valid = c.NOT_VALID
|
||||
print_str = f'{entity.name} just tried to collect some coins at {entity.pos}, but failed.'
|
||||
state.print(print_str)
|
||||
|
||||
return self.get_result(valid, entity)
|
||||
BIN
marl_factory_grid/modules/coins/coinpiles.png
Normal file
BIN
marl_factory_grid/modules/coins/coinpiles.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 101 KiB |
11
marl_factory_grid/modules/coins/constants.py
Normal file
11
marl_factory_grid/modules/coins/constants.py
Normal file
@@ -0,0 +1,11 @@
|
||||
COIN = 'CoinPiles'
|
||||
|
||||
COLLECT = 'do_collect_action'
|
||||
|
||||
COLLECT_VALID = 'collect_valid'
|
||||
COLLECT_FAIL = 'collect_fail'
|
||||
COLLECT_ALL = 'all_collected'
|
||||
|
||||
REWARD_COLLECT_VALID: float = 0.5
|
||||
REWARD_COLLECT_FAIL: float = -0.1
|
||||
REWARD_COLLECT_ALL: float = 4.5
|
||||
46
marl_factory_grid/modules/coins/entitites.py
Normal file
46
marl_factory_grid/modules/coins/entitites.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from marl_factory_grid.environment.entity.entity import Entity
|
||||
from marl_factory_grid.utils.utility_classes import RenderEntity
|
||||
from marl_factory_grid.modules.coins import constants as d
|
||||
|
||||
|
||||
class CoinPile(Entity):
|
||||
|
||||
@property
|
||||
def amount(self):
|
||||
"""
|
||||
Internal Usage
|
||||
"""
|
||||
return self._amount
|
||||
|
||||
@property
|
||||
def encoding(self):
|
||||
return self._amount
|
||||
|
||||
def __init__(self, *args, amount=2, max_local_amount=5, **kwargs):
|
||||
"""
|
||||
Represents a pile of coins at a specific position in the environment that agents can interact with. Agents can
|
||||
clean the dirt pile or, depending on activated rules, interact with it in different ways.
|
||||
|
||||
:param amount: The amount of coins in the pile.
|
||||
:type amount: float
|
||||
|
||||
:param max_local_amount: The maximum amount of dirt allowed in a single pile at one position.
|
||||
:type max_local_amount: float
|
||||
"""
|
||||
super(CoinPile, self).__init__(*args, **kwargs)
|
||||
self._amount = amount
|
||||
self.max_local_amount = max_local_amount
|
||||
|
||||
def set_new_amount(self, amount):
|
||||
"""
|
||||
Internal Usage
|
||||
"""
|
||||
self._amount = min(amount, self.max_local_amount)
|
||||
|
||||
def summarize_state(self):
|
||||
state_dict = super().summarize_state()
|
||||
state_dict.update(amount=float(self.amount))
|
||||
return state_dict
|
||||
|
||||
def render(self):
|
||||
return RenderEntity(d.COIN, self.pos, min(0.15 + self.amount, 1.5), 'scale')
|
||||
108
marl_factory_grid/modules/coins/groups.py
Normal file
108
marl_factory_grid/modules/coins/groups.py
Normal file
@@ -0,0 +1,108 @@
|
||||
import ast
|
||||
from marl_factory_grid.environment import constants as c
|
||||
from marl_factory_grid.environment.groups.collection import Collection
|
||||
from marl_factory_grid.modules.coins.entitites import CoinPile
|
||||
from marl_factory_grid.utils.results import Result
|
||||
from marl_factory_grid.utils import helpers as h
|
||||
|
||||
|
||||
class CoinPiles(Collection):
|
||||
_entity = CoinPile
|
||||
|
||||
@property
|
||||
def var_is_blocking_light(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def var_can_collide(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def var_can_move(self):
|
||||
return False
|
||||
|
||||
@property
|
||||
def var_has_position(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def global_amount(self) -> float:
|
||||
"""
|
||||
Internal Usage
|
||||
"""
|
||||
return sum([dirt.amount for dirt in self])
|
||||
|
||||
def __init__(self, *args, max_local_amount=5, collect_amount=1, max_global_amount: int = 20, coords_or_quantity=10,
|
||||
initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs):
|
||||
"""
|
||||
A Collection of dirt piles that triggers their spawn.
|
||||
|
||||
:param max_local_amount: The maximum amount of coins allowed in a single pile at one position.
|
||||
:type max_local_amount: int
|
||||
|
||||
:param clean_amount: The amount of coins removed by a single collecting action.
|
||||
:type clean_amount: int
|
||||
|
||||
:param max_global_amount: The maximum total amount of coins allowed in the environment.
|
||||
:type max_global_amount: int
|
||||
|
||||
:param coords_or_quantity: Determines whether to use coordinates or quantity when triggering coin pile spawn.
|
||||
:type coords_or_quantity: Union[Tuple[int, int], int]
|
||||
|
||||
:param initial_amount: The initial amount of coin in each newly spawned pile.
|
||||
:type initial_amount: int
|
||||
|
||||
:param amount_var: The variability in the initial amount of coin in each pile.
|
||||
:type amount_var: float
|
||||
|
||||
:param n_var: The variability in the number of new coin piles spawned.
|
||||
:type n_var: float
|
||||
|
||||
"""
|
||||
super(CoinPiles, self).__init__(*args, **kwargs)
|
||||
self.amount_var = amount_var
|
||||
self.n_var = n_var
|
||||
self.collect_amount = collect_amount
|
||||
self.max_global_amount = max_global_amount
|
||||
self.max_local_amount = max_local_amount
|
||||
self.coords_or_quantity = coords_or_quantity
|
||||
self.initial_amount = initial_amount
|
||||
|
||||
def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]:
|
||||
if ignore_blocking:
|
||||
print("##########################################")
|
||||
print("Blocking should not be ignored for this Entity")
|
||||
print("Exiting....")
|
||||
exit()
|
||||
coords_or_quantity = coords_or_quantity if coords_or_quantity else self.coords_or_quantity
|
||||
if isinstance(coords_or_quantity, int):
|
||||
n_new = int(abs(coords_or_quantity + (state.rng.uniform(-self.n_var, self.n_var))))
|
||||
n_new = state.get_n_random_free_positions(n_new)
|
||||
else:
|
||||
coords_or_quantity = ast.literal_eval(coords_or_quantity)
|
||||
if isinstance(coords_or_quantity[0], int):
|
||||
n_new = [coords_or_quantity]
|
||||
else:
|
||||
n_new = [pos for pos in coords_or_quantity]
|
||||
|
||||
amounts = [amount if amount else (self.initial_amount ) # removed rng amount
|
||||
for _ in range(len(n_new))]
|
||||
|
||||
spawn_counter = 0
|
||||
for idx, (pos, a) in enumerate(zip(n_new, amounts)):
|
||||
if not self.global_amount > self.max_global_amount:
|
||||
if coin := self.by_pos(pos):
|
||||
coin = h.get_first(coin)
|
||||
new_value = coin.amount + a
|
||||
coin.set_new_amount(new_value)
|
||||
else:
|
||||
super().spawn([pos], amount=a)
|
||||
spawn_counter += 1
|
||||
else:
|
||||
return Result(identifier=f'{self.name}_spawn', validity=c.NOT_VALID, value=spawn_counter)
|
||||
|
||||
return Result(identifier=f'{self.name}_spawn', validity=c.VALID, value=spawn_counter)
|
||||
|
||||
def __repr__(self):
|
||||
s = super(CoinPiles, self).__repr__()
|
||||
return f'{s[:-1]}, {self.global_amount}]'
|
||||
59
marl_factory_grid/modules/coins/rules.py
Normal file
59
marl_factory_grid/modules/coins/rules.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from marl_factory_grid.modules.coins import constants as d
|
||||
from marl_factory_grid.environment import constants as c
|
||||
|
||||
from marl_factory_grid.environment.rules import Rule
|
||||
from marl_factory_grid.utils.helpers import is_move
|
||||
from marl_factory_grid.utils.results import TickResult
|
||||
from marl_factory_grid.utils.results import DoneResult
|
||||
|
||||
|
||||
class DoneOnAllCoinsCollected(Rule):
|
||||
|
||||
def __init__(self, reward: float = d.REWARD_COLLECT_ALL):
|
||||
"""
|
||||
Defines a 'Done'-condition which triggers, when there is no more 'Dirt' in the environment.
|
||||
|
||||
:type reward: float
|
||||
:parameter reward: Given reward when condition triggers.
|
||||
"""
|
||||
super().__init__()
|
||||
self.reward = reward
|
||||
|
||||
def on_check_done(self, state) -> [DoneResult]:
|
||||
if len(state[d.COIN]) == 0 and state.curr_step:
|
||||
return [DoneResult(validity=c.VALID, identifier=self.name, reward=self.reward)]
|
||||
return []
|
||||
|
||||
|
||||
class RespawnCoins(Rule):
|
||||
|
||||
def __init__(self, respawn_freq: int = 15, respawn_n: int = 5, respawn_amount: float = 1.0):
|
||||
"""
|
||||
Defines the spawn pattern of initial and additional 'Dirt'-entities.
|
||||
First chooses positions, then tries to spawn dirt until 'respawn_n' or the maximal global amount is reached.
|
||||
If there is already some, it is topped up to min(max_local_amount, amount).
|
||||
|
||||
:type respawn_freq: int
|
||||
:parameter respawn_freq: In which frequency should this Rule try to spawn new 'Dirt'?
|
||||
:type respawn_n: int
|
||||
:parameter respawn_n: How many respawn positions are considered.
|
||||
:type respawn_amount: float
|
||||
:parameter respawn_amount: Defines how much dirt 'amount' is placed every 'spawn_freq' ticks.
|
||||
"""
|
||||
super().__init__()
|
||||
self.respawn_n = respawn_n
|
||||
self.respawn_amount = respawn_amount
|
||||
self.respawn_freq = respawn_freq
|
||||
self._next_coin_spawn = respawn_freq
|
||||
|
||||
def tick_step(self, state):
|
||||
collection = state[d.COIN]
|
||||
if self._next_coin_spawn < 0:
|
||||
result = [] # No CoinPile Spawn
|
||||
elif not self._next_coin_spawn:
|
||||
result = [collection.trigger_spawn(state, coords_or_quantity=self.respawn_n, amount=self.respawn_amount)]
|
||||
self._next_coin_spawn = self.respawn_freq
|
||||
else:
|
||||
self._next_coin_spawn -= 1
|
||||
result = []
|
||||
return result
|
||||
@@ -7,7 +7,10 @@ from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from marl_factory_grid.algorithms.rl.utils import _as_torch
|
||||
from marl_factory_grid.utils.helpers import IGNORED_DF_COLUMNS
|
||||
from marl_factory_grid.utils.plotting.plotting_utils import prepare_plot
|
||||
|
||||
@@ -253,3 +256,125 @@ direction_mapping = {
|
||||
'south_east': (1, 1),
|
||||
'south_west': (-1, 1)
|
||||
}
|
||||
|
||||
|
||||
def plot_reward_development(reward_development, results_path):
|
||||
smoothed_data = np.convolve(reward_development, np.ones(10) / 10, mode='valid')
|
||||
plt.plot(smoothed_data)
|
||||
plt.ylim([-10, max(smoothed_data) + 20])
|
||||
plt.title('Smoothed Reward Development')
|
||||
plt.xlabel('Episode')
|
||||
plt.ylabel('Reward')
|
||||
plt.savefig(f"{results_path}/smoothed_reward_development.png")
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_collected_coins_per_step():
|
||||
# Observed behaviour for multi-agent setting consisting of run0 and run0
|
||||
cleaned_dirt_per_step_emergent = [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5]
|
||||
cleaned_dirt_per_step = [0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 5] # RL and TSP
|
||||
|
||||
plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, color='green', linewidth=3, label='Prevented (RL)')
|
||||
plt.step(range(1, len(cleaned_dirt_per_step_emergent) + 1), cleaned_dirt_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')
|
||||
plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
|
||||
plt.xlabel("Environment step", fontsize=20)
|
||||
plt.ylabel("Collected Coins", fontsize=20)
|
||||
yint = range(min(cleaned_dirt_per_step), max(cleaned_dirt_per_step) + 1)
|
||||
plt.yticks(yint, fontsize=17)
|
||||
plt.xticks(range(1, len(cleaned_dirt_per_step_emergent) + 1), fontsize=17)
|
||||
frame1 = plt.gca()
|
||||
# Only display every 5th tick label
|
||||
for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
|
||||
if (idx + 1) % 5 != 0:
|
||||
xlabel_i.set_visible(False)
|
||||
xlabel_i.set_fontsize(0.0)
|
||||
# Change order of labels in legend
|
||||
handles, labels = frame1.get_legend_handles_labels()
|
||||
order = [0, 2, 1]
|
||||
plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
|
||||
fig = plt.gcf()
|
||||
fig.set_size_inches(8, 7)
|
||||
plt.savefig("../study_out/number_of_collected_coins.pdf")
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_reached_flags_per_step():
|
||||
# Observed behaviour for multi-agent setting consisting of runs 1 + 2
|
||||
reached_flags_per_step_emergent = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
reached_flags_per_step_RL = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2]
|
||||
reached_flags_per_step_TSP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]
|
||||
|
||||
plt.step(range(1, len(reached_flags_per_step_RL) + 1), reached_flags_per_step_RL, color='green', linewidth=3, label='Prevented (RL)')
|
||||
plt.step(range(1, len(reached_flags_per_step_emergent) + 1), reached_flags_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent')
|
||||
plt.step(range(1, len(reached_flags_per_step_TSP) + 1), reached_flags_per_step_TSP, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)')
|
||||
plt.xlabel("Environment step", fontsize=20)
|
||||
plt.ylabel("Reached Flags", fontsize=20)
|
||||
yint = range(min(reached_flags_per_step_RL), max(reached_flags_per_step_RL) + 1)
|
||||
plt.yticks(yint, fontsize=17)
|
||||
plt.xticks(range(1, len(reached_flags_per_step_emergent) + 1), fontsize=17)
|
||||
frame1 = plt.gca()
|
||||
# Only display every 5th tick label
|
||||
for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()):
|
||||
if (idx + 1) % 5 != 0:
|
||||
xlabel_i.set_visible(False)
|
||||
xlabel_i.set_fontsize(0.0)
|
||||
# Change order of labels in legend
|
||||
handles, labels = frame1.get_legend_handles_labels()
|
||||
order = [0, 2, 1]
|
||||
plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20})
|
||||
fig = plt.gcf()
|
||||
fig.set_size_inches(8, 7)
|
||||
plt.savefig("../study_out/number_of_reached_flags.pdf")
|
||||
plt.show()
|
||||
|
||||
|
||||
def create_info_maps(env, all_valid_observations, dirt_piles_positions, results_path, agents, act_dim,
|
||||
a2c_instance):
|
||||
# Create value map
|
||||
with open(f"{results_path}/info_maps.txt", "w") as txt_file:
|
||||
for obs_layer, pos in enumerate(dirt_piles_positions):
|
||||
observations_shape = (
|
||||
max(t[0] for t in env.state.entities.floorlist) + 2,
|
||||
max(t[1] for t in env.state.entities.floorlist) + 2)
|
||||
value_maps = [np.zeros(observations_shape) for _ in agents]
|
||||
likeliest_action = [np.full(observations_shape, np.NaN) for _ in agents]
|
||||
action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], act_dim)) for
|
||||
_ in agents]
|
||||
for obs in all_valid_observations[obs_layer]:
|
||||
for idx, agent in enumerate(agents):
|
||||
x, y = int(obs[0]), int(obs[1])
|
||||
try:
|
||||
value_maps[idx][x][y] = agent.vf(obs)
|
||||
probs = agent.pi.distribution(obs).probs
|
||||
likeliest_action[idx][x][y] = torch.argmax(
|
||||
probs) # get the likeliest action at the current agent position
|
||||
action_probabilities[idx][x][y] = probs
|
||||
except:
|
||||
pass
|
||||
|
||||
txt_file.write("=======Value Maps=======\n")
|
||||
for agent_idx, vmap in enumerate(value_maps):
|
||||
txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n")
|
||||
vmap = _as_torch(vmap).round(decimals=4)
|
||||
max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item())))
|
||||
for idx, row in enumerate(vmap):
|
||||
txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist()))
|
||||
txt_file.write("\n")
|
||||
txt_file.write("\n")
|
||||
txt_file.write("=======Likeliest Action=======\n")
|
||||
for agent_idx, amap in enumerate(likeliest_action):
|
||||
txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n")
|
||||
txt_file.write(np.array2string(amap))
|
||||
txt_file.write("\n")
|
||||
txt_file.write("=======Action Probabilities=======\n")
|
||||
for agent_idx, pmap in enumerate(action_probabilities):
|
||||
a2c_instance.action_probabilities[agent_idx].append(pmap)
|
||||
txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n")
|
||||
for d in range(pmap.shape[0]):
|
||||
row = '['
|
||||
for r in range(pmap.shape[1]):
|
||||
row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]"
|
||||
txt_file.write(row + "]")
|
||||
txt_file.write("\n")
|
||||
|
||||
return action_probabilities
|
||||
|
||||
@@ -348,7 +348,6 @@ class Renderer:
|
||||
self.save_counter += 1
|
||||
full_path = os.path.join(out_dir, unique_filename)
|
||||
pygame.image.save(self.screen, full_path)
|
||||
print(f"Image saved as {unique_filename}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -118,9 +118,8 @@ class Gamestate(object):
|
||||
self._floortile_graph = None
|
||||
self.tests = StepTests(*tests)
|
||||
|
||||
# Pointer that defines current spawn points of agents
|
||||
for agent in self.agents_conf:
|
||||
self.agents_conf[agent]["pos_pointer"] = 0
|
||||
# Initialize position pointers for agents
|
||||
self._initialize_position_pointers()
|
||||
|
||||
def reset(self):
|
||||
self.curr_step = 0
|
||||
@@ -138,6 +137,11 @@ class Gamestate(object):
|
||||
def __repr__(self):
|
||||
return f'{self.__class__.__name__}({len(self.entities)} Entitites @ Step {self.curr_step})'
|
||||
|
||||
def _initialize_position_pointers(self):
|
||||
""" Initialize the position pointers for each agent in the configuration."""
|
||||
for agent in self.agents_conf:
|
||||
self.agents_conf[agent]["pos_pointer"] = 0
|
||||
|
||||
@property
|
||||
def random_free_position(self) -> (int, int):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user