added changes from code submission branch and coin entity

This commit is contained in:
Chanumask
2024-09-06 11:01:42 +02:00
parent 33e40deecf
commit 5476f617c6
42 changed files with 1429 additions and 68 deletions

View File

@@ -1 +0,0 @@
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory

View File

@@ -0,0 +1 @@
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory

View File

@@ -0,0 +1,297 @@
import os
import torch
from typing import Union, List
import numpy as np
from tqdm import tqdm
from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient
from marl_factory_grid.algorithms.rl.constants import Names
from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \
get_coin_piles_positions, update_target_pile, update_ordered_coin_piles, get_all_collected_coin_piles, \
distribute_indices, set_agents_spawnpoints, get_ordered_coin_piles, handle_finished_episode, save_configs, \
save_agent_models, get_all_observations, get_agents_positions
from marl_factory_grid.algorithms.utils import add_env_props
from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \
create_info_maps
nms = Names
ListOrTensor = Union[List, torch.Tensor]
class A2C:
def __init__(self, train_cfg, eval_cfg):
self.results_path = None
self.agents = None
self.act_dim = None
self.obs_dim = None
self.factory = add_env_props(train_cfg)
self.eval_factory = add_env_props(eval_cfg)
self.__training = True
self.train_cfg = train_cfg
self.eval_cfg = eval_cfg
self.cfg = train_cfg
self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS]
self.setup()
self.reward_development = []
self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)}
def setup(self):
""" Initialize agents and create entry for run results according to configuration """
self.obs_dim = 2 + 2 * len(get_coin_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][
nms.PILE_OBSERVABILITY] == nms.ALL else 4
self.act_dim = 4 # The 4 movement directions
self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in
range(self.n_agents)]
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
# Define study_out_path and check if it exists
base_dir = os.path.dirname(os.path.abspath(__file__)) # Directory of the script
study_out_path = os.path.join(base_dir, '../../../study_out')
study_out_path = os.path.abspath(study_out_path)
if not os.path.exists(study_out_path):
raise FileNotFoundError(f"The directory {study_out_path} does not exist.")
# Create results folder
runs = os.listdir(study_out_path)
run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"]
next_run_number = max(run_numbers) + 1 if run_numbers else 0
self.results_path = os.path.join(study_out_path, f"run{next_run_number}")
os.mkdir(self.results_path)
# Save settings in results folder
save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf)
def set_cfg(self, eval=False):
if eval:
self.cfg = self.eval_cfg
else:
self.cfg = self.train_cfg
def load_agents(self, runs_list):
""" Initialize networks with parameters of already trained agents """
for idx, run in enumerate(runs_list):
run_path = f"./study_out/{run}"
self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth")
self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth")
@torch.no_grad()
def train_loop(self):
""" Function for training agents """
env = self.factory
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
global_steps, episode = 0, 0
indices = distribute_indices(env, self.cfg, self.n_agents)
coin_piles_positions = get_coin_piles_positions(env)
target_pile = [partition[0] for partition in
indices] # list of pointers that point to the current target pile for each agent
collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
pbar = tqdm(total=max_steps)
while global_steps < max_steps:
_ = env.reset()
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
env.render()
set_agents_spawnpoints(env, self.n_agents)
ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
# Reset current target pile at episode begin if all piles have to be collected in one episode
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL:
target_pile = [partition[0] for partition in indices]
collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
# Supply each agent with its local observation
obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
done, rew_log = [False] * self.n_agents, 0
while not all(done):
action = self.use_door_or_move(env, obs, collected_coin_piles) \
if nms.DOORS in env.state.entities.keys() else self.get_actions(obs)
_, next_obs, reward, done, info = env.step(action)
next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
# Handle case where agent is on field with coin
reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
reward, done)
if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True
done = [done] * self.n_agents if isinstance(done, bool) else done
for ag_i, agent in enumerate(self.agents):
if action[ag_i] in range(self.act_dim):
# Add agent results into respective rollout buffers
agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1])
# Visualize state update
if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render()
obs = next_obs
if all(done): handle_finished_episode(obs, self.agents, self.cfg)
global_steps += 1
rew_log += sum(reward)
if global_steps >= max_steps: break
self.reward_development.append(rew_log)
episode += 1
pbar.update(global_steps - pbar.n)
pbar.close()
if self.cfg[nms.ENV][nms.SAVE_AND_LOG]:
plot_reward_development(self.reward_development, self.results_path)
create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents),
get_coin_piles_positions(env), self.results_path, self.agents, self.act_dim, self)
save_agent_models(self.results_path, self.agents)
plot_action_maps(env, [self], self.results_path)
@torch.inference_mode(True)
def eval_loop(self, n_episodes):
""" Function for performing inference """
env = self.eval_factory
self.set_cfg(eval=True)
episode, results = 0, []
coin_piles_positions = get_coin_piles_positions(env)
indices = distribute_indices(env, self.cfg, self.n_agents)
target_pile = [partition[0] for partition in
indices] # list of pointers that point to the current target pile for each agent
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
range(self.n_agents)]
else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
while episode < n_episodes:
_ = env.reset()
set_agents_spawnpoints(env, self.n_agents)
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
# Don't render auxiliary piles
if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]:
auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.COIN_PILES]) if
idx % 2 == 0]
for pile in auxiliary_piles:
pile.set_new_amount(0)
env.render()
env._renderer.fps = 5 # Slow down agent movement
# Reset current target pile at episode begin if all piles have to be collected in one episode
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]:
target_pile = [partition[0] for partition in indices]
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED:
collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in
range(self.n_agents)]
else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)]
ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents)
# Supply each agent with its local observation
obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
while not all(done):
action = self.use_door_or_move(env, obs, collected_coin_piles, det=True) \
if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env,
collected_coin_piles) # zero exploration
_, next_obs, reward, done, info = env.step(action)
# Handle case where agent is on field with coin
reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices,
reward, done)
# Get transformed next_obs that might have been updated because of handle_coin
next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents)
done = [done] * self.n_agents if isinstance(done, bool) else done
if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render()
obs = next_obs
episode += 1
# -------------------------------------- HELPER FUNCTIONS ------------------------------------------------- #
def get_actions(self, observations) -> ListOrTensor:
""" Given local observations, get actions for both agents """
actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
enumerate(self.agents)]
return actions
def execute_policy(self, observations, env, collected_coin_piles) -> ListOrTensor:
""" Execute agent policies deterministically for inference """
actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in
enumerate(self.agents)]
for agent_idx in range(self.n_agents):
if all(collected_coin_piles[agent_idx].values()):
actions[agent_idx] = np.array(next(
action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
a.name == nms.NOOP))
return actions
def use_door_or_move(self, env, obs, collected_coin_piles, det=False):
""" Function that handles automatic actions like door opening and forced Noop"""
action = []
for agent_idx, agent in enumerate(self.agents):
agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32)
# Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting)
if all(collected_coin_piles[agent_idx].values()):
action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
a.name == nms.NOOP))
if not det:
# Include agent experience entry manually
agent._episode.append((None, None, None, agent.vf(agent_obs)))
else:
if door := is_door_close(env, agent_idx):
if door.is_closed:
action.append(next(
action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if
a.name == nms.USE_DOOR))
# Don't include action in agent experience
else:
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
else: action.append(int(agent.step(agent_obs)))
else:
if det: action.append(int(agent.pi(agent_obs, det=True)[0]))
else: action.append(int(agent.step(agent_obs)))
return action
def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done):
""" Check if agent moved on field with coin. If that is the case collect coin automatically """
agents_positions = get_agents_positions(env, self.n_agents)
coin_piles_positions = get_coin_piles_positions(env)
if any([True for pos in agents_positions if pos in coin_piles_positions]):
# Only simulate collecting the coin
for idx, pos in enumerate(agents_positions):
if pos in collected_coin_piles[idx].keys() and not collected_coin_piles[idx][pos]:
# If coin piles should be collected in a specific order
if ordered_coin_piles[idx]:
if pos == ordered_coin_piles[idx][target_pile[idx]]:
reward[idx] += 50
collected_coin_piles[idx][pos] = True
# Set pointer to next coin pile
update_target_pile(env, idx, target_pile, indices, self.cfg)
update_ordered_coin_piles(idx, collected_coin_piles, ordered_coin_piles, env,
self.cfg, self.n_agents)
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE:
done = True
if all(collected_coin_piles[idx].values()):
# Reset collected_coin_piles indicator
for pos in coin_piles_positions:
collected_coin_piles[idx][pos] = False
else:
reward[idx] += 50
collected_coin_piles[idx][pos] = True
# Indicate that renderer can hide coin pile
coin_at_position = env.state[nms.COIN_PILES].by_pos(pos)
coin_at_position[0].set_new_amount(0)
if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]:
if all([all(collected_coin_piles[i].values()) for i in range(self.n_agents)]):
done = True
elif self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED:
# End episode if both agents together have collected all coin piles
if all(get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, self.n_agents).values()):
done = True
return reward, done

View File

@@ -0,0 +1,112 @@
import numpy as np
import torch as th
import scipy as sp
from collections import deque
from torch import nn
cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]
class Net(th.nn.Module):
def __init__(self, shape, activation, lr):
super().__init__()
self.net = th.nn.Sequential(*[layer
for io, a in zip(zip(shape[:-1], shape[1:]),
[activation] * (len(shape) - 2) + [th.nn.Identity])
for layer in [th.nn.Linear(*io), a()]])
self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr)
# Initialize weights uniformly, so that for the policy net all actions have approximately the same
# probability in the beginning
for module in self.modules():
if isinstance(module, nn.Linear):
nn.init.uniform_(module.weight, a=-0.1, b=0.1)
if module.bias is not None:
nn.init.uniform_(module.bias, a=-0.1, b=0.1)
def save_model(self, path):
th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth")
def save_model_parameters(self, path):
th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth")
def load_model_parameters(self, path):
self.net.load_state_dict(th.load(path))
self.net.eval()
class ValueNet(Net):
def __init__(self, obs_dim, hidden_sizes=[64, 64], activation=th.nn.ReLU, lr=1e-3):
super().__init__([obs_dim] + hidden_sizes + [1], activation, lr)
def forward(self, obs): return self.net(obs)
def loss(self, states, returns): return ((returns - self(states)) ** 2).mean()
class PolicyNet(Net):
def __init__(self, obs_dim, act_dim, hidden_sizes=[64, 64], activation=th.nn.Tanh, lr=3e-4):
super().__init__([obs_dim] + hidden_sizes + [act_dim], activation, lr)
self.distribution = lambda obs: th.distributions.Categorical(logits=self.net(obs))
def forward(self, obs, act=None, det=False):
"""Given an observation: Returns policy distribution and probablilty for a given action
or Returns a sampled action and its corresponding probablilty"""
pi = self.distribution(obs)
if act is not None: return pi, pi.log_prob(act)
act = self.net(obs).argmax() if det else pi.sample() # sample from the learned distribution
return act, pi.log_prob(act)
def loss(self, states, actions, advantages):
_, logp = self.forward(states, actions)
loss = -(logp * advantages).mean()
return loss
class PolicyGradient:
""" Autonomous agent using vanilla policy gradient. """
def __init__(self, env, seed=42, gamma=0.99, agent_id=0, act_dim=None, obs_dim=None):
self.env = env
self.gamma = gamma # Setup env and discount
th.manual_seed(seed)
np.random.seed(seed) # Seed Torch, numpy and gym
# Keep track of previous rewards and performed steps to calcule the mean Return metric
self._episode, self.ep_returns, self.num_steps = [], deque(maxlen=100), 0
# Get observation and action shapes
if not obs_dim:
obs_size = env.observation_space.shape if len(env.state.entities.by_name("Agents")) == 1 \
else env.observation_space[agent_id].shape # Single agent case vs. multi-agent case
obs_dim = np.prod(obs_size)
if not act_dim:
act_dim = env.action_space[agent_id].n
self.vf = ValueNet(obs_dim) # Setup Value Network (Critic)
self.pi = PolicyNet(obs_dim, act_dim) # Setup Policy Network (Actor)
def step(self, obs):
""" Given an observation, get action and probs from policy and values from critic"""
with th.no_grad():
(a, _), v = self.pi(obs), self.vf(obs)
self._episode.append((None, None, None, v))
return a.numpy()
def policy(self, obs, det=True):
return self.pi(obs, det=det)[0].numpy()
def finish_episode(self):
"""Process self._episode & reset self.env, Returns (s,a,G,V)-Tuple and new inital state"""
s, a, r, v = (np.array(e) for e in zip(*self._episode)) # Get trajectories from rollout
self.ep_returns.append(sum(r))
self._episode = [] # Add episode return to buffer & reset
return s, a, r, v # state, action, Return, Value Tensors
def train(self, states, actions, returns, advantages): # Update policy weights
self.pi.optimizer.zero_grad()
self.vf.optimizer.zero_grad() # Reset optimizer
states = states.flatten(1, -1) # Reduce dimensionality to rollout_dim x input_dim
policy_loss = self.pi.loss(states, actions, advantages) # Calculate Policy loss
policy_loss.backward()
self.pi.optimizer.step() # Apply Policy loss
value_loss = self.vf.loss(states, returns) # Calculate Value loss
value_loss.backward()
self.vf.optimizer.step() # Apply Value loss

View File

@@ -2,7 +2,7 @@ import torch
from typing import Union, List, Dict
import numpy as np
from torch.distributions import Categorical
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
from pathlib import Path
import pandas as pd

View File

@@ -1,5 +1,5 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
@@ -18,7 +18,7 @@ env:
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
method: marl_factory_grid.algorithms.rl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01

View File

@@ -1,5 +1,5 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
n_agents: 2
obs_emb_size: 96
action_emb_size: 16
@@ -18,7 +18,7 @@ env:
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
method: marl_factory_grid.algorithms.rl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01

View File

@@ -1,5 +1,5 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
n_agents: 1
obs_emb_size: 96
action_emb_size: 16
@@ -18,7 +18,7 @@ env:
eval_render: True
save_and_log: True
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
method: marl_factory_grid.algorithms.rl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01

View File

@@ -1,5 +1,5 @@
agent:
classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC
classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC
n_agents: 1
obs_emb_size: 96
action_emb_size: 16
@@ -18,7 +18,7 @@ env:
eval_render: True
save_and_log: False
record: False
method: marl_factory_grid.algorithms.marl.LoopSEAC
method: marl_factory_grid.algorithms.rl.LoopSEAC
algorithm:
gamma: 0.99
entropy_coef: 0.01

View File

@@ -0,0 +1,37 @@
class Names:
ENV = 'env'
ENV_NAME = 'env_name'
N_AGENTS = 'n_agents'
ALGORITHM = 'algorithm'
MAX_STEPS = 'max_steps'
N_STEPS = 'n_steps'
TRAIN_RENDER = 'train_render'
EVAL_RENDER = 'eval_render'
AGENT = 'Agent'
PILE_OBSERVABILITY = 'pile-observability'
PILE_ORDER = 'pile-order'
ALL = 'all'
FIXED = 'fixed'
AGENTS = 'agents'
DYNAMIC = 'dynamic'
SMART = 'smart'
DIRT_PILES = 'DirtPiles'
COIN_PILES = 'CoinPiles'
AUXILIARY_PILES = "auxiliary_piles"
DOORS = 'Doors'
DOOR = 'Door'
GAMMA = 'gamma'
ADVANTAGE = 'advantage'
REINFORCE = 'reinforce'
ADVANTAGE_AC = "Advantage-AC"
TD_ADVANTAGE_AC = "TD-Advantage-AC"
CHUNK_EPISODE = 'chunk-episode'
POS_POINTER = 'pos_pointer'
POSITIONS = 'positions'
SAVE_AND_LOG = 'save_and_log'
NOOP = 'Noop'
USE_DOOR = 'use_door'
PILE_ALL_DONE = 'pile_all_done'
SINGLE = 'single'
DISTRIBUTED = 'distributed'
SHARED = 'shared'

View File

@@ -1,9 +1,9 @@
import torch
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic, nms
from marl_factory_grid.algorithms.utils import instantiate_class
from pathlib import Path
from natsort import natsorted
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
class LoopIAC(BaseActorCritic):

View File

@@ -1,6 +1,6 @@
from marl_factory_grid.algorithms.marl.base_ac import Names as nms
from marl_factory_grid.algorithms.marl.snac import LoopSNAC
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
from marl_factory_grid.algorithms.rl.base_ac import Names as nms
from marl_factory_grid.algorithms.rl.snac import LoopSNAC
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
import torch
from torch.distributions import Categorical
from marl_factory_grid.algorithms.utils import instantiate_class

View File

@@ -1,8 +1,8 @@
import torch
from torch.distributions import Categorical
from marl_factory_grid.algorithms.marl.iac import LoopIAC
from marl_factory_grid.algorithms.marl.base_ac import nms
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
from marl_factory_grid.algorithms.rl.iac import LoopIAC
from marl_factory_grid.algorithms.rl.base_ac import nms
from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory
class LoopSEAC(LoopIAC):

View File

@@ -1,5 +1,5 @@
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
from marl_factory_grid.algorithms.marl.base_ac import nms
from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic
from marl_factory_grid.algorithms.rl.base_ac import nms
import torch
from torch.distributions import Categorical
from pathlib import Path

View File

@@ -0,0 +1,337 @@
import copy
from typing import List
import numpy as np
import torch
from marl_factory_grid.algorithms.rl.constants import Names as nms
from marl_factory_grid.algorithms.rl.base_a2c import cumulate_discount
def _as_torch(x):
""" Helper function to convert different list types to a torch tensor """
if isinstance(x, np.ndarray):
return torch.from_numpy(x)
elif isinstance(x, List):
return torch.tensor(x)
elif isinstance(x, (int, float)):
return torch.tensor([x])
return x
def transform_observations(env, ordered_coins, target_coin, cfg, n_agents):
""" Function that extracts local observations from global state
Requires that agents have observations -CoinPiles and -Self (cf. environment configs) """
agents_positions = get_agents_positions(env, n_agents)
coin_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL
if coin_observability_is_all:
trans_obs = [torch.zeros(2 + 2 * len(ordered_coins[0])) for _ in range(len(agents_positions))]
else:
# Only show current target pile
trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))]
for i, pos in enumerate(agents_positions):
agent_x, agent_y = pos[0], pos[1]
trans_obs[i][0] = agent_x
trans_obs[i][1] = agent_y
idx = 2
if coin_observability_is_all:
for coin_pos in ordered_coins[i]:
trans_obs[i][idx] = coin_pos[0]
trans_obs[i][idx + 1] = coin_pos[1]
idx += 2
else:
trans_obs[i][2] = ordered_coins[i][target_coin[i]][0]
trans_obs[i][3] = ordered_coins[i][target_coin[i]][1]
return trans_obs
def get_all_observations(env, cfg, n_agents):
""" Helper function that returns all possible agent observations """
coins_positions = [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
range(len(env.state.entities[nms.COIN_PILES]))]
if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL:
obs = [torch.zeros(2 + 2 * len(coins_positions))]
observations = [[]]
# Fill in pile positions
idx = 2
for pile_pos in coins_positions:
obs[0][idx] = pile_pos[0]
obs[0][idx + 1] = pile_pos[1]
idx += 2
else:
# Have multiple observation layers of the map for each coin pile one
obs = [torch.zeros(4) for _ in range(n_agents) for _ in coins_positions]
observations = [[] for _ in coins_positions]
for idx, pile_pos in enumerate(coins_positions):
obs[idx][2] = pile_pos[0]
obs[idx][3] = pile_pos[1]
valid_agent_positions = env.state.entities.floorlist
for idx, pos in enumerate(valid_agent_positions):
for obs_layer in range(len(obs)):
observation = copy.deepcopy(obs[obs_layer])
observation[0] = pos[0]
observation[1] = pos[1]
observations[obs_layer].append(observation)
return observations
def get_coin_piles_positions(env):
""" Get positions of coin piles on the map """
return [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in
range(len(env.state.entities[nms.COIN_PILES]))]
def get_agents_positions(env, n_agents):
""" Get positions of agents on the map """
return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)]
def get_ordered_coin_piles(env, collected_coins, cfg, n_agents):
""" This function determines in which order the agents should collect the coin piles
Each agent can have its individual pile order """
ordered_coin_piles = [[] for _ in range(n_agents)]
coin_piles_positions = get_coin_piles_positions(env)
agents_positions = get_agents_positions(env, n_agents)
for agent_idx in range(n_agents):
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]:
ordered_coin_piles[agent_idx] = coin_piles_positions
elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]:
# Calculate distances for remaining unvisited coin piles
remaining_target_piles = [pos for pos, value in collected_coins[agent_idx].items() if not value]
pile_distances = {pos: 0 for pos in remaining_target_piles}
agent_pos = agents_positions[agent_idx]
for pos in remaining_target_piles:
pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1])
if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART:
# Check if there is an agent on the direct path to any of the remaining coin piles
for pile_pos in remaining_target_piles:
for other_pos in agents_positions:
if other_pos != agent_pos:
if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[
1]:
# Get the line between the agent and the target
path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1])
# Check if the entity lies on the path between the agent and the target
if other_pos in path:
pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs(
agent_pos[1] - other_pos[1])
sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1]))
# Insert already visited coin piles
ordered_coin_piles[agent_idx] = [pos for pos in coin_piles_positions if pos not in remaining_target_piles]
# Fill up with sorted positions
for pos in sorted_pile_distances.keys():
ordered_coin_piles[agent_idx].append(pos)
else:
print("Not a valid pile order option.")
exit()
return ordered_coin_piles
def bresenham(x0, y0, x1, y1):
"""Bresenham's line algorithm to get the coordinates of a line between two points."""
dx = np.abs(x1 - x0)
dy = np.abs(y1 - y0)
sx = 1 if x0 < x1 else -1
sy = 1 if y0 < y1 else -1
err = dx - dy
coordinates = []
while True:
coordinates.append((x0, y0))
if x0 == x1 and y0 == y1:
break
e2 = 2 * err
if e2 > -dy:
err -= dy
x0 += sx
if e2 < dx:
err += dx
y0 += sy
return coordinates
def update_ordered_coin_piles(agent_idx, collected_coin_piles, ordered_coin_piles, env, cfg, n_agents):
""" Update the order of the remaining coin piles """
# Only update ordered_coin_pile for agent that reached its target pile
updated_ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, cfg, n_agents)
for i in range(len(ordered_coin_piles[agent_idx])):
ordered_coin_piles[agent_idx][i] = updated_ordered_coin_piles[agent_idx][i]
def distribute_indices(env, cfg, n_agents):
""" Distribute coin piles evenly among the agents """
indices = []
n_coin_piles = len(get_coin_piles_positions(env))
agents_positions = get_agents_positions(env, n_agents)
if n_coin_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
indices = [[0] for _ in range(n_agents)]
else:
base_count = n_coin_piles // n_agents
remainder = n_coin_piles % n_agents
start_index = 0
for i in range(n_agents):
# Add an extra index to the first 'remainder' objects
end_index = start_index + base_count + (1 if i < remainder else 0)
indices.append(list(range(start_index, end_index)))
start_index = end_index
# Static form: auxiliary pile, primary pile, auxiliary pile, ...
# -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles
if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys():
door_positions = [door.pos for door in env.state.entities[nms.DOORS]]
distances = {door_pos: [] for door_pos in door_positions}
# Calculate distance of every agent to every door
for door_pos in door_positions:
for agent_pos in agents_positions:
distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1]))
def duplicate_indices(lst, item):
return [i for i, x in enumerate(lst) if x == item]
# Get agent indices of agents with same distance to door
affected_agents = {door_pos: {} for door_pos in door_positions}
for door_pos in distances.keys():
dist = distances[door_pos]
dist_set = set(dist)
for d in dist_set:
affected_agents[door_pos][str(d)] = duplicate_indices(dist, d)
updated_indices = []
for door_pos, agent_distances in affected_agents.items():
if len(agent_distances) == 0:
# Remove auxiliary piles for all agents
# (In config, we defined every pile with an even numbered index to be an auxiliary pile)
updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices]
else:
for distance, agent_indices in agent_distances.items():
# For each distance group, pick one random agent to keep the auxiliary pile
# selected_agent = np.random.choice(agent_indices)
selected_agent = 0
for agent_idx in agent_indices:
if agent_idx == selected_agent:
updated_indices.append(indices[agent_idx])
else:
updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0])
indices = updated_indices
return indices
def update_target_pile(env, agent_idx, target_pile, indices, cfg):
""" Get the next target pile for a given agent """
if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]:
if target_pile[agent_idx] + 1 < len(get_coin_piles_positions(env)):
target_pile[agent_idx] += 1
else:
target_pile[agent_idx] = 0
else:
if target_pile[agent_idx] + 1 in indices[agent_idx]:
target_pile[agent_idx] += 1
def is_door_close(env, agent_idx):
""" Checks whether the agent is close to a door """
neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos)
for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name]
if neighbourhood:
return neighbourhood[0]
def get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, n_agents):
""" Returns all coin piles collected by any agent """
meta_collected_coin_piles = {pos: False for pos in coin_piles_positions}
for agent_idx in range(n_agents):
for (pos, collected) in collected_coin_piles[agent_idx].items():
if collected:
meta_collected_coin_piles[pos] = True
return meta_collected_coin_piles
def handle_finished_episode(obs, agents, cfg):
""" Finish up episode, calculate advantages and perform policy net and value net updates"""
with torch.inference_mode(False):
for ag_i, agent in enumerate(agents):
# Get states, actions, rewards and values from rollout buffer
data = agent.finish_episode()
# Chunk episode data, such that there will be no memory failure for very long episodes
chunks = split_into_chunks(data, cfg)
for (s, a, R, V) in chunks:
# Calculate discounted return and advantage
G = cumulate_discount(R, cfg[nms.ALGORITHM][nms.GAMMA])
if cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.REINFORCE:
A = G
elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.ADVANTAGE_AC:
A = G - V # Actor-Critic Advantages
elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.TD_ADVANTAGE_AC:
with torch.no_grad():
A = R + cfg[nms.ALGORITHM][nms.GAMMA] * np.append(V[1:], agent.vf(
_as_torch(obs[ag_i]).view(-1).to(
torch.float32)).numpy()) - V # TD Actor-Critic Advantages
else:
print("Not a valid advantage option.")
exit()
rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A))
# Update policy and value net of agent with experience from rollout buffer
agent.train(*rollout)
def split_into_chunks(data_tuple, cfg):
""" Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """
result = [data_tuple]
chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE]
if chunk_size > 0:
# Get the maximum length of the lists in the tuple to handle different lengths
max_length = max(len(lst) for lst in data_tuple)
# Prepare a list to store the result
result = []
# Split each list into chunks and add them to the result
for i in range(0, max_length, chunk_size):
# Create a sublist containing the ith chunk from each list
sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)]
result.append(sublist)
return result
def set_agents_spawnpoints(env, n_agents):
""" Tell environment where the agents should spawn in the next episode """
for agent_idx in range(n_agents):
agent_name = list(env.state.agents_conf.keys())[agent_idx]
current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER]
# Making the reset dependent on the number of spawnpoints and not the number of coinpiles allows
# for having multiple subsequent spawnpoints with the same target pile
if current_pos_pointer == len(env.state.agents_conf[agent_name][nms.POSITIONS]) - 1:
env.state.agents_conf[agent_name][nms.POS_POINTER] = 0
else:
env.state.agents_conf[agent_name][nms.POS_POINTER] += 1
def save_configs(results_path, cfg, factory_conf, eval_factory_conf):
""" Save configurations for logging purposes """
with open(f"{results_path}/MARL_config.txt", "w") as txt_file:
txt_file.write(str(cfg))
with open(f"{results_path}/train_env_config.txt", "w") as txt_file:
txt_file.write(str(factory_conf))
with open(f"{results_path}/eval_env_config.txt", "w") as txt_file:
txt_file.write(str(eval_factory_conf))
def save_agent_models(results_path, agents):
""" Save model parameters after training """
for idx, agent in enumerate(agents):
agent.pi.save_model_parameters(results_path)
agent.vf.save_model_parameters(results_path)

View File

@@ -0,0 +1,40 @@
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
from marl_factory_grid.modules.coins import constants as c
from marl_factory_grid.environment import constants as e
future_planning = 7
class TSPCoinAgent(TSPBaseAgent):
def __init__(self, *args, **kwargs):
"""
Initializes a TSPCoinAgent that aims to collect coins in the environment.
"""
super(TSPCoinAgent, self).__init__(*args, **kwargs)
self.fallback_action = e.NOOP
def predict(self, *_, **__):
"""
Predicts the next action based on the presence of coins in the environment.
:return: Predicted action.
:rtype: int
"""
coin_at_position = self._env.state[c.COIN].by_pos(self.state.pos)
if coin_at_position:
# Translate the action_object to an integer to have the same output as any other model
action = c.COLLECT
elif door := self._door_is_close(self._env.state):
action = self._use_door_or_move(door, c.COIN)
else:
action = self._predict_move(c.COIN)
self.action_list.append(action)
# Translate the action_object to an integer to have the same output as any other model
try:
action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
except (StopIteration, UnboundLocalError):
print('Will not happen')
raise EnvironmentError
return action_obj