diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 47b5268..7d13a27 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -10,7 +10,7 @@ build-job: # This job runs in the build stage, which runs first. variables: TWINE_USERNAME: $USER_NAME TWINE_PASSWORD: $API_KEY - TWINE_REPOSITORY: marl-factory-grid + TWINE_REPOSITORY: rl-factory-grid image: python:slim script: diff --git a/README.md b/README.md index b93f9eb..45eea00 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Existing modules include a variety of functionalities within the environment: - [Agents](marl_factory_grid/algorithms) implement either static strategies or learning algorithms based on the specific configuration. -- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), cleaning +- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), collecting [coins](marl_factory_grid/modules/coins/coin cleaning [dirt](marl_factory_grid/modules/clean_up/entitites.py), picking up [items](marl_factory_grid/modules/items/entitites.py) and delivering them to designated drop-off locations. diff --git a/README_submission.md b/README_submission.md new file mode 100644 index 0000000..12e6513 --- /dev/null +++ b/README_submission.md @@ -0,0 +1,77 @@ +# About EDYS + +## Tackling emergent dysfunctions (EDYs) in cooperation with Fraunhofer-IKS. + +Collaborating with Fraunhofer-IKS, this project is dedicated to investigating Emergent Dysfunctions (EDYs) within +multi-agent environments. In multi-agent reinforcement learning (MARL), a population of agents learns by interacting +with each other in a shared environment and adapt their behavior based on the feedback they receive from the environment +and the actions of other agents. + +In this context, emergent behavior describes spontaneous behaviors resulting from interactions among agents and +environmental stimuli, rather than explicit programming. This promotes natural, adaptable behavior, increases system +unpredictability for dynamic learning , enables diverse strategies, and encourages collective intelligence for complex +problem-solving. However, the complex dynamics of the environment also give rise to emerging dysfunctions—unexpected +issues from agent interactions. This research aims to enhance our understanding of EDYs and their impact on multi-agent +systems. + +### Project Objectives: + +- Create an environment that provokes emerging dysfunctions. + + - This is achieved by creating a high level of background noise in the domain, where various entities perform + diverse tasks, resulting in a deliberately chaotic dynamic. + - The goal is to observe and analyze naturally occurring emergent dysfunctions within the complexity generated in + this dynamic environment. + + +- Observational Framework: + + - The project introduces an environment that is designed to capture dysfunctions as they naturally occur. + - The environment allows for continuous monitoring of agent behaviors, actions, and interactions. + - Tracking emergent dysfunctions in real-time provides valuable data for analysis and understanding. + + +- Compatibility + - The Framework allows learning entities from different manufacturers and projects with varying representations + of actions and observations to interact seamlessly within the environment. + + +## Setup + +Install this environment using `pip install marl-factory-grid`. For more information refer +to ['installation'](docs/source/installation.rst). + +## Usage + +The environment is configured to automatically load necessary objects, including entities, rules, and assets, based on your requirements. +You can utilize existing configurations to replicate the experiments from [this paper](PAPER). + +- Preconfigured Studies: + The studies folder contains predefined studies that can be used to replicate the experiments. + These studies provide a structured way to validate and analyze the outcomes observed in different scenarios. + - Creating your own scenarios: + If you want to use the environment with custom entities, rules or levels refer to the [complete repository](). + + + +Existing modules include a variety of functionalities within the environment: + +- [Agents](marl_factory_grid/algorithms) implement either static strategies or learning algorithms based on the specific + configuration. +- Their action set includes opening [door entities](marl_factory_grid/modules/doors/entitites.py), collecting [coins](marl_factory_grid/modules/coins/coin cleaning + [dirt](marl_factory_grid/modules/clean_up/entitites.py), picking + up [items](marl_factory_grid/modules/items/entitites.py) and + delivering them to designated drop-off locations. +- Agents are equipped with a [battery](marl_factory_grid/modules/batteries/entitites.py) that gradually depletes over + time if not charged at a chargepod. +- The [maintainer](marl_factory_grid/modules/maintenance/entities.py) aims to + repair [machines](marl_factory_grid/modules/machines/entitites.py) that lose health over time. + + +## Limitations + +The provided code and documentation are tailored for replicating and validating experiments as described in the paper. +Modifications to the environment, such as adding new entities, creating additional rules, or customizing behavior beyond the provided scope are not supported in this release. +If you are interested in accessing the complete project, including features not covered in this release, refer to the [full repository](LINK FULL REPO). + +For further details on running the experiments, please consult the relevant documentation provided in the studies' folder. diff --git a/docs/source/conf.py b/docs/source/conf.py index 8d0a105..b7fd9ef 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -6,7 +6,7 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -project = 'marl-factory-grid' +project = 'rl-factory-grid' copyright = '2023, Steffen Illium, Robert Mueller, Joel Friedrich' author = 'Steffen Illium, Robert Mueller, Joel Friedrich' release = '2.5.0' diff --git a/marl_factory_grid/__init__.py b/marl_factory_grid/__init__.py index d8f4799..cc4ebeb 100644 --- a/marl_factory_grid/__init__.py +++ b/marl_factory_grid/__init__.py @@ -1,7 +1,7 @@ from .quickstart import init from marl_factory_grid.environment.factory import Factory """ -Main module of the 'marl-factory-grid'-environment. +Main module of the 'rl-factory-grid'-environment. Configure the :class:.Factory with any 'conf.yaml' file. Examples can be found in :module:.levels . """ diff --git a/marl_factory_grid/algorithms/marl/__init__.py b/marl_factory_grid/algorithms/marl/__init__.py deleted file mode 100644 index a4c30ef..0000000 --- a/marl_factory_grid/algorithms/marl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory diff --git a/marl_factory_grid/algorithms/rl/__init__.py b/marl_factory_grid/algorithms/rl/__init__.py new file mode 100644 index 0000000..ecc0a81 --- /dev/null +++ b/marl_factory_grid/algorithms/rl/__init__.py @@ -0,0 +1 @@ +from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory diff --git a/marl_factory_grid/algorithms/rl/a2c_coin.py b/marl_factory_grid/algorithms/rl/a2c_coin.py new file mode 100644 index 0000000..f2aa31b --- /dev/null +++ b/marl_factory_grid/algorithms/rl/a2c_coin.py @@ -0,0 +1,297 @@ +import os +import torch +from typing import Union, List +import numpy as np +from tqdm import tqdm + +from marl_factory_grid.algorithms.rl.base_a2c import PolicyGradient +from marl_factory_grid.algorithms.rl.constants import Names +from marl_factory_grid.algorithms.rl.utils import transform_observations, _as_torch, is_door_close, \ + get_coin_piles_positions, update_target_pile, update_ordered_coin_piles, get_all_collected_coin_piles, \ + distribute_indices, set_agents_spawnpoints, get_ordered_coin_piles, handle_finished_episode, save_configs, \ + save_agent_models, get_all_observations, get_agents_positions +from marl_factory_grid.algorithms.utils import add_env_props +from marl_factory_grid.utils.plotting.plot_single_runs import plot_action_maps, plot_reward_development, \ + create_info_maps + +nms = Names +ListOrTensor = Union[List, torch.Tensor] + + +class A2C: + def __init__(self, train_cfg, eval_cfg): + self.results_path = None + self.agents = None + self.act_dim = None + self.obs_dim = None + self.factory = add_env_props(train_cfg) + self.eval_factory = add_env_props(eval_cfg) + self.__training = True + self.train_cfg = train_cfg + self.eval_cfg = eval_cfg + self.cfg = train_cfg + self.n_agents = train_cfg[nms.ENV][nms.N_AGENTS] + self.setup() + self.reward_development = [] + self.action_probabilities = {agent_idx: [] for agent_idx in range(self.n_agents)} + + def setup(self): + """ Initialize agents and create entry for run results according to configuration """ + self.obs_dim = 2 + 2 * len(get_coin_piles_positions(self.factory)) if self.cfg[nms.ALGORITHM][ + nms.PILE_OBSERVABILITY] == nms.ALL else 4 + self.act_dim = 4 # The 4 movement directions + self.agents = [PolicyGradient(self.factory, agent_id=i, obs_dim=self.obs_dim, act_dim=self.act_dim) for i in + range(self.n_agents)] + + if self.cfg[nms.ENV][nms.SAVE_AND_LOG]: + # Define study_out_path and check if it exists + base_dir = os.path.dirname(os.path.abspath(__file__)) # Directory of the script + study_out_path = os.path.join(base_dir, '../../../study_out') + study_out_path = os.path.abspath(study_out_path) + + if not os.path.exists(study_out_path): + raise FileNotFoundError(f"The directory {study_out_path} does not exist.") + + # Create results folder + runs = os.listdir(study_out_path) + run_numbers = [int(run[3:]) for run in runs if run[:3] == "run"] + next_run_number = max(run_numbers) + 1 if run_numbers else 0 + self.results_path = os.path.join(study_out_path, f"run{next_run_number}") + os.mkdir(self.results_path) + + # Save settings in results folder + save_configs(self.results_path, self.cfg, self.factory.conf, self.eval_factory.conf) + + def set_cfg(self, eval=False): + if eval: + self.cfg = self.eval_cfg + else: + self.cfg = self.train_cfg + + def load_agents(self, runs_list): + """ Initialize networks with parameters of already trained agents """ + for idx, run in enumerate(runs_list): + run_path = f"./study_out/{run}" + self.agents[idx].pi.load_model_parameters(f"{run_path}/PolicyNet_model_parameters.pth") + self.agents[idx].vf.load_model_parameters(f"{run_path}/ValueNet_model_parameters.pth") + + @torch.no_grad() + def train_loop(self): + """ Function for training agents """ + env = self.factory + n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]] + global_steps, episode = 0, 0 + indices = distribute_indices(env, self.cfg, self.n_agents) + coin_piles_positions = get_coin_piles_positions(env) + target_pile = [partition[0] for partition in + indices] # list of pointers that point to the current target pile for each agent + collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)] + + pbar = tqdm(total=max_steps) + while global_steps < max_steps: + _ = env.reset() + if self.cfg[nms.ENV][nms.TRAIN_RENDER]: + env.render() + set_agents_spawnpoints(env, self.n_agents) + ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents) + # Reset current target pile at episode begin if all piles have to be collected in one episode + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.ALL: + target_pile = [partition[0] for partition in indices] + collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)] + + # Supply each agent with its local observation + obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents) + done, rew_log = [False] * self.n_agents, 0 + + while not all(done): + action = self.use_door_or_move(env, obs, collected_coin_piles) \ + if nms.DOORS in env.state.entities.keys() else self.get_actions(obs) + _, next_obs, reward, done, info = env.step(action) + next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents) + + # Handle case where agent is on field with coin + reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices, + reward, done) + + if n_steps != 0 and (global_steps + 1) % n_steps == 0: done = True + + done = [done] * self.n_agents if isinstance(done, bool) else done + for ag_i, agent in enumerate(self.agents): + if action[ag_i] in range(self.act_dim): + # Add agent results into respective rollout buffers + agent._episode[-1] = (next_obs[ag_i], action[ag_i], reward[ag_i], agent._episode[-1][-1]) + + # Visualize state update + if self.cfg[nms.ENV][nms.TRAIN_RENDER]: env.render() + + obs = next_obs + + if all(done): handle_finished_episode(obs, self.agents, self.cfg) + + global_steps += 1 + rew_log += sum(reward) + + if global_steps >= max_steps: break + + self.reward_development.append(rew_log) + episode += 1 + pbar.update(global_steps - pbar.n) + + pbar.close() + if self.cfg[nms.ENV][nms.SAVE_AND_LOG]: + plot_reward_development(self.reward_development, self.results_path) + create_info_maps(env, get_all_observations(env, self.cfg, self.n_agents), + get_coin_piles_positions(env), self.results_path, self.agents, self.act_dim, self) + save_agent_models(self.results_path, self.agents) + plot_action_maps(env, [self], self.results_path) + + @torch.inference_mode(True) + def eval_loop(self, n_episodes): + """ Function for performing inference """ + env = self.eval_factory + self.set_cfg(eval=True) + episode, results = 0, [] + coin_piles_positions = get_coin_piles_positions(env) + indices = distribute_indices(env, self.cfg, self.n_agents) + target_pile = [partition[0] for partition in + indices] # list of pointers that point to the current target pile for each agent + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED: + collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in + range(self.n_agents)] + else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)] + + while episode < n_episodes: + _ = env.reset() + set_agents_spawnpoints(env, self.n_agents) + if self.cfg[nms.ENV][nms.EVAL_RENDER]: + # Don't render auxiliary piles + if self.cfg[nms.ALGORITHM][nms.AUXILIARY_PILES]: + auxiliary_piles = [pile for idx, pile in enumerate(env.state.entities[nms.COIN_PILES]) if + idx % 2 == 0] + for pile in auxiliary_piles: + pile.set_new_amount(0) + env.render() + env._renderer.fps = 5 # Slow down agent movement + + # Reset current target pile at episode begin if all piles have to be collected in one episode + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED, nms.SHARED]: + target_pile = [partition[0] for partition in indices] + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.DISTRIBUTED: + collected_coin_piles = [{coin_piles_positions[idx]: False for idx in indices[i]} for i in + range(self.n_agents)] + else: collected_coin_piles = [{pos: False for pos in coin_piles_positions} for _ in range(self.n_agents)] + + ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, self.cfg, self.n_agents) + + # Supply each agent with its local observation + obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents) + done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents) + + while not all(done): + action = self.use_door_or_move(env, obs, collected_coin_piles, det=True) \ + if nms.DOORS in env.state.entities.keys() else self.execute_policy(obs, env, + collected_coin_piles) # zero exploration + _, next_obs, reward, done, info = env.step(action) + + # Handle case where agent is on field with coin + reward, done = self.handle_coin(env, collected_coin_piles, ordered_coin_piles, target_pile, indices, + reward, done) + + # Get transformed next_obs that might have been updated because of handle_coin + next_obs = transform_observations(env, ordered_coin_piles, target_pile, self.cfg, self.n_agents) + + done = [done] * self.n_agents if isinstance(done, bool) else done + + if self.cfg[nms.ENV][nms.EVAL_RENDER]: env.render() + + obs = next_obs + + episode += 1 + + # -------------------------------------- HELPER FUNCTIONS ------------------------------------------------- # + + def get_actions(self, observations) -> ListOrTensor: + """ Given local observations, get actions for both agents """ + actions = [agent.step(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in + enumerate(self.agents)] + return actions + + def execute_policy(self, observations, env, collected_coin_piles) -> ListOrTensor: + """ Execute agent policies deterministically for inference """ + actions = [agent.policy(_as_torch(observations[ag_i]).view(-1).to(torch.float32)) for ag_i, agent in + enumerate(self.agents)] + for agent_idx in range(self.n_agents): + if all(collected_coin_piles[agent_idx].values()): + actions[agent_idx] = np.array(next( + action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if + a.name == nms.NOOP)) + return actions + + def use_door_or_move(self, env, obs, collected_coin_piles, det=False): + """ Function that handles automatic actions like door opening and forced Noop""" + action = [] + for agent_idx, agent in enumerate(self.agents): + agent_obs = _as_torch((obs)[agent_idx]).view(-1).to(torch.float32) + # Use Noop operation if agent already reached its target. (Only relevant for two-rooms setting) + if all(collected_coin_piles[agent_idx].values()): + action.append(next(action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if + a.name == nms.NOOP)) + if not det: + # Include agent experience entry manually + agent._episode.append((None, None, None, agent.vf(agent_obs))) + else: + if door := is_door_close(env, agent_idx): + if door.is_closed: + action.append(next( + action_i for action_i, a in enumerate(env.state[nms.AGENT][agent_idx].actions) if + a.name == nms.USE_DOOR)) + # Don't include action in agent experience + else: + if det: action.append(int(agent.pi(agent_obs, det=True)[0])) + else: action.append(int(agent.step(agent_obs))) + else: + if det: action.append(int(agent.pi(agent_obs, det=True)[0])) + else: action.append(int(agent.step(agent_obs))) + return action + + def handle_coin(self, env, collected_coin_piles, ordered_coin_piles, target_pile, indices, reward, done): + """ Check if agent moved on field with coin. If that is the case collect coin automatically """ + agents_positions = get_agents_positions(env, self.n_agents) + coin_piles_positions = get_coin_piles_positions(env) + if any([True for pos in agents_positions if pos in coin_piles_positions]): + # Only simulate collecting the coin + for idx, pos in enumerate(agents_positions): + if pos in collected_coin_piles[idx].keys() and not collected_coin_piles[idx][pos]: + + # If coin piles should be collected in a specific order + if ordered_coin_piles[idx]: + if pos == ordered_coin_piles[idx][target_pile[idx]]: + reward[idx] += 50 + collected_coin_piles[idx][pos] = True + # Set pointer to next coin pile + update_target_pile(env, idx, target_pile, indices, self.cfg) + update_ordered_coin_piles(idx, collected_coin_piles, ordered_coin_piles, env, + self.cfg, self.n_agents) + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SINGLE: + done = True + if all(collected_coin_piles[idx].values()): + # Reset collected_coin_piles indicator + for pos in coin_piles_positions: + collected_coin_piles[idx][pos] = False + else: + reward[idx] += 50 + collected_coin_piles[idx][pos] = True + + # Indicate that renderer can hide coin pile + coin_at_position = env.state[nms.COIN_PILES].by_pos(pos) + coin_at_position[0].set_new_amount(0) + + if self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] in [nms.ALL, nms.DISTRIBUTED]: + if all([all(collected_coin_piles[i].values()) for i in range(self.n_agents)]): + done = True + elif self.cfg[nms.ALGORITHM][nms.PILE_ALL_DONE] == nms.SHARED: + # End episode if both agents together have collected all coin piles + if all(get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, self.n_agents).values()): + done = True + + return reward, done diff --git a/marl_factory_grid/algorithms/rl/base_a2c.py b/marl_factory_grid/algorithms/rl/base_a2c.py new file mode 100644 index 0000000..1406d5f --- /dev/null +++ b/marl_factory_grid/algorithms/rl/base_a2c.py @@ -0,0 +1,112 @@ +import numpy as np +import torch as th +import scipy as sp +from collections import deque +from torch import nn + +cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1] + + +class Net(th.nn.Module): + def __init__(self, shape, activation, lr): + super().__init__() + self.net = th.nn.Sequential(*[layer + for io, a in zip(zip(shape[:-1], shape[1:]), + [activation] * (len(shape) - 2) + [th.nn.Identity]) + for layer in [th.nn.Linear(*io), a()]]) + self.optimizer = th.optim.Adam(self.net.parameters(), lr=lr) + + # Initialize weights uniformly, so that for the policy net all actions have approximately the same + # probability in the beginning + for module in self.modules(): + if isinstance(module, nn.Linear): + nn.init.uniform_(module.weight, a=-0.1, b=0.1) + if module.bias is not None: + nn.init.uniform_(module.bias, a=-0.1, b=0.1) + + def save_model(self, path): + th.save(self.net, f"{path}/{self.__class__.__name__}_model.pth") + + def save_model_parameters(self, path): + th.save(self.net.state_dict(), f"{path}/{self.__class__.__name__}_model_parameters.pth") + + def load_model_parameters(self, path): + self.net.load_state_dict(th.load(path)) + self.net.eval() + + +class ValueNet(Net): + def __init__(self, obs_dim, hidden_sizes=[64, 64], activation=th.nn.ReLU, lr=1e-3): + super().__init__([obs_dim] + hidden_sizes + [1], activation, lr) + + def forward(self, obs): return self.net(obs) + + def loss(self, states, returns): return ((returns - self(states)) ** 2).mean() + + +class PolicyNet(Net): + def __init__(self, obs_dim, act_dim, hidden_sizes=[64, 64], activation=th.nn.Tanh, lr=3e-4): + super().__init__([obs_dim] + hidden_sizes + [act_dim], activation, lr) + self.distribution = lambda obs: th.distributions.Categorical(logits=self.net(obs)) + + def forward(self, obs, act=None, det=False): + """Given an observation: Returns policy distribution and probablilty for a given action + or Returns a sampled action and its corresponding probablilty""" + pi = self.distribution(obs) + if act is not None: return pi, pi.log_prob(act) + act = self.net(obs).argmax() if det else pi.sample() # sample from the learned distribution + return act, pi.log_prob(act) + + def loss(self, states, actions, advantages): + _, logp = self.forward(states, actions) + loss = -(logp * advantages).mean() + return loss + + +class PolicyGradient: + """ Autonomous agent using vanilla policy gradient. """ + + def __init__(self, env, seed=42, gamma=0.99, agent_id=0, act_dim=None, obs_dim=None): + self.env = env + self.gamma = gamma # Setup env and discount + th.manual_seed(seed) + np.random.seed(seed) # Seed Torch, numpy and gym + # Keep track of previous rewards and performed steps to calcule the mean Return metric + self._episode, self.ep_returns, self.num_steps = [], deque(maxlen=100), 0 + # Get observation and action shapes + if not obs_dim: + obs_size = env.observation_space.shape if len(env.state.entities.by_name("Agents")) == 1 \ + else env.observation_space[agent_id].shape # Single agent case vs. multi-agent case + obs_dim = np.prod(obs_size) + if not act_dim: + act_dim = env.action_space[agent_id].n + self.vf = ValueNet(obs_dim) # Setup Value Network (Critic) + self.pi = PolicyNet(obs_dim, act_dim) # Setup Policy Network (Actor) + + def step(self, obs): + """ Given an observation, get action and probs from policy and values from critic""" + with th.no_grad(): + (a, _), v = self.pi(obs), self.vf(obs) + self._episode.append((None, None, None, v)) + return a.numpy() + + def policy(self, obs, det=True): + return self.pi(obs, det=det)[0].numpy() + + def finish_episode(self): + """Process self._episode & reset self.env, Returns (s,a,G,V)-Tuple and new inital state""" + s, a, r, v = (np.array(e) for e in zip(*self._episode)) # Get trajectories from rollout + self.ep_returns.append(sum(r)) + self._episode = [] # Add episode return to buffer & reset + return s, a, r, v # state, action, Return, Value Tensors + + def train(self, states, actions, returns, advantages): # Update policy weights + self.pi.optimizer.zero_grad() + self.vf.optimizer.zero_grad() # Reset optimizer + states = states.flatten(1, -1) # Reduce dimensionality to rollout_dim x input_dim + policy_loss = self.pi.loss(states, actions, advantages) # Calculate Policy loss + policy_loss.backward() + self.pi.optimizer.step() # Apply Policy loss + value_loss = self.vf.loss(states, returns) # Calculate Value loss + value_loss.backward() + self.vf.optimizer.step() # Apply Value loss diff --git a/marl_factory_grid/algorithms/marl/base_ac.py b/marl_factory_grid/algorithms/rl/base_ac.py similarity index 99% rename from marl_factory_grid/algorithms/marl/base_ac.py rename to marl_factory_grid/algorithms/rl/base_ac.py index 0c15250..f1ef3d1 100644 --- a/marl_factory_grid/algorithms/marl/base_ac.py +++ b/marl_factory_grid/algorithms/rl/base_ac.py @@ -2,7 +2,7 @@ import torch from typing import Union, List, Dict import numpy as np from torch.distributions import Categorical -from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory +from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class from pathlib import Path import pandas as pd diff --git a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml similarity index 90% rename from marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml rename to marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml index 599b7f4..99b7ea4 100644 --- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml +++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/dirt_quadrant_config.yaml @@ -1,5 +1,5 @@ agent: - classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC + classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC n_agents: 2 obs_emb_size: 96 action_emb_size: 16 @@ -18,7 +18,7 @@ env: eval_render: True save_and_log: True record: False -method: marl_factory_grid.algorithms.marl.LoopSEAC +method: marl_factory_grid.algorithms.rl.LoopSEAC algorithm: gamma: 0.99 entropy_coef: 0.01 diff --git a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml similarity index 91% rename from marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml rename to marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml index 8b8bf13..421a8d1 100644 --- a/marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml +++ b/marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/two_rooms_one_door_modified_config.yaml @@ -1,5 +1,5 @@ agent: - classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC + classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC n_agents: 2 obs_emb_size: 96 action_emb_size: 16 @@ -18,7 +18,7 @@ env: eval_render: True save_and_log: True record: False -method: marl_factory_grid.algorithms.marl.LoopSEAC +method: marl_factory_grid.algorithms.rl.LoopSEAC algorithm: gamma: 0.99 entropy_coef: 0.01 diff --git a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml b/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml similarity index 90% rename from marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml rename to marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml index d254f5e..6c11d02 100644 --- a/marl_factory_grid/algorithms/marl/configs/dirt_quadrant_config.yaml +++ b/marl_factory_grid/algorithms/rl/configs/dirt_quadrant_config.yaml @@ -1,5 +1,5 @@ agent: - classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC + classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC n_agents: 1 obs_emb_size: 96 action_emb_size: 16 @@ -18,7 +18,7 @@ env: eval_render: True save_and_log: True record: False -method: marl_factory_grid.algorithms.marl.LoopSEAC +method: marl_factory_grid.algorithms.rl.LoopSEAC algorithm: gamma: 0.99 entropy_coef: 0.01 diff --git a/marl_factory_grid/algorithms/marl/configs/environment_changes b/marl_factory_grid/algorithms/rl/configs/environment_changes similarity index 100% rename from marl_factory_grid/algorithms/marl/configs/environment_changes rename to marl_factory_grid/algorithms/rl/configs/environment_changes diff --git a/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml b/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml similarity index 90% rename from marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml rename to marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml index 95ddf07..d28d86d 100644 --- a/marl_factory_grid/algorithms/marl/configs/two_rooms_one_door_modified_config.yaml +++ b/marl_factory_grid/algorithms/rl/configs/two_rooms_one_door_modified_config.yaml @@ -1,5 +1,5 @@ agent: - classname: marl_factory_grid.algorithms.marl.networks.RecurrentAC + classname: marl_factory_grid.algorithms.rl.networks.RecurrentAC n_agents: 1 obs_emb_size: 96 action_emb_size: 16 @@ -18,7 +18,7 @@ env: eval_render: True save_and_log: False record: False -method: marl_factory_grid.algorithms.marl.LoopSEAC +method: marl_factory_grid.algorithms.rl.LoopSEAC algorithm: gamma: 0.99 entropy_coef: 0.01 diff --git a/marl_factory_grid/algorithms/rl/constants.py b/marl_factory_grid/algorithms/rl/constants.py new file mode 100644 index 0000000..fadf2fe --- /dev/null +++ b/marl_factory_grid/algorithms/rl/constants.py @@ -0,0 +1,37 @@ +class Names: + ENV = 'env' + ENV_NAME = 'env_name' + N_AGENTS = 'n_agents' + ALGORITHM = 'algorithm' + MAX_STEPS = 'max_steps' + N_STEPS = 'n_steps' + TRAIN_RENDER = 'train_render' + EVAL_RENDER = 'eval_render' + AGENT = 'Agent' + PILE_OBSERVABILITY = 'pile-observability' + PILE_ORDER = 'pile-order' + ALL = 'all' + FIXED = 'fixed' + AGENTS = 'agents' + DYNAMIC = 'dynamic' + SMART = 'smart' + DIRT_PILES = 'DirtPiles' + COIN_PILES = 'CoinPiles' + AUXILIARY_PILES = "auxiliary_piles" + DOORS = 'Doors' + DOOR = 'Door' + GAMMA = 'gamma' + ADVANTAGE = 'advantage' + REINFORCE = 'reinforce' + ADVANTAGE_AC = "Advantage-AC" + TD_ADVANTAGE_AC = "TD-Advantage-AC" + CHUNK_EPISODE = 'chunk-episode' + POS_POINTER = 'pos_pointer' + POSITIONS = 'positions' + SAVE_AND_LOG = 'save_and_log' + NOOP = 'Noop' + USE_DOOR = 'use_door' + PILE_ALL_DONE = 'pile_all_done' + SINGLE = 'single' + DISTRIBUTED = 'distributed' + SHARED = 'shared' diff --git a/marl_factory_grid/algorithms/marl/iac.py b/marl_factory_grid/algorithms/rl/iac.py similarity index 92% rename from marl_factory_grid/algorithms/marl/iac.py rename to marl_factory_grid/algorithms/rl/iac.py index d2730c8..ea8d4ee 100644 --- a/marl_factory_grid/algorithms/marl/iac.py +++ b/marl_factory_grid/algorithms/rl/iac.py @@ -1,9 +1,9 @@ import torch -from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms +from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic, nms from marl_factory_grid.algorithms.utils import instantiate_class from pathlib import Path from natsort import natsorted -from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory +from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory class LoopIAC(BaseActorCritic): diff --git a/marl_factory_grid/algorithms/marl/mappo.py b/marl_factory_grid/algorithms/rl/mappo.py similarity index 93% rename from marl_factory_grid/algorithms/marl/mappo.py rename to marl_factory_grid/algorithms/rl/mappo.py index e86a394..d40eaf2 100644 --- a/marl_factory_grid/algorithms/marl/mappo.py +++ b/marl_factory_grid/algorithms/rl/mappo.py @@ -1,6 +1,6 @@ -from marl_factory_grid.algorithms.marl.base_ac import Names as nms -from marl_factory_grid.algorithms.marl.snac import LoopSNAC -from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory +from marl_factory_grid.algorithms.rl.base_ac import Names as nms +from marl_factory_grid.algorithms.rl.snac import LoopSNAC +from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory import torch from torch.distributions import Categorical from marl_factory_grid.algorithms.utils import instantiate_class diff --git a/marl_factory_grid/algorithms/marl/memory.py b/marl_factory_grid/algorithms/rl/memory.py similarity index 100% rename from marl_factory_grid/algorithms/marl/memory.py rename to marl_factory_grid/algorithms/rl/memory.py diff --git a/marl_factory_grid/algorithms/marl/networks.py b/marl_factory_grid/algorithms/rl/networks.py similarity index 100% rename from marl_factory_grid/algorithms/marl/networks.py rename to marl_factory_grid/algorithms/rl/networks.py diff --git a/marl_factory_grid/algorithms/marl/seac.py b/marl_factory_grid/algorithms/rl/seac.py similarity index 91% rename from marl_factory_grid/algorithms/marl/seac.py rename to marl_factory_grid/algorithms/rl/seac.py index 07e8267..d1384e3 100644 --- a/marl_factory_grid/algorithms/marl/seac.py +++ b/marl_factory_grid/algorithms/rl/seac.py @@ -1,8 +1,8 @@ import torch from torch.distributions import Categorical -from marl_factory_grid.algorithms.marl.iac import LoopIAC -from marl_factory_grid.algorithms.marl.base_ac import nms -from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory +from marl_factory_grid.algorithms.rl.iac import LoopIAC +from marl_factory_grid.algorithms.rl.base_ac import nms +from marl_factory_grid.algorithms.rl.memory import MARLActorCriticMemory class LoopSEAC(LoopIAC): diff --git a/marl_factory_grid/algorithms/marl/snac.py b/marl_factory_grid/algorithms/rl/snac.py similarity index 90% rename from marl_factory_grid/algorithms/marl/snac.py rename to marl_factory_grid/algorithms/rl/snac.py index 11be902..02a9d89 100644 --- a/marl_factory_grid/algorithms/marl/snac.py +++ b/marl_factory_grid/algorithms/rl/snac.py @@ -1,5 +1,5 @@ -from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic -from marl_factory_grid.algorithms.marl.base_ac import nms +from marl_factory_grid.algorithms.rl.base_ac import BaseActorCritic +from marl_factory_grid.algorithms.rl.base_ac import nms import torch from torch.distributions import Categorical from pathlib import Path diff --git a/marl_factory_grid/algorithms/rl/utils.py b/marl_factory_grid/algorithms/rl/utils.py new file mode 100644 index 0000000..9204f4c --- /dev/null +++ b/marl_factory_grid/algorithms/rl/utils.py @@ -0,0 +1,337 @@ +import copy +from typing import List +import numpy as np +import torch + +from marl_factory_grid.algorithms.rl.constants import Names as nms + +from marl_factory_grid.algorithms.rl.base_a2c import cumulate_discount + + +def _as_torch(x): + """ Helper function to convert different list types to a torch tensor """ + if isinstance(x, np.ndarray): + return torch.from_numpy(x) + elif isinstance(x, List): + return torch.tensor(x) + elif isinstance(x, (int, float)): + return torch.tensor([x]) + return x + + +def transform_observations(env, ordered_coins, target_coin, cfg, n_agents): + """ Function that extracts local observations from global state + Requires that agents have observations -CoinPiles and -Self (cf. environment configs) """ + agents_positions = get_agents_positions(env, n_agents) + coin_observability_is_all = cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL + if coin_observability_is_all: + trans_obs = [torch.zeros(2 + 2 * len(ordered_coins[0])) for _ in range(len(agents_positions))] + else: + # Only show current target pile + trans_obs = [torch.zeros(4) for _ in range(len(agents_positions))] + for i, pos in enumerate(agents_positions): + agent_x, agent_y = pos[0], pos[1] + trans_obs[i][0] = agent_x + trans_obs[i][1] = agent_y + idx = 2 + if coin_observability_is_all: + for coin_pos in ordered_coins[i]: + trans_obs[i][idx] = coin_pos[0] + trans_obs[i][idx + 1] = coin_pos[1] + idx += 2 + else: + trans_obs[i][2] = ordered_coins[i][target_coin[i]][0] + trans_obs[i][3] = ordered_coins[i][target_coin[i]][1] + return trans_obs + + +def get_all_observations(env, cfg, n_agents): + """ Helper function that returns all possible agent observations """ + coins_positions = [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in + range(len(env.state.entities[nms.COIN_PILES]))] + if cfg[nms.ALGORITHM][nms.PILE_OBSERVABILITY] == nms.ALL: + obs = [torch.zeros(2 + 2 * len(coins_positions))] + observations = [[]] + # Fill in pile positions + idx = 2 + for pile_pos in coins_positions: + obs[0][idx] = pile_pos[0] + obs[0][idx + 1] = pile_pos[1] + idx += 2 + else: + # Have multiple observation layers of the map for each coin pile one + obs = [torch.zeros(4) for _ in range(n_agents) for _ in coins_positions] + observations = [[] for _ in coins_positions] + for idx, pile_pos in enumerate(coins_positions): + obs[idx][2] = pile_pos[0] + obs[idx][3] = pile_pos[1] + valid_agent_positions = env.state.entities.floorlist + + for idx, pos in enumerate(valid_agent_positions): + for obs_layer in range(len(obs)): + observation = copy.deepcopy(obs[obs_layer]) + observation[0] = pos[0] + observation[1] = pos[1] + observations[obs_layer].append(observation) + + return observations + + +def get_coin_piles_positions(env): + """ Get positions of coin piles on the map """ + return [env.state.entities[nms.COIN_PILES][pile_idx].pos for pile_idx in + range(len(env.state.entities[nms.COIN_PILES]))] + + +def get_agents_positions(env, n_agents): + """ Get positions of agents on the map """ + return [env.state.moving_entites[agent_idx].pos for agent_idx in range(n_agents)] + + +def get_ordered_coin_piles(env, collected_coins, cfg, n_agents): + """ This function determines in which order the agents should collect the coin piles + Each agent can have its individual pile order """ + ordered_coin_piles = [[] for _ in range(n_agents)] + coin_piles_positions = get_coin_piles_positions(env) + agents_positions = get_agents_positions(env, n_agents) + for agent_idx in range(n_agents): + if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.AGENTS]: + ordered_coin_piles[agent_idx] = coin_piles_positions + elif cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.SMART, nms.DYNAMIC]: + # Calculate distances for remaining unvisited coin piles + remaining_target_piles = [pos for pos, value in collected_coins[agent_idx].items() if not value] + pile_distances = {pos: 0 for pos in remaining_target_piles} + agent_pos = agents_positions[agent_idx] + for pos in remaining_target_piles: + pile_distances[pos] = np.abs(agent_pos[0] - pos[0]) + np.abs(agent_pos[1] - pos[1]) + + if cfg[nms.ALGORITHM][nms.PILE_ORDER] == nms.SMART: + # Check if there is an agent on the direct path to any of the remaining coin piles + for pile_pos in remaining_target_piles: + for other_pos in agents_positions: + if other_pos != agent_pos: + if agent_pos[0] == other_pos[0] == pile_pos[0] or agent_pos[1] == other_pos[1] == pile_pos[ + 1]: + # Get the line between the agent and the target + path = bresenham(agent_pos[0], agent_pos[1], pile_pos[0], pile_pos[1]) + + # Check if the entity lies on the path between the agent and the target + if other_pos in path: + pile_distances[pile_pos] += np.abs(agent_pos[0] - other_pos[0]) + np.abs( + agent_pos[1] - other_pos[1]) + + sorted_pile_distances = dict(sorted(pile_distances.items(), key=lambda item: item[1])) + # Insert already visited coin piles + ordered_coin_piles[agent_idx] = [pos for pos in coin_piles_positions if pos not in remaining_target_piles] + # Fill up with sorted positions + for pos in sorted_pile_distances.keys(): + ordered_coin_piles[agent_idx].append(pos) + + else: + print("Not a valid pile order option.") + exit() + + return ordered_coin_piles + + +def bresenham(x0, y0, x1, y1): + """Bresenham's line algorithm to get the coordinates of a line between two points.""" + dx = np.abs(x1 - x0) + dy = np.abs(y1 - y0) + sx = 1 if x0 < x1 else -1 + sy = 1 if y0 < y1 else -1 + err = dx - dy + + coordinates = [] + while True: + coordinates.append((x0, y0)) + if x0 == x1 and y0 == y1: + break + e2 = 2 * err + if e2 > -dy: + err -= dy + x0 += sx + if e2 < dx: + err += dx + y0 += sy + return coordinates + + +def update_ordered_coin_piles(agent_idx, collected_coin_piles, ordered_coin_piles, env, cfg, n_agents): + """ Update the order of the remaining coin piles """ + # Only update ordered_coin_pile for agent that reached its target pile + updated_ordered_coin_piles = get_ordered_coin_piles(env, collected_coin_piles, cfg, n_agents) + for i in range(len(ordered_coin_piles[agent_idx])): + ordered_coin_piles[agent_idx][i] = updated_ordered_coin_piles[agent_idx][i] + + +def distribute_indices(env, cfg, n_agents): + """ Distribute coin piles evenly among the agents """ + indices = [] + n_coin_piles = len(get_coin_piles_positions(env)) + agents_positions = get_agents_positions(env, n_agents) + if n_coin_piles == 1 or cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]: + indices = [[0] for _ in range(n_agents)] + else: + base_count = n_coin_piles // n_agents + remainder = n_coin_piles % n_agents + + start_index = 0 + for i in range(n_agents): + # Add an extra index to the first 'remainder' objects + end_index = start_index + base_count + (1 if i < remainder else 0) + indices.append(list(range(start_index, end_index))) + start_index = end_index + + # Static form: auxiliary pile, primary pile, auxiliary pile, ... + # -> Starting with index 0 even piles are auxiliary piles, odd piles are primary piles + if cfg[nms.ALGORITHM][nms.AUXILIARY_PILES] and nms.DOORS in env.state.entities.keys(): + door_positions = [door.pos for door in env.state.entities[nms.DOORS]] + distances = {door_pos: [] for door_pos in door_positions} + + # Calculate distance of every agent to every door + for door_pos in door_positions: + for agent_pos in agents_positions: + distances[door_pos].append(np.abs(door_pos[0] - agent_pos[0]) + np.abs(door_pos[1] - agent_pos[1])) + + def duplicate_indices(lst, item): + return [i for i, x in enumerate(lst) if x == item] + + # Get agent indices of agents with same distance to door + affected_agents = {door_pos: {} for door_pos in door_positions} + for door_pos in distances.keys(): + dist = distances[door_pos] + dist_set = set(dist) + for d in dist_set: + affected_agents[door_pos][str(d)] = duplicate_indices(dist, d) + + updated_indices = [] + + for door_pos, agent_distances in affected_agents.items(): + if len(agent_distances) == 0: + # Remove auxiliary piles for all agents + # (In config, we defined every pile with an even numbered index to be an auxiliary pile) + updated_indices = [[ele for ele in lst if ele % 2 != 0] for lst in indices] + else: + for distance, agent_indices in agent_distances.items(): + # For each distance group, pick one random agent to keep the auxiliary pile + # selected_agent = np.random.choice(agent_indices) + selected_agent = 0 + for agent_idx in agent_indices: + if agent_idx == selected_agent: + updated_indices.append(indices[agent_idx]) + else: + updated_indices.append([ele for ele in indices[agent_idx] if ele % 2 != 0]) + + indices = updated_indices + + return indices + + +def update_target_pile(env, agent_idx, target_pile, indices, cfg): + """ Get the next target pile for a given agent """ + if cfg[nms.ALGORITHM][nms.PILE_ORDER] in [nms.FIXED, nms.DYNAMIC, nms.SMART]: + if target_pile[agent_idx] + 1 < len(get_coin_piles_positions(env)): + target_pile[agent_idx] += 1 + else: + target_pile[agent_idx] = 0 + else: + if target_pile[agent_idx] + 1 in indices[agent_idx]: + target_pile[agent_idx] += 1 + + +def is_door_close(env, agent_idx): + """ Checks whether the agent is close to a door """ + neighbourhood = [y for x in env.state.entities.neighboring_positions(env.state[nms.AGENT][agent_idx].pos) + for y in env.state.entities.pos_dict[x] if nms.DOOR in y.name] + if neighbourhood: + return neighbourhood[0] + + +def get_all_collected_coin_piles(coin_piles_positions, collected_coin_piles, n_agents): + """ Returns all coin piles collected by any agent """ + meta_collected_coin_piles = {pos: False for pos in coin_piles_positions} + for agent_idx in range(n_agents): + for (pos, collected) in collected_coin_piles[agent_idx].items(): + if collected: + meta_collected_coin_piles[pos] = True + return meta_collected_coin_piles + + +def handle_finished_episode(obs, agents, cfg): + """ Finish up episode, calculate advantages and perform policy net and value net updates""" + with torch.inference_mode(False): + for ag_i, agent in enumerate(agents): + # Get states, actions, rewards and values from rollout buffer + data = agent.finish_episode() + # Chunk episode data, such that there will be no memory failure for very long episodes + chunks = split_into_chunks(data, cfg) + for (s, a, R, V) in chunks: + # Calculate discounted return and advantage + G = cumulate_discount(R, cfg[nms.ALGORITHM][nms.GAMMA]) + if cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.REINFORCE: + A = G + elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.ADVANTAGE_AC: + A = G - V # Actor-Critic Advantages + elif cfg[nms.ALGORITHM][nms.ADVANTAGE] == nms.TD_ADVANTAGE_AC: + with torch.no_grad(): + A = R + cfg[nms.ALGORITHM][nms.GAMMA] * np.append(V[1:], agent.vf( + _as_torch(obs[ag_i]).view(-1).to( + torch.float32)).numpy()) - V # TD Actor-Critic Advantages + else: + print("Not a valid advantage option.") + exit() + + rollout = (torch.tensor(x.copy()).to(torch.float32) for x in (s, a, G, A)) + # Update policy and value net of agent with experience from rollout buffer + agent.train(*rollout) + + +def split_into_chunks(data_tuple, cfg): + """ Chunks episode data into approximately equal sized chunks to prevent system memory failure from overload """ + result = [data_tuple] + chunk_size = cfg[nms.ALGORITHM][nms.CHUNK_EPISODE] + if chunk_size > 0: + # Get the maximum length of the lists in the tuple to handle different lengths + max_length = max(len(lst) for lst in data_tuple) + + # Prepare a list to store the result + result = [] + + # Split each list into chunks and add them to the result + for i in range(0, max_length, chunk_size): + # Create a sublist containing the ith chunk from each list + sublist = [lst[i:i + chunk_size] for lst in data_tuple if i < len(lst)] + result.append(sublist) + + return result + + +def set_agents_spawnpoints(env, n_agents): + """ Tell environment where the agents should spawn in the next episode """ + for agent_idx in range(n_agents): + agent_name = list(env.state.agents_conf.keys())[agent_idx] + current_pos_pointer = env.state.agents_conf[agent_name][nms.POS_POINTER] + # Making the reset dependent on the number of spawnpoints and not the number of coinpiles allows + # for having multiple subsequent spawnpoints with the same target pile + if current_pos_pointer == len(env.state.agents_conf[agent_name][nms.POSITIONS]) - 1: + env.state.agents_conf[agent_name][nms.POS_POINTER] = 0 + else: + env.state.agents_conf[agent_name][nms.POS_POINTER] += 1 + + +def save_configs(results_path, cfg, factory_conf, eval_factory_conf): + """ Save configurations for logging purposes """ + with open(f"{results_path}/MARL_config.txt", "w") as txt_file: + txt_file.write(str(cfg)) + with open(f"{results_path}/train_env_config.txt", "w") as txt_file: + txt_file.write(str(factory_conf)) + with open(f"{results_path}/eval_env_config.txt", "w") as txt_file: + txt_file.write(str(eval_factory_conf)) + + +def save_agent_models(results_path, agents): + """ Save model parameters after training """ + for idx, agent in enumerate(agents): + agent.pi.save_model_parameters(results_path) + agent.vf.save_model_parameters(results_path) diff --git a/marl_factory_grid/algorithms/static/TSP_coin_agent.py b/marl_factory_grid/algorithms/static/TSP_coin_agent.py new file mode 100644 index 0000000..fe80a5b --- /dev/null +++ b/marl_factory_grid/algorithms/static/TSP_coin_agent.py @@ -0,0 +1,40 @@ +from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent + +from marl_factory_grid.modules.coins import constants as c +from marl_factory_grid.environment import constants as e + +future_planning = 7 + + +class TSPCoinAgent(TSPBaseAgent): + + def __init__(self, *args, **kwargs): + """ + Initializes a TSPCoinAgent that aims to collect coins in the environment. + """ + super(TSPCoinAgent, self).__init__(*args, **kwargs) + self.fallback_action = e.NOOP + + def predict(self, *_, **__): + """ + Predicts the next action based on the presence of coins in the environment. + + :return: Predicted action. + :rtype: int + """ + coin_at_position = self._env.state[c.COIN].by_pos(self.state.pos) + if coin_at_position: + # Translate the action_object to an integer to have the same output as any other model + action = c.COLLECT + elif door := self._door_is_close(self._env.state): + action = self._use_door_or_move(door, c.COIN) + else: + action = self._predict_move(c.COIN) + self.action_list.append(action) + # Translate the action_object to an integer to have the same output as any other model + try: + action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action) + except (StopIteration, UnboundLocalError): + print('Will not happen') + raise EnvironmentError + return action_obj diff --git a/marl_factory_grid/configs/test_config.yaml b/marl_factory_grid/configs/test_config.yaml index 606a817..8eea6ef 100644 --- a/marl_factory_grid/configs/test_config.yaml +++ b/marl_factory_grid/configs/test_config.yaml @@ -40,10 +40,27 @@ Agents: # - DropOffLocations # - Maintainers # Clones: 0 - Target test agent: +# Target test agent: +# Actions: +# - Noop +# - Charge +# - DoorUse +# - Move8 +# Observations: +# - Combined: +# - Other +# - Walls +# - GlobalPosition +# - Battery +# - Destinations +# - Doors +# - Maintainers +# Clones: 1 + Coin test agent: Actions: - Noop - Charge + - Collect - DoorUse - Move8 Observations: @@ -52,6 +69,8 @@ Agents: - Walls - GlobalPosition - Battery + - ChargePods + - CoinPiles - Destinations - Doors - Maintainers @@ -67,11 +86,18 @@ Entities: Destinations: coords_or_quantity: 1 spawn_mode: GROUPED - DirtPiles: +# DirtPiles: +# coords_or_quantity: 10 +# initial_amount: 2 +# clean_amount: 1 +# dirt_spawn_r_var: 0.1 +# max_global_amount: 20 +# max_local_amount: 5 + CoinPiles: coords_or_quantity: 10 initial_amount: 2 - clean_amount: 1 - dirt_spawn_r_var: 0.1 + collect_amount: 1 + coin_spawn_r_var: 0.1 max_global_amount: 20 max_local_amount: 5 Doors: @@ -90,24 +116,26 @@ Entities: General: env_seed: 69 individual_rewards: true - level_name: quadrant + level_name: two_rooms pomdp_r: 3 verbose: false tests: false Rules: # Environment Dynamics - EntitiesSmearDirtOnMove: - smear_ratio: 0.2 + # EntitiesSmearDirtOnMove: + # smear_ratio: 0.2 DoorAutoClose: close_frequency: 10 MoveMaintainers: # Respawn Stuff - RespawnDirt: - respawn_freq: 15 + # RespawnDirt: + # respawn_freq: 15 RespawnItems: respawn_freq: 15 + RespawnCoins: + respawn_freq: 15 # Utilities WatchCollisions: diff --git a/marl_factory_grid/environment/factory.py b/marl_factory_grid/environment/factory.py index aa6a268..0b239b4 100644 --- a/marl_factory_grid/environment/factory.py +++ b/marl_factory_grid/environment/factory.py @@ -81,7 +81,7 @@ class Factory(gym.Env): def __init__(self, config_file: Union[str, PathLike], custom_modules_path: Union[None, PathLike] = None, custom_level_path: Union[None, PathLike] = None): """ - Initializes the marl-factory-grid as Gym environment. + Initializes the rl-factory-grid as Gym environment. :param config_file: Path to the configuration file. :type config_file: Union[str, PathLike] @@ -271,15 +271,37 @@ class Factory(gym.Env): if not self._renderer: # lazy init from marl_factory_grid.utils.renderer import Renderer global Renderer - self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10) + self._renderer = Renderer(self.map.level_shape, view_radius=self.conf.pomdp_r, fps=10) render_entities = self.state.entities.render() + + # Hide entities where certain conditions are met (e.g., amount <= 0 for DirtPiles) + render_entities = self.filter_entities(render_entities) + + # Mask entities based on dynamic conditions instead of hardcoding level-specific logic + if self.conf['General']['level_name'] == 'two_rooms': + render_entities = self.mask_entities(render_entities) + if self.conf.pomdp_r: for render_entity in render_entities: if render_entity.name == c.AGENT: render_entity.aux = self.obs_builder.curr_lightmaps[render_entity.real_name] return self._renderer.render(render_entities, self._recorder) + def filter_entities(self, entities): + """ Generalized method to filter out entities that shouldn't be rendered. """ + if 'DirtPiles' in self.state.entities.keys(): + entities = [entity for entity in entities if not (entity.name == 'DirtPiles' and entity.amount <= 0)] + return entities + + def mask_entities(self, entities): + """ Generalized method to mask entities based on dynamic conditions. """ + for entity in entities: + if entity.name == 'CoinPiles': + entity.mask = 'Destinations' + entity.mask_value = 1 + return entities + def set_recorder(self, recorder): self._recorder = recorder @@ -298,7 +320,7 @@ class Factory(gym.Env): summary.update({entity_group.name.lower(): entity_group.summarize_states()}) # TODO Section End ######## for key in list(summary.keys()): - if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries']: + if key not in ['step', 'walls', 'doors', 'agents', 'items', 'dirtPiles', 'batteries', 'coinPiles']: del summary[key] return summary diff --git a/marl_factory_grid/environment/rules.py b/marl_factory_grid/environment/rules.py index 763f16c..91a21f4 100644 --- a/marl_factory_grid/environment/rules.py +++ b/marl_factory_grid/environment/rules.py @@ -168,14 +168,25 @@ class SpawnEntity(Rule): return results +def _get_position(spawn_rule, positions, empty_positions, positions_pointer): + """ + Internal usage, selects positions based on rule. + """ + if spawn_rule and spawn_rule == "random": + position = random.choice(([x for x in positions if x in empty_positions])) + elif spawn_rule and spawn_rule == "order": + position = ([x for x in positions if x in empty_positions])[positions_pointer] + else: + position = h.get_first([x for x in positions if x in empty_positions]) + return position + + class SpawnAgents(Rule): def __init__(self): """ - TODO - - - :return: + Finds suitable spawn positions according to the given spawn rule, creates agents with these positions and adds + them to state.agents. """ super().__init__() pass @@ -183,8 +194,9 @@ class SpawnAgents(Rule): def on_reset(self, state): spawn_rule = None for rule in state.rules.rules: - if isinstance(rule, marl_factory_grid.environment.rules.AgentSpawnRule): + if isinstance(rule, AgentSpawnRule): spawn_rule = rule.spawn_rule + break if not hasattr(state, 'agent_spawn_positions'): state.agent_spawn_positions = [] @@ -200,7 +212,7 @@ class SpawnAgents(Rule): other = agent_conf['other'].copy() positions_pointer = agent_conf['pos_pointer'] - if position := self._get_position(spawn_rule, positions, empty_positions, positions_pointer): + if position := _get_position(spawn_rule, positions, empty_positions, positions_pointer): assert state.check_pos_validity(position), 'smth went wrong....' agents.add_item(Agent(actions, observations, position, str_ident=agent_name, **other)) state.agent_spawn_positions.append(position) @@ -213,21 +225,13 @@ class SpawnAgents(Rule): state.agent_spawn_positions.append(chosen_position) return [] - def _get_position(self, spawn_rule, positions, empty_positions, positions_pointer): - if spawn_rule and spawn_rule == "random": - position = random.choice(([x for x in positions if x in empty_positions])) - elif spawn_rule and spawn_rule == "order": - position = ([x for x in positions if x in empty_positions])[positions_pointer] - else: - position = h.get_first([x for x in positions if x in empty_positions]) - - return position class AgentSpawnRule(Rule): def __init__(self, spawn_rule): self.spawn_rule = spawn_rule super().__init__() + class DoneAtMaxStepsReached(Rule): def __init__(self, max_steps: int = 500): diff --git a/marl_factory_grid/modules/clean_up/groups.py b/marl_factory_grid/modules/clean_up/groups.py index 8a99439..f199dcb 100644 --- a/marl_factory_grid/modules/clean_up/groups.py +++ b/marl_factory_grid/modules/clean_up/groups.py @@ -1,4 +1,5 @@ import ast +import random from marl_factory_grid.environment import constants as c from marl_factory_grid.environment.groups.collection import Collection from marl_factory_grid.modules.clean_up.entitites import DirtPile @@ -33,7 +34,7 @@ class DirtPiles(Collection): return sum([dirt.amount for dirt in self]) def __init__(self, *args, max_local_amount=5, clean_amount=1, max_global_amount: int = 20, coords_or_quantity=10, - initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs): + initial_amount=2, amount_var=0.2, n_var=0.2, randomize=False, randomization_seed=0, **kwargs): """ A Collection of dirt piles that triggers their spawn. @@ -67,6 +68,8 @@ class DirtPiles(Collection): self.max_local_amount = max_local_amount self.coords_or_quantity = coords_or_quantity self.initial_amount = initial_amount + self.randomize = randomize + self.randomized_selection = None def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]: if ignore_blocking: @@ -85,7 +88,17 @@ class DirtPiles(Collection): else: n_new = [pos for pos in coords_or_quantity] - amounts = [amount if amount else (self.initial_amount ) # removed rng amount + if self.randomize: + if not self.randomized_selection: + n_new_prime = [] + for n in n_new: + if random.random() < 0.5: + n_new_prime.append(n) + n_new = n_new_prime + self.randomized_selection = n_new + else: + n_new = self.randomized_selection + amounts = [amount if amount else self.initial_amount # removed rng amount for _ in range(len(n_new))] spawn_counter = 0 diff --git a/marl_factory_grid/modules/coins/__init__.py b/marl_factory_grid/modules/coins/__init__.py new file mode 100644 index 0000000..2b47c1c --- /dev/null +++ b/marl_factory_grid/modules/coins/__init__.py @@ -0,0 +1,4 @@ +from .actions import Collect +from .entitites import CoinPile +from .groups import CoinPiles +from .rules import DoneOnAllCoinsCollected diff --git a/marl_factory_grid/modules/coins/actions.py b/marl_factory_grid/modules/coins/actions.py new file mode 100644 index 0000000..42e3eab --- /dev/null +++ b/marl_factory_grid/modules/coins/actions.py @@ -0,0 +1,36 @@ +from typing import Union + +from marl_factory_grid.environment.actions import Action +from marl_factory_grid.utils.results import ActionResult + +from marl_factory_grid.modules.coins import constants as d + +from marl_factory_grid.environment import constants as c + + +class Collect(Action): + + def __init__(self): + """ + Attempts to reduce coin amount on entity's position. Fails if no coin is found at the at agents' position. + """ + super().__init__(d.COLLECT, d.REWARD_COLLECT_VALID, d.REWARD_COLLECT_FAIL) + + def do(self, entity, state) -> Union[None, ActionResult]: + if coin_pile := next((x for x in state.entities.pos_dict[entity.pos] if "coin" in x.name.lower()), None): + new_coin_pile_amount = coin_pile.amount - state[d.COIN].collect_amount + + if new_coin_pile_amount <= 0: + state[d.COIN].delete_env_object(coin_pile) + else: + coin_pile.set_new_amount(max(new_coin_pile_amount, c.VALUE_FREE_CELL)) + valid = c.VALID + print_str = f'{entity.name} did just collect some coins at {entity.pos}.' + state.print(print_str) + + else: + valid = c.NOT_VALID + print_str = f'{entity.name} just tried to collect some coins at {entity.pos}, but failed.' + state.print(print_str) + + return self.get_result(valid, entity) diff --git a/marl_factory_grid/modules/coins/coinpiles.png b/marl_factory_grid/modules/coins/coinpiles.png new file mode 100644 index 0000000..38b084e Binary files /dev/null and b/marl_factory_grid/modules/coins/coinpiles.png differ diff --git a/marl_factory_grid/modules/coins/constants.py b/marl_factory_grid/modules/coins/constants.py new file mode 100644 index 0000000..a1f22a5 --- /dev/null +++ b/marl_factory_grid/modules/coins/constants.py @@ -0,0 +1,11 @@ +COIN = 'CoinPiles' + +COLLECT = 'do_collect_action' + +COLLECT_VALID = 'collect_valid' +COLLECT_FAIL = 'collect_fail' +COLLECT_ALL = 'all_collected' + +REWARD_COLLECT_VALID: float = 0.5 +REWARD_COLLECT_FAIL: float = -0.1 +REWARD_COLLECT_ALL: float = 4.5 diff --git a/marl_factory_grid/modules/coins/entitites.py b/marl_factory_grid/modules/coins/entitites.py new file mode 100644 index 0000000..76d242f --- /dev/null +++ b/marl_factory_grid/modules/coins/entitites.py @@ -0,0 +1,46 @@ +from marl_factory_grid.environment.entity.entity import Entity +from marl_factory_grid.utils.utility_classes import RenderEntity +from marl_factory_grid.modules.coins import constants as d + + +class CoinPile(Entity): + + @property + def amount(self): + """ + Internal Usage + """ + return self._amount + + @property + def encoding(self): + return self._amount + + def __init__(self, *args, amount=2, max_local_amount=5, **kwargs): + """ + Represents a pile of coins at a specific position in the environment that agents can interact with. Agents can + clean the dirt pile or, depending on activated rules, interact with it in different ways. + + :param amount: The amount of coins in the pile. + :type amount: float + + :param max_local_amount: The maximum amount of dirt allowed in a single pile at one position. + :type max_local_amount: float + """ + super(CoinPile, self).__init__(*args, **kwargs) + self._amount = amount + self.max_local_amount = max_local_amount + + def set_new_amount(self, amount): + """ + Internal Usage + """ + self._amount = min(amount, self.max_local_amount) + + def summarize_state(self): + state_dict = super().summarize_state() + state_dict.update(amount=float(self.amount)) + return state_dict + + def render(self): + return RenderEntity(d.COIN, self.pos, min(0.15 + self.amount, 1.5), 'scale') diff --git a/marl_factory_grid/modules/coins/groups.py b/marl_factory_grid/modules/coins/groups.py new file mode 100644 index 0000000..38d2f36 --- /dev/null +++ b/marl_factory_grid/modules/coins/groups.py @@ -0,0 +1,108 @@ +import ast +from marl_factory_grid.environment import constants as c +from marl_factory_grid.environment.groups.collection import Collection +from marl_factory_grid.modules.coins.entitites import CoinPile +from marl_factory_grid.utils.results import Result +from marl_factory_grid.utils import helpers as h + + +class CoinPiles(Collection): + _entity = CoinPile + + @property + def var_is_blocking_light(self): + return False + + @property + def var_can_collide(self): + return False + + @property + def var_can_move(self): + return False + + @property + def var_has_position(self): + return True + + @property + def global_amount(self) -> float: + """ + Internal Usage + """ + return sum([dirt.amount for dirt in self]) + + def __init__(self, *args, max_local_amount=5, collect_amount=1, max_global_amount: int = 20, coords_or_quantity=10, + initial_amount=2, amount_var=0.2, n_var=0.2, **kwargs): + """ + A Collection of dirt piles that triggers their spawn. + + :param max_local_amount: The maximum amount of coins allowed in a single pile at one position. + :type max_local_amount: int + + :param clean_amount: The amount of coins removed by a single collecting action. + :type clean_amount: int + + :param max_global_amount: The maximum total amount of coins allowed in the environment. + :type max_global_amount: int + + :param coords_or_quantity: Determines whether to use coordinates or quantity when triggering coin pile spawn. + :type coords_or_quantity: Union[Tuple[int, int], int] + + :param initial_amount: The initial amount of coin in each newly spawned pile. + :type initial_amount: int + + :param amount_var: The variability in the initial amount of coin in each pile. + :type amount_var: float + + :param n_var: The variability in the number of new coin piles spawned. + :type n_var: float + + """ + super(CoinPiles, self).__init__(*args, **kwargs) + self.amount_var = amount_var + self.n_var = n_var + self.collect_amount = collect_amount + self.max_global_amount = max_global_amount + self.max_local_amount = max_local_amount + self.coords_or_quantity = coords_or_quantity + self.initial_amount = initial_amount + + def trigger_spawn(self, state, coords_or_quantity=0, amount=0, ignore_blocking=False) -> [Result]: + if ignore_blocking: + print("##########################################") + print("Blocking should not be ignored for this Entity") + print("Exiting....") + exit() + coords_or_quantity = coords_or_quantity if coords_or_quantity else self.coords_or_quantity + if isinstance(coords_or_quantity, int): + n_new = int(abs(coords_or_quantity + (state.rng.uniform(-self.n_var, self.n_var)))) + n_new = state.get_n_random_free_positions(n_new) + else: + coords_or_quantity = ast.literal_eval(coords_or_quantity) + if isinstance(coords_or_quantity[0], int): + n_new = [coords_or_quantity] + else: + n_new = [pos for pos in coords_or_quantity] + + amounts = [amount if amount else (self.initial_amount ) # removed rng amount + for _ in range(len(n_new))] + + spawn_counter = 0 + for idx, (pos, a) in enumerate(zip(n_new, amounts)): + if not self.global_amount > self.max_global_amount: + if coin := self.by_pos(pos): + coin = h.get_first(coin) + new_value = coin.amount + a + coin.set_new_amount(new_value) + else: + super().spawn([pos], amount=a) + spawn_counter += 1 + else: + return Result(identifier=f'{self.name}_spawn', validity=c.NOT_VALID, value=spawn_counter) + + return Result(identifier=f'{self.name}_spawn', validity=c.VALID, value=spawn_counter) + + def __repr__(self): + s = super(CoinPiles, self).__repr__() + return f'{s[:-1]}, {self.global_amount}]' diff --git a/marl_factory_grid/modules/coins/rules.py b/marl_factory_grid/modules/coins/rules.py new file mode 100644 index 0000000..7122b28 --- /dev/null +++ b/marl_factory_grid/modules/coins/rules.py @@ -0,0 +1,59 @@ +from marl_factory_grid.modules.coins import constants as d +from marl_factory_grid.environment import constants as c + +from marl_factory_grid.environment.rules import Rule +from marl_factory_grid.utils.helpers import is_move +from marl_factory_grid.utils.results import TickResult +from marl_factory_grid.utils.results import DoneResult + + +class DoneOnAllCoinsCollected(Rule): + + def __init__(self, reward: float = d.REWARD_COLLECT_ALL): + """ + Defines a 'Done'-condition which triggers, when there is no more 'Dirt' in the environment. + + :type reward: float + :parameter reward: Given reward when condition triggers. + """ + super().__init__() + self.reward = reward + + def on_check_done(self, state) -> [DoneResult]: + if len(state[d.COIN]) == 0 and state.curr_step: + return [DoneResult(validity=c.VALID, identifier=self.name, reward=self.reward)] + return [] + + +class RespawnCoins(Rule): + + def __init__(self, respawn_freq: int = 15, respawn_n: int = 5, respawn_amount: float = 1.0): + """ + Defines the spawn pattern of initial and additional 'Dirt'-entities. + First chooses positions, then tries to spawn dirt until 'respawn_n' or the maximal global amount is reached. + If there is already some, it is topped up to min(max_local_amount, amount). + + :type respawn_freq: int + :parameter respawn_freq: In which frequency should this Rule try to spawn new 'Dirt'? + :type respawn_n: int + :parameter respawn_n: How many respawn positions are considered. + :type respawn_amount: float + :parameter respawn_amount: Defines how much dirt 'amount' is placed every 'spawn_freq' ticks. + """ + super().__init__() + self.respawn_n = respawn_n + self.respawn_amount = respawn_amount + self.respawn_freq = respawn_freq + self._next_coin_spawn = respawn_freq + + def tick_step(self, state): + collection = state[d.COIN] + if self._next_coin_spawn < 0: + result = [] # No CoinPile Spawn + elif not self._next_coin_spawn: + result = [collection.trigger_spawn(state, coords_or_quantity=self.respawn_n, amount=self.respawn_amount)] + self._next_coin_spawn = self.respawn_freq + else: + self._next_coin_spawn -= 1 + result = [] + return result diff --git a/marl_factory_grid/utils/plotting/plot_single_runs.py b/marl_factory_grid/utils/plotting/plot_single_runs.py index 5fbb024..a4dd040 100644 --- a/marl_factory_grid/utils/plotting/plot_single_runs.py +++ b/marl_factory_grid/utils/plotting/plot_single_runs.py @@ -7,7 +7,10 @@ from typing import Union import numpy as np import pandas as pd +import torch +from matplotlib import pyplot as plt +from marl_factory_grid.algorithms.rl.utils import _as_torch from marl_factory_grid.utils.helpers import IGNORED_DF_COLUMNS from marl_factory_grid.utils.plotting.plotting_utils import prepare_plot @@ -253,3 +256,125 @@ direction_mapping = { 'south_east': (1, 1), 'south_west': (-1, 1) } + + +def plot_reward_development(reward_development, results_path): + smoothed_data = np.convolve(reward_development, np.ones(10) / 10, mode='valid') + plt.plot(smoothed_data) + plt.ylim([-10, max(smoothed_data) + 20]) + plt.title('Smoothed Reward Development') + plt.xlabel('Episode') + plt.ylabel('Reward') + plt.savefig(f"{results_path}/smoothed_reward_development.png") + plt.show() + + +def plot_collected_coins_per_step(): + # Observed behaviour for multi-agent setting consisting of run0 and run0 + cleaned_dirt_per_step_emergent = [0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5] + cleaned_dirt_per_step = [0, 0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 5] # RL and TSP + + plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, color='green', linewidth=3, label='Prevented (RL)') + plt.step(range(1, len(cleaned_dirt_per_step_emergent) + 1), cleaned_dirt_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent') + plt.step(range(1, len(cleaned_dirt_per_step) + 1), cleaned_dirt_per_step, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)') + plt.xlabel("Environment step", fontsize=20) + plt.ylabel("Collected Coins", fontsize=20) + yint = range(min(cleaned_dirt_per_step), max(cleaned_dirt_per_step) + 1) + plt.yticks(yint, fontsize=17) + plt.xticks(range(1, len(cleaned_dirt_per_step_emergent) + 1), fontsize=17) + frame1 = plt.gca() + # Only display every 5th tick label + for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()): + if (idx + 1) % 5 != 0: + xlabel_i.set_visible(False) + xlabel_i.set_fontsize(0.0) + # Change order of labels in legend + handles, labels = frame1.get_legend_handles_labels() + order = [0, 2, 1] + plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20}) + fig = plt.gcf() + fig.set_size_inches(8, 7) + plt.savefig("../study_out/number_of_collected_coins.pdf") + plt.show() + + +def plot_reached_flags_per_step(): + # Observed behaviour for multi-agent setting consisting of runs 1 + 2 + reached_flags_per_step_emergent = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + reached_flags_per_step_RL = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2] + reached_flags_per_step_TSP = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2] + + plt.step(range(1, len(reached_flags_per_step_RL) + 1), reached_flags_per_step_RL, color='green', linewidth=3, label='Prevented (RL)') + plt.step(range(1, len(reached_flags_per_step_emergent) + 1), reached_flags_per_step_emergent, linestyle='--', color='darkred', linewidth=3, label='Emergent') + plt.step(range(1, len(reached_flags_per_step_TSP) + 1), reached_flags_per_step_TSP, linestyle='dotted', color='darkorange', linewidth=3, label='Prevented (TSP)') + plt.xlabel("Environment step", fontsize=20) + plt.ylabel("Reached Flags", fontsize=20) + yint = range(min(reached_flags_per_step_RL), max(reached_flags_per_step_RL) + 1) + plt.yticks(yint, fontsize=17) + plt.xticks(range(1, len(reached_flags_per_step_emergent) + 1), fontsize=17) + frame1 = plt.gca() + # Only display every 5th tick label + for idx, xlabel_i in enumerate(frame1.axes.get_xticklabels()): + if (idx + 1) % 5 != 0: + xlabel_i.set_visible(False) + xlabel_i.set_fontsize(0.0) + # Change order of labels in legend + handles, labels = frame1.get_legend_handles_labels() + order = [0, 2, 1] + plt.legend([handles[idx] for idx in order], [labels[idx] for idx in order], prop={'size': 20}) + fig = plt.gcf() + fig.set_size_inches(8, 7) + plt.savefig("../study_out/number_of_reached_flags.pdf") + plt.show() + + +def create_info_maps(env, all_valid_observations, dirt_piles_positions, results_path, agents, act_dim, + a2c_instance): + # Create value map + with open(f"{results_path}/info_maps.txt", "w") as txt_file: + for obs_layer, pos in enumerate(dirt_piles_positions): + observations_shape = ( + max(t[0] for t in env.state.entities.floorlist) + 2, + max(t[1] for t in env.state.entities.floorlist) + 2) + value_maps = [np.zeros(observations_shape) for _ in agents] + likeliest_action = [np.full(observations_shape, np.NaN) for _ in agents] + action_probabilities = [np.zeros((observations_shape[0], observations_shape[1], act_dim)) for + _ in agents] + for obs in all_valid_observations[obs_layer]: + for idx, agent in enumerate(agents): + x, y = int(obs[0]), int(obs[1]) + try: + value_maps[idx][x][y] = agent.vf(obs) + probs = agent.pi.distribution(obs).probs + likeliest_action[idx][x][y] = torch.argmax( + probs) # get the likeliest action at the current agent position + action_probabilities[idx][x][y] = probs + except: + pass + + txt_file.write("=======Value Maps=======\n") + for agent_idx, vmap in enumerate(value_maps): + txt_file.write(f"Value map of agent {agent_idx} for target pile {pos}:\n") + vmap = _as_torch(vmap).round(decimals=4) + max_digits = max(len(str(vmap.max().item())), len(str(vmap.min().item()))) + for idx, row in enumerate(vmap): + txt_file.write(' '.join(f" {elem:>{max_digits + 1}}" for elem in row.tolist())) + txt_file.write("\n") + txt_file.write("\n") + txt_file.write("=======Likeliest Action=======\n") + for agent_idx, amap in enumerate(likeliest_action): + txt_file.write(f"Likeliest action map of agent {agent_idx} for target pile {pos}:\n") + txt_file.write(np.array2string(amap)) + txt_file.write("\n") + txt_file.write("=======Action Probabilities=======\n") + for agent_idx, pmap in enumerate(action_probabilities): + a2c_instance.action_probabilities[agent_idx].append(pmap) + txt_file.write(f"Action probability map of agent {agent_idx} for target pile {pos}:\n") + for d in range(pmap.shape[0]): + row = '[' + for r in range(pmap.shape[1]): + row += "[" + ', '.join(f"{x:7.4f}" for x in pmap[d, r]) + "]" + txt_file.write(row + "]") + txt_file.write("\n") + + return action_probabilities diff --git a/marl_factory_grid/utils/renderer.py b/marl_factory_grid/utils/renderer.py index f5b9ea1..3982dc4 100644 --- a/marl_factory_grid/utils/renderer.py +++ b/marl_factory_grid/utils/renderer.py @@ -348,7 +348,6 @@ class Renderer: self.save_counter += 1 full_path = os.path.join(out_dir, unique_filename) pygame.image.save(self.screen, full_path) - print(f"Image saved as {unique_filename}") if __name__ == '__main__': diff --git a/marl_factory_grid/utils/states.py b/marl_factory_grid/utils/states.py index 0c9e965..a0a9030 100644 --- a/marl_factory_grid/utils/states.py +++ b/marl_factory_grid/utils/states.py @@ -118,9 +118,8 @@ class Gamestate(object): self._floortile_graph = None self.tests = StepTests(*tests) - # Pointer that defines current spawn points of agents - for agent in self.agents_conf: - self.agents_conf[agent]["pos_pointer"] = 0 + # Initialize position pointers for agents + self._initialize_position_pointers() def reset(self): self.curr_step = 0 @@ -138,6 +137,11 @@ class Gamestate(object): def __repr__(self): return f'{self.__class__.__name__}({len(self.entities)} Entitites @ Step {self.curr_step})' + def _initialize_position_pointers(self): + """ Initialize the position pointers for each agent in the configuration.""" + for agent in self.agents_conf: + self.agents_conf[agent]["pos_pointer"] = 0 + @property def random_free_position(self) -> (int, int): """ diff --git a/studies/marl_adapted.py b/studies/marl_adapted.py index 0f03a38..d5b0026 100644 --- a/studies/marl_adapted.py +++ b/studies/marl_adapted.py @@ -1,10 +1,11 @@ import copy from pathlib import Path -from marl_factory_grid.algorithms.marl.a2c_dirt import A2C +from marl_factory_grid.algorithms.rl.a2c_coin import A2C from marl_factory_grid.algorithms.utils import load_yaml_file + def single_agent_training(config_name): - cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml') + cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/{config_name}_config.yaml') train_cfg = load_yaml_file(cfg_path) # Use environment config with fixed spawnpoints for eval @@ -21,7 +22,7 @@ def single_agent_training(config_name): def single_agent_eval(config_name, run): - cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/{config_name}_config.yaml') + cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/{config_name}_config.yaml') train_cfg = load_yaml_file(cfg_path) # Use environment config with fixed spawnpoints for eval @@ -34,7 +35,7 @@ def single_agent_eval(config_name, run): def multi_agent_eval(config_name, runs, emergent_phenomenon=False): - cfg_path = Path(f'../marl_factory_grid/algorithms/marl/configs/MultiAgentConfigs/{config_name}_config.yaml') + cfg_path = Path(f'../marl_factory_grid/algorithms/rl/configs/MultiAgentConfigs/{config_name}_config.yaml') train_cfg = load_yaml_file(cfg_path) # Use environment config with fixed spawnpoints for eval @@ -85,12 +86,14 @@ def two_rooms_one_door_modified_single_agent_eval(agent_name): def dirt_quadrant_5_multi_agent_eval(emergent_phenomenon): multi_agent_eval("dirt_quadrant", ["run4", "run5"], emergent_phenomenon) -def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon): # run7 == run4 + +def dirt_quadrant_5_multi_agent_ctde_eval(emergent_phenomenon): # run7 == run4 multi_agent_eval("dirt_quadrant", ["run4", "run7"], emergent_phenomenon) + def two_rooms_one_door_modified_multi_agent_eval(emergent_phenomenon): multi_agent_eval("two_rooms_one_door_modified", ["run2", "run3"], emergent_phenomenon) if __name__ == '__main__': - dirt_quadrant_5_multi_agent_ctde_eval(True) \ No newline at end of file + dirt_quadrant_5_multi_agent_ctde_eval(True) diff --git a/studies/normalization_study.py b/studies/normalization_study.py index dd48c60..7d2c9da 100644 --- a/studies/normalization_study.py +++ b/studies/normalization_study.py @@ -2,7 +2,7 @@ from marl_factory_grid.algorithms.utils import Checkpointer from pathlib import Path from marl_factory_grid.algorithms.utils import load_yaml_file, add_env_props, instantiate_class, load_class -# from algorithms.marl import LoopSNAC, LoopIAC, LoopSEAC +# from algorithms.rl import LoopSNAC, LoopIAC, LoopSEAC for i in range(0, 5): diff --git a/studies/viz_policy.py b/studies/viz_policy.py index a3efd3e..b128872 100644 --- a/studies/viz_policy.py +++ b/studies/viz_policy.py @@ -5,7 +5,7 @@ from algorithms.utils import load_yaml_file from tqdm import trange study = 'example_config#0' #study_root = Path(__file__).parent / study -study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/marl/') +study_root = Path('/Users/romue/PycharmProjects/EDYS/algorithms/rl/') #['L2NoAh_gru', 'L2NoCh_gru', 'nomix_gru']: render = True diff --git a/test_run.py b/test_run.py index ee2b25b..a7ef76f 100644 --- a/test_run.py +++ b/test_run.py @@ -3,6 +3,7 @@ from pprint import pprint from tqdm import trange +from marl_factory_grid.algorithms.static.TSP_coin_agent import TSPCoinAgent from marl_factory_grid.algorithms.static.TSP_dirt_agent import TSPDirtAgent from marl_factory_grid.algorithms.static.TSP_item_agent import TSPItemAgent from marl_factory_grid.algorithms.static.TSP_target_agent import TSPTargetAgent @@ -30,7 +31,7 @@ if __name__ == '__main__': factory.render() action_spaces = factory.action_space # agents = [TSPDirtAgent(factory, 0), TSPItemAgent(factory, 1), TSPTargetAgent(factory, 2)] - agents = [TSPTargetAgent(factory, 0), TSPTargetAgent(factory, 1)] + agents = [TSPCoinAgent(factory, 0)] while not done: a = [x.predict() for x in agents] obs_type, _, _, done, info = factory.step(a) @@ -39,5 +40,3 @@ if __name__ == '__main__': if done: print(f'Episode {episode} done...') break - - plot_routes(factory, agents)