mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-07-11 23:42:40 +02:00
Reset tsp route caching + renamed and moved configs + removed unnecessary files
This commit is contained in:
@ -1 +1 @@
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
|
||||
|
@ -11,7 +11,6 @@ import numpy as np
|
||||
from torch.distributions import Categorical
|
||||
|
||||
from marl_factory_grid.algorithms.marl.base_a2c import PolicyGradient, cumulate_discount
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
||||
from pathlib import Path
|
||||
from collections import deque
|
||||
|
@ -2,8 +2,6 @@ import numpy as np; import torch as th; import scipy as sp;
|
||||
from collections import deque
|
||||
from torch import nn
|
||||
|
||||
# RLLab Magic for calculating the discounted return G(t) = R(t) + gamma * R(t-1)
|
||||
# cf. https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
|
||||
cumulate_discount = lambda x, gamma: sp.signal.lfilter([1], [1, - gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
class Net(th.nn.Module):
|
||||
|
@ -1,242 +0,0 @@
|
||||
import torch
|
||||
from typing import Union, List, Dict
|
||||
import numpy as np
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.utils import add_env_props, instantiate_class
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from collections import deque
|
||||
|
||||
|
||||
class Names:
|
||||
REWARD = 'reward'
|
||||
DONE = 'done'
|
||||
ACTION = 'action'
|
||||
OBSERVATION = 'observation'
|
||||
LOGITS = 'logits'
|
||||
HIDDEN_ACTOR = 'hidden_actor'
|
||||
HIDDEN_CRITIC = 'hidden_critic'
|
||||
AGENT = 'agent'
|
||||
ENV = 'env'
|
||||
ENV_NAME = 'env_name'
|
||||
N_AGENTS = 'n_agents'
|
||||
ALGORITHM = 'algorithm'
|
||||
MAX_STEPS = 'max_steps'
|
||||
N_STEPS = 'n_steps'
|
||||
BUFFER_SIZE = 'buffer_size'
|
||||
CRITIC = 'critic'
|
||||
BATCH_SIZE = 'bnatch_size'
|
||||
N_ACTIONS = 'n_actions'
|
||||
TRAIN_RENDER = 'train_render'
|
||||
EVAL_RENDER = 'eval_render'
|
||||
|
||||
|
||||
nms = Names
|
||||
ListOrTensor = Union[List, torch.Tensor]
|
||||
|
||||
|
||||
class BaseActorCritic:
|
||||
def __init__(self, cfg):
|
||||
self.factory = add_env_props(cfg)
|
||||
self.__training = True
|
||||
self.cfg = cfg
|
||||
self.n_agents = cfg[nms.AGENT][nms.N_AGENTS]
|
||||
self.reset_memory_after_epoch = True
|
||||
self.setup()
|
||||
|
||||
def setup(self):
|
||||
self.net = instantiate_class(self.cfg[nms.AGENT])
|
||||
self.optimizer = torch.optim.RMSprop(self.net.parameters(), lr=3e-4, eps=1e-5)
|
||||
|
||||
@classmethod
|
||||
def _as_torch(cls, x):
|
||||
if isinstance(x, np.ndarray):
|
||||
return torch.from_numpy(x)
|
||||
elif isinstance(x, List):
|
||||
return torch.tensor(x)
|
||||
elif isinstance(x, (int, float)):
|
||||
return torch.tensor([x])
|
||||
return x
|
||||
|
||||
def train(self):
|
||||
self.__training = False
|
||||
networks = [self.net] if not isinstance(self.net, List) else self.net
|
||||
for net in networks:
|
||||
net.train()
|
||||
|
||||
def eval(self):
|
||||
self.__training = False
|
||||
networks = [self.net] if not isinstance(self.net, List) else self.net
|
||||
for net in networks:
|
||||
net.eval()
|
||||
|
||||
def load_state_dict(self, path: Path):
|
||||
pass
|
||||
|
||||
def get_actions(self, out) -> ListOrTensor:
|
||||
actions = [Categorical(logits=logits).sample().item() for logits in out[nms.LOGITS]]
|
||||
return actions
|
||||
|
||||
def init_hidden(self) -> Dict[str, ListOrTensor]:
|
||||
pass
|
||||
|
||||
def forward(self,
|
||||
observations: ListOrTensor,
|
||||
actions: ListOrTensor,
|
||||
hidden_actor: ListOrTensor,
|
||||
hidden_critic: ListOrTensor
|
||||
) -> Dict[str, ListOrTensor]:
|
||||
pass
|
||||
|
||||
@torch.no_grad()
|
||||
def train_loop(self, checkpointer=None):
|
||||
env = self.factory
|
||||
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
|
||||
env.render()
|
||||
n_steps, max_steps = [self.cfg[nms.ALGORITHM][k] for k in [nms.N_STEPS, nms.MAX_STEPS]]
|
||||
tm = MARLActorCriticMemory(self.n_agents, self.cfg[nms.ALGORITHM].get(nms.BUFFER_SIZE, n_steps))
|
||||
global_steps, episode, df_results = 0, 0, []
|
||||
reward_queue = deque(maxlen=2000)
|
||||
|
||||
while global_steps < max_steps:
|
||||
obs = env.reset()
|
||||
obs = list(obs.values())
|
||||
last_hiddens = self.init_hidden()
|
||||
last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
|
||||
done, rew_log = [False] * self.n_agents, 0
|
||||
|
||||
if self.reset_memory_after_epoch:
|
||||
tm.reset()
|
||||
|
||||
tm.add(observation=obs, action=last_action,
|
||||
logits=torch.zeros(self.n_agents, 1, self.cfg[nms.AGENT][nms.N_ACTIONS]),
|
||||
values=torch.zeros(self.n_agents, 1), reward=reward, done=done, **last_hiddens)
|
||||
|
||||
while not all(done):
|
||||
out = self.forward(obs, last_action, **last_hiddens)
|
||||
action = self.get_actions(out)
|
||||
_, next_obs, reward, done, info = env.step(action)
|
||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||
|
||||
if self.cfg[nms.ENV][nms.TRAIN_RENDER]:
|
||||
env.render()
|
||||
|
||||
last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR],
|
||||
hidden_critic=out[nms.HIDDEN_CRITIC])
|
||||
|
||||
logits = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.LOGITS, None)], dim=0)
|
||||
values = torch.stack([tensor.squeeze(0) for tensor in out.get(nms.CRITIC, None)], dim=0)
|
||||
|
||||
tm.add(observation=obs, action=action, reward=reward, done=done,
|
||||
logits=logits, values=values,
|
||||
**last_hiddens)
|
||||
|
||||
obs = next_obs
|
||||
last_action = action
|
||||
|
||||
if (global_steps+1) % n_steps == 0 or all(done):
|
||||
with torch.inference_mode(False):
|
||||
self.learn(tm)
|
||||
|
||||
global_steps += 1
|
||||
rew_log += sum(reward)
|
||||
reward_queue.extend(reward)
|
||||
|
||||
if checkpointer is not None:
|
||||
checkpointer.step([
|
||||
(f'agent#{i}', agent)
|
||||
for i, agent in enumerate([self.net] if not isinstance(self.net, List) else self.net)
|
||||
])
|
||||
|
||||
if global_steps >= max_steps:
|
||||
break
|
||||
if global_steps%100 == 0:
|
||||
print(f'reward at episode: {episode} = {rew_log}')
|
||||
episode += 1
|
||||
df_results.append([episode, rew_log, *reward])
|
||||
df_results = pd.DataFrame(df_results,
|
||||
columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]]
|
||||
)
|
||||
if checkpointer is not None:
|
||||
df_results.to_csv(checkpointer.path / 'results.csv', index=False)
|
||||
return df_results
|
||||
|
||||
@torch.inference_mode(True)
|
||||
def eval_loop(self, n_episodes, render=False):
|
||||
env = self.factory
|
||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||
env.render()
|
||||
episode, results = 0, []
|
||||
while episode < n_episodes:
|
||||
obs = env.reset()
|
||||
obs = list(obs.values())
|
||||
last_hiddens = self.init_hidden()
|
||||
last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
|
||||
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
|
||||
while not all(done):
|
||||
out = self.forward(obs, last_action, **last_hiddens)
|
||||
action = self.get_actions(out)
|
||||
_, next_obs, reward, done, info = env.step(action)
|
||||
|
||||
if self.cfg[nms.ENV][nms.EVAL_RENDER]:
|
||||
env.render()
|
||||
|
||||
if isinstance(done, bool):
|
||||
done = [done] * obs[0].shape[0]
|
||||
obs = next_obs
|
||||
last_action = action
|
||||
last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR, None),
|
||||
hidden_critic=out.get(nms.HIDDEN_CRITIC, None)
|
||||
)
|
||||
eps_rew += torch.tensor(reward)
|
||||
results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
|
||||
episode += 1
|
||||
agent_columns = [f'agent#{i}' for i in range(self.cfg[nms.ENV][nms.N_AGENTS])]
|
||||
results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
|
||||
results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'],
|
||||
value_name='reward', var_name='agent')
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def compute_advantages(critic, reward, done, gamma, gae_coef=0.0):
|
||||
tds = (reward + gamma * (1.0 - done) * critic[:, 1:].detach()) - critic[:, :-1]
|
||||
|
||||
if gae_coef <= 0:
|
||||
return tds
|
||||
|
||||
gae = torch.zeros_like(tds[:, -1])
|
||||
gaes = []
|
||||
for t in range(tds.shape[1]-1, -1, -1):
|
||||
gae = tds[:, t] + gamma * gae_coef * (1.0 - done[:, t]) * gae
|
||||
gaes.insert(0, gae)
|
||||
gaes = torch.stack(gaes, dim=1)
|
||||
return gaes
|
||||
|
||||
def actor_critic(self, tm, network, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
|
||||
obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
|
||||
|
||||
out = network(obs, actions, tm.hidden_actor[:, 0].squeeze(0), tm.hidden_critic[:, 0].squeeze(0))
|
||||
logits = out[nms.LOGITS][:, :-1] # last one only needed for v_{t+1}
|
||||
critic = out[nms.CRITIC]
|
||||
|
||||
entropy_loss = Categorical(logits=logits).entropy().mean(-1)
|
||||
advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
|
||||
value_loss = advantages.pow(2).mean(-1) # n_agent
|
||||
|
||||
# policy loss
|
||||
log_ap = torch.log_softmax(logits, -1)
|
||||
log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
|
||||
a2c_loss = -(advantages.detach() * log_ap).mean(-1)
|
||||
# weighted loss
|
||||
loss = a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss
|
||||
return loss.mean()
|
||||
|
||||
def learn(self, tm: MARLActorCriticMemory, **kwargs):
|
||||
loss = self.actor_critic(tm, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
|
||||
# remove next_obs, will be added in next iter
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
|
||||
self.optimizer.step()
|
||||
|
@ -1,8 +0,0 @@
|
||||
marl_factory_grid>environment>rules.py#SpawnEntity.on_reset()
|
||||
marl_factory_grid>environment>rewards.py
|
||||
marl_factory_grid>modules>clean_up>groups.py#DirtPiles.trigger_spawn()
|
||||
marl_factory_grid>environment>rules.py#AgentSpawnRule
|
||||
marl_factory_grid>utils>states.py#GameState.__init__()
|
||||
marl_factory_grid>environment>factory.py>Factory#render
|
||||
marl_factory_grid>environment>factory.py>Factory#set_recorder
|
||||
marl_factory_grid>utils>renderer.py>Renderer#render
|
@ -1,57 +0,0 @@
|
||||
import torch
|
||||
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic, nms
|
||||
from marl_factory_grid.algorithms.utils import instantiate_class
|
||||
from pathlib import Path
|
||||
from natsort import natsorted
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
|
||||
|
||||
class LoopIAC(BaseActorCritic):
|
||||
|
||||
def __init__(self, cfg):
|
||||
super(LoopIAC, self).__init__(cfg)
|
||||
|
||||
def setup(self):
|
||||
self.net = [
|
||||
instantiate_class(self.cfg[nms.AGENT]) for _ in range(self.n_agents)
|
||||
]
|
||||
self.optimizer = [
|
||||
torch.optim.RMSprop(self.net[ag_i].parameters(), lr=3e-4, eps=1e-5) for ag_i in range(self.n_agents)
|
||||
]
|
||||
|
||||
def load_state_dict(self, path: Path):
|
||||
paths = natsorted(list(path.glob('*.pt')))
|
||||
for path, net in zip(paths, self.net):
|
||||
net.load_state_dict(torch.load(path))
|
||||
|
||||
@staticmethod
|
||||
def merge_dicts(ds): # todo could be recursive for more than 1 hierarchy
|
||||
d = {}
|
||||
for k in ds[0].keys():
|
||||
d[k] = [d[k] for d in ds]
|
||||
return d
|
||||
|
||||
def init_hidden(self):
|
||||
ha = [net.init_hidden_actor() for net in self.net]
|
||||
hc = [net.init_hidden_critic() for net in self.net]
|
||||
return dict(hidden_actor=ha, hidden_critic=hc)
|
||||
|
||||
def forward(self, observations, actions, hidden_actor, hidden_critic):
|
||||
outputs = [
|
||||
net(
|
||||
self._as_torch(observations[ag_i]).unsqueeze(0).unsqueeze(0), # agent x time
|
||||
self._as_torch(actions[ag_i]).unsqueeze(0),
|
||||
hidden_actor[ag_i],
|
||||
hidden_critic[ag_i]
|
||||
) for ag_i, net in enumerate(self.net)
|
||||
]
|
||||
return self.merge_dicts(outputs)
|
||||
|
||||
def learn(self, tms: MARLActorCriticMemory, **kwargs):
|
||||
for ag_i in range(self.n_agents):
|
||||
tm, net = tms(ag_i), self.net[ag_i]
|
||||
loss = self.actor_critic(tm, net, **self.cfg[nms.ALGORITHM], **kwargs)
|
||||
self.optimizer[ag_i].zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(net.parameters(), 0.5)
|
||||
self.optimizer[ag_i].step()
|
@ -1,66 +0,0 @@
|
||||
from marl_factory_grid.algorithms.marl.base_ac import Names as nms
|
||||
from marl_factory_grid.algorithms.marl.snac import LoopSNAC
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.utils import instantiate_class
|
||||
|
||||
|
||||
class LoopMAPPO(LoopSNAC):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(LoopMAPPO, self).__init__(*args, **kwargs)
|
||||
self.reset_memory_after_epoch = False
|
||||
|
||||
def setup(self):
|
||||
self.net = instantiate_class(self.cfg[nms.AGENT])
|
||||
self.optimizer = torch.optim.Adam(self.net.parameters(), lr=3e-4, eps=1e-5)
|
||||
|
||||
def learn(self, tm: MARLActorCriticMemory, **kwargs):
|
||||
if len(tm) >= self.cfg['algorithm']['buffer_size']:
|
||||
# only learn when buffer is full
|
||||
for batch_i in range(self.cfg['algorithm']['n_updates']):
|
||||
batch = tm.chunk_dataloader(chunk_len=self.cfg['algorithm']['n_steps'],
|
||||
k=self.cfg['algorithm']['batch_size'])
|
||||
loss = self.mappo(batch, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
|
||||
self.optimizer.zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(self.net.parameters(), 0.5)
|
||||
self.optimizer.step()
|
||||
|
||||
def monte_carlo_returns(self, rewards, done, gamma):
|
||||
rewards_ = []
|
||||
discounted_reward = torch.zeros_like(rewards[:, -1])
|
||||
for t in range(rewards.shape[1]-1, -1, -1):
|
||||
discounted_reward = rewards[:, t] + (gamma * (1.0 - done[:, t]) * discounted_reward)
|
||||
rewards_.insert(0, discounted_reward)
|
||||
rewards_ = torch.stack(rewards_, dim=1)
|
||||
return rewards_
|
||||
|
||||
def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **__):
|
||||
out = network(batch[nms.OBSERVATION], batch[nms.ACTION], batch[nms.HIDDEN_ACTOR], batch[nms.HIDDEN_CRITIC])
|
||||
logits = out[nms.LOGITS][:, :-1] # last one only needed for v_{t+1}
|
||||
|
||||
old_log_probs = torch.log_softmax(batch[nms.LOGITS], -1)
|
||||
old_log_probs = torch.gather(old_log_probs, index=batch[nms.ACTION][:, 1:].unsqueeze(-1), dim=-1).squeeze()
|
||||
|
||||
# monte carlo returns
|
||||
mc_returns = self.monte_carlo_returns(batch[nms.REWARD], batch[nms.DONE], gamma)
|
||||
mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8) # todo: norm across agent ok?
|
||||
advantages = mc_returns - out[nms.CRITIC][:, :-1]
|
||||
|
||||
# policy loss
|
||||
log_ap = torch.log_softmax(logits, -1)
|
||||
log_ap = torch.gather(log_ap, dim=-1, index=batch[nms.ACTION][:, 1:].unsqueeze(-1)).squeeze()
|
||||
ratio = (log_ap - old_log_probs).exp()
|
||||
surr1 = ratio * advantages.detach()
|
||||
surr2 = torch.clamp(ratio, 1 - clip_range, 1 + clip_range) * advantages.detach()
|
||||
policy_loss = -torch.min(surr1, surr2).mean(-1)
|
||||
|
||||
# entropy & value loss
|
||||
entropy_loss = Categorical(logits=logits).entropy().mean(-1)
|
||||
value_loss = advantages.pow(2).mean(-1) # n_agent
|
||||
|
||||
# weighted loss
|
||||
loss = policy_loss + vf_coef*value_loss - entropy_coef * entropy_loss
|
||||
|
||||
return loss.mean()
|
@ -1,221 +0,0 @@
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
import torch
|
||||
from typing import Union
|
||||
from torch import Tensor
|
||||
from torch.utils.data import Dataset, ConcatDataset
|
||||
import random
|
||||
|
||||
|
||||
class ActorCriticMemory(object):
|
||||
def __init__(self, capacity=10):
|
||||
self.capacity = capacity
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.__actions = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__hidden_actor = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__hidden_critic = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__states = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__rewards = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__dones = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__logits = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
self.__values = LazyTensorFiFoQueue(maxlen=self.capacity+1)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__rewards) - 1
|
||||
|
||||
@property
|
||||
def observation(self, sls=slice(0, None)): # add time dimension through stacking
|
||||
return self.__states[sls].unsqueeze(0) # 1 x time x hidden dim
|
||||
|
||||
@property
|
||||
def hidden_actor(self, sls=slice(0, None)): # 1 x n_layers x dim
|
||||
return self.__hidden_actor[sls].unsqueeze(0) # 1 x time x n_layers x dim
|
||||
|
||||
@property
|
||||
def hidden_critic(self, sls=slice(0, None)): # 1 x n_layers x dim
|
||||
return self.__hidden_critic[sls].unsqueeze(0) # 1 x time x n_layers x dim
|
||||
|
||||
@property
|
||||
def reward(self, sls=slice(0, None)):
|
||||
return self.__rewards[sls].squeeze().unsqueeze(0) # 1 x time
|
||||
|
||||
@property
|
||||
def action(self, sls=slice(0, None)):
|
||||
return self.__actions[sls].long().squeeze().unsqueeze(0) # 1 x time
|
||||
|
||||
@property
|
||||
def done(self, sls=slice(0, None)):
|
||||
return self.__dones[sls].float().squeeze().unsqueeze(0) # 1 x time
|
||||
|
||||
@property
|
||||
def logits(self, sls=slice(0, None)): # assumes a trailing 1 for time dimension - common when using output from NN
|
||||
return self.__logits[sls].squeeze().unsqueeze(0) # 1 x time x actions
|
||||
|
||||
@property
|
||||
def values(self, sls=slice(0, None)):
|
||||
return self.__values[sls].squeeze().unsqueeze(0) # 1 x time x actions
|
||||
|
||||
def add_observation(self, state: Union[Tensor, np.ndarray]):
|
||||
self.__states.append(state if isinstance(state, Tensor) else torch.from_numpy(state))
|
||||
|
||||
def add_hidden_actor(self, hidden: Tensor):
|
||||
# layers x hidden dim
|
||||
self.__hidden_actor.append(hidden)
|
||||
|
||||
def add_hidden_critic(self, hidden: Tensor):
|
||||
# layers x hidden dim
|
||||
self.__hidden_critic.append(hidden)
|
||||
|
||||
def add_action(self, action: Union[int, Tensor]):
|
||||
if not isinstance(action, Tensor):
|
||||
action = torch.tensor(action)
|
||||
self.__actions.append(action)
|
||||
|
||||
def add_reward(self, reward: Union[float, Tensor]):
|
||||
if not isinstance(reward, Tensor):
|
||||
reward = torch.tensor(reward)
|
||||
self.__rewards.append(reward)
|
||||
|
||||
def add_done(self, done: bool):
|
||||
if not isinstance(done, Tensor):
|
||||
done = torch.tensor(done)
|
||||
self.__dones.append(done)
|
||||
|
||||
def add_logits(self, logits: Tensor):
|
||||
self.__logits.append(logits)
|
||||
|
||||
def add_values(self, values: Tensor):
|
||||
self.__values.append(values)
|
||||
|
||||
def add(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
func = getattr(ActorCriticMemory, f'add_{k}')
|
||||
func(self, v)
|
||||
|
||||
|
||||
class MARLActorCriticMemory(object):
|
||||
def __init__(self, n_agents, capacity):
|
||||
self.n_agents = n_agents
|
||||
self.memories = [
|
||||
ActorCriticMemory(capacity) for _ in range(n_agents)
|
||||
]
|
||||
|
||||
def __call__(self, agent_i):
|
||||
return self.memories[agent_i]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.memories[0]) # todo add assertion check!
|
||||
|
||||
def reset(self):
|
||||
for mem in self.memories:
|
||||
mem.reset()
|
||||
|
||||
def add(self, **kwargs):
|
||||
for agent_i in range(self.n_agents):
|
||||
for k, v in kwargs.items():
|
||||
func = getattr(ActorCriticMemory, f'add_{k}')
|
||||
func(self.memories[agent_i], v[agent_i])
|
||||
|
||||
def __getattr__(self, attr):
|
||||
all_attrs = [getattr(mem, attr) for mem in self.memories]
|
||||
return torch.cat(all_attrs, 0) # agent x time ...
|
||||
|
||||
def chunk_dataloader(self, chunk_len, k):
|
||||
datasets = [ExperienceChunks(mem, chunk_len, k) for mem in self.memories]
|
||||
dataset = ConcatDataset(datasets)
|
||||
data = [dataset[i] for i in range(len(dataset))]
|
||||
data = custom_collate_fn(data)
|
||||
return data
|
||||
|
||||
|
||||
def custom_collate_fn(batch):
|
||||
elem = batch[0]
|
||||
return {key: torch.cat([d[key] for d in batch], dim=0) for key in elem}
|
||||
|
||||
|
||||
class ExperienceChunks(Dataset):
|
||||
def __init__(self, memory, chunk_len, k):
|
||||
assert chunk_len <= len(memory), 'chunk_len cannot be longer than the size of the memory'
|
||||
self.memory = memory
|
||||
self.chunk_len = chunk_len
|
||||
self.k = k
|
||||
|
||||
@property
|
||||
def whitelist(self):
|
||||
whitelist = torch.ones(len(self.memory) - self.chunk_len)
|
||||
for d in self.memory.done.squeeze().nonzero().flatten():
|
||||
whitelist[max((0, d-self.chunk_len-1)):d+2] = 0
|
||||
whitelist[0] = 0
|
||||
return whitelist.tolist()
|
||||
|
||||
def sample(self, start=1):
|
||||
cl = self.chunk_len
|
||||
sample = dict(observation=self.memory.observation[:, start:start+cl+1],
|
||||
action=self.memory.action[:, start-1:start+cl],
|
||||
hidden_actor=self.memory.hidden_actor[:, start-1],
|
||||
hidden_critic=self.memory.hidden_critic[:, start-1],
|
||||
reward=self.memory.reward[:, start:start + cl],
|
||||
done=self.memory.done[:, start:start + cl],
|
||||
logits=self.memory.logits[:, start:start + cl],
|
||||
values=self.memory.values[:, start:start + cl])
|
||||
return sample
|
||||
|
||||
def __len__(self):
|
||||
return self.k
|
||||
|
||||
def __getitem__(self, i):
|
||||
idx = random.choices(range(0, len(self.memory) - self.chunk_len), weights=self.whitelist, k=1)
|
||||
return self.sample(idx[0])
|
||||
|
||||
|
||||
class LazyTensorFiFoQueue:
|
||||
def __init__(self, maxlen):
|
||||
self.maxlen = maxlen
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.__lazy_queue = deque(maxlen=self.maxlen)
|
||||
self.shape = None
|
||||
self.queue = None
|
||||
|
||||
def shape_init(self, tensor: Tensor):
|
||||
self.shape = torch.Size([self.maxlen, *tensor.shape])
|
||||
|
||||
def build_tensor_queue(self):
|
||||
if len(self.__lazy_queue) > 0:
|
||||
block = torch.stack(list(self.__lazy_queue), dim=0)
|
||||
l = block.shape[0]
|
||||
if self.queue is None:
|
||||
self.queue = block
|
||||
elif self.true_len() <= self.maxlen:
|
||||
self.queue = torch.cat((self.queue, block), dim=0)
|
||||
else:
|
||||
self.queue = torch.cat((self.queue[l:], block), dim=0)
|
||||
self.__lazy_queue.clear()
|
||||
|
||||
def append(self, data):
|
||||
if self.shape is None:
|
||||
self.shape_init(data)
|
||||
self.__lazy_queue.append(data)
|
||||
if len(self.__lazy_queue) >= self.maxlen:
|
||||
self.build_tensor_queue()
|
||||
|
||||
def true_len(self):
|
||||
return len(self.__lazy_queue) + (0 if self.queue is None else self.queue.shape[0])
|
||||
|
||||
def __len__(self):
|
||||
return min((self.true_len(), self.maxlen))
|
||||
|
||||
def __str__(self):
|
||||
return f'LazyTensorFiFoQueue\tmaxlen: {self.maxlen}, shape: {self.shape}, ' \
|
||||
f'len: {len(self)}, true_len: {self.true_len()}, elements in lazy queue: {len(self.__lazy_queue)}'
|
||||
|
||||
def __getitem__(self, item_or_slice):
|
||||
self.build_tensor_queue()
|
||||
return self.queue[item_or_slice]
|
||||
|
||||
|
||||
|
||||
|
@ -7,8 +7,8 @@ agent:
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/MultiAgentConfigs/dirt_quadrant_train_config"
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/dirt_quadrant_eval_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
@ -7,8 +7,8 @@ agent:
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||
classname: marl_factory_grid.environment.configs.marl_eval
|
||||
env_name: "marl_eval/two_rooms_eval_config"
|
||||
n_agents: 2
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
@ -1,103 +0,0 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class RecurrentAC(nn.Module):
|
||||
def __init__(self, observation_size, n_actions, obs_emb_size,
|
||||
action_emb_size, hidden_size_actor, hidden_size_critic,
|
||||
n_agents, use_agent_embedding=True):
|
||||
super(RecurrentAC, self).__init__()
|
||||
observation_size = np.prod(observation_size)
|
||||
self.n_layers = 1
|
||||
self.n_actions = n_actions
|
||||
self.use_agent_embedding = use_agent_embedding
|
||||
self.hidden_size_actor = hidden_size_actor
|
||||
self.hidden_size_critic = hidden_size_critic
|
||||
self.action_emb_size = action_emb_size
|
||||
self.obs_proj = nn.Linear(observation_size, obs_emb_size)
|
||||
self.action_emb = nn.Embedding(n_actions+1, action_emb_size, padding_idx=0)
|
||||
self.agent_emb = nn.Embedding(n_agents, action_emb_size)
|
||||
mix_in_size = obs_emb_size+action_emb_size if not use_agent_embedding else obs_emb_size+n_agents*action_emb_size
|
||||
self.mix = nn.Sequential(nn.Tanh(),
|
||||
nn.Linear(mix_in_size, obs_emb_size),
|
||||
nn.Tanh(),
|
||||
nn.Linear(obs_emb_size, obs_emb_size)
|
||||
)
|
||||
self.gru_actor = nn.GRU(obs_emb_size, hidden_size_actor, batch_first=True, num_layers=self.n_layers)
|
||||
self.gru_critic = nn.GRU(obs_emb_size, hidden_size_critic, batch_first=True, num_layers=self.n_layers)
|
||||
self.action_head = nn.Sequential(
|
||||
nn.Linear(hidden_size_actor, hidden_size_actor),
|
||||
nn.Tanh(),
|
||||
nn.Linear(hidden_size_actor, n_actions)
|
||||
)
|
||||
# spectral_norm(nn.Linear(hidden_size_actor, hidden_size_actor)),
|
||||
self.critic_head = nn.Sequential(
|
||||
nn.Linear(hidden_size_critic, hidden_size_critic),
|
||||
nn.Tanh(),
|
||||
nn.Linear(hidden_size_critic, 1)
|
||||
)
|
||||
#self.action_head[-1].weight.data.uniform_(-3e-3, 3e-3)
|
||||
#self.action_head[-1].bias.data.uniform_(-3e-3, 3e-3)
|
||||
|
||||
def init_hidden_actor(self):
|
||||
return torch.zeros(1, self.n_layers, self.hidden_size_actor)
|
||||
|
||||
def init_hidden_critic(self):
|
||||
return torch.zeros(1, self.n_layers, self.hidden_size_critic)
|
||||
|
||||
def forward(self, observations, actions, hidden_actor=None, hidden_critic=None):
|
||||
n_agents, t, *_ = observations.shape
|
||||
obs_emb = self.obs_proj(observations.view(n_agents, t, -1).float())
|
||||
action_emb = self.action_emb(actions+1) # shift by one due to padding idx
|
||||
|
||||
if not self.use_agent_embedding:
|
||||
x_t = torch.cat((obs_emb, action_emb), -1)
|
||||
else:
|
||||
agent_emb = self.agent_emb(
|
||||
torch.cat([torch.arange(0, n_agents, 1).view(-1, 1)] * t, 1)
|
||||
)
|
||||
x_t = torch.cat((obs_emb, agent_emb, action_emb), -1)
|
||||
|
||||
mixed_x_t = self.mix(x_t)
|
||||
output_p, _ = self.gru_actor(input=mixed_x_t, hx=hidden_actor.swapaxes(1, 0))
|
||||
output_c, _ = self.gru_critic(input=mixed_x_t, hx=hidden_critic.swapaxes(1, 0))
|
||||
|
||||
logits = self.action_head(output_p)
|
||||
critic = self.critic_head(output_c).squeeze(-1)
|
||||
return dict(logits=logits, critic=critic, hidden_actor=output_p, hidden_critic=output_c)
|
||||
|
||||
|
||||
class RecurrentACL2(RecurrentAC):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.action_head = nn.Sequential(
|
||||
nn.Linear(self.hidden_size_actor, self.hidden_size_actor),
|
||||
nn.Tanh(),
|
||||
NormalizedLinear(self.hidden_size_actor, self.n_actions, trainable_magnitude=True)
|
||||
)
|
||||
|
||||
|
||||
class NormalizedLinear(nn.Linear):
|
||||
def __init__(self, in_features: int, out_features: int,
|
||||
device=None, dtype=None, trainable_magnitude=False):
|
||||
super(NormalizedLinear, self).__init__(in_features, out_features, False, device, dtype)
|
||||
self.d_sqrt = in_features**0.5
|
||||
self.trainable_magnitude = trainable_magnitude
|
||||
self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
|
||||
|
||||
def forward(self, in_array):
|
||||
normalized_input = F.normalize(in_array, dim=-1, p=2, eps=1e-5)
|
||||
normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
|
||||
return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
|
||||
|
||||
|
||||
class L2Norm(nn.Module):
|
||||
def __init__(self, in_features, trainable_magnitude=False):
|
||||
super(L2Norm, self).__init__()
|
||||
self.d_sqrt = in_features**0.5
|
||||
self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
|
||||
|
||||
def forward(self, x):
|
||||
return F.normalize(x, dim=-1, p=2, eps=1e-5) * self.d_sqrt * self.scale
|
@ -1,55 +0,0 @@
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from marl_factory_grid.algorithms.marl.iac import LoopIAC
|
||||
from marl_factory_grid.algorithms.marl.base_ac import nms
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
|
||||
|
||||
class LoopSEAC(LoopIAC):
|
||||
def __init__(self, cfg):
|
||||
super(LoopSEAC, self).__init__(cfg)
|
||||
|
||||
def actor_critic(self, tm, networks, gamma, entropy_coef, vf_coef, gae_coef=0.0, **kwargs):
|
||||
obs, actions, done, reward = tm.observation, tm.action, tm.done[:, 1:], tm.reward[:, 1:]
|
||||
outputs = [net(obs, actions, tm.hidden_actor[:, 0], tm.hidden_critic[:, 0]) for net in networks]
|
||||
|
||||
with torch.inference_mode(True):
|
||||
true_action_logp = torch.stack([
|
||||
torch.log_softmax(out[nms.LOGITS][ag_i, :-1], -1)
|
||||
.gather(index=actions[ag_i, 1:, None], dim=-1)
|
||||
for ag_i, out in enumerate(outputs)
|
||||
], 0).squeeze()
|
||||
|
||||
losses = []
|
||||
|
||||
for ag_i, out in enumerate(outputs):
|
||||
logits = out[nms.LOGITS][:, :-1] # last one only needed for v_{t+1}
|
||||
critic = out[nms.CRITIC]
|
||||
|
||||
entropy_loss = Categorical(logits=logits[ag_i]).entropy().mean()
|
||||
advantages = self.compute_advantages(critic, reward, done, gamma, gae_coef)
|
||||
|
||||
# policy loss
|
||||
log_ap = torch.log_softmax(logits, -1)
|
||||
log_ap = torch.gather(log_ap, dim=-1, index=actions[:, 1:].unsqueeze(-1)).squeeze()
|
||||
|
||||
# importance weights
|
||||
iw = (log_ap - true_action_logp).exp().detach() # importance_weights
|
||||
|
||||
a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
|
||||
|
||||
value_loss = (iw*advantages.pow(2)).mean(-1) # n_agent
|
||||
|
||||
# weighted loss
|
||||
loss = (a2c_loss + vf_coef*value_loss - entropy_coef * entropy_loss).mean()
|
||||
losses.append(loss)
|
||||
|
||||
return losses
|
||||
|
||||
def learn(self, tms: MARLActorCriticMemory, **kwargs):
|
||||
losses = self.actor_critic(tms, self.net, **self.cfg[nms.ALGORITHM], **kwargs)
|
||||
for ag_i, loss in enumerate(losses):
|
||||
self.optimizer[ag_i].zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
|
||||
self.optimizer[ag_i].step()
|
@ -7,8 +7,8 @@ agent:
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/dirt_quadrant_train_config"
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/dirt_quadrant_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
@ -7,8 +7,8 @@ agent:
|
||||
hidden_size_critic: 64
|
||||
use_agent_embedding: False
|
||||
env:
|
||||
classname: marl_factory_grid.configs.custom
|
||||
env_name: "custom/two_rooms_one_door_modified_train_config"
|
||||
classname: marl_factory_grid.environment.configs.rl
|
||||
env_name: "rl/two_rooms_train_config"
|
||||
n_agents: 1
|
||||
max_steps: 250
|
||||
pomdp_r: 2
|
@ -1,33 +0,0 @@
|
||||
from marl_factory_grid.algorithms.marl.base_ac import BaseActorCritic
|
||||
from marl_factory_grid.algorithms.marl.base_ac import nms
|
||||
import torch
|
||||
from torch.distributions import Categorical
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class LoopSNAC(BaseActorCritic):
|
||||
def __init__(self, cfg):
|
||||
super().__init__(cfg)
|
||||
|
||||
def load_state_dict(self, path: Path):
|
||||
path2weights = list(path.glob('*.pt'))
|
||||
assert len(path2weights) == 1, f'Expected a single set of weights but got {len(path2weights)}'
|
||||
self.net.load_state_dict(torch.load(path2weights[0]))
|
||||
|
||||
def init_hidden(self):
|
||||
hidden_actor = self.net.init_hidden_actor()
|
||||
hidden_critic = self.net.init_hidden_critic()
|
||||
return dict(hidden_actor=torch.cat([hidden_actor] * self.n_agents, 0),
|
||||
hidden_critic=torch.cat([hidden_critic] * self.n_agents, 0)
|
||||
)
|
||||
|
||||
def get_actions(self, out):
|
||||
actions = Categorical(logits=out[nms.LOGITS]).sample().squeeze()
|
||||
return actions
|
||||
|
||||
def forward(self, observations, actions, hidden_actor, hidden_critic):
|
||||
out = self.net(self._as_torch(observations).unsqueeze(1),
|
||||
self._as_torch(actions).unsqueeze(1),
|
||||
hidden_actor, hidden_critic
|
||||
)
|
||||
return out
|
@ -37,7 +37,6 @@ class TSPBaseAgent(ABC):
|
||||
self._position_graph = self.generate_pos_graph()
|
||||
self._static_route = None
|
||||
self.cached_route = None
|
||||
self.fallback_action = None
|
||||
self.action_list = []
|
||||
|
||||
@abstractmethod
|
||||
@ -50,46 +49,6 @@ class TSPBaseAgent(ABC):
|
||||
"""
|
||||
return 0
|
||||
|
||||
def calculate_tsp_route(self, target_identifier):
|
||||
"""
|
||||
Calculate the TSP route to reach a target.
|
||||
|
||||
:param target_identifier: Identifier of the target entity
|
||||
:type target_identifier: str
|
||||
|
||||
:return: TSP route
|
||||
:rtype: List[int]
|
||||
"""
|
||||
target_positions = [x for x in self._env.state[target_identifier].positions if x != c.VALUE_NO_POS]
|
||||
|
||||
# if there are cached routes, search for one matching the current and target position
|
||||
if self._env.state.route_cache and (
|
||||
route := self._env.state.get_cached_route(self.state.pos, target_positions)) is not None:
|
||||
# print(f"Retrieved cached route: {route}")
|
||||
return route
|
||||
# if none are found, calculate tsp route and cache it
|
||||
else:
|
||||
start_time = time.time()
|
||||
if self.local_optimization:
|
||||
nodes = \
|
||||
[self.state.pos] + \
|
||||
[x for x in target_positions if max(abs(np.subtract(x, self.state.pos))) < 3]
|
||||
try:
|
||||
while len(nodes) < 7:
|
||||
nodes += [next(x for x in target_positions if x not in nodes)]
|
||||
except StopIteration:
|
||||
nodes = [self.state.pos] + target_positions
|
||||
|
||||
else:
|
||||
nodes = [self.state.pos] + target_positions
|
||||
|
||||
route = tsp.traveling_salesman_problem(self._position_graph,
|
||||
nodes=nodes, cycle=True, method=tsp.greedy_tsp)
|
||||
duration = time.time() - start_time
|
||||
print("TSP calculation took {:.2f} seconds to execute".format(duration))
|
||||
self._env.state.cache_route(route)
|
||||
return route
|
||||
|
||||
def _use_door_or_move(self, door, target):
|
||||
"""
|
||||
Helper method to decide whether to use a door or move towards a target.
|
||||
@ -108,6 +67,47 @@ class TSPBaseAgent(ABC):
|
||||
action = self._predict_move(target)
|
||||
return action
|
||||
|
||||
def calculate_tsp_route(self, target_identifier):
|
||||
"""
|
||||
Calculate the TSP route to reach a target.
|
||||
|
||||
:param target_identifier: Identifier of the target entity
|
||||
:type target_identifier: str
|
||||
|
||||
:return: TSP route
|
||||
:rtype: List[int]
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
if self.cached_route is not None:
|
||||
#print(f" Used cached route: {self.cached_route}")
|
||||
return copy.deepcopy(self.cached_route)
|
||||
|
||||
else:
|
||||
positions = [x for x in self._env.state[target_identifier].positions if x != c.VALUE_NO_POS]
|
||||
if self.local_optimization:
|
||||
nodes = \
|
||||
[self.state.pos] + \
|
||||
[x for x in positions if max(abs(np.subtract(x, self.state.pos))) < 3]
|
||||
try:
|
||||
while len(nodes) < 7:
|
||||
nodes += [next(x for x in positions if x not in nodes)]
|
||||
except StopIteration:
|
||||
nodes = [self.state.pos] + positions
|
||||
|
||||
else:
|
||||
nodes = [self.state.pos] + positions
|
||||
|
||||
route = tsp.traveling_salesman_problem(self._position_graph,
|
||||
nodes=nodes, cycle=True, method=tsp.greedy_tsp)
|
||||
self.cached_route = copy.deepcopy(route)
|
||||
#print(f"Cached route: {self.cached_route}")
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
#print("TSP calculation took {:.2f} seconds to execute".format(duration))
|
||||
return route
|
||||
|
||||
def _door_is_close(self, state):
|
||||
"""
|
||||
Check if a door is close to the agent's position.
|
||||
@ -173,11 +173,8 @@ class TSPBaseAgent(ABC):
|
||||
action = next(action for action, pos_diff in MOVEMAP.items() if
|
||||
np.all(diff == pos_diff) and action in allowed_directions)
|
||||
except StopIteration:
|
||||
print(f"No valid action found for pos diff: {diff}. Using fallback action: {self.fallback_action}.")
|
||||
if self.fallback_action and any(self.fallback_action == action.name for action in self.state.actions):
|
||||
action = self.fallback_action
|
||||
else:
|
||||
action = choice(self.state.actions).name
|
||||
print(f"No valid action found for pos diff: {diff}. Using fallback action.")
|
||||
action = choice(self.state.actions).name
|
||||
else:
|
||||
action = choice(self.state.actions).name
|
||||
# noinspection PyUnboundLocalVariable
|
||||
|
@ -1,76 +0,0 @@
|
||||
import numpy as np
|
||||
|
||||
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
|
||||
|
||||
from marl_factory_grid.modules.items import constants as i
|
||||
from marl_factory_grid.environment import constants as c
|
||||
|
||||
future_planning = 7
|
||||
inventory_size = 3
|
||||
|
||||
MODE_GET = 'Mode_Get'
|
||||
MODE_BRING = 'Mode_Bring'
|
||||
|
||||
|
||||
class TSPItemAgent(TSPBaseAgent):
|
||||
|
||||
def __init__(self, *args, mode=MODE_GET, **kwargs):
|
||||
"""
|
||||
Initializes a TSPItemAgent that colects items in the environment, stores them in his inventory and drops them off
|
||||
at a drop-off location.
|
||||
|
||||
:param mode: Mode of the agent, either MODE_GET or MODE_BRING.
|
||||
"""
|
||||
super(TSPItemAgent, self).__init__(*args, **kwargs)
|
||||
self.mode = mode
|
||||
self.fallback_action = c.NOOP
|
||||
|
||||
def predict(self, *_, **__):
|
||||
item_at_position = self._env.state[i.ITEM].by_pos(self.state.pos)
|
||||
dropoff_at_position = self._env.state[i.DROP_OFF].by_pos(self.state.pos)
|
||||
if item_at_position:
|
||||
# Translate the action_object to an integer to have the same output as any other model
|
||||
action = i.ITEM_ACTION
|
||||
elif dropoff_at_position:
|
||||
# Translate the action_object to an integer to have the same output as any other model
|
||||
action = i.ITEM_ACTION
|
||||
elif door := self._door_is_close(self._env.state):
|
||||
action = self._use_door_or_move(door, i.DROP_OFF if self.mode == MODE_BRING else i.ITEM)
|
||||
else:
|
||||
action = self._choose()
|
||||
self.action_list.append(action)
|
||||
# Translate the action_object to an integer to have the same output as any other model
|
||||
try:
|
||||
action_obj = next(action_i for action_i, a in enumerate(self.state.actions) if a.name == action)
|
||||
except (StopIteration, UnboundLocalError):
|
||||
print('Will not happen')
|
||||
raise EnvironmentError
|
||||
# noinspection PyUnboundLocalVariable
|
||||
if self.mode == MODE_BRING and len(self._env[i.INVENTORY].by_entity(self.state)):
|
||||
pass
|
||||
elif self.mode == MODE_BRING and not len(self._env[i.INVENTORY].by_entity(self.state)):
|
||||
self.mode = MODE_GET
|
||||
elif self.mode == MODE_GET and len(self._env[i.INVENTORY].by_entity(self.state)) > inventory_size:
|
||||
self.mode = MODE_BRING
|
||||
else:
|
||||
pass
|
||||
return action_obj
|
||||
|
||||
def _choose(self):
|
||||
"""
|
||||
Internal Usage. Chooses the action based on the agent's mode and the environment state.
|
||||
|
||||
:return: Chosen action.
|
||||
:rtype: int
|
||||
"""
|
||||
target = i.DROP_OFF if self.mode == MODE_BRING else i.ITEM
|
||||
if len(self._env.state[i.ITEM]) >= 1:
|
||||
action = self._predict_move(target)
|
||||
|
||||
elif len(self._env[i.INVENTORY].by_entity(self.state)):
|
||||
self.mode = MODE_BRING
|
||||
action = self._predict_move(target)
|
||||
else:
|
||||
action = int(np.random.randint(self._env.action_space.n))
|
||||
# noinspection PyUnboundLocalVariable
|
||||
return action
|
@ -1,27 +0,0 @@
|
||||
from random import randint
|
||||
|
||||
from marl_factory_grid.algorithms.static.TSP_base_agent import TSPBaseAgent
|
||||
|
||||
future_planning = 7
|
||||
|
||||
|
||||
class TSPRandomAgent(TSPBaseAgent):
|
||||
|
||||
def __init__(self, n_actions, *args, **kwargs):
|
||||
"""
|
||||
Initializes a TSPRandomAgent that performs random actions from within his action space.
|
||||
|
||||
:param n_actions: Number of possible actions.
|
||||
:type n_actions: int
|
||||
"""
|
||||
super(TSPRandomAgent, self).__init__(*args, **kwargs)
|
||||
self.n_action = n_actions
|
||||
|
||||
def predict(self, *_, **__):
|
||||
"""
|
||||
Predicts the next action randomly.
|
||||
|
||||
:return: Predicted action.
|
||||
:rtype: int
|
||||
"""
|
||||
return randint(0, self.n_action - 1)
|
@ -58,7 +58,7 @@ def load_yaml_file(path: Path):
|
||||
|
||||
def add_env_props(cfg):
|
||||
# Path to config File
|
||||
env_path = Path(f'../marl_factory_grid/configs/{cfg["env"]["env_name"]}.yaml')
|
||||
env_path = Path(f'../marl_factory_grid/environment/configs/{cfg["env"]["env_name"]}.yaml')
|
||||
|
||||
# Env Init
|
||||
factory = Factory(env_path)
|
||||
|
Reference in New Issue
Block a user