mirror of
https://github.com/illiumst/marl-factory-grid.git
synced 2025-06-18 18:52:52 +02:00
Resolved some warnings and style issues
This commit is contained in:
@ -1 +1,4 @@
|
||||
import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
||||
|
@ -1 +1 @@
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
from marl_factory_grid.algorithms.marl.memory import MARLActorCriticMemory
|
||||
|
@ -28,6 +28,7 @@ class Names:
|
||||
BATCH_SIZE = 'bnatch_size'
|
||||
N_ACTIONS = 'n_actions'
|
||||
|
||||
|
||||
nms = Names
|
||||
ListOrTensor = Union[List, torch.Tensor]
|
||||
|
||||
@ -112,10 +113,9 @@ class BaseActorCritic:
|
||||
next_obs, reward, done, info = env.step(action)
|
||||
done = [done] * self.n_agents if isinstance(done, bool) else done
|
||||
|
||||
last_hiddens = dict(hidden_actor =out[nms.HIDDEN_ACTOR],
|
||||
last_hiddens = dict(hidden_actor=out[nms.HIDDEN_ACTOR],
|
||||
hidden_critic=out[nms.HIDDEN_CRITIC])
|
||||
|
||||
|
||||
tm.add(observation=obs, action=action, reward=reward, done=done,
|
||||
logits=out.get(nms.LOGITS, None), values=out.get(nms.CRITIC, None),
|
||||
**last_hiddens)
|
||||
@ -142,7 +142,9 @@ class BaseActorCritic:
|
||||
print(f'reward at episode: {episode} = {rew_log}')
|
||||
episode += 1
|
||||
df_results.append([episode, rew_log, *reward])
|
||||
df_results = pd.DataFrame(df_results, columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]])
|
||||
df_results = pd.DataFrame(df_results,
|
||||
columns=['steps', 'reward', *[f'agent#{i}' for i in range(self.n_agents)]]
|
||||
)
|
||||
if checkpointer is not None:
|
||||
df_results.to_csv(checkpointer.path / 'results.csv', index=False)
|
||||
return df_results
|
||||
@ -157,24 +159,27 @@ class BaseActorCritic:
|
||||
last_action, reward = [-1] * self.n_agents, [0.] * self.n_agents
|
||||
done, rew_log, eps_rew = [False] * self.n_agents, 0, torch.zeros(self.n_agents)
|
||||
while not all(done):
|
||||
if render: env.render()
|
||||
if render:
|
||||
env.render()
|
||||
|
||||
out = self.forward(obs, last_action, **last_hiddens)
|
||||
action = self.get_actions(out)
|
||||
next_obs, reward, done, info = env.step(action)
|
||||
|
||||
if isinstance(done, bool): done = [done] * obs.shape[0]
|
||||
if isinstance(done, bool):
|
||||
done = [done] * obs.shape[0]
|
||||
obs = next_obs
|
||||
last_action = action
|
||||
last_hiddens = dict(hidden_actor=out.get(nms.HIDDEN_ACTOR, None),
|
||||
hidden_critic=out.get(nms.HIDDEN_CRITIC, None)
|
||||
)
|
||||
eps_rew += torch.tensor(reward)
|
||||
results.append(eps_rew.tolist() + [sum(eps_rew).item()] + [episode])
|
||||
results.append(eps_rew.tolist() + [np.sum(eps_rew).item()] + [episode])
|
||||
episode += 1
|
||||
agent_columns = [f'agent#{i}' for i in range(self.cfg['environment']['n_agents'])]
|
||||
results = pd.DataFrame(results, columns=agent_columns + ['sum', 'episode'])
|
||||
results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'], value_name='reward', var_name='agent')
|
||||
results = pd.melt(results, id_vars=['episode'], value_vars=agent_columns + ['sum'],
|
||||
value_name='reward', var_name='agent')
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
|
@ -36,7 +36,7 @@ class LoopMAPPO(LoopSNAC):
|
||||
rewards_ = torch.stack(rewards_, dim=1)
|
||||
return rewards_
|
||||
|
||||
def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **kwargs):
|
||||
def mappo(self, batch, network, gamma, entropy_coef, vf_coef, clip_range, **__):
|
||||
out = network(batch[nms.OBSERVATION], batch[nms.ACTION], batch[nms.HIDDEN_ACTOR], batch[nms.HIDDEN_CRITIC])
|
||||
logits = out[nms.LOGITS][:, :-1] # last one only needed for v_{t+1}
|
||||
|
||||
@ -45,7 +45,7 @@ class LoopMAPPO(LoopSNAC):
|
||||
|
||||
# monte carlo returns
|
||||
mc_returns = self.monte_carlo_returns(batch[nms.REWARD], batch[nms.DONE], gamma)
|
||||
mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8) #todo: norm across agent ok?
|
||||
mc_returns = (mc_returns - mc_returns.mean()) / (mc_returns.std() + 1e-8) # todo: norm across agent ok?
|
||||
advantages = mc_returns - out[nms.CRITIC][:, :-1]
|
||||
|
||||
# policy loss
|
||||
|
@ -1,8 +1,7 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils import spectral_norm
|
||||
|
||||
|
||||
class RecurrentAC(nn.Module):
|
||||
@ -88,8 +87,8 @@ class NormalizedLinear(nn.Linear):
|
||||
self.trainable_magnitude = trainable_magnitude
|
||||
self.scale = nn.Parameter(torch.tensor([1.]), requires_grad=trainable_magnitude)
|
||||
|
||||
def forward(self, input):
|
||||
normalized_input = F.normalize(input, dim=-1, p=2, eps=1e-5)
|
||||
def forward(self, in_array):
|
||||
normalized_input = F.normalize(in_array, dim=-1, p=2, eps=1e-5)
|
||||
normalized_weight = F.normalize(self.weight, dim=-1, p=2, eps=1e-5)
|
||||
return F.linear(normalized_input, normalized_weight) * self.d_sqrt * self.scale
|
||||
|
||||
|
@ -16,7 +16,7 @@ class LoopSEAC(LoopIAC):
|
||||
with torch.inference_mode(True):
|
||||
true_action_logp = torch.stack([
|
||||
torch.log_softmax(out[nms.LOGITS][ag_i, :-1], -1)
|
||||
.gather(index=actions[ag_i, 1:, None], dim=-1)
|
||||
.gather(index=actions[ag_i, 1:, None], dim=-1)
|
||||
for ag_i, out in enumerate(outputs)
|
||||
], 0).squeeze()
|
||||
|
||||
@ -38,7 +38,6 @@ class LoopSEAC(LoopIAC):
|
||||
|
||||
a2c_loss = (-iw*log_ap * advantages.detach()).mean(-1)
|
||||
|
||||
|
||||
value_loss = (iw*advantages.pow(2)).mean(-1) # n_agent
|
||||
|
||||
# weighted loss
|
||||
@ -53,4 +52,4 @@ class LoopSEAC(LoopIAC):
|
||||
self.optimizer[ag_i].zero_grad()
|
||||
loss.backward()
|
||||
torch.nn.utils.clip_grad_norm_(self.net[ag_i].parameters(), 0.5)
|
||||
self.optimizer[ag_i].step()
|
||||
self.optimizer[ag_i].step()
|
||||
|
@ -30,4 +30,4 @@ class LoopSNAC(BaseActorCritic):
|
||||
self._as_torch(actions).unsqueeze(1),
|
||||
hidden_actor, hidden_critic
|
||||
)
|
||||
return out
|
||||
return out
|
||||
|
@ -56,8 +56,8 @@ class TSPBaseAgent(ABC):
|
||||
|
||||
def _door_is_close(self, state):
|
||||
try:
|
||||
# return next(y for x in self.state.tile.neighboring_floor for y in x.guests if do.DOOR in y.name)
|
||||
return next(y for x in state.entities.neighboring_positions(self.state.pos) for y in state.entities.pos_dict[x] if do.DOOR in y.name)
|
||||
return next(y for x in state.entities.neighboring_positions(self.state.pos)
|
||||
for y in state.entities.pos_dict[x] if do.DOOR in y.name)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
@ -14,8 +14,8 @@ class TSPTargetAgent(TSPBaseAgent):
|
||||
def _handle_doors(self, state):
|
||||
|
||||
try:
|
||||
# return next(y for x in self.state.tile.neighboring_floor for y in x.guests if do.DOOR in y.name)
|
||||
return next(y for x in state.entities.neighboring_positions(self.state.pos) for y in state.entities.pos_dict[x] if do.DOOR in y.name)
|
||||
return next(y for x in state.entities.neighboring_positions(self.state.pos)
|
||||
for y in state.entities.pos_dict[x] if do.DOOR in y.name)
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import yaml
|
||||
|
||||
|
||||
def load_class(classname):
|
||||
from importlib import import_module
|
||||
@ -42,7 +43,6 @@ def get_class(arguments):
|
||||
|
||||
|
||||
def get_arguments(arguments):
|
||||
from importlib import import_module
|
||||
d = dict(arguments)
|
||||
if "classname" in d:
|
||||
del d["classname"]
|
||||
@ -82,4 +82,4 @@ class Checkpointer(object):
|
||||
for name, model in to_save:
|
||||
self.save_experiment(name, model)
|
||||
self.__current_checkpoint += 1
|
||||
self.__current_step += 1
|
||||
self.__current_step += 1
|
||||
|
Reference in New Issue
Block a user