my update
This commit is contained in:
parent
6287380f60
commit
ea4582a59e
@ -1,40 +0,0 @@
|
|||||||
from common import BaseLearner, TrajectoryBuffer
|
|
||||||
|
|
||||||
|
|
||||||
class AWRLearner(BaseLearner):
|
|
||||||
def __init__(self, *args, buffer_size=1e5, **kwargs):
|
|
||||||
super(AWRLearner, self).__init__(*args, **kwargs)
|
|
||||||
assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!'
|
|
||||||
self.buffer = TrajectoryBuffer(buffer_size)
|
|
||||||
|
|
||||||
def train(self):
|
|
||||||
# convert to trajectory format
|
|
||||||
pass
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from matplotlib import pyplot as plt
|
|
||||||
import pandas as pd
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
sns.set(font_scale=1.25, rc={'text.usetex': True})
|
|
||||||
data = np.array([[689, 74], [71, 647]])
|
|
||||||
cats = ['Mask', 'No Mask']
|
|
||||||
df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats)
|
|
||||||
|
|
||||||
group_counts = ['{0:0.0f}'.format(value) for value in
|
|
||||||
data.flatten()]
|
|
||||||
group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in
|
|
||||||
data.flatten()/np.sum(data)]
|
|
||||||
|
|
||||||
labels = [f'{v1}\n{v2}' for v1, v2 in
|
|
||||||
zip(group_counts,group_percentages)]
|
|
||||||
labels = np.asarray(labels).reshape(2,2)
|
|
||||||
|
|
||||||
with sns.axes_style("white"):
|
|
||||||
cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
|
|
||||||
sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats)
|
|
||||||
plt.title('Simple-CNN')
|
|
||||||
plt.ylabel('True label')
|
|
||||||
plt.xlabel('Predicted label')
|
|
||||||
plt.tight_layout()
|
|
||||||
plt.savefig('cnn.pdf', bbox_inches='tight')
|
|
@ -2,9 +2,12 @@ from typing import NamedTuple, Union
|
|||||||
from collections import deque, OrderedDict, defaultdict
|
from collections import deque, OrderedDict, defaultdict
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
|
from tqdm import trange
|
||||||
|
|
||||||
class Experience(NamedTuple):
|
class Experience(NamedTuple):
|
||||||
# can be use for a single (s_t, a, r s_{t+1}) tuple
|
# can be use for a single (s_t, a, r s_{t+1}) tuple
|
||||||
@ -57,6 +60,9 @@ class BaseLearner:
|
|||||||
def train(self):
|
def train(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def reward(self, r):
|
||||||
|
return r
|
||||||
|
|
||||||
def learn(self, n_steps):
|
def learn(self, n_steps):
|
||||||
train_type, train_freq = self.train_every
|
train_type, train_freq = self.train_every
|
||||||
while self.step < n_steps:
|
while self.step < n_steps:
|
||||||
@ -70,7 +76,7 @@ class BaseLearner:
|
|||||||
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
||||||
|
|
||||||
experience = Experience(observation=obs, next_observation=next_obs,
|
experience = Experience(observation=obs, next_observation=next_obs,
|
||||||
action=action, reward=reward,
|
action=action, reward=self.reward(reward),
|
||||||
done=done, episode=self.episode) # do we really need to copy?
|
done=done, episode=self.episode) # do we really need to copy?
|
||||||
self.on_new_experience(experience)
|
self.on_new_experience(experience)
|
||||||
# end of step routine
|
# end of step routine
|
||||||
@ -90,7 +96,7 @@ class BaseLearner:
|
|||||||
self.running_reward.append(total_reward)
|
self.running_reward.append(total_reward)
|
||||||
self.episode += 1
|
self.episode += 1
|
||||||
try:
|
try:
|
||||||
if self.step % 10 == 0:
|
if self.step % 100 == 0:
|
||||||
print(
|
print(
|
||||||
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
|
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
|
||||||
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
|
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
|
||||||
@ -98,6 +104,21 @@ class BaseLearner:
|
|||||||
pass
|
pass
|
||||||
self.on_all_done()
|
self.on_all_done()
|
||||||
|
|
||||||
|
def evaluate(self, n_episodes=100, render=False):
|
||||||
|
with torch.no_grad():
|
||||||
|
data = []
|
||||||
|
for eval_i in trange(n_episodes):
|
||||||
|
obs, done = self.env.reset(), False
|
||||||
|
while not done:
|
||||||
|
action = self.get_action(obs)
|
||||||
|
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
||||||
|
if render: self.env.render()
|
||||||
|
obs = next_obs # srsly i'm so stupid
|
||||||
|
info.update({'reward': reward, 'eval_episode': eval_i})
|
||||||
|
data.append(info)
|
||||||
|
return pd.DataFrame(data).fillna(0)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BaseBuffer:
|
class BaseBuffer:
|
||||||
def __init__(self, size: int):
|
def __init__(self, size: int):
|
||||||
@ -187,7 +208,7 @@ class BaseDDQN(BaseDQN):
|
|||||||
class BaseICM(nn.Module):
|
class BaseICM(nn.Module):
|
||||||
def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
|
def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
|
||||||
super(BaseICM, self).__init__()
|
super(BaseICM, self).__init__()
|
||||||
self.backbone = mlp_maker(backbone_dims, flatten=True)
|
self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu')
|
||||||
self.icm = mlp_maker(head_dims)
|
self.icm = mlp_maker(head_dims)
|
||||||
self.ce = nn.CrossEntropyLoss()
|
self.ce = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from algorithms.q_learner import QLearner
|
from algorithms.q_learner import QLearner
|
||||||
@ -53,19 +54,24 @@ class MQLearner(QLearner):
|
|||||||
self._backprop_loss(loss)
|
self._backprop_loss(loss)
|
||||||
|
|
||||||
from tqdm import trange
|
from tqdm import trange
|
||||||
|
from collections import deque
|
||||||
class MQICMLearner(MQLearner):
|
class MQICMLearner(MQLearner):
|
||||||
def __init__(self, *args, icm, **kwargs):
|
def __init__(self, *args, icm, **kwargs):
|
||||||
super(MQICMLearner, self).__init__(*args, **kwargs)
|
super(MQICMLearner, self).__init__(*args, **kwargs)
|
||||||
self.icm = icm
|
self.icm = icm
|
||||||
self.icm_optimizer = torch.optim.Adam(self.icm.parameters())
|
self.icm_optimizer = torch.optim.AdamW(self.icm.parameters())
|
||||||
|
self.normalize_reward = deque(maxlen=1000)
|
||||||
|
|
||||||
def on_all_done(self):
|
def on_all_done(self):
|
||||||
for b in trange(50000):
|
from collections import deque
|
||||||
|
losses = deque(maxlen=100)
|
||||||
|
for b in trange(10000):
|
||||||
batch = self.buffer.sample(128, 0)
|
batch = self.buffer.sample(128, 0)
|
||||||
s0, s1, a = batch.observation, batch.next_observation, batch.action
|
s0, s1, a = batch.observation, batch.next_observation, batch.action
|
||||||
loss = self.icm(s0, s1, a.squeeze())['loss']
|
loss = self.icm(s0, s1, a.squeeze())['loss']
|
||||||
self.icm_optimizer.zero_grad()
|
self.icm_optimizer.zero_grad()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
self.icm_optimizer.step()
|
self.icm_optimizer.step()
|
||||||
|
losses.append(loss.item())
|
||||||
if b%100 == 0:
|
if b%100 == 0:
|
||||||
print(loss.item())
|
print(np.mean(losses))
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from algorithms.q_learner import QLearner
|
from algorithms.q_learner import QLearner
|
||||||
|
|
||||||
|
|
||||||
@ -37,4 +38,18 @@ class VDNLearner(QLearner):
|
|||||||
target_q_raw += next_q_values_raw
|
target_q_raw += next_q_values_raw
|
||||||
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
|
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
|
||||||
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
|
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
|
||||||
self._backprop_loss(loss)
|
self._backprop_loss(loss)
|
||||||
|
|
||||||
|
def evaluate(self, n_episodes=100, render=False):
|
||||||
|
with torch.no_grad():
|
||||||
|
data = []
|
||||||
|
for eval_i in range(n_episodes):
|
||||||
|
obs, done = self.env.reset(), False
|
||||||
|
while not done:
|
||||||
|
action = self.get_action(obs)
|
||||||
|
next_obs, reward, done, info = self.env.step(action)
|
||||||
|
if render: self.env.render()
|
||||||
|
obs = next_obs # srsly i'm so stupid
|
||||||
|
info.update({'reward': reward, 'eval_episode': eval_i})
|
||||||
|
data.append(info)
|
||||||
|
return pd.DataFrame(data).fillna(0)
|
||||||
|
@ -0,0 +1,27 @@
|
|||||||
|
def rooms(n_agents=1):
|
||||||
|
from environments.factory.factory_dirt_item import DirtItemFactory
|
||||||
|
from environments.factory.factory_item import ItemFactory, ItemProperties
|
||||||
|
from environments.factory.factory_dirt import DirtProperties, DirtFactory
|
||||||
|
from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions
|
||||||
|
|
||||||
|
obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT,
|
||||||
|
omit_agent_self=True,
|
||||||
|
additional_agent_placeholder=None,
|
||||||
|
frames_to_stack=0,
|
||||||
|
pomdp_r=2
|
||||||
|
)
|
||||||
|
move_props = MovementProperties(allow_diagonal_movement=True,
|
||||||
|
allow_square_movement=True,
|
||||||
|
allow_no_op=False)
|
||||||
|
dirt_props = DirtProperties(initial_dirt_ratio=0.35, initial_dirt_spawn_r_var=0.1,
|
||||||
|
clean_amount=0.34,
|
||||||
|
max_spawn_amount=0.1, max_global_amount=20,
|
||||||
|
max_local_amount=1, spawn_frequency=0, max_spawn_ratio=0.05,
|
||||||
|
dirt_smear_amount=0.0, agent_can_interact=True)
|
||||||
|
factory_kwargs = dict(n_agents=n_agents, max_steps=400, parse_doors=True,
|
||||||
|
level_name='rooms', record_episodes=False, doors_have_area=False,
|
||||||
|
verbose=False,
|
||||||
|
mv_prop=move_props,
|
||||||
|
obs_prop=obs_props
|
||||||
|
)
|
||||||
|
return DirtFactory(dirt_props=dirt_props, **factory_kwargs)
|
@ -1,57 +1,7 @@
|
|||||||
import random
|
from environments.factory.factory_dirt import DirtFactory
|
||||||
from pathlib import Path
|
from environments.factory.factory_item import ItemFactory
|
||||||
|
|
||||||
from environments.factory.factory_dirt import DirtFactory, DirtProperties
|
|
||||||
from environments.factory.factory_item import ItemFactory, ItemProperties
|
|
||||||
from environments.logging.recorder import RecorderCallback
|
|
||||||
from environments.utility_classes import MovementProperties
|
|
||||||
|
|
||||||
|
|
||||||
class DirtItemFactory(ItemFactory, DirtFactory):
|
class DirtItemFactory(ItemFactory, DirtFactory):
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
super().__init__(*args, **kwargs)
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
with RecorderCallback(filepath=Path('debug_out') / f'recorder_xxxx.json', occupation_map=False,
|
|
||||||
trajectory_map=False) as recorder:
|
|
||||||
|
|
||||||
dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20,
|
|
||||||
max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05,
|
|
||||||
dirt_smear_amount=0.0, agent_can_interact=True)
|
|
||||||
item_props = ItemProperties(n_items=5, agent_can_interact=True)
|
|
||||||
move_props = MovementProperties(allow_diagonal_movement=True,
|
|
||||||
allow_square_movement=True,
|
|
||||||
allow_no_op=False)
|
|
||||||
|
|
||||||
render = True
|
|
||||||
|
|
||||||
factory = DirtItemFactory(n_agents=1, done_at_collision=False, frames_to_stack=0,
|
|
||||||
level_name='rooms', max_steps=200, combin_agent_obs=True,
|
|
||||||
omit_agent_in_obs=True, parse_doors=True, pomdp_r=3,
|
|
||||||
record_episodes=True, verbose=False, cast_shadows=True,
|
|
||||||
movement_properties=move_props, dirt_properties=dirt_props
|
|
||||||
)
|
|
||||||
|
|
||||||
# noinspection DuplicatedCode
|
|
||||||
n_actions = factory.action_space.n - 1
|
|
||||||
_ = factory.observation_space
|
|
||||||
|
|
||||||
for epoch in range(4):
|
|
||||||
random_actions = [[random.randint(0, n_actions) for _
|
|
||||||
in range(factory.n_agents)] for _
|
|
||||||
in range(factory.max_steps + 1)]
|
|
||||||
env_state = factory.reset()
|
|
||||||
r = 0
|
|
||||||
for agent_i_action in random_actions:
|
|
||||||
env_state, step_r, done_bool, info_obj = factory.step(agent_i_action)
|
|
||||||
# recorder.read_info(0, info_obj)
|
|
||||||
r += step_r
|
|
||||||
if render:
|
|
||||||
factory.render()
|
|
||||||
if done_bool:
|
|
||||||
# recorder.read_done(0, done_bool)
|
|
||||||
break
|
|
||||||
print(f'Factory run {epoch} done, reward is:\n {r}')
|
|
||||||
pass
|
|
||||||
|
@ -126,6 +126,6 @@ class Renderer:
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
renderer = Renderer(fps=2, cell_size=40)
|
renderer = Renderer(fps=2, cell_size=40)
|
||||||
for i in range(15):
|
for i in range(15):
|
||||||
entity_1 = RenderEntity('agent', [5, i], 1, 'idle', 'idle')
|
entity_1 = RenderEntity('agent_collision', [5, i], 1, 'idle', 'idle')
|
||||||
renderer.render([entity_1])
|
renderer.render([entity_1])
|
||||||
|
|
||||||
|
115
main.py
115
main.py
@ -1,115 +0,0 @@
|
|||||||
import warnings
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import time
|
|
||||||
|
|
||||||
from stable_baselines3.common.callbacks import CallbackList
|
|
||||||
from stable_baselines3.common.vec_env import SubprocVecEnv
|
|
||||||
|
|
||||||
from environments.factory.factory_dirt_item import DirtItemFactory
|
|
||||||
from environments.factory.factory_item import ItemFactory, ItemProperties
|
|
||||||
from environments.factory.factory_dirt import DirtProperties, DirtFactory
|
|
||||||
from environments.logging.monitor import MonitorCallback
|
|
||||||
from environments.logging.recorder import RecorderCallback
|
|
||||||
from environments.utility_classes import MovementProperties
|
|
||||||
from plotting.compare_runs import compare_seed_runs, compare_model_runs
|
|
||||||
|
|
||||||
warnings.filterwarnings('ignore', category=FutureWarning)
|
|
||||||
warnings.filterwarnings('ignore', category=UserWarning)
|
|
||||||
|
|
||||||
|
|
||||||
def make_env(env_kwargs_dict):
|
|
||||||
|
|
||||||
def _init():
|
|
||||||
with DirtFactory(**env_kwargs_dict) as init_env:
|
|
||||||
return init_env
|
|
||||||
|
|
||||||
return _init
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# combine_runs(Path('debug_out') / 'A2C_1630314192')
|
|
||||||
# exit()
|
|
||||||
|
|
||||||
# compare_runs(Path('debug_out'), 1623052687, ['step_reward'])
|
|
||||||
# exit()
|
|
||||||
|
|
||||||
from stable_baselines3 import PPO, DQN, A2C
|
|
||||||
# from algorithms.reg_dqn import RegDQN
|
|
||||||
# from sb3_contrib import QRDQN
|
|
||||||
|
|
||||||
dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20,
|
|
||||||
max_local_amount=1, spawn_frequency=16, max_spawn_ratio=0.05,
|
|
||||||
dirt_smear_amount=0.0, agent_can_interact=True)
|
|
||||||
item_props = ItemProperties(n_items=10, agent_can_interact=True,
|
|
||||||
spawn_frequency=30, n_drop_off_locations=2,
|
|
||||||
max_agent_inventory_capacity=15)
|
|
||||||
move_props = MovementProperties(allow_diagonal_movement=True,
|
|
||||||
allow_square_movement=True,
|
|
||||||
allow_no_op=False)
|
|
||||||
train_steps = 5e6
|
|
||||||
time_stamp = int(time.time())
|
|
||||||
|
|
||||||
out_path = None
|
|
||||||
|
|
||||||
for modeL_type in [A2C, PPO, DQN]: # ,RegDQN, QRDQN]:
|
|
||||||
for seed in range(3):
|
|
||||||
env_kwargs = dict(n_agents=1,
|
|
||||||
# item_prop=item_props,
|
|
||||||
dirt_properties=dirt_props,
|
|
||||||
movement_properties=move_props,
|
|
||||||
pomdp_r=2, max_steps=1000, parse_doors=False,
|
|
||||||
level_name='rooms', frames_to_stack=4,
|
|
||||||
omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False,
|
|
||||||
cast_shadows=True, doors_have_area=False, env_seed=seed, verbose=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
if modeL_type.__name__ in ["PPO", "A2C"]:
|
|
||||||
kwargs = dict(ent_coef=0.01)
|
|
||||||
env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn")
|
|
||||||
elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]:
|
|
||||||
env = make_env(env_kwargs)()
|
|
||||||
kwargs = dict(buffer_size=50000,
|
|
||||||
learning_starts=64,
|
|
||||||
batch_size=64,
|
|
||||||
target_update_interval=5000,
|
|
||||||
exploration_fraction=0.25,
|
|
||||||
exploration_final_eps=0.025
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.')
|
|
||||||
|
|
||||||
model = modeL_type("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **kwargs)
|
|
||||||
|
|
||||||
out_path = Path('debug_out') / f'{model.__class__.__name__}_{time_stamp}'
|
|
||||||
|
|
||||||
# identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}'
|
|
||||||
identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}'
|
|
||||||
out_path /= identifier
|
|
||||||
|
|
||||||
callbacks = CallbackList(
|
|
||||||
[MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick'),
|
|
||||||
RecorderCallback(filepath=out_path / f'recorder_{identifier}.json', occupation_map=False,
|
|
||||||
trajectory_map=False
|
|
||||||
)]
|
|
||||||
)
|
|
||||||
|
|
||||||
model.learn(total_timesteps=int(train_steps), callback=callbacks)
|
|
||||||
|
|
||||||
save_path = out_path / f'model_{identifier}.zip'
|
|
||||||
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
model.save(save_path)
|
|
||||||
param_path = out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.json'
|
|
||||||
try:
|
|
||||||
env.env_method('save_params', param_path)
|
|
||||||
except AttributeError:
|
|
||||||
env.save_params(param_path)
|
|
||||||
print("Model Trained and saved")
|
|
||||||
print("Model Group Done.. Plotting...")
|
|
||||||
|
|
||||||
if out_path:
|
|
||||||
compare_seed_runs(out_path.parent)
|
|
||||||
print("All Models Done... Evaluating")
|
|
||||||
if out_path:
|
|
||||||
compare_model_runs(Path('debug_out'), time_stamp, 'step_reward')
|
|
86
main_test.py
86
main_test.py
@ -1,86 +0,0 @@
|
|||||||
# foreign imports
|
|
||||||
import warnings
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import yaml
|
|
||||||
from gym.wrappers import FrameStack
|
|
||||||
from natsort import natsorted
|
|
||||||
|
|
||||||
from stable_baselines3.common.callbacks import CallbackList
|
|
||||||
from stable_baselines3 import PPO, DQN, A2C
|
|
||||||
|
|
||||||
# our imports
|
|
||||||
from environments.factory.factory_dirt import DirtFactory, DirtProperties
|
|
||||||
from environments.logging.monitor import MonitorCallback
|
|
||||||
from algorithms.reg_dqn import RegDQN
|
|
||||||
from main import compare_model_runs, compare_seed_runs
|
|
||||||
|
|
||||||
warnings.filterwarnings('ignore', category=FutureWarning)
|
|
||||||
warnings.filterwarnings('ignore', category=UserWarning)
|
|
||||||
model_mapping = dict(A2C=A2C, PPO=PPO, DQN=DQN, RegDQN=RegDQN)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
|
|
||||||
# get n policies pi_1, ..., pi_n trained in single agent setting
|
|
||||||
# rewards = []
|
|
||||||
# repeat for x eval runs
|
|
||||||
# total reward = rollout game for y steps with n policies in multi-agent setting
|
|
||||||
# rewards += [total reward]
|
|
||||||
# boxplot total rewards
|
|
||||||
|
|
||||||
run_id = '1623923982'
|
|
||||||
model_name = 'A2C'
|
|
||||||
|
|
||||||
# -----------------------
|
|
||||||
out_path = Path(__file__).parent / 'debug_out'
|
|
||||||
|
|
||||||
# from sb3_contrib import QRDQN
|
|
||||||
model_path = out_path / f'{model_name}_{run_id}'
|
|
||||||
model_files = list(natsorted(model_path.rglob('model_*.zip')))
|
|
||||||
this_model = model_files[0]
|
|
||||||
render = True
|
|
||||||
|
|
||||||
model = model_mapping[model_name].load(this_model)
|
|
||||||
|
|
||||||
for seed in range(3):
|
|
||||||
with (model_path / f'env_{model_path.name}.yaml').open('r') as f:
|
|
||||||
env_kwargs = yaml.load(f, Loader=yaml.FullLoader)
|
|
||||||
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
|
|
||||||
max_local_amount=3, spawn_frequency=1, max_spawn_ratio=0.05)
|
|
||||||
# env_kwargs.update(n_agents=1, dirt_prop=dirt_props)
|
|
||||||
env = DirtFactory(**env_kwargs)
|
|
||||||
|
|
||||||
env = FrameStack(env, 4)
|
|
||||||
|
|
||||||
exp_out_path = model_path / 'exp'
|
|
||||||
callbacks = CallbackList(
|
|
||||||
[MonitorCallback(filepath=exp_out_path / f'future_exp_name')]
|
|
||||||
)
|
|
||||||
|
|
||||||
n_actions = env.action_space.n
|
|
||||||
|
|
||||||
for epoch in range(100):
|
|
||||||
observations = env.reset()
|
|
||||||
if render:
|
|
||||||
if isinstance(env, FrameStack):
|
|
||||||
env.env.render()
|
|
||||||
else:
|
|
||||||
env.render()
|
|
||||||
done_bool = False
|
|
||||||
r = 0
|
|
||||||
while not done_bool:
|
|
||||||
if env.n_agents > 1:
|
|
||||||
actions = [model.predict(obs, deterministic=False)[0] for obs in observations]
|
|
||||||
else:
|
|
||||||
actions = model.predict(observations, deterministic=False)[0]
|
|
||||||
|
|
||||||
observations, r, done_bool, info_obj = env.step(actions)
|
|
||||||
if render:
|
|
||||||
env.render()
|
|
||||||
if done_bool:
|
|
||||||
break
|
|
||||||
print(f'Factory run {epoch} done, reward is:\n {r}')
|
|
||||||
|
|
||||||
if out_path:
|
|
||||||
compare_seed_runs(out_path.parent)
|
|
@ -1,9 +1,11 @@
|
|||||||
import numpy as np
|
from environments.factory import rooms
|
||||||
|
import random
|
||||||
|
from gym.wrappers import FrameStack
|
||||||
|
|
||||||
|
env = rooms(n_agents=2)
|
||||||
|
env = FrameStack(env, num_stack=3)
|
||||||
|
state, *_ = env.reset()
|
||||||
|
|
||||||
class SatMad(object):
|
for i in range(1000):
|
||||||
def __init__(self):
|
state, *_ = env.step([random.randint(0, 9), random.randint(0, 9)])
|
||||||
pass
|
env.render()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
pass
|
|
Loading…
x
Reference in New Issue
Block a user