my update

This commit is contained in:
romue 2021-11-11 10:59:13 +01:00
parent 6287380f60
commit ea4582a59e
10 changed files with 88 additions and 308 deletions

View File

@ -1,40 +0,0 @@
from common import BaseLearner, TrajectoryBuffer
class AWRLearner(BaseLearner):
def __init__(self, *args, buffer_size=1e5, **kwargs):
super(AWRLearner, self).__init__(*args, **kwargs)
assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!'
self.buffer = TrajectoryBuffer(buffer_size)
def train(self):
# convert to trajectory format
pass
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.25, rc={'text.usetex': True})
data = np.array([[689, 74], [71, 647]])
cats = ['Mask', 'No Mask']
df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats)
group_counts = ['{0:0.0f}'.format(value) for value in
data.flatten()]
group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in
data.flatten()/np.sum(data)]
labels = [f'{v1}\n{v2}' for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
with sns.axes_style("white"):
cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats)
plt.title('Simple-CNN')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig('cnn.pdf', bbox_inches='tight')

View File

@ -2,9 +2,12 @@ from typing import NamedTuple, Union
from collections import deque, OrderedDict, defaultdict
import numpy as np
import random
import pandas as pd
import torch
import torch.nn as nn
from tqdm import trange
class Experience(NamedTuple):
# can be use for a single (s_t, a, r s_{t+1}) tuple
@ -57,6 +60,9 @@ class BaseLearner:
def train(self):
pass
def reward(self, r):
return r
def learn(self, n_steps):
train_type, train_freq = self.train_every
while self.step < n_steps:
@ -70,7 +76,7 @@ class BaseLearner:
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
experience = Experience(observation=obs, next_observation=next_obs,
action=action, reward=reward,
action=action, reward=self.reward(reward),
done=done, episode=self.episode) # do we really need to copy?
self.on_new_experience(experience)
# end of step routine
@ -90,7 +96,7 @@ class BaseLearner:
self.running_reward.append(total_reward)
self.episode += 1
try:
if self.step % 10 == 0:
if self.step % 100 == 0:
print(
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
@ -98,6 +104,21 @@ class BaseLearner:
pass
self.on_all_done()
def evaluate(self, n_episodes=100, render=False):
with torch.no_grad():
data = []
for eval_i in trange(n_episodes):
obs, done = self.env.reset(), False
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
if render: self.env.render()
obs = next_obs # srsly i'm so stupid
info.update({'reward': reward, 'eval_episode': eval_i})
data.append(info)
return pd.DataFrame(data).fillna(0)
class BaseBuffer:
def __init__(self, size: int):
@ -187,7 +208,7 @@ class BaseDDQN(BaseDQN):
class BaseICM(nn.Module):
def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
super(BaseICM, self).__init__()
self.backbone = mlp_maker(backbone_dims, flatten=True)
self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu')
self.icm = mlp_maker(head_dims)
self.ce = nn.CrossEntropyLoss()

View File

@ -1,3 +1,4 @@
import numpy as np
import torch
import torch.nn.functional as F
from algorithms.q_learner import QLearner
@ -53,19 +54,24 @@ class MQLearner(QLearner):
self._backprop_loss(loss)
from tqdm import trange
from collections import deque
class MQICMLearner(MQLearner):
def __init__(self, *args, icm, **kwargs):
super(MQICMLearner, self).__init__(*args, **kwargs)
self.icm = icm
self.icm_optimizer = torch.optim.Adam(self.icm.parameters())
self.icm_optimizer = torch.optim.AdamW(self.icm.parameters())
self.normalize_reward = deque(maxlen=1000)
def on_all_done(self):
for b in trange(50000):
from collections import deque
losses = deque(maxlen=100)
for b in trange(10000):
batch = self.buffer.sample(128, 0)
s0, s1, a = batch.observation, batch.next_observation, batch.action
loss = self.icm(s0, s1, a.squeeze())['loss']
self.icm_optimizer.zero_grad()
loss.backward()
self.icm_optimizer.step()
losses.append(loss.item())
if b%100 == 0:
print(loss.item())
print(np.mean(losses))

View File

@ -1,6 +1,7 @@
from typing import Union
import torch
import numpy as np
import pandas as pd
from algorithms.q_learner import QLearner
@ -38,3 +39,17 @@ class VDNLearner(QLearner):
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)
def evaluate(self, n_episodes=100, render=False):
with torch.no_grad():
data = []
for eval_i in range(n_episodes):
obs, done = self.env.reset(), False
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action)
if render: self.env.render()
obs = next_obs # srsly i'm so stupid
info.update({'reward': reward, 'eval_episode': eval_i})
data.append(info)
return pd.DataFrame(data).fillna(0)

View File

@ -0,0 +1,27 @@
def rooms(n_agents=1):
from environments.factory.factory_dirt_item import DirtItemFactory
from environments.factory.factory_item import ItemFactory, ItemProperties
from environments.factory.factory_dirt import DirtProperties, DirtFactory
from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions
obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT,
omit_agent_self=True,
additional_agent_placeholder=None,
frames_to_stack=0,
pomdp_r=2
)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
dirt_props = DirtProperties(initial_dirt_ratio=0.35, initial_dirt_spawn_r_var=0.1,
clean_amount=0.34,
max_spawn_amount=0.1, max_global_amount=20,
max_local_amount=1, spawn_frequency=0, max_spawn_ratio=0.05,
dirt_smear_amount=0.0, agent_can_interact=True)
factory_kwargs = dict(n_agents=n_agents, max_steps=400, parse_doors=True,
level_name='rooms', record_episodes=False, doors_have_area=False,
verbose=False,
mv_prop=move_props,
obs_prop=obs_props
)
return DirtFactory(dirt_props=dirt_props, **factory_kwargs)

View File

@ -1,57 +1,7 @@
import random
from pathlib import Path
from environments.factory.factory_dirt import DirtFactory, DirtProperties
from environments.factory.factory_item import ItemFactory, ItemProperties
from environments.logging.recorder import RecorderCallback
from environments.utility_classes import MovementProperties
from environments.factory.factory_dirt import DirtFactory
from environments.factory.factory_item import ItemFactory
class DirtItemFactory(ItemFactory, DirtFactory):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if __name__ == '__main__':
with RecorderCallback(filepath=Path('debug_out') / f'recorder_xxxx.json', occupation_map=False,
trajectory_map=False) as recorder:
dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20,
max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05,
dirt_smear_amount=0.0, agent_can_interact=True)
item_props = ItemProperties(n_items=5, agent_can_interact=True)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
render = True
factory = DirtItemFactory(n_agents=1, done_at_collision=False, frames_to_stack=0,
level_name='rooms', max_steps=200, combin_agent_obs=True,
omit_agent_in_obs=True, parse_doors=True, pomdp_r=3,
record_episodes=True, verbose=False, cast_shadows=True,
movement_properties=move_props, dirt_properties=dirt_props
)
# noinspection DuplicatedCode
n_actions = factory.action_space.n - 1
_ = factory.observation_space
for epoch in range(4):
random_actions = [[random.randint(0, n_actions) for _
in range(factory.n_agents)] for _
in range(factory.max_steps + 1)]
env_state = factory.reset()
r = 0
for agent_i_action in random_actions:
env_state, step_r, done_bool, info_obj = factory.step(agent_i_action)
# recorder.read_info(0, info_obj)
r += step_r
if render:
factory.render()
if done_bool:
# recorder.read_done(0, done_bool)
break
print(f'Factory run {epoch} done, reward is:\n {r}')
pass

View File

@ -126,6 +126,6 @@ class Renderer:
if __name__ == '__main__':
renderer = Renderer(fps=2, cell_size=40)
for i in range(15):
entity_1 = RenderEntity('agent', [5, i], 1, 'idle', 'idle')
entity_1 = RenderEntity('agent_collision', [5, i], 1, 'idle', 'idle')
renderer.render([entity_1])

115
main.py
View File

@ -1,115 +0,0 @@
import warnings
from pathlib import Path
import time
from stable_baselines3.common.callbacks import CallbackList
from stable_baselines3.common.vec_env import SubprocVecEnv
from environments.factory.factory_dirt_item import DirtItemFactory
from environments.factory.factory_item import ItemFactory, ItemProperties
from environments.factory.factory_dirt import DirtProperties, DirtFactory
from environments.logging.monitor import MonitorCallback
from environments.logging.recorder import RecorderCallback
from environments.utility_classes import MovementProperties
from plotting.compare_runs import compare_seed_runs, compare_model_runs
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
def make_env(env_kwargs_dict):
def _init():
with DirtFactory(**env_kwargs_dict) as init_env:
return init_env
return _init
if __name__ == '__main__':
# combine_runs(Path('debug_out') / 'A2C_1630314192')
# exit()
# compare_runs(Path('debug_out'), 1623052687, ['step_reward'])
# exit()
from stable_baselines3 import PPO, DQN, A2C
# from algorithms.reg_dqn import RegDQN
# from sb3_contrib import QRDQN
dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20,
max_local_amount=1, spawn_frequency=16, max_spawn_ratio=0.05,
dirt_smear_amount=0.0, agent_can_interact=True)
item_props = ItemProperties(n_items=10, agent_can_interact=True,
spawn_frequency=30, n_drop_off_locations=2,
max_agent_inventory_capacity=15)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
train_steps = 5e6
time_stamp = int(time.time())
out_path = None
for modeL_type in [A2C, PPO, DQN]: # ,RegDQN, QRDQN]:
for seed in range(3):
env_kwargs = dict(n_agents=1,
# item_prop=item_props,
dirt_properties=dirt_props,
movement_properties=move_props,
pomdp_r=2, max_steps=1000, parse_doors=False,
level_name='rooms', frames_to_stack=4,
omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False,
cast_shadows=True, doors_have_area=False, env_seed=seed, verbose=False,
)
if modeL_type.__name__ in ["PPO", "A2C"]:
kwargs = dict(ent_coef=0.01)
env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn")
elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]:
env = make_env(env_kwargs)()
kwargs = dict(buffer_size=50000,
learning_starts=64,
batch_size=64,
target_update_interval=5000,
exploration_fraction=0.25,
exploration_final_eps=0.025
)
else:
raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.')
model = modeL_type("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **kwargs)
out_path = Path('debug_out') / f'{model.__class__.__name__}_{time_stamp}'
# identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}'
identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}'
out_path /= identifier
callbacks = CallbackList(
[MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick'),
RecorderCallback(filepath=out_path / f'recorder_{identifier}.json', occupation_map=False,
trajectory_map=False
)]
)
model.learn(total_timesteps=int(train_steps), callback=callbacks)
save_path = out_path / f'model_{identifier}.zip'
save_path.parent.mkdir(parents=True, exist_ok=True)
model.save(save_path)
param_path = out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.json'
try:
env.env_method('save_params', param_path)
except AttributeError:
env.save_params(param_path)
print("Model Trained and saved")
print("Model Group Done.. Plotting...")
if out_path:
compare_seed_runs(out_path.parent)
print("All Models Done... Evaluating")
if out_path:
compare_model_runs(Path('debug_out'), time_stamp, 'step_reward')

View File

@ -1,86 +0,0 @@
# foreign imports
import warnings
from pathlib import Path
import yaml
from gym.wrappers import FrameStack
from natsort import natsorted
from stable_baselines3.common.callbacks import CallbackList
from stable_baselines3 import PPO, DQN, A2C
# our imports
from environments.factory.factory_dirt import DirtFactory, DirtProperties
from environments.logging.monitor import MonitorCallback
from algorithms.reg_dqn import RegDQN
from main import compare_model_runs, compare_seed_runs
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
model_mapping = dict(A2C=A2C, PPO=PPO, DQN=DQN, RegDQN=RegDQN)
if __name__ == '__main__':
# get n policies pi_1, ..., pi_n trained in single agent setting
# rewards = []
# repeat for x eval runs
# total reward = rollout game for y steps with n policies in multi-agent setting
# rewards += [total reward]
# boxplot total rewards
run_id = '1623923982'
model_name = 'A2C'
# -----------------------
out_path = Path(__file__).parent / 'debug_out'
# from sb3_contrib import QRDQN
model_path = out_path / f'{model_name}_{run_id}'
model_files = list(natsorted(model_path.rglob('model_*.zip')))
this_model = model_files[0]
render = True
model = model_mapping[model_name].load(this_model)
for seed in range(3):
with (model_path / f'env_{model_path.name}.yaml').open('r') as f:
env_kwargs = yaml.load(f, Loader=yaml.FullLoader)
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=3, spawn_frequency=1, max_spawn_ratio=0.05)
# env_kwargs.update(n_agents=1, dirt_prop=dirt_props)
env = DirtFactory(**env_kwargs)
env = FrameStack(env, 4)
exp_out_path = model_path / 'exp'
callbacks = CallbackList(
[MonitorCallback(filepath=exp_out_path / f'future_exp_name')]
)
n_actions = env.action_space.n
for epoch in range(100):
observations = env.reset()
if render:
if isinstance(env, FrameStack):
env.env.render()
else:
env.render()
done_bool = False
r = 0
while not done_bool:
if env.n_agents > 1:
actions = [model.predict(obs, deterministic=False)[0] for obs in observations]
else:
actions = model.predict(observations, deterministic=False)[0]
observations, r, done_bool, info_obj = env.step(actions)
if render:
env.render()
if done_bool:
break
print(f'Factory run {epoch} done, reward is:\n {r}')
if out_path:
compare_seed_runs(out_path.parent)

View File

@ -1,9 +1,11 @@
import numpy as np
from environments.factory import rooms
import random
from gym.wrappers import FrameStack
env = rooms(n_agents=2)
env = FrameStack(env, num_stack=3)
state, *_ = env.reset()
class SatMad(object):
def __init__(self):
pass
if __name__ == '__main__':
pass
for i in range(1000):
state, *_ = env.step([random.randint(0, 9), random.randint(0, 9)])
env.render()