diff --git a/algorithms/awr_learner.py b/algorithms/awr_learner.py deleted file mode 100644 index c825ec6..0000000 --- a/algorithms/awr_learner.py +++ /dev/null @@ -1,40 +0,0 @@ -from common import BaseLearner, TrajectoryBuffer - - -class AWRLearner(BaseLearner): - def __init__(self, *args, buffer_size=1e5, **kwargs): - super(AWRLearner, self).__init__(*args, **kwargs) - assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!' - self.buffer = TrajectoryBuffer(buffer_size) - - def train(self): - # convert to trajectory format - pass - -import numpy as np -from matplotlib import pyplot as plt -import pandas as pd -import seaborn as sns - -sns.set(font_scale=1.25, rc={'text.usetex': True}) -data = np.array([[689, 74], [71, 647]]) -cats = ['Mask', 'No Mask'] -df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats) - -group_counts = ['{0:0.0f}'.format(value) for value in - data.flatten()] -group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in - data.flatten()/np.sum(data)] - -labels = [f'{v1}\n{v2}' for v1, v2 in - zip(group_counts,group_percentages)] -labels = np.asarray(labels).reshape(2,2) - -with sns.axes_style("white"): - cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True) - sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats) -plt.title('Simple-CNN') -plt.ylabel('True label') -plt.xlabel('Predicted label') -plt.tight_layout() -plt.savefig('cnn.pdf', bbox_inches='tight') \ No newline at end of file diff --git a/algorithms/common.py b/algorithms/common.py index 876f689..3ebb1d2 100644 --- a/algorithms/common.py +++ b/algorithms/common.py @@ -2,9 +2,12 @@ from typing import NamedTuple, Union from collections import deque, OrderedDict, defaultdict import numpy as np import random + +import pandas as pd import torch import torch.nn as nn +from tqdm import trange class Experience(NamedTuple): # can be use for a single (s_t, a, r s_{t+1}) tuple @@ -57,6 +60,9 @@ class BaseLearner: def train(self): pass + def reward(self, r): + return r + def learn(self, n_steps): train_type, train_freq = self.train_every while self.step < n_steps: @@ -70,7 +76,7 @@ class BaseLearner: next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0]) experience = Experience(observation=obs, next_observation=next_obs, - action=action, reward=reward, + action=action, reward=self.reward(reward), done=done, episode=self.episode) # do we really need to copy? self.on_new_experience(experience) # end of step routine @@ -90,7 +96,7 @@ class BaseLearner: self.running_reward.append(total_reward) self.episode += 1 try: - if self.step % 10 == 0: + if self.step % 100 == 0: print( f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t' f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}') @@ -98,6 +104,21 @@ class BaseLearner: pass self.on_all_done() + def evaluate(self, n_episodes=100, render=False): + with torch.no_grad(): + data = [] + for eval_i in trange(n_episodes): + obs, done = self.env.reset(), False + while not done: + action = self.get_action(obs) + next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0]) + if render: self.env.render() + obs = next_obs # srsly i'm so stupid + info.update({'reward': reward, 'eval_episode': eval_i}) + data.append(info) + return pd.DataFrame(data).fillna(0) + + class BaseBuffer: def __init__(self, size: int): @@ -187,7 +208,7 @@ class BaseDDQN(BaseDQN): class BaseICM(nn.Module): def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]): super(BaseICM, self).__init__() - self.backbone = mlp_maker(backbone_dims, flatten=True) + self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu') self.icm = mlp_maker(head_dims) self.ce = nn.CrossEntropyLoss() diff --git a/algorithms/m_q_learner.py b/algorithms/m_q_learner.py index e4f85eb..bd57597 100644 --- a/algorithms/m_q_learner.py +++ b/algorithms/m_q_learner.py @@ -1,3 +1,4 @@ +import numpy as np import torch import torch.nn.functional as F from algorithms.q_learner import QLearner @@ -53,19 +54,24 @@ class MQLearner(QLearner): self._backprop_loss(loss) from tqdm import trange +from collections import deque class MQICMLearner(MQLearner): def __init__(self, *args, icm, **kwargs): super(MQICMLearner, self).__init__(*args, **kwargs) self.icm = icm - self.icm_optimizer = torch.optim.Adam(self.icm.parameters()) + self.icm_optimizer = torch.optim.AdamW(self.icm.parameters()) + self.normalize_reward = deque(maxlen=1000) def on_all_done(self): - for b in trange(50000): + from collections import deque + losses = deque(maxlen=100) + for b in trange(10000): batch = self.buffer.sample(128, 0) s0, s1, a = batch.observation, batch.next_observation, batch.action loss = self.icm(s0, s1, a.squeeze())['loss'] self.icm_optimizer.zero_grad() loss.backward() self.icm_optimizer.step() + losses.append(loss.item()) if b%100 == 0: - print(loss.item()) + print(np.mean(losses)) diff --git a/algorithms/vdn_learner.py b/algorithms/vdn_learner.py index 504adb0..d2e7067 100644 --- a/algorithms/vdn_learner.py +++ b/algorithms/vdn_learner.py @@ -1,6 +1,7 @@ from typing import Union import torch import numpy as np +import pandas as pd from algorithms.q_learner import QLearner @@ -37,4 +38,18 @@ class VDNLearner(QLearner): target_q_raw += next_q_values_raw target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2)) - self._backprop_loss(loss) \ No newline at end of file + self._backprop_loss(loss) + + def evaluate(self, n_episodes=100, render=False): + with torch.no_grad(): + data = [] + for eval_i in range(n_episodes): + obs, done = self.env.reset(), False + while not done: + action = self.get_action(obs) + next_obs, reward, done, info = self.env.step(action) + if render: self.env.render() + obs = next_obs # srsly i'm so stupid + info.update({'reward': reward, 'eval_episode': eval_i}) + data.append(info) + return pd.DataFrame(data).fillna(0) diff --git a/environments/factory/__init__.py b/environments/factory/__init__.py index e69de29..c73b2d7 100644 --- a/environments/factory/__init__.py +++ b/environments/factory/__init__.py @@ -0,0 +1,27 @@ +def rooms(n_agents=1): + from environments.factory.factory_dirt_item import DirtItemFactory + from environments.factory.factory_item import ItemFactory, ItemProperties + from environments.factory.factory_dirt import DirtProperties, DirtFactory + from environments.utility_classes import MovementProperties, ObservationProperties, AgentRenderOptions + + obs_props = ObservationProperties(render_agents=AgentRenderOptions.NOT, + omit_agent_self=True, + additional_agent_placeholder=None, + frames_to_stack=0, + pomdp_r=2 + ) + move_props = MovementProperties(allow_diagonal_movement=True, + allow_square_movement=True, + allow_no_op=False) + dirt_props = DirtProperties(initial_dirt_ratio=0.35, initial_dirt_spawn_r_var=0.1, + clean_amount=0.34, + max_spawn_amount=0.1, max_global_amount=20, + max_local_amount=1, spawn_frequency=0, max_spawn_ratio=0.05, + dirt_smear_amount=0.0, agent_can_interact=True) + factory_kwargs = dict(n_agents=n_agents, max_steps=400, parse_doors=True, + level_name='rooms', record_episodes=False, doors_have_area=False, + verbose=False, + mv_prop=move_props, + obs_prop=obs_props + ) + return DirtFactory(dirt_props=dirt_props, **factory_kwargs) diff --git a/environments/factory/factory_dirt_item.py b/environments/factory/factory_dirt_item.py index 04752ea..895cfe2 100644 --- a/environments/factory/factory_dirt_item.py +++ b/environments/factory/factory_dirt_item.py @@ -1,57 +1,7 @@ -import random -from pathlib import Path - -from environments.factory.factory_dirt import DirtFactory, DirtProperties -from environments.factory.factory_item import ItemFactory, ItemProperties -from environments.logging.recorder import RecorderCallback -from environments.utility_classes import MovementProperties +from environments.factory.factory_dirt import DirtFactory +from environments.factory.factory_item import ItemFactory class DirtItemFactory(ItemFactory, DirtFactory): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - - -if __name__ == '__main__': - with RecorderCallback(filepath=Path('debug_out') / f'recorder_xxxx.json', occupation_map=False, - trajectory_map=False) as recorder: - - dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, - max_local_amount=1, spawn_frequency=3, max_spawn_ratio=0.05, - dirt_smear_amount=0.0, agent_can_interact=True) - item_props = ItemProperties(n_items=5, agent_can_interact=True) - move_props = MovementProperties(allow_diagonal_movement=True, - allow_square_movement=True, - allow_no_op=False) - - render = True - - factory = DirtItemFactory(n_agents=1, done_at_collision=False, frames_to_stack=0, - level_name='rooms', max_steps=200, combin_agent_obs=True, - omit_agent_in_obs=True, parse_doors=True, pomdp_r=3, - record_episodes=True, verbose=False, cast_shadows=True, - movement_properties=move_props, dirt_properties=dirt_props - ) - - # noinspection DuplicatedCode - n_actions = factory.action_space.n - 1 - _ = factory.observation_space - - for epoch in range(4): - random_actions = [[random.randint(0, n_actions) for _ - in range(factory.n_agents)] for _ - in range(factory.max_steps + 1)] - env_state = factory.reset() - r = 0 - for agent_i_action in random_actions: - env_state, step_r, done_bool, info_obj = factory.step(agent_i_action) - # recorder.read_info(0, info_obj) - r += step_r - if render: - factory.render() - if done_bool: - # recorder.read_done(0, done_bool) - break - print(f'Factory run {epoch} done, reward is:\n {r}') - pass diff --git a/environments/factory/renderer.py b/environments/factory/renderer.py index a4ca734..e8f4297 100644 --- a/environments/factory/renderer.py +++ b/environments/factory/renderer.py @@ -126,6 +126,6 @@ class Renderer: if __name__ == '__main__': renderer = Renderer(fps=2, cell_size=40) for i in range(15): - entity_1 = RenderEntity('agent', [5, i], 1, 'idle', 'idle') + entity_1 = RenderEntity('agent_collision', [5, i], 1, 'idle', 'idle') renderer.render([entity_1]) diff --git a/main.py b/main.py deleted file mode 100644 index 6c96296..0000000 --- a/main.py +++ /dev/null @@ -1,115 +0,0 @@ -import warnings - -from pathlib import Path -import time - -from stable_baselines3.common.callbacks import CallbackList -from stable_baselines3.common.vec_env import SubprocVecEnv - -from environments.factory.factory_dirt_item import DirtItemFactory -from environments.factory.factory_item import ItemFactory, ItemProperties -from environments.factory.factory_dirt import DirtProperties, DirtFactory -from environments.logging.monitor import MonitorCallback -from environments.logging.recorder import RecorderCallback -from environments.utility_classes import MovementProperties -from plotting.compare_runs import compare_seed_runs, compare_model_runs - -warnings.filterwarnings('ignore', category=FutureWarning) -warnings.filterwarnings('ignore', category=UserWarning) - - -def make_env(env_kwargs_dict): - - def _init(): - with DirtFactory(**env_kwargs_dict) as init_env: - return init_env - - return _init - - -if __name__ == '__main__': - - # combine_runs(Path('debug_out') / 'A2C_1630314192') - # exit() - - # compare_runs(Path('debug_out'), 1623052687, ['step_reward']) - # exit() - - from stable_baselines3 import PPO, DQN, A2C - # from algorithms.reg_dqn import RegDQN - # from sb3_contrib import QRDQN - - dirt_props = DirtProperties(clean_amount=2, gain_amount=0.1, max_global_amount=20, - max_local_amount=1, spawn_frequency=16, max_spawn_ratio=0.05, - dirt_smear_amount=0.0, agent_can_interact=True) - item_props = ItemProperties(n_items=10, agent_can_interact=True, - spawn_frequency=30, n_drop_off_locations=2, - max_agent_inventory_capacity=15) - move_props = MovementProperties(allow_diagonal_movement=True, - allow_square_movement=True, - allow_no_op=False) - train_steps = 5e6 - time_stamp = int(time.time()) - - out_path = None - - for modeL_type in [A2C, PPO, DQN]: # ,RegDQN, QRDQN]: - for seed in range(3): - env_kwargs = dict(n_agents=1, - # item_prop=item_props, - dirt_properties=dirt_props, - movement_properties=move_props, - pomdp_r=2, max_steps=1000, parse_doors=False, - level_name='rooms', frames_to_stack=4, - omit_agent_in_obs=True, combin_agent_obs=True, record_episodes=False, - cast_shadows=True, doors_have_area=False, env_seed=seed, verbose=False, - ) - - if modeL_type.__name__ in ["PPO", "A2C"]: - kwargs = dict(ent_coef=0.01) - env = SubprocVecEnv([make_env(env_kwargs) for _ in range(10)], start_method="spawn") - elif modeL_type.__name__ in ["RegDQN", "DQN", "QRDQN"]: - env = make_env(env_kwargs)() - kwargs = dict(buffer_size=50000, - learning_starts=64, - batch_size=64, - target_update_interval=5000, - exploration_fraction=0.25, - exploration_final_eps=0.025 - ) - else: - raise NameError(f'The model "{modeL_type.__name__}" has the wrong name.') - - model = modeL_type("MlpPolicy", env, verbose=1, seed=seed, device='cpu', **kwargs) - - out_path = Path('debug_out') / f'{model.__class__.__name__}_{time_stamp}' - - # identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' - identifier = f'{seed}_{model.__class__.__name__}_{time_stamp}' - out_path /= identifier - - callbacks = CallbackList( - [MonitorCallback(filepath=out_path / f'monitor_{identifier}.pick'), - RecorderCallback(filepath=out_path / f'recorder_{identifier}.json', occupation_map=False, - trajectory_map=False - )] - ) - - model.learn(total_timesteps=int(train_steps), callback=callbacks) - - save_path = out_path / f'model_{identifier}.zip' - save_path.parent.mkdir(parents=True, exist_ok=True) - model.save(save_path) - param_path = out_path.parent / f'env_{model.__class__.__name__}_{time_stamp}.json' - try: - env.env_method('save_params', param_path) - except AttributeError: - env.save_params(param_path) - print("Model Trained and saved") - print("Model Group Done.. Plotting...") - - if out_path: - compare_seed_runs(out_path.parent) - print("All Models Done... Evaluating") - if out_path: - compare_model_runs(Path('debug_out'), time_stamp, 'step_reward') diff --git a/main_test.py b/main_test.py deleted file mode 100644 index 2834288..0000000 --- a/main_test.py +++ /dev/null @@ -1,86 +0,0 @@ -# foreign imports -import warnings - -from pathlib import Path -import yaml -from gym.wrappers import FrameStack -from natsort import natsorted - -from stable_baselines3.common.callbacks import CallbackList -from stable_baselines3 import PPO, DQN, A2C - -# our imports -from environments.factory.factory_dirt import DirtFactory, DirtProperties -from environments.logging.monitor import MonitorCallback -from algorithms.reg_dqn import RegDQN -from main import compare_model_runs, compare_seed_runs - -warnings.filterwarnings('ignore', category=FutureWarning) -warnings.filterwarnings('ignore', category=UserWarning) -model_mapping = dict(A2C=A2C, PPO=PPO, DQN=DQN, RegDQN=RegDQN) - - -if __name__ == '__main__': - - # get n policies pi_1, ..., pi_n trained in single agent setting - # rewards = [] - # repeat for x eval runs - # total reward = rollout game for y steps with n policies in multi-agent setting - # rewards += [total reward] - # boxplot total rewards - - run_id = '1623923982' - model_name = 'A2C' - - # ----------------------- - out_path = Path(__file__).parent / 'debug_out' - - # from sb3_contrib import QRDQN - model_path = out_path / f'{model_name}_{run_id}' - model_files = list(natsorted(model_path.rglob('model_*.zip'))) - this_model = model_files[0] - render = True - - model = model_mapping[model_name].load(this_model) - - for seed in range(3): - with (model_path / f'env_{model_path.name}.yaml').open('r') as f: - env_kwargs = yaml.load(f, Loader=yaml.FullLoader) - dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, - max_local_amount=3, spawn_frequency=1, max_spawn_ratio=0.05) - # env_kwargs.update(n_agents=1, dirt_prop=dirt_props) - env = DirtFactory(**env_kwargs) - - env = FrameStack(env, 4) - - exp_out_path = model_path / 'exp' - callbacks = CallbackList( - [MonitorCallback(filepath=exp_out_path / f'future_exp_name')] - ) - - n_actions = env.action_space.n - - for epoch in range(100): - observations = env.reset() - if render: - if isinstance(env, FrameStack): - env.env.render() - else: - env.render() - done_bool = False - r = 0 - while not done_bool: - if env.n_agents > 1: - actions = [model.predict(obs, deterministic=False)[0] for obs in observations] - else: - actions = model.predict(observations, deterministic=False)[0] - - observations, r, done_bool, info_obj = env.step(actions) - if render: - env.render() - if done_bool: - break - print(f'Factory run {epoch} done, reward is:\n {r}') - - if out_path: - compare_seed_runs(out_path.parent) diff --git a/studies/sat_mad.py b/studies/sat_mad.py index 46754a9..29f9ccd 100644 --- a/studies/sat_mad.py +++ b/studies/sat_mad.py @@ -1,9 +1,11 @@ -import numpy as np +from environments.factory import rooms +import random +from gym.wrappers import FrameStack +env = rooms(n_agents=2) +env = FrameStack(env, num_stack=3) +state, *_ = env.reset() -class SatMad(object): - def __init__(self): - pass - -if __name__ == '__main__': - pass \ No newline at end of file +for i in range(1000): + state, *_ = env.step([random.randint(0, 9), random.randint(0, 9)]) + env.render() \ No newline at end of file