From 9e8d2ac1dc071c057c742fbad5677d74495f27ab Mon Sep 17 00:00:00 2001 From: romue Date: Tue, 27 Jul 2021 16:59:24 +0200 Subject: [PATCH] cleanup algos + adjusted renderer to support "ray casting" --- algorithms/awr_learner.py | 40 ++++++ algorithms/common.py | 27 +++- algorithms/m_q_learner.py | 2 +- algorithms/q_learner.py | 29 ++-- algorithms/qtran_learner.py | 48 ------- algorithms/udr_learner.py | 178 ------------------------- environments/factory/renderer.py | 26 ++-- environments/factory/simple_factory.py | 4 +- environments/policy_adaption/test.py | 2 +- 9 files changed, 102 insertions(+), 254 deletions(-) create mode 100644 algorithms/awr_learner.py delete mode 100644 algorithms/qtran_learner.py delete mode 100644 algorithms/udr_learner.py diff --git a/algorithms/awr_learner.py b/algorithms/awr_learner.py new file mode 100644 index 0000000..c825ec6 --- /dev/null +++ b/algorithms/awr_learner.py @@ -0,0 +1,40 @@ +from common import BaseLearner, TrajectoryBuffer + + +class AWRLearner(BaseLearner): + def __init__(self, *args, buffer_size=1e5, **kwargs): + super(AWRLearner, self).__init__(*args, **kwargs) + assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!' + self.buffer = TrajectoryBuffer(buffer_size) + + def train(self): + # convert to trajectory format + pass + +import numpy as np +from matplotlib import pyplot as plt +import pandas as pd +import seaborn as sns + +sns.set(font_scale=1.25, rc={'text.usetex': True}) +data = np.array([[689, 74], [71, 647]]) +cats = ['Mask', 'No Mask'] +df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats) + +group_counts = ['{0:0.0f}'.format(value) for value in + data.flatten()] +group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in + data.flatten()/np.sum(data)] + +labels = [f'{v1}\n{v2}' for v1, v2 in + zip(group_counts,group_percentages)] +labels = np.asarray(labels).reshape(2,2) + +with sns.axes_style("white"): + cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True) + sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats) +plt.title('Simple-CNN') +plt.ylabel('True label') +plt.xlabel('Predicted label') +plt.tight_layout() +plt.savefig('cnn.pdf', bbox_inches='tight') \ No newline at end of file diff --git a/algorithms/common.py b/algorithms/common.py index 1749f7e..97d166c 100644 --- a/algorithms/common.py +++ b/algorithms/common.py @@ -1,5 +1,5 @@ from typing import NamedTuple, Union -from collections import deque, OrderedDict +from collections import deque, OrderedDict, defaultdict import numpy as np import random import torch @@ -18,12 +18,13 @@ class Experience(NamedTuple): class BaseLearner: - def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1): + def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1, stack_n_frames=1): assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]' self.env = env self.n_agents = n_agents self.n_grad_steps = n_grad_steps self.train_every = train_every + self.stack_n_frames = deque(stack_n_frames) self.device = 'cpu' self.n_updates = 0 self.step = 0 @@ -102,8 +103,8 @@ class BaseBuffer: def __len__(self): return len(self.experience) - def add(self, experience): - self.experience.append(experience) + def add(self, exp: Experience): + self.experience.append(exp) def sample(self, k, cer=4): sample = random.choices(self.experience, k=k-cer) @@ -113,9 +114,22 @@ class BaseBuffer: actions = torch.tensor([e.action for e in sample]).long() rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1) dones = torch.tensor([e.done for e in sample]).float().view(-1, 1) + #print(observations.shape, next_observations.shape, actions.shape, rewards.shape, dones.shape) return Experience(observations, next_observations, actions, rewards, dones) +class TrajectoryBuffer(BaseBuffer): + def __init__(self, size): + super(TrajectoryBuffer, self).__init__(size) + self.experience = defaultdict(list) + + def add(self, exp: Experience): + self.experience[exp.episode].append(exp) + if len(self.experience) > self.size: + oldest_traj_key = list(sorted(self.experience.keys()))[0] + del self.experience[oldest_traj_key] + + def soft_update(local_model, target_model, tau): # taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): @@ -152,9 +166,10 @@ class BaseDDQN(BaseDQN): def __init__(self, backbone_dims=[3*5*5, 64, 64], value_dims=[64, 1], - advantage_dims=[64, 9]): + advantage_dims=[64, 9], + activation='elu'): super(BaseDDQN, self).__init__(backbone_dims) - self.net = mlp_maker(backbone_dims, flatten=True) + self.net = mlp_maker(backbone_dims, activation=activation, flatten=True) self.value_head = mlp_maker(value_dims) self.advantage_head = mlp_maker(advantage_dims) diff --git a/algorithms/m_q_learner.py b/algorithms/m_q_learner.py index 402c68c..ded972e 100644 --- a/algorithms/m_q_learner.py +++ b/algorithms/m_q_learner.py @@ -25,7 +25,7 @@ class MQLearner(QLearner): if len(self.buffer) < self.batch_size: return for _ in range(self.n_grad_steps): - experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps) + experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1]) with torch.no_grad(): q_target_next = self.target_q_net(experience.next_observation) diff --git a/algorithms/q_learner.py b/algorithms/q_learner.py index 06a3384..0cd04f0 100644 --- a/algorithms/q_learner.py +++ b/algorithms/q_learner.py @@ -17,7 +17,7 @@ class QLearner(BaseLearner): self.q_net = q_net self.target_q_net = target_q_net self.target_q_net.eval() - soft_update(self.q_net, self.target_q_net, tau=1.0) + #soft_update(self.q_net, self.target_q_net, tau=1.0) self.buffer = BaseBuffer(buffer_size) self.target_update = target_update self.eps = eps_start @@ -30,9 +30,7 @@ class QLearner(BaseLearner): self.reg_weight = reg_weight self.weight_decay = weight_decay self.lr = lr - self.optimizer = torch.optim.AdamW(self.q_net.parameters(), - lr=self.lr, - weight_decay=self.weight_decay) + self.optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=self.lr, weight_decay=self.weight_decay) self.max_grad_norm = max_grad_norm self.running_reward = deque(maxlen=5) self.running_loss = deque(maxlen=5) @@ -103,20 +101,31 @@ class QLearner(BaseLearner): if __name__ == '__main__': from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties from algorithms.common import BaseDDQN + from algorithms.m_q_learner import MQLearner from algorithms.vdn_learner import VDNLearner from algorithms.udr_learner import UDRLearner N_AGENTS = 1 - dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, - max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) + dirt_props = DirtProperties(clean_amount=1, gain_amount=0.1, max_global_amount=20, + max_local_amount=1, spawn_frequency=5, max_spawn_ratio=0.05, + dirt_smear_amount=0.0) move_props = MovementProperties(allow_diagonal_movement=True, allow_square_movement=True, allow_no_op=False) - env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True) - dqn, target_dqn = BaseDDQN(), BaseDDQN() - learner = QLearner(dqn, target_dqn, env, 40000, target_update=3500, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10, - train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, batch_size=64) + env = SimpleFactory(n_agents=1, dirt_properties=dirt_props, pomdp_radius=2, max_steps=400, parse_doors=False, + movement_properties=move_props, level_name='rooms', frames_to_stack=0, + omit_agent_slice_in_obs=True, combin_agent_slices_in_obs=True, record_episodes=False + ) + + obs_shape = np.prod(env.observation_space.shape) + n_actions = env.action_space.n + + dqn, target_dqn = BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128,1], activation='leaky_relu'),\ + BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128,1], activation='leaky_relu') + + learner = MQLearner(dqn, target_dqn, env, 50000, target_update=5000, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10, + train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, batch_size=64, weight_decay=1e-3) #learner.save(Path(__file__).parent / 'test' / 'testexperiment1337') learner.learn(100000) diff --git a/algorithms/qtran_learner.py b/algorithms/qtran_learner.py deleted file mode 100644 index fc6cc24..0000000 --- a/algorithms/qtran_learner.py +++ /dev/null @@ -1,48 +0,0 @@ -import torch -from algorithms.q_learner import QLearner - - -class QTRANLearner(QLearner): - def __init__(self, *args, weight_opt=1., weigt_nopt=1., **kwargs): - super(QTRANLearner, self).__init__(*args, **kwargs) - assert self.n_agents >= 2, 'QTRANLearner requires more than one agent, use QLearner instead' - self.weight_opt = weight_opt - self.weigt_nopt = weigt_nopt - - def _training_routine(self, obs, next_obs, action): - # todo remove - is inherited - only used while implementing qtran - current_q_values = self.q_net(obs) - current_q_values = torch.gather(current_q_values, dim=-1, index=action) - next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach() - return current_q_values, next_q_values_raw - - def local_qs(self, observations, actions): - Q_jt = torch.zeros_like(actions) # placeholder to sum up individual q values - features = [] - for agent_i in range(self.n_agents): - q_values_agent_i, features_agent_i = self.q_net(observations[:, agent_i]) # Individual action-value network - q_values_agent_i = torch.gather(q_values_agent_i, dim=-1, index=actions[:, agent_i].unsqueeze(-1)) - Q_jt += q_values_agent_i - features.append(features_agent_i) - feature_sum = torch.stack(features, 0).sum(0) # (n_agents x hdim) -> hdim - return Q_jt - - def train(self): - if len(self.buffer) < self.batch_size: return - for _ in range(self.n_grad_steps): - experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps) - - Q_jt_prime = self.local_qs(experience.observation, experience.action) # sum of individual q-vals - Q_jt = None - V_jt = None - - pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1)) - for agent_i in range(self.n_agents): - q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i], - experience.next_observation[:, agent_i], - experience.action[:, agent_i].unsqueeze(-1)) - pred_q += q_values - target_q_raw += next_q_values_raw - target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw - loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2)) - self._backprop_loss(loss) \ No newline at end of file diff --git a/algorithms/udr_learner.py b/algorithms/udr_learner.py deleted file mode 100644 index b99f10f..0000000 --- a/algorithms/udr_learner.py +++ /dev/null @@ -1,178 +0,0 @@ -import random -from typing import Union, List -from collections import deque -import numpy as np -import torch -import torch.nn as nn -from algorithms.common import BaseBuffer, Experience, BaseLearner, BaseDQN, mlp_maker -from collections import defaultdict - - -class UDRLBuffer(BaseBuffer): - def __init__(self, size): - super(UDRLBuffer, self).__init__(0) - self.experience = defaultdict(list) - self.size = size - - def add(self, experience): - self.experience[experience.episode].append(experience) - if len(self.experience) > self.size: - self.sort_and_prune() - - def select_time_steps(self, episode: List[Experience]): - T = len(episode) # max horizon - t1 = random.randint(0, T - 1) - t2 = random.randint(t1 + 1, T) - return t1, t2, T - - def sort_and_prune(self): - scores = [] - for k, episode_experience in self.experience.items(): - r = sum([e.reward for e in episode_experience]) - scores.append((r, k)) - sorted_scores = sorted(scores, reverse=True) - return sorted_scores - - def sample(self, batch_size, cer=0): - random_episode_keys = random.choices(list(self.experience.keys()), k=batch_size) - lsts = (obs, desired_rewards, horizons, actions) = [], [], [], [] - for ek in random_episode_keys: - episode = self.experience[ek] - t1, t2, T = self.select_time_steps(episode) - t2 = T # TODO only good for episodic envs - observation = episode[t1].observation - desired_reward = sum([experience.reward for experience in episode[t1:t2]]) - horizon = t2 - t1 - action = episode[t1].action - for lst, val in zip(lsts, [observation, desired_reward, horizon, action]): - lst.append(val) - return (torch.stack([torch.from_numpy(o) for o in obs], 0).float(), - torch.tensor(desired_rewards).view(-1, 1).float(), - torch.tensor(horizons).view(-1, 1).float(), - torch.tensor(actions)) - - -class UDRLearner(BaseLearner): - # Upside Down Reinforcement Learner - def __init__(self, env, desired_reward, desired_horizon, - behavior_fn=None, buffer_size=100, n_warm_up_episodes=8, best_x=20, - batch_size=128, lr=1e-3, n_agents=1, train_every=('episode', 4), n_grad_steps=1): - super(UDRLearner, self).__init__(env, n_agents, train_every, n_grad_steps) - assert self.n_agents == 1, 'UDRL currently only supports single agent training' - self.behavior_fn = behavior_fn - self.buffer_size = buffer_size - self.n_warm_up_episodes = n_warm_up_episodes - self.buffer = UDRLBuffer(buffer_size) - self.batch_size = batch_size - self.mode = 'train' - self.best_x = best_x - self.desired_reward = desired_reward - self.desired_horizon = desired_horizon - self.lr = lr - self.optimizer = torch.optim.AdamW(self.behavior_fn.parameters(), lr=lr) - - self.running_loss = deque(maxlen=self.n_grad_steps*5) - - def sample_exploratory_commands(self): - top_x = self.buffer.sort_and_prune()[:self.best_x] - # The exploratory desired horizon dh0 is set to the mean of the lengths of the selected episodes - new_desired_horizon = np.mean([len(self.buffer.experience[k]) for _, k in top_x]) - # save all top_X cumulative returns in a list - returns = [r for r, _ in top_x] - # from these returns calc the mean and std - mean_returns = np.mean([r for r, _ in top_x]) - std_returns = np.std(returns) - # sample desired reward from a uniform distribution given the mean and the std - new_desired_reward = np.random.uniform(mean_returns, mean_returns + std_returns) - self.exploratory_commands = (new_desired_reward, new_desired_horizon) - return torch.tensor([[new_desired_reward]]).float(), torch.tensor([[new_desired_horizon]]).float() - - def on_new_experience(self, experience): - self.buffer.add(experience) - self.desired_reward = self.desired_reward - torch.tensor(experience.reward).float().view(1, 1) - - def on_step_end(self, n_steps): - one = torch.tensor([1.]).float().view(1, 1) - self.desired_horizon -= one - self.desired_horizon = self.desired_horizon if self.desired_horizon >= 1. else one - - def on_episode_end(self, n_steps): - self.desired_reward, self.desired_horizon = self.sample_exploratory_commands() - - def get_action(self, obs) -> Union[int, np.ndarray]: - o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs) - bf_out = self.behavior_fn(o.float(), self.desired_reward, self.desired_horizon) - dist = torch.distributions.Categorical(bf_out) - sample = dist.sample() - return [sample.item()]#[self.env.action_space.sample()] - - def _backprop_loss(self, loss): - # log loss - self.running_loss.append(loss.item()) - # Optimize the model - self.optimizer.zero_grad() - loss.backward() - #torch.nn.utils.clip_grad_norm_(self.behavior_fn.parameters(), 10) - self.optimizer.step() - - def train(self): - if len(self.buffer) < self.n_warm_up_episodes: return - for _ in range(self.n_grad_steps): - experience = self.buffer.sample(self.batch_size) - bf_out = self.behavior_fn(*experience[:3]) - labels = experience[-1] - #print(labels.shape) - loss = nn.CrossEntropyLoss()(bf_out, labels.squeeze()) - mean_entropy = torch.distributions.Categorical(bf_out).entropy().mean() - self._backprop_loss(loss - 0.03*mean_entropy) - print(f'Running loss: {np.mean(list(self.running_loss)):.3f}\tRunning reward: {np.mean(self.running_reward):.2f}' - f'\td_r: {self.desired_reward.item():.2f}\ttd_h: {self.desired_horizon.item()}') - - -class BF(BaseDQN): - def __init__(self, dims=[5*5*3, 64]): - super(BF, self).__init__(dims) - self.net = mlp_maker(dims, activation_last='identity') - self.command_net = mlp_maker([2, 64], activation_last='sigmoid') - self.common_branch = mlp_maker([64, 64, 64, 9]) - - - def forward(self, observation, desired_reward, horizon): - command = torch.cat((desired_reward*(0.02), horizon*(0.01)), dim=-1) - obs_out = self.net(torch.flatten(observation, start_dim=1)) - command_out = self.command_net(command) - combined = obs_out*command_out - out = self.common_branch(combined) - return torch.softmax(out, -1) - - -if __name__ == '__main__': - from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties - from algorithms.common import BaseDDQN - from algorithms.vdn_learner import VDNLearner - - N_AGENTS = 1 - - dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, - max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) - move_props = MovementProperties(allow_diagonal_movement=True, - allow_square_movement=True, - allow_no_op=False) - env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, - max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True) - - bf = BF() - desired_reward = torch.tensor([200.]).view(1, 1).float() - desired_horizon = torch.tensor([400.]).view(1, 1).float() - learner = UDRLearner(env, behavior_fn=bf, - train_every=('episode', 2), - buffer_size=40, - best_x=10, - lr=1e-3, - batch_size=64, - n_warm_up_episodes=12, - n_grad_steps=4, - desired_reward=desired_reward, - desired_horizon=desired_horizon) - #learner.save(Path(__file__).parent / 'test' / 'testexperiment1337') - learner.learn(500000) diff --git a/environments/factory/renderer.py b/environments/factory/renderer.py index 767a95f..ff4a58f 100644 --- a/environments/factory/renderer.py +++ b/environments/factory/renderer.py @@ -3,7 +3,7 @@ import numpy as np from pathlib import Path from collections import deque import pygame -from typing import NamedTuple +from typing import NamedTuple, Any import time @@ -14,6 +14,7 @@ class Entity(NamedTuple): value_operation: str = 'none' state: str = None id: int = 0 + aux:Any=None class Renderer: @@ -73,6 +74,20 @@ class Renderer: asset = pygame.transform.smoothscale(asset, (s, s)) return asset + def visibility_rects(self, bp, view): + rects = [] + for i in range(-self.view_radius, self.view_radius+1): + for j in range(-self.view_radius, self.view_radius+1): + if bool(view[self.view_radius+j, self.view_radius+i]): + visibility_rect = bp['dest'].copy() + visibility_rect.centerx += i*self.cell_size + visibility_rect.centery += j*self.cell_size + shape_surf = pygame.Surface(visibility_rect.size, pygame.SRCALPHA) + pygame.draw.rect(shape_surf, self.AGENT_VIEW_COLOR, shape_surf.get_rect()) + shape_surf.set_alpha(64) + rects.append(dict(source=shape_surf, dest=visibility_rect)) + return rects + def render(self, entities): for event in pygame.event.get(): if event.type == pygame.QUIT: @@ -88,13 +103,8 @@ class Renderer: blits.append(bp) if entity.name.lower() == 'agent': if self.view_radius > 0: - visibility_rect = bp['dest'].inflate( - (self.view_radius*2)*self.cell_size, (self.view_radius*2)*self.cell_size - ) - shape_surf = pygame.Surface(visibility_rect.size, pygame.SRCALPHA) - pygame.draw.rect(shape_surf, self.AGENT_VIEW_COLOR, shape_surf.get_rect()) - shape_surf.set_alpha(64) - blits.appendleft(dict(source=shape_surf, dest=visibility_rect)) + vis_rects = self.visibility_rects(bp, entity.aux) + blits.extendleft(vis_rects) if entity.state != 'blank': agent_state_blits = self.blit_params( Entity(entity.state, (entity.pos[0]+0.12, entity.pos[1]), 0.48, 'scale') diff --git a/environments/factory/simple_factory.py b/environments/factory/simple_factory.py index fe4f055..cff9990 100644 --- a/environments/factory/simple_factory.py +++ b/environments/factory/simple_factory.py @@ -83,7 +83,7 @@ class SimpleFactory(BaseFactory): agents = [] for i, agent in enumerate(self._agents): name, state = asset_str(agent) - agents.append(Entity(name, agent.pos, 1, 'none', state, i+1)) + agents.append(Entity(name, agent.pos, 1, 'none', state, i+1, agent.temp_light_map)) doors = [] if self.parse_doors: for i, door in enumerate(self._doors): @@ -229,7 +229,7 @@ if __name__ == '__main__': allow_no_op=False) factory = SimpleFactory(movement_properties=move_props, dirt_properties=dirt_props, n_agents=1, combin_agent_slices_in_obs=False, level_name='rooms', parse_doors=True, - pomdp_radius=3, cast_shadows=True) + pomdp_radius=2, cast_shadows=True) n_actions = factory.action_space.n - 1 _ = factory.observation_space diff --git a/environments/policy_adaption/test.py b/environments/policy_adaption/test.py index 1434205..8996e8d 100644 --- a/environments/policy_adaption/test.py +++ b/environments/policy_adaption/test.py @@ -4,5 +4,5 @@ from environments.policy_adaption.natural_rl_environment.imgsource import * from environments.policy_adaption.natural_rl_environment.natural_env import * if __name__ == "__main__": - env = make('SpaceInvaders-v0', 'color') # gravitar, breakout, MsPacman, Space Invaders + env = make('SpaceInvaders-v0', 'video') # gravitar, breakout, MsPacman, Space Invaders play.play(env, zoom=4) \ No newline at end of file