cleanup algos + adjusted renderer to support "ray casting"

This commit is contained in:
romue
2021-07-27 16:59:24 +02:00
parent 8429e3db9d
commit 9e8d2ac1dc
9 changed files with 102 additions and 254 deletions

40
algorithms/awr_learner.py Normal file
View File

@@ -0,0 +1,40 @@
from common import BaseLearner, TrajectoryBuffer
class AWRLearner(BaseLearner):
def __init__(self, *args, buffer_size=1e5, **kwargs):
super(AWRLearner, self).__init__(*args, **kwargs)
assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!'
self.buffer = TrajectoryBuffer(buffer_size)
def train(self):
# convert to trajectory format
pass
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.25, rc={'text.usetex': True})
data = np.array([[689, 74], [71, 647]])
cats = ['Mask', 'No Mask']
df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats)
group_counts = ['{0:0.0f}'.format(value) for value in
data.flatten()]
group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in
data.flatten()/np.sum(data)]
labels = [f'{v1}\n{v2}' for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
with sns.axes_style("white"):
cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats)
plt.title('Simple-CNN')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig('cnn.pdf', bbox_inches='tight')

View File

@@ -1,5 +1,5 @@
from typing import NamedTuple, Union from typing import NamedTuple, Union
from collections import deque, OrderedDict from collections import deque, OrderedDict, defaultdict
import numpy as np import numpy as np
import random import random
import torch import torch
@@ -18,12 +18,13 @@ class Experience(NamedTuple):
class BaseLearner: class BaseLearner:
def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1): def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1, stack_n_frames=1):
assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]' assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]'
self.env = env self.env = env
self.n_agents = n_agents self.n_agents = n_agents
self.n_grad_steps = n_grad_steps self.n_grad_steps = n_grad_steps
self.train_every = train_every self.train_every = train_every
self.stack_n_frames = deque(stack_n_frames)
self.device = 'cpu' self.device = 'cpu'
self.n_updates = 0 self.n_updates = 0
self.step = 0 self.step = 0
@@ -102,8 +103,8 @@ class BaseBuffer:
def __len__(self): def __len__(self):
return len(self.experience) return len(self.experience)
def add(self, experience): def add(self, exp: Experience):
self.experience.append(experience) self.experience.append(exp)
def sample(self, k, cer=4): def sample(self, k, cer=4):
sample = random.choices(self.experience, k=k-cer) sample = random.choices(self.experience, k=k-cer)
@@ -113,9 +114,22 @@ class BaseBuffer:
actions = torch.tensor([e.action for e in sample]).long() actions = torch.tensor([e.action for e in sample]).long()
rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1) rewards = torch.tensor([e.reward for e in sample]).float().view(-1, 1)
dones = torch.tensor([e.done for e in sample]).float().view(-1, 1) dones = torch.tensor([e.done for e in sample]).float().view(-1, 1)
#print(observations.shape, next_observations.shape, actions.shape, rewards.shape, dones.shape)
return Experience(observations, next_observations, actions, rewards, dones) return Experience(observations, next_observations, actions, rewards, dones)
class TrajectoryBuffer(BaseBuffer):
def __init__(self, size):
super(TrajectoryBuffer, self).__init__(size)
self.experience = defaultdict(list)
def add(self, exp: Experience):
self.experience[exp.episode].append(exp)
if len(self.experience) > self.size:
oldest_traj_key = list(sorted(self.experience.keys()))[0]
del self.experience[oldest_traj_key]
def soft_update(local_model, target_model, tau): def soft_update(local_model, target_model, tau):
# taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb # taken from https://github.com/BY571/Munchausen-RL/blob/master/M-DQN.ipynb
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
@@ -152,9 +166,10 @@ class BaseDDQN(BaseDQN):
def __init__(self, def __init__(self,
backbone_dims=[3*5*5, 64, 64], backbone_dims=[3*5*5, 64, 64],
value_dims=[64, 1], value_dims=[64, 1],
advantage_dims=[64, 9]): advantage_dims=[64, 9],
activation='elu'):
super(BaseDDQN, self).__init__(backbone_dims) super(BaseDDQN, self).__init__(backbone_dims)
self.net = mlp_maker(backbone_dims, flatten=True) self.net = mlp_maker(backbone_dims, activation=activation, flatten=True)
self.value_head = mlp_maker(value_dims) self.value_head = mlp_maker(value_dims)
self.advantage_head = mlp_maker(advantage_dims) self.advantage_head = mlp_maker(advantage_dims)

View File

@@ -25,7 +25,7 @@ class MQLearner(QLearner):
if len(self.buffer) < self.batch_size: return if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps): for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps) experience = self.buffer.sample(self.batch_size, cer=self.train_every[-1])
with torch.no_grad(): with torch.no_grad():
q_target_next = self.target_q_net(experience.next_observation) q_target_next = self.target_q_net(experience.next_observation)

View File

@@ -17,7 +17,7 @@ class QLearner(BaseLearner):
self.q_net = q_net self.q_net = q_net
self.target_q_net = target_q_net self.target_q_net = target_q_net
self.target_q_net.eval() self.target_q_net.eval()
soft_update(self.q_net, self.target_q_net, tau=1.0) #soft_update(self.q_net, self.target_q_net, tau=1.0)
self.buffer = BaseBuffer(buffer_size) self.buffer = BaseBuffer(buffer_size)
self.target_update = target_update self.target_update = target_update
self.eps = eps_start self.eps = eps_start
@@ -30,9 +30,7 @@ class QLearner(BaseLearner):
self.reg_weight = reg_weight self.reg_weight = reg_weight
self.weight_decay = weight_decay self.weight_decay = weight_decay
self.lr = lr self.lr = lr
self.optimizer = torch.optim.AdamW(self.q_net.parameters(), self.optimizer = torch.optim.AdamW(self.q_net.parameters(), lr=self.lr, weight_decay=self.weight_decay)
lr=self.lr,
weight_decay=self.weight_decay)
self.max_grad_norm = max_grad_norm self.max_grad_norm = max_grad_norm
self.running_reward = deque(maxlen=5) self.running_reward = deque(maxlen=5)
self.running_loss = deque(maxlen=5) self.running_loss = deque(maxlen=5)
@@ -103,20 +101,31 @@ class QLearner(BaseLearner):
if __name__ == '__main__': if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.common import BaseDDQN from algorithms.common import BaseDDQN
from algorithms.m_q_learner import MQLearner
from algorithms.vdn_learner import VDNLearner from algorithms.vdn_learner import VDNLearner
from algorithms.udr_learner import UDRLearner from algorithms.udr_learner import UDRLearner
N_AGENTS = 1 N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30, dirt_props = DirtProperties(clean_amount=1, gain_amount=0.1, max_global_amount=20,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05) max_local_amount=1, spawn_frequency=5, max_spawn_ratio=0.05,
dirt_smear_amount=0.0)
move_props = MovementProperties(allow_diagonal_movement=True, move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True, allow_square_movement=True,
allow_no_op=False) allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2, max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True)
dqn, target_dqn = BaseDDQN(), BaseDDQN() env = SimpleFactory(n_agents=1, dirt_properties=dirt_props, pomdp_radius=2, max_steps=400, parse_doors=False,
learner = QLearner(dqn, target_dqn, env, 40000, target_update=3500, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10, movement_properties=move_props, level_name='rooms', frames_to_stack=0,
train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, batch_size=64) omit_agent_slice_in_obs=True, combin_agent_slices_in_obs=True, record_episodes=False
)
obs_shape = np.prod(env.observation_space.shape)
n_actions = env.action_space.n
dqn, target_dqn = BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128,1], activation='leaky_relu'),\
BaseDDQN(backbone_dims=[obs_shape, 128, 128], advantage_dims=[128, n_actions], value_dims=[128,1], activation='leaky_relu')
learner = MQLearner(dqn, target_dqn, env, 50000, target_update=5000, lr=0.0007, gamma=0.99, n_agents=N_AGENTS, tau=0.95, max_grad_norm=10,
train_every=('step', 4), eps_end=0.025, n_grad_steps=1, reg_weight=0.1, exploration_fraction=0.25, batch_size=64, weight_decay=1e-3)
#learner.save(Path(__file__).parent / 'test' / 'testexperiment1337') #learner.save(Path(__file__).parent / 'test' / 'testexperiment1337')
learner.learn(100000) learner.learn(100000)

View File

@@ -1,48 +0,0 @@
import torch
from algorithms.q_learner import QLearner
class QTRANLearner(QLearner):
def __init__(self, *args, weight_opt=1., weigt_nopt=1., **kwargs):
super(QTRANLearner, self).__init__(*args, **kwargs)
assert self.n_agents >= 2, 'QTRANLearner requires more than one agent, use QLearner instead'
self.weight_opt = weight_opt
self.weigt_nopt = weigt_nopt
def _training_routine(self, obs, next_obs, action):
# todo remove - is inherited - only used while implementing qtran
current_q_values = self.q_net(obs)
current_q_values = torch.gather(current_q_values, dim=-1, index=action)
next_q_values_raw = self.target_q_net(next_obs).max(dim=-1)[0].reshape(-1, 1).detach()
return current_q_values, next_q_values_raw
def local_qs(self, observations, actions):
Q_jt = torch.zeros_like(actions) # placeholder to sum up individual q values
features = []
for agent_i in range(self.n_agents):
q_values_agent_i, features_agent_i = self.q_net(observations[:, agent_i]) # Individual action-value network
q_values_agent_i = torch.gather(q_values_agent_i, dim=-1, index=actions[:, agent_i].unsqueeze(-1))
Q_jt += q_values_agent_i
features.append(features_agent_i)
feature_sum = torch.stack(features, 0).sum(0) # (n_agents x hdim) -> hdim
return Q_jt
def train(self):
if len(self.buffer) < self.batch_size: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size, cer=self.train_every_n_steps)
Q_jt_prime = self.local_qs(experience.observation, experience.action) # sum of individual q-vals
Q_jt = None
V_jt = None
pred_q, target_q_raw = torch.zeros((self.batch_size, 1)), torch.zeros((self.batch_size, 1))
for agent_i in range(self.n_agents):
q_values, next_q_values_raw = self._training_routine(experience.observation[:, agent_i],
experience.next_observation[:, agent_i],
experience.action[:, agent_i].unsqueeze(-1))
pred_q += q_values
target_q_raw += next_q_values_raw
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)

View File

@@ -1,178 +0,0 @@
import random
from typing import Union, List
from collections import deque
import numpy as np
import torch
import torch.nn as nn
from algorithms.common import BaseBuffer, Experience, BaseLearner, BaseDQN, mlp_maker
from collections import defaultdict
class UDRLBuffer(BaseBuffer):
def __init__(self, size):
super(UDRLBuffer, self).__init__(0)
self.experience = defaultdict(list)
self.size = size
def add(self, experience):
self.experience[experience.episode].append(experience)
if len(self.experience) > self.size:
self.sort_and_prune()
def select_time_steps(self, episode: List[Experience]):
T = len(episode) # max horizon
t1 = random.randint(0, T - 1)
t2 = random.randint(t1 + 1, T)
return t1, t2, T
def sort_and_prune(self):
scores = []
for k, episode_experience in self.experience.items():
r = sum([e.reward for e in episode_experience])
scores.append((r, k))
sorted_scores = sorted(scores, reverse=True)
return sorted_scores
def sample(self, batch_size, cer=0):
random_episode_keys = random.choices(list(self.experience.keys()), k=batch_size)
lsts = (obs, desired_rewards, horizons, actions) = [], [], [], []
for ek in random_episode_keys:
episode = self.experience[ek]
t1, t2, T = self.select_time_steps(episode)
t2 = T # TODO only good for episodic envs
observation = episode[t1].observation
desired_reward = sum([experience.reward for experience in episode[t1:t2]])
horizon = t2 - t1
action = episode[t1].action
for lst, val in zip(lsts, [observation, desired_reward, horizon, action]):
lst.append(val)
return (torch.stack([torch.from_numpy(o) for o in obs], 0).float(),
torch.tensor(desired_rewards).view(-1, 1).float(),
torch.tensor(horizons).view(-1, 1).float(),
torch.tensor(actions))
class UDRLearner(BaseLearner):
# Upside Down Reinforcement Learner
def __init__(self, env, desired_reward, desired_horizon,
behavior_fn=None, buffer_size=100, n_warm_up_episodes=8, best_x=20,
batch_size=128, lr=1e-3, n_agents=1, train_every=('episode', 4), n_grad_steps=1):
super(UDRLearner, self).__init__(env, n_agents, train_every, n_grad_steps)
assert self.n_agents == 1, 'UDRL currently only supports single agent training'
self.behavior_fn = behavior_fn
self.buffer_size = buffer_size
self.n_warm_up_episodes = n_warm_up_episodes
self.buffer = UDRLBuffer(buffer_size)
self.batch_size = batch_size
self.mode = 'train'
self.best_x = best_x
self.desired_reward = desired_reward
self.desired_horizon = desired_horizon
self.lr = lr
self.optimizer = torch.optim.AdamW(self.behavior_fn.parameters(), lr=lr)
self.running_loss = deque(maxlen=self.n_grad_steps*5)
def sample_exploratory_commands(self):
top_x = self.buffer.sort_and_prune()[:self.best_x]
# The exploratory desired horizon dh0 is set to the mean of the lengths of the selected episodes
new_desired_horizon = np.mean([len(self.buffer.experience[k]) for _, k in top_x])
# save all top_X cumulative returns in a list
returns = [r for r, _ in top_x]
# from these returns calc the mean and std
mean_returns = np.mean([r for r, _ in top_x])
std_returns = np.std(returns)
# sample desired reward from a uniform distribution given the mean and the std
new_desired_reward = np.random.uniform(mean_returns, mean_returns + std_returns)
self.exploratory_commands = (new_desired_reward, new_desired_horizon)
return torch.tensor([[new_desired_reward]]).float(), torch.tensor([[new_desired_horizon]]).float()
def on_new_experience(self, experience):
self.buffer.add(experience)
self.desired_reward = self.desired_reward - torch.tensor(experience.reward).float().view(1, 1)
def on_step_end(self, n_steps):
one = torch.tensor([1.]).float().view(1, 1)
self.desired_horizon -= one
self.desired_horizon = self.desired_horizon if self.desired_horizon >= 1. else one
def on_episode_end(self, n_steps):
self.desired_reward, self.desired_horizon = self.sample_exploratory_commands()
def get_action(self, obs) -> Union[int, np.ndarray]:
o = torch.from_numpy(obs).unsqueeze(0) if self.n_agents <= 1 else torch.from_numpy(obs)
bf_out = self.behavior_fn(o.float(), self.desired_reward, self.desired_horizon)
dist = torch.distributions.Categorical(bf_out)
sample = dist.sample()
return [sample.item()]#[self.env.action_space.sample()]
def _backprop_loss(self, loss):
# log loss
self.running_loss.append(loss.item())
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
#torch.nn.utils.clip_grad_norm_(self.behavior_fn.parameters(), 10)
self.optimizer.step()
def train(self):
if len(self.buffer) < self.n_warm_up_episodes: return
for _ in range(self.n_grad_steps):
experience = self.buffer.sample(self.batch_size)
bf_out = self.behavior_fn(*experience[:3])
labels = experience[-1]
#print(labels.shape)
loss = nn.CrossEntropyLoss()(bf_out, labels.squeeze())
mean_entropy = torch.distributions.Categorical(bf_out).entropy().mean()
self._backprop_loss(loss - 0.03*mean_entropy)
print(f'Running loss: {np.mean(list(self.running_loss)):.3f}\tRunning reward: {np.mean(self.running_reward):.2f}'
f'\td_r: {self.desired_reward.item():.2f}\ttd_h: {self.desired_horizon.item()}')
class BF(BaseDQN):
def __init__(self, dims=[5*5*3, 64]):
super(BF, self).__init__(dims)
self.net = mlp_maker(dims, activation_last='identity')
self.command_net = mlp_maker([2, 64], activation_last='sigmoid')
self.common_branch = mlp_maker([64, 64, 64, 9])
def forward(self, observation, desired_reward, horizon):
command = torch.cat((desired_reward*(0.02), horizon*(0.01)), dim=-1)
obs_out = self.net(torch.flatten(observation, start_dim=1))
command_out = self.command_net(command)
combined = obs_out*command_out
out = self.common_branch(combined)
return torch.softmax(out, -1)
if __name__ == '__main__':
from environments.factory.simple_factory import SimpleFactory, DirtProperties, MovementProperties
from algorithms.common import BaseDDQN
from algorithms.vdn_learner import VDNLearner
N_AGENTS = 1
dirt_props = DirtProperties(clean_amount=3, gain_amount=0.2, max_global_amount=30,
max_local_amount=5, spawn_frequency=1, max_spawn_ratio=0.05)
move_props = MovementProperties(allow_diagonal_movement=True,
allow_square_movement=True,
allow_no_op=False)
env = SimpleFactory(dirt_properties=dirt_props, movement_properties=move_props, n_agents=N_AGENTS, pomdp_radius=2,
max_steps=400, omit_agent_slice_in_obs=False, combin_agent_slices_in_obs=True)
bf = BF()
desired_reward = torch.tensor([200.]).view(1, 1).float()
desired_horizon = torch.tensor([400.]).view(1, 1).float()
learner = UDRLearner(env, behavior_fn=bf,
train_every=('episode', 2),
buffer_size=40,
best_x=10,
lr=1e-3,
batch_size=64,
n_warm_up_episodes=12,
n_grad_steps=4,
desired_reward=desired_reward,
desired_horizon=desired_horizon)
#learner.save(Path(__file__).parent / 'test' / 'testexperiment1337')
learner.learn(500000)

View File

@@ -3,7 +3,7 @@ import numpy as np
from pathlib import Path from pathlib import Path
from collections import deque from collections import deque
import pygame import pygame
from typing import NamedTuple from typing import NamedTuple, Any
import time import time
@@ -14,6 +14,7 @@ class Entity(NamedTuple):
value_operation: str = 'none' value_operation: str = 'none'
state: str = None state: str = None
id: int = 0 id: int = 0
aux:Any=None
class Renderer: class Renderer:
@@ -73,6 +74,20 @@ class Renderer:
asset = pygame.transform.smoothscale(asset, (s, s)) asset = pygame.transform.smoothscale(asset, (s, s))
return asset return asset
def visibility_rects(self, bp, view):
rects = []
for i in range(-self.view_radius, self.view_radius+1):
for j in range(-self.view_radius, self.view_radius+1):
if bool(view[self.view_radius+j, self.view_radius+i]):
visibility_rect = bp['dest'].copy()
visibility_rect.centerx += i*self.cell_size
visibility_rect.centery += j*self.cell_size
shape_surf = pygame.Surface(visibility_rect.size, pygame.SRCALPHA)
pygame.draw.rect(shape_surf, self.AGENT_VIEW_COLOR, shape_surf.get_rect())
shape_surf.set_alpha(64)
rects.append(dict(source=shape_surf, dest=visibility_rect))
return rects
def render(self, entities): def render(self, entities):
for event in pygame.event.get(): for event in pygame.event.get():
if event.type == pygame.QUIT: if event.type == pygame.QUIT:
@@ -88,13 +103,8 @@ class Renderer:
blits.append(bp) blits.append(bp)
if entity.name.lower() == 'agent': if entity.name.lower() == 'agent':
if self.view_radius > 0: if self.view_radius > 0:
visibility_rect = bp['dest'].inflate( vis_rects = self.visibility_rects(bp, entity.aux)
(self.view_radius*2)*self.cell_size, (self.view_radius*2)*self.cell_size blits.extendleft(vis_rects)
)
shape_surf = pygame.Surface(visibility_rect.size, pygame.SRCALPHA)
pygame.draw.rect(shape_surf, self.AGENT_VIEW_COLOR, shape_surf.get_rect())
shape_surf.set_alpha(64)
blits.appendleft(dict(source=shape_surf, dest=visibility_rect))
if entity.state != 'blank': if entity.state != 'blank':
agent_state_blits = self.blit_params( agent_state_blits = self.blit_params(
Entity(entity.state, (entity.pos[0]+0.12, entity.pos[1]), 0.48, 'scale') Entity(entity.state, (entity.pos[0]+0.12, entity.pos[1]), 0.48, 'scale')

View File

@@ -83,7 +83,7 @@ class SimpleFactory(BaseFactory):
agents = [] agents = []
for i, agent in enumerate(self._agents): for i, agent in enumerate(self._agents):
name, state = asset_str(agent) name, state = asset_str(agent)
agents.append(Entity(name, agent.pos, 1, 'none', state, i+1)) agents.append(Entity(name, agent.pos, 1, 'none', state, i+1, agent.temp_light_map))
doors = [] doors = []
if self.parse_doors: if self.parse_doors:
for i, door in enumerate(self._doors): for i, door in enumerate(self._doors):
@@ -229,7 +229,7 @@ if __name__ == '__main__':
allow_no_op=False) allow_no_op=False)
factory = SimpleFactory(movement_properties=move_props, dirt_properties=dirt_props, n_agents=1, factory = SimpleFactory(movement_properties=move_props, dirt_properties=dirt_props, n_agents=1,
combin_agent_slices_in_obs=False, level_name='rooms', parse_doors=True, combin_agent_slices_in_obs=False, level_name='rooms', parse_doors=True,
pomdp_radius=3, cast_shadows=True) pomdp_radius=2, cast_shadows=True)
n_actions = factory.action_space.n - 1 n_actions = factory.action_space.n - 1
_ = factory.observation_space _ = factory.observation_space

View File

@@ -4,5 +4,5 @@ from environments.policy_adaption.natural_rl_environment.imgsource import *
from environments.policy_adaption.natural_rl_environment.natural_env import * from environments.policy_adaption.natural_rl_environment.natural_env import *
if __name__ == "__main__": if __name__ == "__main__":
env = make('SpaceInvaders-v0', 'color') # gravitar, breakout, MsPacman, Space Invaders env = make('SpaceInvaders-v0', 'video') # gravitar, breakout, MsPacman, Space Invaders
play.play(env, zoom=4) play.play(env, zoom=4)