my update

This commit is contained in:
romue
2021-11-11 10:59:13 +01:00
parent 6287380f60
commit ea4582a59e
10 changed files with 88 additions and 308 deletions

View File

@@ -1,40 +0,0 @@
from common import BaseLearner, TrajectoryBuffer
class AWRLearner(BaseLearner):
def __init__(self, *args, buffer_size=1e5, **kwargs):
super(AWRLearner, self).__init__(*args, **kwargs)
assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!'
self.buffer = TrajectoryBuffer(buffer_size)
def train(self):
# convert to trajectory format
pass
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
sns.set(font_scale=1.25, rc={'text.usetex': True})
data = np.array([[689, 74], [71, 647]])
cats = ['Mask', 'No Mask']
df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats)
group_counts = ['{0:0.0f}'.format(value) for value in
data.flatten()]
group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in
data.flatten()/np.sum(data)]
labels = [f'{v1}\n{v2}' for v1, v2 in
zip(group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
with sns.axes_style("white"):
cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats)
plt.title('Simple-CNN')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.savefig('cnn.pdf', bbox_inches='tight')

View File

@@ -2,9 +2,12 @@ from typing import NamedTuple, Union
from collections import deque, OrderedDict, defaultdict
import numpy as np
import random
import pandas as pd
import torch
import torch.nn as nn
from tqdm import trange
class Experience(NamedTuple):
# can be use for a single (s_t, a, r s_{t+1}) tuple
@@ -57,6 +60,9 @@ class BaseLearner:
def train(self):
pass
def reward(self, r):
return r
def learn(self, n_steps):
train_type, train_freq = self.train_every
while self.step < n_steps:
@@ -70,7 +76,7 @@ class BaseLearner:
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
experience = Experience(observation=obs, next_observation=next_obs,
action=action, reward=reward,
action=action, reward=self.reward(reward),
done=done, episode=self.episode) # do we really need to copy?
self.on_new_experience(experience)
# end of step routine
@@ -90,7 +96,7 @@ class BaseLearner:
self.running_reward.append(total_reward)
self.episode += 1
try:
if self.step % 10 == 0:
if self.step % 100 == 0:
print(
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
@@ -98,6 +104,21 @@ class BaseLearner:
pass
self.on_all_done()
def evaluate(self, n_episodes=100, render=False):
with torch.no_grad():
data = []
for eval_i in trange(n_episodes):
obs, done = self.env.reset(), False
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
if render: self.env.render()
obs = next_obs # srsly i'm so stupid
info.update({'reward': reward, 'eval_episode': eval_i})
data.append(info)
return pd.DataFrame(data).fillna(0)
class BaseBuffer:
def __init__(self, size: int):
@@ -187,7 +208,7 @@ class BaseDDQN(BaseDQN):
class BaseICM(nn.Module):
def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
super(BaseICM, self).__init__()
self.backbone = mlp_maker(backbone_dims, flatten=True)
self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu')
self.icm = mlp_maker(head_dims)
self.ce = nn.CrossEntropyLoss()

View File

@@ -1,3 +1,4 @@
import numpy as np
import torch
import torch.nn.functional as F
from algorithms.q_learner import QLearner
@@ -53,19 +54,24 @@ class MQLearner(QLearner):
self._backprop_loss(loss)
from tqdm import trange
from collections import deque
class MQICMLearner(MQLearner):
def __init__(self, *args, icm, **kwargs):
super(MQICMLearner, self).__init__(*args, **kwargs)
self.icm = icm
self.icm_optimizer = torch.optim.Adam(self.icm.parameters())
self.icm_optimizer = torch.optim.AdamW(self.icm.parameters())
self.normalize_reward = deque(maxlen=1000)
def on_all_done(self):
for b in trange(50000):
from collections import deque
losses = deque(maxlen=100)
for b in trange(10000):
batch = self.buffer.sample(128, 0)
s0, s1, a = batch.observation, batch.next_observation, batch.action
loss = self.icm(s0, s1, a.squeeze())['loss']
self.icm_optimizer.zero_grad()
loss.backward()
self.icm_optimizer.step()
losses.append(loss.item())
if b%100 == 0:
print(loss.item())
print(np.mean(losses))

View File

@@ -1,6 +1,7 @@
from typing import Union
import torch
import numpy as np
import pandas as pd
from algorithms.q_learner import QLearner
@@ -37,4 +38,18 @@ class VDNLearner(QLearner):
target_q_raw += next_q_values_raw
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
self._backprop_loss(loss)
self._backprop_loss(loss)
def evaluate(self, n_episodes=100, render=False):
with torch.no_grad():
data = []
for eval_i in range(n_episodes):
obs, done = self.env.reset(), False
while not done:
action = self.get_action(obs)
next_obs, reward, done, info = self.env.step(action)
if render: self.env.render()
obs = next_obs # srsly i'm so stupid
info.update({'reward': reward, 'eval_episode': eval_i})
data.append(info)
return pd.DataFrame(data).fillna(0)