my update
This commit is contained in:
@@ -1,40 +0,0 @@
|
||||
from common import BaseLearner, TrajectoryBuffer
|
||||
|
||||
|
||||
class AWRLearner(BaseLearner):
|
||||
def __init__(self, *args, buffer_size=1e5, **kwargs):
|
||||
super(AWRLearner, self).__init__(*args, **kwargs)
|
||||
assert self.train_every[0] == 'episode', 'AWR only supports the episodic RL setting!'
|
||||
self.buffer = TrajectoryBuffer(buffer_size)
|
||||
|
||||
def train(self):
|
||||
# convert to trajectory format
|
||||
pass
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
|
||||
sns.set(font_scale=1.25, rc={'text.usetex': True})
|
||||
data = np.array([[689, 74], [71, 647]])
|
||||
cats = ['Mask', 'No Mask']
|
||||
df = pd.DataFrame(data/np.sum(data), index=cats, columns=cats)
|
||||
|
||||
group_counts = ['{0:0.0f}'.format(value) for value in
|
||||
data.flatten()]
|
||||
group_percentages = [f'{value*100:.2f}' + r'$\%$' for value in
|
||||
data.flatten()/np.sum(data)]
|
||||
|
||||
labels = [f'{v1}\n{v2}' for v1, v2 in
|
||||
zip(group_counts,group_percentages)]
|
||||
labels = np.asarray(labels).reshape(2,2)
|
||||
|
||||
with sns.axes_style("white"):
|
||||
cmap = sns.diverging_palette(h_neg=100, h_pos=10, s=99, l=55, sep=3, as_cmap=True)
|
||||
sns.heatmap(data, annot=labels, fmt='', cmap='Set2_r', square=True, cbar=False, xticklabels=cats,yticklabels=cats)
|
||||
plt.title('Simple-CNN')
|
||||
plt.ylabel('True label')
|
||||
plt.xlabel('Predicted label')
|
||||
plt.tight_layout()
|
||||
plt.savefig('cnn.pdf', bbox_inches='tight')
|
@@ -2,9 +2,12 @@ from typing import NamedTuple, Union
|
||||
from collections import deque, OrderedDict, defaultdict
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from tqdm import trange
|
||||
|
||||
class Experience(NamedTuple):
|
||||
# can be use for a single (s_t, a, r s_{t+1}) tuple
|
||||
@@ -57,6 +60,9 @@ class BaseLearner:
|
||||
def train(self):
|
||||
pass
|
||||
|
||||
def reward(self, r):
|
||||
return r
|
||||
|
||||
def learn(self, n_steps):
|
||||
train_type, train_freq = self.train_every
|
||||
while self.step < n_steps:
|
||||
@@ -70,7 +76,7 @@ class BaseLearner:
|
||||
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
||||
|
||||
experience = Experience(observation=obs, next_observation=next_obs,
|
||||
action=action, reward=reward,
|
||||
action=action, reward=self.reward(reward),
|
||||
done=done, episode=self.episode) # do we really need to copy?
|
||||
self.on_new_experience(experience)
|
||||
# end of step routine
|
||||
@@ -90,7 +96,7 @@ class BaseLearner:
|
||||
self.running_reward.append(total_reward)
|
||||
self.episode += 1
|
||||
try:
|
||||
if self.step % 10 == 0:
|
||||
if self.step % 100 == 0:
|
||||
print(
|
||||
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
|
||||
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
|
||||
@@ -98,6 +104,21 @@ class BaseLearner:
|
||||
pass
|
||||
self.on_all_done()
|
||||
|
||||
def evaluate(self, n_episodes=100, render=False):
|
||||
with torch.no_grad():
|
||||
data = []
|
||||
for eval_i in trange(n_episodes):
|
||||
obs, done = self.env.reset(), False
|
||||
while not done:
|
||||
action = self.get_action(obs)
|
||||
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
||||
if render: self.env.render()
|
||||
obs = next_obs # srsly i'm so stupid
|
||||
info.update({'reward': reward, 'eval_episode': eval_i})
|
||||
data.append(info)
|
||||
return pd.DataFrame(data).fillna(0)
|
||||
|
||||
|
||||
|
||||
class BaseBuffer:
|
||||
def __init__(self, size: int):
|
||||
@@ -187,7 +208,7 @@ class BaseDDQN(BaseDQN):
|
||||
class BaseICM(nn.Module):
|
||||
def __init__(self, backbone_dims=[2*3*5*5, 64, 64], head_dims=[2*64, 64, 9]):
|
||||
super(BaseICM, self).__init__()
|
||||
self.backbone = mlp_maker(backbone_dims, flatten=True)
|
||||
self.backbone = mlp_maker(backbone_dims, flatten=True, activation_last='relu', activation='relu')
|
||||
self.icm = mlp_maker(head_dims)
|
||||
self.ce = nn.CrossEntropyLoss()
|
||||
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from algorithms.q_learner import QLearner
|
||||
@@ -53,19 +54,24 @@ class MQLearner(QLearner):
|
||||
self._backprop_loss(loss)
|
||||
|
||||
from tqdm import trange
|
||||
from collections import deque
|
||||
class MQICMLearner(MQLearner):
|
||||
def __init__(self, *args, icm, **kwargs):
|
||||
super(MQICMLearner, self).__init__(*args, **kwargs)
|
||||
self.icm = icm
|
||||
self.icm_optimizer = torch.optim.Adam(self.icm.parameters())
|
||||
self.icm_optimizer = torch.optim.AdamW(self.icm.parameters())
|
||||
self.normalize_reward = deque(maxlen=1000)
|
||||
|
||||
def on_all_done(self):
|
||||
for b in trange(50000):
|
||||
from collections import deque
|
||||
losses = deque(maxlen=100)
|
||||
for b in trange(10000):
|
||||
batch = self.buffer.sample(128, 0)
|
||||
s0, s1, a = batch.observation, batch.next_observation, batch.action
|
||||
loss = self.icm(s0, s1, a.squeeze())['loss']
|
||||
self.icm_optimizer.zero_grad()
|
||||
loss.backward()
|
||||
self.icm_optimizer.step()
|
||||
losses.append(loss.item())
|
||||
if b%100 == 0:
|
||||
print(loss.item())
|
||||
print(np.mean(losses))
|
||||
|
@@ -1,6 +1,7 @@
|
||||
from typing import Union
|
||||
import torch
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from algorithms.q_learner import QLearner
|
||||
|
||||
|
||||
@@ -37,4 +38,18 @@ class VDNLearner(QLearner):
|
||||
target_q_raw += next_q_values_raw
|
||||
target_q = experience.reward + (1 - experience.done) * self.gamma * target_q_raw
|
||||
loss = torch.mean(self.reg_weight * pred_q + torch.pow(pred_q - target_q, 2))
|
||||
self._backprop_loss(loss)
|
||||
self._backprop_loss(loss)
|
||||
|
||||
def evaluate(self, n_episodes=100, render=False):
|
||||
with torch.no_grad():
|
||||
data = []
|
||||
for eval_i in range(n_episodes):
|
||||
obs, done = self.env.reset(), False
|
||||
while not done:
|
||||
action = self.get_action(obs)
|
||||
next_obs, reward, done, info = self.env.step(action)
|
||||
if render: self.env.render()
|
||||
obs = next_obs # srsly i'm so stupid
|
||||
info.update({'reward': reward, 'eval_episode': eval_i})
|
||||
data.append(info)
|
||||
return pd.DataFrame(data).fillna(0)
|
||||
|
Reference in New Issue
Block a user