added individual eps-greedy for VDN
This commit is contained in:
@ -6,21 +6,6 @@ import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class BaseLearner:
|
||||
def __init__(self, env, n_agents, lr):
|
||||
self.env = env
|
||||
self.n_agents = n_agents
|
||||
self.lr = lr
|
||||
self.device = 'cpu'
|
||||
|
||||
def to(self, device):
|
||||
self.device = device
|
||||
for attr, value in self.__dict__.items():
|
||||
if isinstance(value, nn.Module):
|
||||
value = value.to(self.device)
|
||||
return self
|
||||
|
||||
|
||||
class Experience(NamedTuple):
|
||||
# can be use for a single (s_t, a, r s_{t+1}) tuple
|
||||
# or for a batch of tuples
|
||||
@ -29,6 +14,84 @@ class Experience(NamedTuple):
|
||||
action: np.ndarray
|
||||
reward: Union[float, np.ndarray]
|
||||
done : Union[bool, np.ndarray]
|
||||
episode: int = -1
|
||||
|
||||
|
||||
class BaseLearner:
|
||||
def __init__(self, env, n_agents=1, train_every=('step', 4), n_grad_steps=1):
|
||||
assert train_every[0] in ['step', 'episode'], 'train_every[0] must be one of ["step", "episode"]'
|
||||
self.env = env
|
||||
self.n_agents = n_agents
|
||||
self.n_grad_steps = n_grad_steps
|
||||
self.train_every = train_every
|
||||
self.device = 'cpu'
|
||||
self.n_updates = 0
|
||||
self.step = 0
|
||||
self.episode_step = 0
|
||||
self.episode = 0
|
||||
self.running_reward = deque(maxlen=5)
|
||||
|
||||
def to(self, device):
|
||||
self.device = device
|
||||
for attr, value in self.__dict__.items():
|
||||
if isinstance(value, nn.Module):
|
||||
value = value.to(self.device)
|
||||
return self
|
||||
|
||||
def get_action(self, obs) -> Union[int, np.ndarray]:
|
||||
pass
|
||||
|
||||
def on_new_experience(self, experience):
|
||||
pass
|
||||
|
||||
def on_step_end(self, n_steps):
|
||||
pass
|
||||
|
||||
def on_episode_end(self, n_steps):
|
||||
pass
|
||||
|
||||
def train(self):
|
||||
pass
|
||||
|
||||
def learn(self, n_steps):
|
||||
train_type, train_freq = self.train_every
|
||||
while self.step < n_steps:
|
||||
obs, done = self.env.reset(), False
|
||||
total_reward = 0
|
||||
self.episode_step = 0
|
||||
while not done:
|
||||
|
||||
action = self.get_action(obs)
|
||||
|
||||
next_obs, reward, done, info = self.env.step(action if not len(action) == 1 else action[0])
|
||||
|
||||
experience = Experience(observation=obs, next_observation=next_obs,
|
||||
action=action, reward=reward,
|
||||
done=done, episode=self.episode) # do we really need to copy?
|
||||
self.on_new_experience(experience)
|
||||
# end of step routine
|
||||
obs = next_obs
|
||||
total_reward += reward
|
||||
self.step += 1
|
||||
self.episode_step += 1
|
||||
self.on_step_end(n_steps)
|
||||
if train_type == 'step' and (self.step % train_freq == 0):
|
||||
self.train()
|
||||
self.n_updates += 1
|
||||
self.on_episode_end(n_steps)
|
||||
if train_type == 'episode' and (self.episode % train_freq == 0):
|
||||
self.train()
|
||||
self.n_updates += 1
|
||||
|
||||
self.running_reward.append(total_reward)
|
||||
self.episode += 1
|
||||
try:
|
||||
if self.step % 10 == 0:
|
||||
print(
|
||||
f'Step: {self.step} ({(self.step / n_steps) * 100:.2f}%)\tRunning reward: {sum(list(self.running_reward)) / len(self.running_reward):.2f}\t'
|
||||
f' eps: {self.eps:.4f}\tRunning loss: {sum(list(self.running_loss)) / len(self.running_loss):.4f}\tUpdates:{self.n_updates}')
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
class BaseBuffer:
|
||||
@ -60,7 +123,7 @@ def soft_update(local_model, target_model, tau):
|
||||
|
||||
|
||||
def mlp_maker(dims, flatten=False, activation='elu', activation_last='identity'):
|
||||
activations = {'elu': nn.ELU, 'relu': nn.ReLU,
|
||||
activations = {'elu': nn.ELU, 'relu': nn.ReLU, 'sigmoid': nn.Sigmoid,
|
||||
'leaky_relu': nn.LeakyReLU, 'tanh': nn.Tanh,
|
||||
'gelu': nn.GELU, 'identity': nn.Identity}
|
||||
layers = [('Flatten', nn.Flatten())] if flatten else []
|
||||
@ -71,7 +134,6 @@ def mlp_maker(dims, flatten=False, activation='elu', activation_last='identity')
|
||||
return nn.Sequential(OrderedDict(layers))
|
||||
|
||||
|
||||
|
||||
class BaseDQN(nn.Module):
|
||||
def __init__(self, dims=[3*5*5, 64, 64, 9]):
|
||||
super(BaseDQN, self).__init__()
|
||||
|
Reference in New Issue
Block a user