sparse net training

This commit is contained in:
Steffen Illium
2022-02-26 16:01:12 +01:00
parent 9d8496a725
commit c0db8e19a3
3 changed files with 116 additions and 92 deletions

View File

@@ -287,17 +287,17 @@ def flat_for_store(parameters):
if __name__ == '__main__': if __name__ == '__main__':
self_train = True self_train = True
training = False training = True
train_to_id_first = True train_to_id_first = False
train_to_task_first = False train_to_task_first = False
sequential_task_train = True sequential_task_train = True
force_st_for_n_from_last_epochs = 5 force_st_for_n_from_last_epochs = 5
n_st_per_batch = 3 n_st_per_batch = 3
activation = None # nn.ReLU() activation = None # nn.ReLU()
use_sparse_network = True use_sparse_network = False
for weight_hidden_size in [3, 4, 5, 6]: for weight_hidden_size in [8]:
tsk_threshold = 0.85 tsk_threshold = 0.85
weight_hidden_size = weight_hidden_size weight_hidden_size = weight_hidden_size
@@ -353,15 +353,16 @@ if __name__ == '__main__':
meta_weight_count = sum(p.numel() for p in next(dense_metanet.particles).parameters()) meta_weight_count = sum(p.numel() for p in next(dense_metanet.particles).parameters())
loss_fn = nn.CrossEntropyLoss() loss_fn = nn.CrossEntropyLoss()
dense_optimizer = torch.optim.SGD(dense_metanet.parameters(), lr=0.008, momentum=0.9) dense_optimizer = torch.optim.SGD(dense_metanet.parameters(), lr=0.004, momentum=0.9)
sparse_optimizer = torch.optim.SGD( sparse_optimizer = torch.optim.SGD(
sparse_metanet.parameters(), lr=0.008, momentum=0.9 sparse_metanet.parameters(), lr=0.004, momentum=0.9
) if use_sparse_network else dense_optimizer ) if use_sparse_network else dense_optimizer
train_store = new_storage_df('train', None) train_store = new_storage_df('train', None)
weight_store = new_storage_df('weights', meta_weight_count) weight_store = new_storage_df('weights', meta_weight_count)
init_tsk = train_to_task_first init_tsk = train_to_task_first
for epoch in tqdm(range(EPOCH), desc='MetaNet Train - Epochs'): for epoch in tqdm(range(EPOCH), desc=f'Train - Epochs'):
tqdm.write(f'{seed}: {exp_path}')
is_validation_epoch = epoch % VALIDATION_FRQ == 0 if not debug else True is_validation_epoch = epoch % VALIDATION_FRQ == 0 if not debug else True
is_self_train_epoch = epoch % SELF_TRAIN_FRQ == 0 if not debug else True is_self_train_epoch = epoch % SELF_TRAIN_FRQ == 0 if not debug else True
sparse_metanet = sparse_metanet.train() sparse_metanet = sparse_metanet.train()

View File

@@ -109,37 +109,38 @@ class Net(nn.Module):
if self._weight_pos_enc_and_mask is None: if self._weight_pos_enc_and_mask is None:
d = next(self.parameters()).device d = next(self.parameters()).device
weight_matrix = [] weight_matrix = []
for layer_id, layer in enumerate(self.layers): with torch.no_grad():
x = next(layer.parameters()) for layer_id, layer in enumerate(self.layers):
weight_matrix.append( x = next(layer.parameters())
torch.cat( weight_matrix.append(
( torch.cat(
# Those are the weights (
torch.full((x.numel(), 1), 0, device=d), # Those are the weights
# Layer enumeration torch.full((x.numel(), 1), 0, device=d),
torch.full((x.numel(), 1), layer_id, device=d), # Layer enumeration
# Cell Enumeration torch.full((x.numel(), 1), layer_id, device=d),
torch.arange(layer.out_features, device=d).repeat_interleave(layer.in_features).view(-1, 1), # Cell Enumeration
# Weight Enumeration within the Cells torch.arange(layer.out_features, device=d).repeat_interleave(layer.in_features).view(-1, 1),
torch.arange(layer.in_features, device=d).view(-1, 1).repeat(layer.out_features, 1), # Weight Enumeration within the Cells
*(torch.full((x.numel(), 1), 0, device=d) for _ in range(self.input_size-4)) torch.arange(layer.in_features, device=d).view(-1, 1).repeat(layer.out_features, 1),
), dim=1) *(torch.full((x.numel(), 1), 0, device=d) for _ in range(self.input_size-4))
) ), dim=1)
# Finalize )
weight_matrix = torch.cat(weight_matrix).float() # Finalize
weight_matrix = torch.cat(weight_matrix).float()
# Normalize 1,2,3 column of dim 1 # Normalize 1,2,3 column of dim 1
last_pos_idx = self.input_size - 4 last_pos_idx = self.input_size - 4
max_per_col, _ = weight_matrix[:, 1:-last_pos_idx].max(keepdim=True, dim=0) max_per_col, _ = weight_matrix[:, 1:-last_pos_idx].max(keepdim=True, dim=0)
weight_matrix[:, 1:-last_pos_idx] = (weight_matrix[:, 1:-last_pos_idx] / max_per_col) + 1e-8 weight_matrix[:, 1:-last_pos_idx] = (weight_matrix[:, 1:-last_pos_idx] / max_per_col) + 1e-8
# computations # computations
# create a mask where pos is 0 if it is to be replaced # create a mask where pos is 0 if it is to be replaced
mask = torch.ones_like(weight_matrix) mask = torch.ones_like(weight_matrix)
mask[:, 0] = 0 mask[:, 0] = 0
self._weight_pos_enc_and_mask = weight_matrix, mask self._weight_pos_enc_and_mask = weight_matrix, mask
return tuple(x.clone() for x in self._weight_pos_enc_and_mask) return self._weight_pos_enc_and_mask
def forward(self, x): def forward(self, x):
for layer in self.layers: for layer in self.layers:
@@ -328,20 +329,21 @@ class MetaCell(nn.Module):
def _bed_mask(self): def _bed_mask(self):
if self.__bed_mask is None: if self.__bed_mask is None:
d = next(self.parameters()).device d = next(self.parameters()).device
embedding = torch.zeros(1, self.weight_interface, device=d) embedding = torch.zeros(1, self.weight_interface, device=d, requires_grad=False)
# computations # computations
# create a mask where pos is 0 if it is to be replaced # create a mask where pos is 0 if it is to be replaced
mask = torch.ones_like(embedding) mask = torch.ones_like(embedding, requires_grad=False, device=d)
mask[:, -1] = 0 mask[:, -1] = 0
self.__bed_mask = embedding, mask self.__bed_mask = embedding, mask
return tuple(x.clone() for x in self.__bed_mask) return self.__bed_mask
def forward(self, x): def forward(self, x):
embedding, mask = self._bed_mask embedding, mask = self._bed_mask
expanded_mask = mask.expand(*x.shape, embedding.shape[-1]) expanded_mask = mask.expand(*x.shape, embedding.shape[-1])
embedding = embedding.repeat(*x.shape, 1) embedding = embedding.expand(*x.shape, embedding.shape[-1])
# embedding = embedding.repeat(*x.shape, 1)
# Row-wise # Row-wise
# xs = x.unsqueeze(-1).expand(-1, -1, embedding.shape[-1]).swapdims(0, 1) # xs = x.unsqueeze(-1).expand(-1, -1, embedding.shape[-1]).swapdims(0, 1)
@@ -444,7 +446,7 @@ class MetaNet(nn.Module):
residual = None residual = None
for idx, meta_layer in enumerate(self._meta_layer_list, start=1): for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
if idx % 2 == 1 and self.residual_skip: if idx % 2 == 1 and self.residual_skip:
residual = tensor.clone() residual = tensor
tensor = meta_layer(tensor) tensor = meta_layer(tensor)
if idx % 2 == 0 and self.residual_skip: if idx % 2 == 0 and self.residual_skip:
tensor = tensor + residual tensor = tensor + residual
@@ -509,7 +511,7 @@ class MetaNetCompareBaseline(nn.Module):
for idx, meta_layer in enumerate(self._meta_layer_list, start=1): for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
tensor = meta_layer(tensor) tensor = meta_layer(tensor)
if idx % 2 == 1 and self.residual_skip: if idx % 2 == 1 and self.residual_skip:
residual = tensor.clone() residual = tensor
if idx % 2 == 0 and self.residual_skip: if idx % 2 == 0 and self.residual_skip:
tensor = tensor + residual tensor = tensor + residual
if self.activation: if self.activation:

View File

@@ -1,5 +1,6 @@
from collections import defaultdict from collections import defaultdict
import pandas as pd
from torch import nn from torch import nn
import functionalities_test import functionalities_test
@@ -42,9 +43,10 @@ class SparseLayer(nn.Module):
self.weights.append(weights) self.weights.append(weights)
def coo_sparse_layer(self, layer_id): def coo_sparse_layer(self, layer_id):
layer_shape = self.dummy_net_shapes[layer_id] with torch.no_grad():
sparse_diagonal = np.eye(self.nr_nets).repeat(layer_shape[0], axis=-2).repeat(layer_shape[1], axis=-1) layer_shape = self.dummy_net_shapes[layer_id]
indices = torch.Tensor(np.argwhere(sparse_diagonal == 1).T) sparse_diagonal = np.eye(self.nr_nets).repeat(layer_shape[0], axis=-2).repeat(layer_shape[1], axis=-1)
indices = torch.Tensor(np.argwhere(sparse_diagonal == 1).T, )
values = torch.nn.Parameter(torch.randn((np.prod((*layer_shape, self.nr_nets)).item())), requires_grad=True) values = torch.nn.Parameter(torch.randn((np.prod((*layer_shape, self.nr_nets)).item())), requires_grad=True)
return indices, values, sparse_diagonal.shape return indices, values, sparse_diagonal.shape
@@ -54,23 +56,24 @@ class SparseLayer(nn.Module):
# i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2 # i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2
# and first hidden*out weights of layer3 = first net # and first hidden*out weights of layer3 = first net
# [nr_layers*[nr_net*nr_weights_layer_i]] # [nr_layers*[nr_net*nr_weights_layer_i]]
weights = [layer.view(-1, int(len(layer)/self.nr_nets)) for layer in self.weights] with torch.no_grad():
# [nr_net*[nr_weights]] weights = [layer.view(-1, int(len(layer)/self.nr_nets)).detach() for layer in self.weights]
weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1, 1) for i in range(self.nr_nets)] # [nr_net*[nr_weights]]
# (16, 25) weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1, 1) for i in range(self.nr_nets)]
# (16, 25)
encoding_matrix, mask = self.dummy_net_weight_pos_enc encoding_matrix, mask = self.dummy_net_weight_pos_enc
weight_device = weights_per_net[0].device weight_device = weights_per_net[0].device
if weight_device != encoding_matrix.device or weight_device != mask.device: if weight_device != encoding_matrix.device or weight_device != mask.device:
encoding_matrix, mask = encoding_matrix.to(weight_device), mask.to(weight_device) encoding_matrix, mask = encoding_matrix.to(weight_device), mask.to(weight_device)
self.dummy_net_weight_pos_enc = encoding_matrix, mask self.dummy_net_weight_pos_enc = encoding_matrix, mask
inputs = torch.hstack( inputs = torch.hstack(
[encoding_matrix * mask + weights_per_net[i].expand(-1, encoding_matrix.shape[-1]) * (1 - mask) [encoding_matrix * mask + weights_per_net[i].expand(-1, encoding_matrix.shape[-1]) * (1 - mask)
for i in range(self.nr_nets)] for i in range(self.nr_nets)]
) )
targets = torch.hstack(weights_per_net) targets = torch.hstack(weights_per_net)
return inputs.T.detach(), targets.T.detach() return inputs.T, targets.T
@property @property
def particles(self): def particles(self):
@@ -119,29 +122,44 @@ class SparseLayer(nn.Module):
def test_sparse_layer(): def test_sparse_layer():
net = SparseLayer(500) #50 parallel nets net = SparseLayer(1000)
loss_fn = torch.nn.MSELoss() loss_fn = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9) optimizer = torch.optim.SGD(net.parameters(), lr=0.008, momentum=0.9)
# optimizer = torch.optim.SGD([layer.coalesce().values() for layer in net.sparse_sub_layer], lr=0.004, momentum=0.9) # optimizer = torch.optim.SGD([layer.coalesce().values() for layer in net.sparse_sub_layer], lr=0.004, momentum=0.9)
df = pd.DataFrame(columns=['Epoch', 'Func Type', 'Count'])
for train_iteration in trange(1000): for train_iteration in trange(20000):
optimizer.zero_grad() optimizer.zero_grad()
X, Y = net.get_self_train_inputs_and_targets() X, Y = net.get_self_train_inputs_and_targets()
out = net(X) output = net(X)
loss = loss_fn(out, Y) loss = loss_fn(output, Y) * 100
# print("X:", X.shape, "Y:", Y.shape) # loss = sum([loss_fn(out, target) for out, target in zip(output, Y)]) / len(output) * 10
# print("OUT", out.shape)
# print("LOSS", loss.item())
loss.backward() loss.backward()
optimizer.step() optimizer.step()
if train_iteration % 500 == 0:
counter = defaultdict(lambda: 0)
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
counter = dict(counter)
tqdm.write(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}")
for key, value in counter.items():
df.loc[df.shape[0]] = (train_iteration, key, value)
counter = defaultdict(lambda: 0) counter = defaultdict(lambda: 0)
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles)) id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
counter = dict(counter) counter = dict(counter)
print(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}") tqdm.write(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}")
for key, value in counter.items():
df.loc[df.shape[0]] = (train_iteration, key, value)
df.to_csv('counter.csv', mode='w')
import seaborn as sns
import matplotlib.pyplot as plt
c = pd.read_csv('counter.csv', index_col=0)
sns.lineplot(data=c, x='Epoch', y='Count', hue='Func Type')
plt.savefig('counter.png', dpi=300)
def embed_batch(x, repeat_dim): def embed_batch(x, repeat_dim):
@@ -241,12 +259,15 @@ class SparseNetwork(nn.Module):
def combined_self_train(self, optimizer, reduction='mean'): def combined_self_train(self, optimizer, reduction='mean'):
losses = [] losses = []
loss_fn = nn.MSELoss(reduction=reduction)
for layer in self.sparselayers: for layer in self.sparselayers:
optimizer.zero_grad() optimizer.zero_grad()
x, target_data = layer.get_self_train_inputs_and_targets() x, target_data = layer.get_self_train_inputs_and_targets()
output = layer(x) output = layer(x)
# loss = sum([loss_fn(out, target) for out, target in zip(output, target_data)]) / len(output)
loss = loss_fn(output, target_data) * 100
loss = F.mse_loss(output, target_data, reduction=reduction)
losses.append(loss.detach()) losses.append(loss.detach())
loss.backward() loss.backward()
optimizer.step() optimizer.step()
@@ -279,33 +300,33 @@ def test_sparse_net():
def test_sparse_net_sef_train(): def test_sparse_net_sef_train():
net = SparseNetwork(30, 5, 6, 10) net = SparseNetwork(5, 5, 6, 10)
epochs = 1000 epochs = 10000
if True: df = pd.DataFrame(columns=['Epoch', 'Func Type', 'Count'])
optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9) optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9)
for _ in trange(epochs): for epoch in trange(epochs):
_ = net.combined_self_train(optimizer) _ = net.combined_self_train(optimizer)
else: if epoch % 500 == 0:
optimizer_dict = { counter = defaultdict(lambda: 0)
key: torch.optim.SGD(layer.parameters(), lr=0.004, momentum=0.9) for key, layer in enumerate(net.sparselayers) id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
} counter = dict(counter)
loss_fn = torch.nn.MSELoss(reduction="mean") tqdm.write(f"identity_fn after {epoch + 1} self-train epochs: {counter}")
for key, value in counter.items():
df.loc[df.shape[0]] = (epoch, key, value)
for layer, optim in zip(net.sparselayers, optimizer_dict.values()):
for _ in trange(epochs):
optim.zero_grad()
x, target_data = layer.get_self_train_inputs_and_targets()
output = layer(x)
loss = loss_fn(output, target_data)
loss.backward()
optim.step()
# is each of the networks self-replicating?
counter = defaultdict(lambda: 0) counter = defaultdict(lambda: 0)
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles)) id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
counter = dict(counter) counter = dict(counter)
print(f"identity_fn after {epochs} self-train epochs: {counter}") tqdm.write(f"identity_fn after {epochs} self-train epochs: {counter}")
for key, value in counter.items():
df.loc[df.shape[0]] = (epoch, key, value)
df.to_csv('counter.csv', mode='w')
import seaborn as sns
import matplotlib.pyplot as plt
c = pd.read_csv('counter.csv', index_col=0)
sns.lineplot(data=c, x='Epoch', y='Count', hue='Func Type')
plt.savefig('counter.png', dpi=300)
def test_manual_for_loop(): def test_manual_for_loop():