sparse net training
This commit is contained in:
@@ -287,17 +287,17 @@ def flat_for_store(parameters):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
self_train = True
|
self_train = True
|
||||||
training = False
|
training = True
|
||||||
train_to_id_first = True
|
train_to_id_first = False
|
||||||
train_to_task_first = False
|
train_to_task_first = False
|
||||||
sequential_task_train = True
|
sequential_task_train = True
|
||||||
force_st_for_n_from_last_epochs = 5
|
force_st_for_n_from_last_epochs = 5
|
||||||
n_st_per_batch = 3
|
n_st_per_batch = 3
|
||||||
activation = None # nn.ReLU()
|
activation = None # nn.ReLU()
|
||||||
|
|
||||||
use_sparse_network = True
|
use_sparse_network = False
|
||||||
|
|
||||||
for weight_hidden_size in [3, 4, 5, 6]:
|
for weight_hidden_size in [8]:
|
||||||
|
|
||||||
tsk_threshold = 0.85
|
tsk_threshold = 0.85
|
||||||
weight_hidden_size = weight_hidden_size
|
weight_hidden_size = weight_hidden_size
|
||||||
@@ -353,15 +353,16 @@ if __name__ == '__main__':
|
|||||||
meta_weight_count = sum(p.numel() for p in next(dense_metanet.particles).parameters())
|
meta_weight_count = sum(p.numel() for p in next(dense_metanet.particles).parameters())
|
||||||
|
|
||||||
loss_fn = nn.CrossEntropyLoss()
|
loss_fn = nn.CrossEntropyLoss()
|
||||||
dense_optimizer = torch.optim.SGD(dense_metanet.parameters(), lr=0.008, momentum=0.9)
|
dense_optimizer = torch.optim.SGD(dense_metanet.parameters(), lr=0.004, momentum=0.9)
|
||||||
sparse_optimizer = torch.optim.SGD(
|
sparse_optimizer = torch.optim.SGD(
|
||||||
sparse_metanet.parameters(), lr=0.008, momentum=0.9
|
sparse_metanet.parameters(), lr=0.004, momentum=0.9
|
||||||
) if use_sparse_network else dense_optimizer
|
) if use_sparse_network else dense_optimizer
|
||||||
|
|
||||||
train_store = new_storage_df('train', None)
|
train_store = new_storage_df('train', None)
|
||||||
weight_store = new_storage_df('weights', meta_weight_count)
|
weight_store = new_storage_df('weights', meta_weight_count)
|
||||||
init_tsk = train_to_task_first
|
init_tsk = train_to_task_first
|
||||||
for epoch in tqdm(range(EPOCH), desc='MetaNet Train - Epochs'):
|
for epoch in tqdm(range(EPOCH), desc=f'Train - Epochs'):
|
||||||
|
tqdm.write(f'{seed}: {exp_path}')
|
||||||
is_validation_epoch = epoch % VALIDATION_FRQ == 0 if not debug else True
|
is_validation_epoch = epoch % VALIDATION_FRQ == 0 if not debug else True
|
||||||
is_self_train_epoch = epoch % SELF_TRAIN_FRQ == 0 if not debug else True
|
is_self_train_epoch = epoch % SELF_TRAIN_FRQ == 0 if not debug else True
|
||||||
sparse_metanet = sparse_metanet.train()
|
sparse_metanet = sparse_metanet.train()
|
||||||
|
|||||||
16
network.py
16
network.py
@@ -109,6 +109,7 @@ class Net(nn.Module):
|
|||||||
if self._weight_pos_enc_and_mask is None:
|
if self._weight_pos_enc_and_mask is None:
|
||||||
d = next(self.parameters()).device
|
d = next(self.parameters()).device
|
||||||
weight_matrix = []
|
weight_matrix = []
|
||||||
|
with torch.no_grad():
|
||||||
for layer_id, layer in enumerate(self.layers):
|
for layer_id, layer in enumerate(self.layers):
|
||||||
x = next(layer.parameters())
|
x = next(layer.parameters())
|
||||||
weight_matrix.append(
|
weight_matrix.append(
|
||||||
@@ -139,7 +140,7 @@ class Net(nn.Module):
|
|||||||
mask[:, 0] = 0
|
mask[:, 0] = 0
|
||||||
|
|
||||||
self._weight_pos_enc_and_mask = weight_matrix, mask
|
self._weight_pos_enc_and_mask = weight_matrix, mask
|
||||||
return tuple(x.clone() for x in self._weight_pos_enc_and_mask)
|
return self._weight_pos_enc_and_mask
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
@@ -328,20 +329,21 @@ class MetaCell(nn.Module):
|
|||||||
def _bed_mask(self):
|
def _bed_mask(self):
|
||||||
if self.__bed_mask is None:
|
if self.__bed_mask is None:
|
||||||
d = next(self.parameters()).device
|
d = next(self.parameters()).device
|
||||||
embedding = torch.zeros(1, self.weight_interface, device=d)
|
embedding = torch.zeros(1, self.weight_interface, device=d, requires_grad=False)
|
||||||
|
|
||||||
# computations
|
# computations
|
||||||
# create a mask where pos is 0 if it is to be replaced
|
# create a mask where pos is 0 if it is to be replaced
|
||||||
mask = torch.ones_like(embedding)
|
mask = torch.ones_like(embedding, requires_grad=False, device=d)
|
||||||
mask[:, -1] = 0
|
mask[:, -1] = 0
|
||||||
|
|
||||||
self.__bed_mask = embedding, mask
|
self.__bed_mask = embedding, mask
|
||||||
return tuple(x.clone() for x in self.__bed_mask)
|
return self.__bed_mask
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
embedding, mask = self._bed_mask
|
embedding, mask = self._bed_mask
|
||||||
expanded_mask = mask.expand(*x.shape, embedding.shape[-1])
|
expanded_mask = mask.expand(*x.shape, embedding.shape[-1])
|
||||||
embedding = embedding.repeat(*x.shape, 1)
|
embedding = embedding.expand(*x.shape, embedding.shape[-1])
|
||||||
|
# embedding = embedding.repeat(*x.shape, 1)
|
||||||
|
|
||||||
# Row-wise
|
# Row-wise
|
||||||
# xs = x.unsqueeze(-1).expand(-1, -1, embedding.shape[-1]).swapdims(0, 1)
|
# xs = x.unsqueeze(-1).expand(-1, -1, embedding.shape[-1]).swapdims(0, 1)
|
||||||
@@ -444,7 +446,7 @@ class MetaNet(nn.Module):
|
|||||||
residual = None
|
residual = None
|
||||||
for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
|
for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
|
||||||
if idx % 2 == 1 and self.residual_skip:
|
if idx % 2 == 1 and self.residual_skip:
|
||||||
residual = tensor.clone()
|
residual = tensor
|
||||||
tensor = meta_layer(tensor)
|
tensor = meta_layer(tensor)
|
||||||
if idx % 2 == 0 and self.residual_skip:
|
if idx % 2 == 0 and self.residual_skip:
|
||||||
tensor = tensor + residual
|
tensor = tensor + residual
|
||||||
@@ -509,7 +511,7 @@ class MetaNetCompareBaseline(nn.Module):
|
|||||||
for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
|
for idx, meta_layer in enumerate(self._meta_layer_list, start=1):
|
||||||
tensor = meta_layer(tensor)
|
tensor = meta_layer(tensor)
|
||||||
if idx % 2 == 1 and self.residual_skip:
|
if idx % 2 == 1 and self.residual_skip:
|
||||||
residual = tensor.clone()
|
residual = tensor
|
||||||
if idx % 2 == 0 and self.residual_skip:
|
if idx % 2 == 0 and self.residual_skip:
|
||||||
tensor = tensor + residual
|
tensor = tensor + residual
|
||||||
if self.activation:
|
if self.activation:
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
import functionalities_test
|
import functionalities_test
|
||||||
@@ -42,9 +43,10 @@ class SparseLayer(nn.Module):
|
|||||||
self.weights.append(weights)
|
self.weights.append(weights)
|
||||||
|
|
||||||
def coo_sparse_layer(self, layer_id):
|
def coo_sparse_layer(self, layer_id):
|
||||||
|
with torch.no_grad():
|
||||||
layer_shape = self.dummy_net_shapes[layer_id]
|
layer_shape = self.dummy_net_shapes[layer_id]
|
||||||
sparse_diagonal = np.eye(self.nr_nets).repeat(layer_shape[0], axis=-2).repeat(layer_shape[1], axis=-1)
|
sparse_diagonal = np.eye(self.nr_nets).repeat(layer_shape[0], axis=-2).repeat(layer_shape[1], axis=-1)
|
||||||
indices = torch.Tensor(np.argwhere(sparse_diagonal == 1).T)
|
indices = torch.Tensor(np.argwhere(sparse_diagonal == 1).T, )
|
||||||
values = torch.nn.Parameter(torch.randn((np.prod((*layer_shape, self.nr_nets)).item())), requires_grad=True)
|
values = torch.nn.Parameter(torch.randn((np.prod((*layer_shape, self.nr_nets)).item())), requires_grad=True)
|
||||||
|
|
||||||
return indices, values, sparse_diagonal.shape
|
return indices, values, sparse_diagonal.shape
|
||||||
@@ -54,7 +56,8 @@ class SparseLayer(nn.Module):
|
|||||||
# i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2
|
# i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2
|
||||||
# and first hidden*out weights of layer3 = first net
|
# and first hidden*out weights of layer3 = first net
|
||||||
# [nr_layers*[nr_net*nr_weights_layer_i]]
|
# [nr_layers*[nr_net*nr_weights_layer_i]]
|
||||||
weights = [layer.view(-1, int(len(layer)/self.nr_nets)) for layer in self.weights]
|
with torch.no_grad():
|
||||||
|
weights = [layer.view(-1, int(len(layer)/self.nr_nets)).detach() for layer in self.weights]
|
||||||
# [nr_net*[nr_weights]]
|
# [nr_net*[nr_weights]]
|
||||||
weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1, 1) for i in range(self.nr_nets)]
|
weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1, 1) for i in range(self.nr_nets)]
|
||||||
# (16, 25)
|
# (16, 25)
|
||||||
@@ -70,7 +73,7 @@ class SparseLayer(nn.Module):
|
|||||||
for i in range(self.nr_nets)]
|
for i in range(self.nr_nets)]
|
||||||
)
|
)
|
||||||
targets = torch.hstack(weights_per_net)
|
targets = torch.hstack(weights_per_net)
|
||||||
return inputs.T.detach(), targets.T.detach()
|
return inputs.T, targets.T
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def particles(self):
|
def particles(self):
|
||||||
@@ -119,29 +122,44 @@ class SparseLayer(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
def test_sparse_layer():
|
def test_sparse_layer():
|
||||||
net = SparseLayer(500) #50 parallel nets
|
net = SparseLayer(1000)
|
||||||
loss_fn = torch.nn.MSELoss()
|
loss_fn = torch.nn.MSELoss(reduction='mean')
|
||||||
optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9)
|
optimizer = torch.optim.SGD(net.parameters(), lr=0.008, momentum=0.9)
|
||||||
# optimizer = torch.optim.SGD([layer.coalesce().values() for layer in net.sparse_sub_layer], lr=0.004, momentum=0.9)
|
# optimizer = torch.optim.SGD([layer.coalesce().values() for layer in net.sparse_sub_layer], lr=0.004, momentum=0.9)
|
||||||
|
df = pd.DataFrame(columns=['Epoch', 'Func Type', 'Count'])
|
||||||
|
|
||||||
for train_iteration in trange(1000):
|
for train_iteration in trange(20000):
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
X, Y = net.get_self_train_inputs_and_targets()
|
X, Y = net.get_self_train_inputs_and_targets()
|
||||||
out = net(X)
|
output = net(X)
|
||||||
|
|
||||||
loss = loss_fn(out, Y)
|
loss = loss_fn(output, Y) * 100
|
||||||
|
|
||||||
# print("X:", X.shape, "Y:", Y.shape)
|
# loss = sum([loss_fn(out, target) for out, target in zip(output, Y)]) / len(output) * 10
|
||||||
# print("OUT", out.shape)
|
|
||||||
# print("LOSS", loss.item())
|
|
||||||
|
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
|
|
||||||
|
if train_iteration % 500 == 0:
|
||||||
counter = defaultdict(lambda: 0)
|
counter = defaultdict(lambda: 0)
|
||||||
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
||||||
counter = dict(counter)
|
counter = dict(counter)
|
||||||
print(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}")
|
tqdm.write(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}")
|
||||||
|
for key, value in counter.items():
|
||||||
|
df.loc[df.shape[0]] = (train_iteration, key, value)
|
||||||
|
|
||||||
|
counter = defaultdict(lambda: 0)
|
||||||
|
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
||||||
|
counter = dict(counter)
|
||||||
|
tqdm.write(f"identity_fn after {train_iteration + 1} self-train epochs: {counter}")
|
||||||
|
for key, value in counter.items():
|
||||||
|
df.loc[df.shape[0]] = (train_iteration, key, value)
|
||||||
|
df.to_csv('counter.csv', mode='w')
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
c = pd.read_csv('counter.csv', index_col=0)
|
||||||
|
sns.lineplot(data=c, x='Epoch', y='Count', hue='Func Type')
|
||||||
|
plt.savefig('counter.png', dpi=300)
|
||||||
|
|
||||||
|
|
||||||
def embed_batch(x, repeat_dim):
|
def embed_batch(x, repeat_dim):
|
||||||
@@ -241,12 +259,15 @@ class SparseNetwork(nn.Module):
|
|||||||
|
|
||||||
def combined_self_train(self, optimizer, reduction='mean'):
|
def combined_self_train(self, optimizer, reduction='mean'):
|
||||||
losses = []
|
losses = []
|
||||||
|
loss_fn = nn.MSELoss(reduction=reduction)
|
||||||
for layer in self.sparselayers:
|
for layer in self.sparselayers:
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
x, target_data = layer.get_self_train_inputs_and_targets()
|
x, target_data = layer.get_self_train_inputs_and_targets()
|
||||||
output = layer(x)
|
output = layer(x)
|
||||||
|
# loss = sum([loss_fn(out, target) for out, target in zip(output, target_data)]) / len(output)
|
||||||
|
|
||||||
|
loss = loss_fn(output, target_data) * 100
|
||||||
|
|
||||||
loss = F.mse_loss(output, target_data, reduction=reduction)
|
|
||||||
losses.append(loss.detach())
|
losses.append(loss.detach())
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
@@ -279,33 +300,33 @@ def test_sparse_net():
|
|||||||
|
|
||||||
|
|
||||||
def test_sparse_net_sef_train():
|
def test_sparse_net_sef_train():
|
||||||
net = SparseNetwork(30, 5, 6, 10)
|
net = SparseNetwork(5, 5, 6, 10)
|
||||||
epochs = 1000
|
epochs = 10000
|
||||||
if True:
|
df = pd.DataFrame(columns=['Epoch', 'Func Type', 'Count'])
|
||||||
optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9)
|
optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9)
|
||||||
for _ in trange(epochs):
|
for epoch in trange(epochs):
|
||||||
_ = net.combined_self_train(optimizer)
|
_ = net.combined_self_train(optimizer)
|
||||||
|
|
||||||
else:
|
if epoch % 500 == 0:
|
||||||
optimizer_dict = {
|
|
||||||
key: torch.optim.SGD(layer.parameters(), lr=0.004, momentum=0.9) for key, layer in enumerate(net.sparselayers)
|
|
||||||
}
|
|
||||||
loss_fn = torch.nn.MSELoss(reduction="mean")
|
|
||||||
|
|
||||||
for layer, optim in zip(net.sparselayers, optimizer_dict.values()):
|
|
||||||
for _ in trange(epochs):
|
|
||||||
optim.zero_grad()
|
|
||||||
x, target_data = layer.get_self_train_inputs_and_targets()
|
|
||||||
output = layer(x)
|
|
||||||
loss = loss_fn(output, target_data)
|
|
||||||
loss.backward()
|
|
||||||
optim.step()
|
|
||||||
|
|
||||||
# is each of the networks self-replicating?
|
|
||||||
counter = defaultdict(lambda: 0)
|
counter = defaultdict(lambda: 0)
|
||||||
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
||||||
counter = dict(counter)
|
counter = dict(counter)
|
||||||
print(f"identity_fn after {epochs} self-train epochs: {counter}")
|
tqdm.write(f"identity_fn after {epoch + 1} self-train epochs: {counter}")
|
||||||
|
for key, value in counter.items():
|
||||||
|
df.loc[df.shape[0]] = (epoch, key, value)
|
||||||
|
|
||||||
|
counter = defaultdict(lambda: 0)
|
||||||
|
id_functions = functionalities_test.test_for_fixpoints(counter, list(net.particles))
|
||||||
|
counter = dict(counter)
|
||||||
|
tqdm.write(f"identity_fn after {epochs} self-train epochs: {counter}")
|
||||||
|
for key, value in counter.items():
|
||||||
|
df.loc[df.shape[0]] = (epoch, key, value)
|
||||||
|
df.to_csv('counter.csv', mode='w')
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
c = pd.read_csv('counter.csv', index_col=0)
|
||||||
|
sns.lineplot(data=c, x='Epoch', y='Count', hue='Func Type')
|
||||||
|
plt.savefig('counter.png', dpi=300)
|
||||||
|
|
||||||
|
|
||||||
def test_manual_for_loop():
|
def test_manual_for_loop():
|
||||||
|
|||||||
Reference in New Issue
Block a user