In [None]:
from network import Net
import torch
from typing import List
from functionalities_test import is_identity_function
from tqdm import tqdm,trange
import numpy as np

In [None]:
def construct_sparse_COO_layer(nets:List[Net], layer_idx:int) -> torch.Tensor:
    assert layer_idx <= len(list(nets[0].parameters()))
    values = []
    indices = []
    for net_idx,net in enumerate(nets):
        layer = list(net.parameters())[layer_idx]
        
        for cell_idx,cell in enumerate(layer):
            # E.g., position of cell weights (with 2 cells per hidden layer) in first sparse layer of N nets: 
            
            # [4x2 weights_net0] [4x2x(n-1) 0s]
            # [4x2 weights] [4x2 weights_net0] [4x2x(n-2) 0s]
            # ... etc
            # [4x2x(n-1) 0s] [4x2 weights_netN]
            
            
            # -> 4x2 weights on the diagonal = [shifted Nr_cellss*B down for AxB cells, and Nr_nets(*A weights)to the right] 
            for i in range(len(cell)):
                indices.append([len(layer)*net_idx + cell_idx,    net_idx*len(cell) + i ])
                #indices.append([2*net_idx + cell_idx,    net_idx*len(cell) + i ])

            [values.append(weight) for weight in cell]

            # for i in range(4):
            #     indices.append([idx+idx+1,  i+(idx*4)])
            #for l in next(net.parameters()):
            #[values.append(w) for w in l]
    #print(indices, values)

    #s = torch.sparse_coo_tensor(list(zip(*indices)), values, (2*nr_nets, 4*nr_nets))
    # sparse tensor dimension = (nr_cells*nr_nets , nr_weights/cell * nr_nets), i.e.,
    # layer 1: (2x4) -> (2*N, 4*N)
    # layer 2: (2x2) -> (2*N, 2*N)
    # layer 3: (1x2) -> (2*N, 1*N)
    s = torch.sparse_coo_tensor(list(zip(*indices)), values, (len(layer)*nr_nets, len(cell)*nr_nets))
    #print(s.to_dense())
    #print(s.to_dense().shape)
    return s


# for each net append to the combined sparse tensor
# construct sparse tensor for each layer, with Nets of (4,2,1), each net appends
# - [4x2] weights in the first (input) layer
# - [2x2] weights in the second (hidden) layer
# - [2x1] weights in the third (output) layer
#modules = [ construct_sparse_tensor_layer(nets, layer_idx) for layer_idx in range(len(list(nets[0].parameters()))) ]
#modules
#for layer_idx in range(len(list(nets[0].parameters()))):
#    sparse_tensor = construct_sparse_tensor_layer(nets, layer_idx)

In [None]:
nr_nets = 50
nets = [Net(4,2,1) for _ in range(nr_nets)]
print(f"before: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

modules = [ construct_sparse_COO_layer(nets, layer_idx) for layer_idx in range(len(list(nets[0].parameters()))) ]
print( id(list(nets[0].parameters())[0][0,0]) == id(modules[0][0,0]))

loss_fn = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD([param for net in nets for param in net.parameters()], lr=0.004, momentum=0.9)
#optimizer = torch.optim.SGD([module for module in modules], lr=0.004, momentum=0.9)


for train_iteration in range(1000):
    optimizer.zero_grad()  
    X = torch.hstack( [net.input_weight_matrix() for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights, nr_weights)
    Y = torch.hstack( [net.create_target_weights(net.input_weight_matrix()) for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights,1)
    #print("X ", X.shape, "Y", Y.shape)

    modules = [ construct_sparse_COO_layer(nets, layer_idx) for layer_idx in range(len(list(nets[0].parameters()))) ]

    X1 = torch.sparse.mm(modules[0], X)
    #print("X1", X1.shape, X1)

    X2 = torch.sparse.mm(modules[1], X1)
    #print("X2", X2.shape)

    X3 = torch.sparse.mm(modules[2], X2)
    #print("X3", X3.shape)

    loss = loss_fn(X3, Y)
    #print(loss)
    loss.backward()
    optimizer.step()

print(f"after {train_iteration+1} iterations of combined self_train: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

In [None]:
nr_nets = 500
nets = [Net(5,2,1) for _ in range(nr_nets)]
loss_fn = torch.nn.MSELoss(reduction="sum")
rounds = 1000

for net in tqdm(nets):
    optimizer = torch.optim.SGD(net.parameters(), lr=0.004, momentum=0.9)
    for i in range(rounds):
        optimizer.zero_grad()
        input_data = net.input_weight_matrix()
        target_data = net.create_target_weights(input_data)
        output = net(input_data)
        loss = loss_fn(output, target_data)
        loss.backward()
        optimizer.step()

sum([is_identity_function(net) for net in nets])

In [None]:
def construct_sparse_CRS_layer(nets:List[Net], layer_idx:int) -> torch.Tensor:
    assert layer_idx <= len(list(nets[0].parameters()))
    
    s = torch.cat( [
        torch.cat(
        (
            torch.zeros(( len(list(net.parameters())[layer_idx]) ,len(list(net.parameters())[layer_idx][0])*net_idx)), 
            list(net.parameters())[layer_idx], 
            torch.zeros((len(list(net.parameters())[layer_idx]), len(list(net.parameters())[layer_idx][0])*(len(nets)-(net_idx+1))))
        )
        , dim=1) for net_idx, net in enumerate(nets)
    ]).to_sparse_csr()

    print(s.shape)
    return s

In [None]:
nr_nets = 5
nets = [Net(4,2,1) for _ in range(nr_nets)]
print(f"before: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

#modules = [ construct_sparse_tensor_layer(nets, layer_idx) for layer_idx in range(len(list(nets[0].parameters()))) ]
print( id(list(nets[0].parameters())[0][0,0]) == id(modules[0][0,0]))

loss_fn = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD([param for net in nets for param in net.parameters()], lr=0.004, momentum=0.9)
#optimizer = torch.optim.SGD([module for module in modules], lr=0.004, momentum=0.9)


for train_iteration in range(1):
    optimizer.zero_grad()  
    X = torch.hstack( [net.input_weight_matrix() for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights, nr_weights)
    Y = torch.hstack( [net.create_target_weights(net.input_weight_matrix()) for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights,1)
    #print("X ", X.shape, "Y", Y.shape)

    num_layers = len(list(nets[0].parameters()))
    modules = [ construct_sparse_CRS_layer(nets, layer_idx) for layer_idx in range(num_layers)]

    X1 = modules[0].matmul(X)
    print("X1", X1.shape, X1.is_sparse)

    X2 = modules[1].matmul(X1)
    print("X2", X2.shape, X2.is_sparse)

    X3 = modules[2].matmul(X2)
    print("X3", X3.shape, X3.is_sparse)

    loss = loss_fn(X3, Y)
    #print(loss)
    loss.backward()
    optimizer.step()

print(f"after {train_iteration+1} iterations of combined self_train: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

In [None]:
nr_nets = 2
nets = [Net(4,2,1) for _ in range(nr_nets)]

def cat_COO_layer(nets, layer_idx):
    i = [[0,i] for i in range(nr_nets*len(list(net.parameters())[layer_idx]))]
    v = torch.cat( [
        torch.cat(
        (
            torch.zeros(( len(list(net.parameters())[layer_idx]) ,len(list(net.parameters())[layer_idx][0])*net_idx)), 
            list(net.parameters())[layer_idx], 
            torch.zeros((len(list(net.parameters())[layer_idx]), len(list(net.parameters())[layer_idx][0])*(len(nets)-(net_idx+1))))
        )
        , dim=1) for net_idx, net in enumerate(nets)
    ])
    #print(i,v)
    s = torch.sparse_coo_tensor(list(zip(*i)), v)
    print(s[0].to_dense().shape, s[0].is_sparse)
    return s[0]

cat_COO_layer(nets, 0)

In [None]:
nr_nets = 5
nets = [Net(4,2,1) for _ in range(nr_nets)]
print(f"before: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

nr_layers = len(list(nets[0].parameters()))
modules = [ cat_COO_layer(nets, layer_idx) for layer_idx in range(nr_layers) ]

loss_fn = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD([param for net in nets for param in net.parameters()], lr=0.004, momentum=0.9)
#optimizer = torch.optim.SGD([module for module in modules], lr=0.004, momentum=0.9)


for train_iteration in range(1):
    optimizer.zero_grad()  
    X = torch.hstack( [net.input_weight_matrix() for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights, nr_weights)
    Y = torch.hstack( [net.create_target_weights(net.input_weight_matrix()) for net in nets] ).requires_grad_(True).T #(nr_nets*nr_weights,1)
    print("X ", X.shape, "Y", Y.shape)

    X1 = torch.sparse.mm(modules[0], X)
    print("X1", X1.shape)

    X2 = torch.sparse.mm(modules[1], X1)
    print("X2", X2.shape)

    X3 = torch.sparse.mm(modules[2], X2)
    print("X3", X3.shape)

    loss = loss_fn(X3, Y)
    #print(loss)
    loss.backward()
    optimizer.step()

print(f"after {train_iteration+1} iterations of combined self_train: {sum([is_identity_function(net) for net in nets])}/{len(nets)} identity_fns")

In [None]:
class SparseLayer():
    def __init__(self, nr_nets, interface=5, depth=3, width=2, out=1):
        self.nr_nets = nr_nets
        self.interface_dim = interface
        self.depth_dim = depth
        self.hidden_dim = width
        self.out_dim = out
        self.dummy_net = Net(self.interface_dim, self.hidden_dim, self.out_dim)
        
        self.sparse_sub_layer = []
        self.weights = []
        for layer_id in range(depth):
            layer, weights = self.coo_sparse_layer(layer_id)
            self.sparse_sub_layer.append(layer)
            self.weights.append(weights)
 
    def coo_sparse_layer(self, layer_id):
        layer_shape = list(self.dummy_net.parameters())[layer_id].shape
        #print(layer_shape) #(out_cells, in_cells) -> (2,5), (2,2), (1,2)

        sparse_diagonal = np.eye(self.nr_nets).repeat(layer_shape[0], axis=-2).repeat(layer_shape[1], axis=-1)
        indices = np.argwhere(sparse_diagonal == 1).T
        values = torch.nn.Parameter(torch.randn((self.nr_nets * (layer_shape[0]*layer_shape[1]) )))
        #values = torch.randn((self.nr_nets * layer_shape[0]*layer_shape[1] ))
        s = torch.sparse_coo_tensor(indices, values, sparse_diagonal.shape, requires_grad=True)
        print(f"L{layer_id}:", s.shape)
        return s, values

    def get_self_train_inputs_and_targets(self):
        encoding_matrix, mask = self.dummy_net._weight_pos_enc

        # view weights of each sublayer in equal chunks, each column representing weights of one selfrepNN
        # i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2 and first hidden*out weights of layer3 = first net
        weights = [layer.view(-1, int(len(layer)/self.nr_nets)) for layer in self.weights]    #[nr_layers*[nr_net*nr_weights_layer_i]]
        weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1,1) for i in range(self.nr_nets)]   #[nr_net*[nr_weights]]
        inputs = torch.hstack([encoding_matrix * mask + weights_per_net[i].expand(-1, encoding_matrix.shape[-1]) * (1 - mask) for i in range(self.nr_nets)]) #(16, 25)
        targets = torch.hstack(weights_per_net)
        return inputs.T, targets.T

    def __call__(self, x):
        X1 = torch.sparse.mm(self.sparse_sub_layer[0], x)
        #print("X1", X1.shape)

        X2 = torch.sparse.mm(self.sparse_sub_layer[1], X1)
        #print("X2", X2.shape)

        X3 = torch.sparse.mm(self.sparse_sub_layer[2], X2)
        #print("X3", X3.shape)
        
        return X3

net = SparseLayer(5)
loss_fn = torch.nn.MSELoss(reduction="sum")
optimizer = torch.optim.SGD([weight for weight in net.weights], lr=0.004, momentum=0.9)
#optimizer = torch.optim.SGD([layer for layer in net.sparse_sub_layer], lr=0.004, momentum=0.9)

for train_iteration in trange(10):
    optimizer.zero_grad()  
    X,Y = net.get_self_train_inputs_and_targets()
    out = net(X)
    
    loss = loss_fn(out, Y)

    # print("X:", X.shape, "Y:", Y.shape)
    # print("OUT", out.shape)
    # print("LOSS", loss.item())
    
    loss.backward(retain_graph=True)
    optimizer.step()

    

epsilon=pow(10, -5)
# is the (the whole layer) self-replicating? -> wrong
#print(torch.allclose(out, Y,rtol=0, atol=epsilon))

# is each of the networks self-replicating?
print(f"identity_fn after {train_iteration+1} self-train iterations: {sum([torch.allclose(out[i], Y[i], rtol=0, atol=epsilon) for i in range(net.nr_nets)])}/{net.nr_nets}")

In [None]:
# for layer in net.weights:
#     n=int(len(layer)/net.nr_nets)
#     print( [layer[i:i+n] for i in range(0, len(layer), n)])

encoding_matrix, mask = Net(5,2,1)._weight_pos_enc
print(encoding_matrix, mask)
# view weights of each sublayer in equal chunks, each column representing weights of one selfrepNN
# i.e., first interface*hidden weights of layer1, first hidden*hidden weights of layer2 and first hidden*out weights of layer3 = first net
weights = [layer.view(-1, int(len(layer)/net.nr_nets)) for layer in net.weights]
weights_per_net = [torch.cat([layer[i] for layer in weights]).view(-1,1) for i in range(net.nr_nets)]

inputs = torch.hstack([encoding_matrix * mask + weights_per_net[i].expand(-1, encoding_matrix.shape[-1]) * (1 - mask) for i in range(net.nr_nets)]) #16, 25

targets = torch.hstack(weights_per_net)
targets.shape

In [None]:
import numpy as np
from pathlib import Path
import torch
from torch.nn import Flatten
from torch.utils.data import Dataset, DataLoader
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Compose, Resize
from tqdm import tqdm, trange


utility_transforms = Compose([ Resize((10, 10)), ToTensor(), Flatten(start_dim=0)])
data_path = Path('data')
WORKER = 8
BATCHSIZE = 10
EPOCH = 1
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = MNIST(str(data_path), transform=utility_transforms)
d = DataLoader(dataset, batch_size=BATCHSIZE, shuffle=True, drop_last=True, num_workers=WORKER)


In [None]:
def embed_batch(x, repeat_dim):
    # x of shape (batchsize, flat_img_dim)
    x = x.unsqueeze(-1) #(batchsize, flat_img_dim, 1)
    return torch.cat( (torch.zeros( x.shape[0], x.shape[1], 4), x), dim=2).repeat(1,1,repeat_dim) #(batchsize, flat_img_dim, encoding_dim*repeat_dim)

def embed_vector(x, repeat_dim):
    # x of shape [flat_img_dim]
    x = x.unsqueeze(-1) #(flat_img_dim, 1)
    return torch.cat( (torch.zeros( x.shape[0], 4), x), dim=1).repeat(1,repeat_dim) #(flat_img_dim,  encoding_dim*repeat_dim)

class SparseNetwork():
    def __init__(self, input_dim, depth, width, out):
        self.input_dim = input_dim
        self.depth_dim = depth
        self.hidden_dim = width
        self.out_dim = out
        self.sparse_layers = []
        self.sparse_layers.append(  SparseLayer( self.input_dim  * self.hidden_dim  ))
        self.sparse_layers.extend([ SparseLayer( self.hidden_dim * self.hidden_dim  ) for layer_idx in range(self.depth_dim - 2)])
        self.sparse_layers.append(  SparseLayer( self.hidden_dim * self.out_dim     ))

    def __call__(self, x):
        
        for sparse_layer in self.sparse_layers[:-1]:
            # batch pass (one by one, sparse bmm doesn't support grad)
            if len(x.shape) > 1:
                embedded_inpt = embed_batch(x, sparse_layer.nr_nets)
                x = torch.stack([sparse_layer(inpt.T).sum(dim=1).view(self.hidden_dim, x.shape[1]).sum(dim=1) for inpt in embedded_inpt]) #[batchsize, hidden*inpt_dim, feature_dim]
            # vector
            else:
                embedded_inpt = embed_vector(x, sparse_layer.nr_nets)
                x = sparse_layer(embedded_inpt.T).sum(dim=1).view(self.hidden_dim, x.shape[1]).sum(dim=1)
            print("out", x.shape)
        
        # output layer
        sparse_layer = self.sparse_layers[-1]
        if len(x.shape) > 1:
            embedded_inpt = embed_batch(x, sparse_layer.nr_nets)
            x = torch.stack([sparse_layer(inpt.T).sum(dim=1).view(self.out_dim, x.shape[1]).sum(dim=1) for inpt in embedded_inpt]) #[batchsize, hidden*inpt_dim, feature_dim]
        else:
            embedded_inpt = embed_vector(x, sparse_layer.nr_nets)
            x = sparse_layer(embedded_inpt.T).sum(dim=1).view(self.out_dim, x.shape[1]).sum(dim=1)
        print("out", x.shape)
        return x

data_dim = np.prod(dataset[0][0].shape)
metanet = SparseNetwork(data_dim, depth=3, width=5, out=10)
batchx, batchy = next(iter(d))
batchx.shape, batchy.shape
metanet(batchx)