Dataset Redone

This commit is contained in:
Si11ium
2020-06-19 08:17:35 +02:00
parent 4898e98851
commit 63605ae33a
14 changed files with 239 additions and 362 deletions

View File

@@ -1,89 +0,0 @@
import pickle
from collections import defaultdict
from abc import ABC
from pathlib import Path
from torch.utils.data import Dataset, ConcatDataset
from ml_lib.point_toolset.sampling import FarthestpointSampling, RandomSampling
import numpy as np
class _Point_Dataset(ABC, Dataset):
@property
def name(self):
raise NotImplementedError
@property
def sample_shape(self):
# FixMe: This does not work when more then x/y tuples are returned
return self[0][0].shape
headers = ['x', 'y', 'z', 'xn', 'yn', 'zn', 'label', 'cl_idx']
samplers = dict(fps=FarthestpointSampling, rnd=RandomSampling)
def __init__(self, root=Path('data'), norm_as_feature=True, sampling_k=2048, sampling='rnd',
transforms=None, load_preprocessed=True, split='train', *args, **kwargs):
super(_Point_Dataset, self).__init__()
self.setting: str
self.split = split
self.norm_as_feature = norm_as_feature
self.load_preprocessed = load_preprocessed
self.transforms = transforms if transforms else lambda x: x
self.sampling_k = sampling_k
self.sampling = self.samplers[sampling](K=self.sampling_k)
self.root = Path(root)
self.raw = self.root / 'raw' / self.split
self.processed_ext = '.pik'
self.raw_ext = '.xyz'
self.processed = self.root / self.setting
self.processed.mkdir(parents=True, exist_ok=True)
self._files = list(self.raw.glob(f'*{self.setting}*'))
def _read_or_load(self, item):
raw_file_path = self._files[item]
processed_file_path = self.processed / raw_file_path.name.replace(self.raw_ext, self.processed_ext)
if not self.load_preprocessed:
processed_file_path.unlink(missing_ok=True)
if not processed_file_path.exists():
pointcloud = defaultdict(list)
with raw_file_path.open('r') as raw_file:
for row in raw_file:
values = [float(x) for x in row.strip().split(' ')]
for header, value in zip(self.headers, values):
pointcloud[header].append(value)
for key in pointcloud.keys():
pointcloud[key] = np.asarray(pointcloud[key])
with processed_file_path.open('wb') as processed_file:
pickle.dump(pointcloud, processed_file)
return processed_file_path
def __len__(self):
raise NotImplementedError
def __getitem__(self, item):
processed_file_path = self._read_or_load(item)
with processed_file_path.open('rb') as processed_file:
pointcloud = pickle.load(processed_file)
position = np.stack((pointcloud['x'], pointcloud['y'], pointcloud['z']), axis=-1)
normal = np.stack((pointcloud['xn'], pointcloud['yn'], pointcloud['zn']), axis=-1)
label = pointcloud['label']
cl_label = pointcloud['cl_idx']
sample_idxs = self.sampling(position)
return (normal[sample_idxs].astype(np.float),
position[sample_idxs].astype(np.float),
label[sample_idxs].astype(np.int),
cl_label[sample_idxs].astype(np.int)
)

View File

@@ -1,19 +0,0 @@
import pickle
from collections import defaultdict
import numpy as np
from ._point_dataset import _Point_Dataset
class FullCloudsDataset(_Point_Dataset):
split: str
name = 'FullCloudsDataset'
def __init__(self, *args, setting='pc', **kwargs):
self.setting = setting
super(FullCloudsDataset, self).__init__(*args, **kwargs)
def __len__(self):
return len(self._files)

View File

@@ -1,83 +0,0 @@
import pickle
from collections import defaultdict
import numpy as np
from torch.utils.data import ConcatDataset
from tqdm import trange
from ._point_dataset import _Point_Dataset
class GridClusters(_Point_Dataset):
split: str
name = 'GridClusters'
def __init__(self, *args, n_spatial_clusters=3*3*3, setting='pc', **kwargs):
self.n_spatial_clusters = n_spatial_clusters
self.setting = setting
super(GridClusters, self).__init__(*args, **kwargs)
def __len__(self):
return len(self._files)
def _read_or_load(self, item):
raw_file_path = self._files[item]
processed_file_path = self.processed / raw_file_path.name.replace(self.raw_ext, self.processed_ext)
if not self.load_preprocessed:
processed_file_path.unlink(missing_ok=True)
if not processed_file_path.exists():
# nested default dict
pointcloud = defaultdict(lambda: defaultdict(list))
with raw_file_path.open('r') as raw_file:
for row in raw_file:
values = [float(x) for x in row.strip().split(' ')]
for header, value in zip(self.headers, values):
pointcloud[int(values[-1])][header].append(value)
for cluster in pointcloud.keys():
for key in pointcloud[cluster].keys():
pointcloud[cluster][key] = np.asarray(pointcloud[cluster][key])
pointcloud[cluster] = dict(pointcloud[cluster])
pointcloud = dict(pointcloud)
with processed_file_path.open('wb') as processed_file:
pickle.dump(pointcloud, processed_file)
return processed_file_path
def __getitem__(self, item):
processed_file_path = self._read_or_load(item)
with processed_file_path.open('rb') as processed_file:
pointcloud = pickle.load(processed_file)
# By number Variant
# cl_idx_list = np.cumsum([[len(self) // self.n_spatial_clusters, ] * self.n_spatial_clusters])
# cl_idx = [idx for idx, x in enumerate(cl_idx_list) if item <= x][0]
# Random Variant
cl_idx = np.random.randint(0, len(pointcloud))
pointcloud = pointcloud[list(pointcloud.keys())[cl_idx]]
position = np.stack((pointcloud['x'], pointcloud['y'], pointcloud['z']), axis=-1)
normal = np.stack((pointcloud['xn'], pointcloud['yn'], pointcloud['zn']), axis=-1)
label = pointcloud['label']
cl_label = pointcloud['cl_idx']
sample_idxs = self.sampling(position)
while sample_idxs.shape[0] < self.sampling_k:
sample_idxs = np.concatenate((sample_idxs, sample_idxs))[:self.sampling_k]
normal = normal[sample_idxs].astype(np.float)
position = position[sample_idxs].astype(np.float)
normal = self.transforms(normal)
position = self.transforms(position)
return (normal, position,
label[sample_idxs].astype(np.int),
cl_label[sample_idxs].astype(np.int)
)

212
datasets/shapenet.py Normal file
View File

@@ -0,0 +1,212 @@
from pathlib import Path
import numpy as np
from collections import defaultdict
import os
from tqdm import tqdm
import glob
import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data
from torch.utils.data import Dataset
import re
from utils.project_config import Classes, DataSplit
def save_names(name_list, path):
with open(path, 'wb') as f:
f.writelines(name_list)
class CustomShapeNet(InMemoryDataset):
categories = {key: val for val, key in Classes().items()}
modes = {key: val for val, key in DataSplit().items()}
name = 'CustomShapeNet'
@property
def raw_dir(self):
return self.root / 'raw'
@property
def raw_file_names(self):
return [self.mode]
@property
def processed_dir(self):
return self.root / 'processed'
def __init__(self, root_dir, collate_per_segment=True, mode='train', transform=None, pre_filter=None,
pre_transform=None, refresh=False, with_normals=False):
assert mode in self.modes.keys(), f'"mode" must be one of {self.modes.keys()}'
# Set the Dataset Parameters
self.collate_per_segment, self.mode, self.refresh = collate_per_segment, mode, refresh
self.with_normals = with_normals
root_dir = Path(root_dir)
super(CustomShapeNet, self).__init__(root_dir, transform, pre_transform, pre_filter)
self.data, self.slices = self._load_dataset()
print("Initialized")
@property
def processed_file_names(self):
return [f'{self.mode}.pt']
def download(self):
dir_count = len([name for name in os.listdir(self.raw_dir) if os.path.isdir(os.path.join(self.raw_dir, name))])
if dir_count:
print(f'{dir_count} folders have been found....')
return dir_count
raise IOError("No raw pointclouds have been found.")
@property
def num_classes(self):
return len(self.categories)
def _load_dataset(self):
data, slices = None, None
filepath = self.processed_paths[0]
if self.refresh:
try:
os.remove(filepath)
print('Processed Location "Refreshed" (We deleted the Files)')
except FileNotFoundError:
print('You meant to refresh the allready processed dataset, but there were none...')
print('continue processing')
pass
while True:
try:
data, slices = torch.load(filepath)
print('Dataset Loaded')
break
except FileNotFoundError:
self.process()
continue
return data, slices
def _transform_and_filter(self, data):
# ToDo: ANy filter to apply? Then do it here.
if self.pre_filter is not None and not self.pre_filter(data):
data = self.pre_filter(data)
raise NotImplementedError
# ToDo: ANy transformation to apply? Then do it here.
if self.pre_transform is not None:
data = self.pre_transform(data)
raise NotImplementedError
return data
def process(self, delimiter=' '):
datasets = defaultdict(list)
path_to_clouds = self.raw_dir / self.mode
for pointcloud in tqdm(path_to_clouds.glob('*.xyz')):
if 'grid' not in pointcloud.name:
continue
data = None
with pointcloud.open('r') as f:
src = defaultdict(list)
# Iterate over all rows
for row in f:
if row != '':
vals = row.rstrip().split(delimiter)[None:None]
vals = [float(x) if x not in ['-nan(ind)', 'nan(ind)'] else 0 for x in vals]
src[vals[-1]].append(vals)
src = dict(src)
for key, values in src.items():
src[key] = torch.tensor(values, dtype=torch.double).squeeze()
if not self.collate_per_segment:
src = dict(all=torch.stack([x for x in src.values()]))
for key, values in src.items():
try:
points = values[:, :-2]
except IndexError:
continue
y = torch.as_tensor(values[:, -2], dtype=torch.long)
y_c = torch.as_tensor(values[:, -1], dtype=torch.long)
####################################
# This is where you define the keys
attr_dict = dict(y=y, y_c=y_c)
if self.with_normals:
pos = points[:, :6]
norm = None
attr_dict.update(pos=pos, norm=norm)
if not self.with_normals:
pos = points[:, :3]
norm = points[:, 3:6]
attr_dict.update(pos=pos, norm=norm)
####################################
if self.collate_per_segment:
data = Data(**attr_dict)
else:
if not data:
data = defaultdict(list)
# points=points, norm=points[:, 3:]
for key, val in attr_dict.items():
data[key].append(val)
data = self._transform_and_filter(data)
if self.collate_per_segment:
datasets[self.mode].append(data)
if not self.collate_per_segment:
# Todo: What is this?
datasets[self.mode].append(Data(**{key: torch.cat(data[key]) for key in data.keys()}))
if datasets[self.mode]:
os.makedirs(self.processed_dir, exist_ok=True)
torch.save(self.collate(datasets[self.mode]), self.processed_paths[0])
def __repr__(self):
return f'{self.__class__.__name__}({len(self)})'
class ShapeNetPartSegDataset(Dataset):
"""
Resample raw point cloud to fixed number of points.
Map raw label from range [1, N] to [0, N-1].
"""
name = 'ShapeNetPartSegDataset'
def __init__(self, root_dir, npoints=1024, mode='train', **kwargs):
super(ShapeNetPartSegDataset, self).__init__()
self.mode = mode
kwargs.update(dict(root_dir=root_dir, mode=self.mode))
self.npoints = npoints
self.dataset = CustomShapeNet(**kwargs)
def __getitem__(self, index):
data = self.dataset[index]
# Resample to fixed number of points
try:
npoints = self.npoints if self.mode != 'predict' else data.pos.shape[0]
choice = np.random.choice(data.pos.shape[0], npoints, replace=False if self.mode == 'predict' else True)
except ValueError:
choice = []
pos, norm, y = data.pos[choice, :], data.norm[choice], data.y[choice]
# y -= 1 if self.num_classes() in y else 0 # Map label from [1, C] to [0, C-1]
sample = Data(**dict(pos=pos, # torch.Tensor (n, 3/6)
y=y, # torch.Tensor (n,)
norm=norm # torch.Tensor (n, 3/0)
)
)
return sample
def __len__(self):
return len(self.dataset)
def num_classes(self):
return self.dataset.num_classes

View File

@@ -1,8 +1,6 @@
from torch.utils.data import Dataset
from._point_dataset import _Point_Dataset
# Template
class TemplateDataset(_Point_Dataset):
class TemplateDataset(object):
def __init__(self, *args, **kwargs):
super(TemplateDataset, self).__init__()