intermediate backup

This commit is contained in:
2025-05-03 20:46:14 +02:00
parent 2b0a5728d4
commit 6542caf48f
38 changed files with 4513 additions and 1067 deletions

View File

@ -5,9 +5,10 @@ import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import Tuple, Generator, List, Optional, Union, Dict, Literal, Type
import math # Add math import
# Use relative import for utils within the package
from .utils.config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
from .utils.forecast_config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
# Optional: Import wavelet library if needed later
# import pywt
@ -264,31 +265,39 @@ def engineer_features(df: pd.DataFrame, target_col: str, feature_config: Feature
if isinstance(nan_handler, str):
if nan_handler in ['ffill', 'bfill']:
fill_method = nan_handler
logger.debug(f"Filling NaNs in generated features using method: '{fill_method}'")
logger.debug(f"Selected NaN fill method for generated features: '{fill_method}'")
elif nan_handler == 'mean':
logger.warning("NaN filling with 'mean' in generated features is applied globally here;"
" consider per-fold mean filling if lookahead is a concern.")
# Calculate mean only on the slice provided, potentially leaking info if slice includes val/test
# Better to use ffill/bfill here or handle after split
fill_value = features_df[feature_cols_generated].mean() # Calculate mean per feature column
logger.debug("Filling NaNs in generated features using column means.")
fill_value = features_df[feature_cols_generated].mean()
logger.debug("Selected NaN fill method: column means.")
else:
logger.warning(f"Unsupported string fill_nan method '{nan_handler}' for generated features. Using 'ffill'.")
fill_method = 'ffill'
fill_method = 'ffill' # Default to ffill if unsupported string
elif isinstance(nan_handler, (int, float)):
fill_value = float(nan_handler)
logger.debug(f"Filling NaNs in generated features with value: {fill_value}")
logger.debug(f"Selected NaN fill value for generated features: {fill_value}")
else:
logger.warning(f"Invalid fill_nan type: {type(nan_handler)}. NaNs in features may remain.")
# Apply filling only to generated feature columns
if fill_method:
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method=fill_method)
if fill_method == 'ffill':
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method='bfill')
# Apply filling only to generated feature columns using recommended methods
if fill_method == 'ffill':
logger.debug("Applying .ffill() to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
# Apply bfill afterwards to handle any NaNs remaining at the very beginning
logger.debug("Applying .bfill() to handle any remaining NaNs at the start...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
elif fill_method == 'bfill':
logger.debug("Applying .bfill() to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
# Optionally apply ffill after bfill if you need to fill trailing NaNs (less common)
# features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
elif fill_value is not None:
# fillna with Series/dict for column-wise mean, or scalar for constant value
logger.debug(f"Applying .fillna(value={fill_value}) to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(value=fill_value)
# No else needed, if fill_method and fill_value are None, no filling happens
else:
logger.warning("`fill_nan` is None. NaNs generated by feature engineering may remain.")
@ -366,36 +375,31 @@ class TimeSeriesCrossValidationSplitter:
# Estimate if None
elif self.initial_train_size is None:
min_samples_per_split_step = 2 # Heuristic minimum samples for val+test in one step
# Estimate val/test based on *potential* train size (crude)
# Assume train is roughly (1 - val - test) fraction for estimation
estimated_train_frac = max(0.1, 1.0 - self.val_frac - self.test_frac) # Ensure non-zero
estimated_train_n = int(self.n_samples * estimated_train_frac)
val_test_size_per_step = max(min_samples_per_split_step, int(estimated_train_n * (self.val_frac + self.test_frac)))
logger.info("Estimating fixed train size based on n_splits, val_frac, test_frac.")
# Estimate based on the total space needed for all splits:
# n_samples >= fixed_train_n + val_size + test_size + (n_splits - 1) * step_size
# n_samples >= fixed_train_n + int(fixed_train_n*val_frac) + n_splits * int(fixed_train_n*test_frac)
# n_samples >= fixed_train_n * (1 + val_frac + n_splits * test_frac)
# fixed_train_n <= n_samples / (1 + val_frac + n_splits * test_frac)
# Tentative initial train size is total minus one val/test block
fixed_train_n_est = self.n_samples - val_test_size_per_step
denominator = 1.0 + self.val_frac + self.n_splits * self.test_frac
if denominator <= 1.0: # Avoid division by zero or non-positive, and ensure train frac < 1
raise ValueError(f"Cannot estimate initial_train_size. Combination of val_frac ({self.val_frac}), "
f"test_frac ({self.test_frac}), and n_splits ({self.n_splits}) is invalid (denominator {denominator:.2f} <= 1.0).")
# Basic sanity checks
if fixed_train_n_est <= 0:
raise ValueError("Could not estimate a valid initial_train_size (<= 0). Please specify it or check CV fractions.")
# Need at least 1 sample for train, val, test each theoretically
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
if fixed_train_n_est + est_val_size + est_test_size > self.n_samples:
# If the simple estimate is too large, reduce it more drastically
# Try setting train size = 50% and see if val/test fit?
fixed_train_n_est = int(self.n_samples * 0.5)
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
if fixed_train_n_est <=0 or (fixed_train_n_est + est_val_size + est_test_size > self.n_samples):
raise ValueError("Could not estimate a valid initial_train_size. Data too small relative to val/test fractions? Please specify initial_train_size.")
estimated_size = int(self.n_samples / denominator)
logger.warning(f"initial_train_size not set, estimated fixed train size for rolling window: {fixed_train_n_est}. "
"This is a heuristic; viability depends on n_splits and step size. Validation happens in split().")
return fixed_train_n_est
# Add a sanity check: ensure estimated size is reasonably large
min_required_for_features = 1 # Placeholder - ideally get from FeatureConfig if possible, but complex here
if estimated_size < min_required_for_features:
raise ValueError(f"Estimated fixed train size ({estimated_size}) is too small. "
f"Check CV config (n_splits={self.n_splits}, val_frac={self.val_frac}, test_frac={self.test_frac}) "
f"relative to total samples ({self.n_samples}). Consider specifying initial_train_size manually.")
logger.info(f"Estimated fixed training window size: {estimated_size}")
return estimated_size
else:
raise ValueError(f"Invalid initial_train_size: {self.initial_train_size}")
raise ValueError(f"Invalid initial_train_size type or value: {self.initial_train_size}")
def split(self) -> Generator[Tuple[np.ndarray, np.ndarray, np.ndarray], None, None]:
@ -483,28 +487,31 @@ class TimeSeriesDataset(Dataset):
"""
PyTorch Dataset for time series forecasting.
Takes a NumPy array (features + target), sequence length, and forecast horizon,
and returns (input_sequence, target_sequence) tuples. Compatible with PyTorch
DataLoaders used by PyTorch Lightning.
Takes a NumPy array (features + target), sequence length, and a list of
specific forecast horizons. Returns (input_sequence, target_vector) tuples,
where target_vector contains the target values at the specified future steps.
"""
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: int, target_col_index: int = 0):
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: List[int], target_col_index: int = 0):
"""
Args:
data_array: Numpy array of shape (n_samples, n_features).
Assumes the target variable is one of the columns.
sequence_length: Length of the input sequence (lookback window).
forecast_horizon: Number of steps ahead to predict.
forecast_horizon: List of specific steps ahead to predict (e.g., [1, 6, 12]).
target_col_index: Index of the target column in data_array. Defaults to 0.
"""
if sequence_length <= 0:
raise ValueError("sequence_length must be positive.")
if forecast_horizon <= 0:
raise ValueError("forecast_horizon must be positive.")
if not forecast_horizon or not isinstance(forecast_horizon, list) or any(h <= 0 for h in forecast_horizon):
raise ValueError("forecast_horizon must be a non-empty list of positive integers.")
if data_array.ndim != 2:
raise ValueError(f"data_array must be 2D, but got shape {data_array.shape}")
min_len_required = sequence_length + forecast_horizon
self.max_horizon = max(forecast_horizon) # Find the furthest point needed
min_len_required = sequence_length + self.max_horizon
if min_len_required > data_array.shape[0]:
raise ValueError(f"sequence_length ({sequence_length}) + forecast_horizon ({forecast_horizon}) = {min_len_required} "
raise ValueError(f"sequence_length ({sequence_length}) + max_horizon ({self.max_horizon}) = {min_len_required} "
f"exceeds total samples provided ({data_array.shape[0]})")
if not (0 <= target_col_index < data_array.shape[1]):
raise ValueError(f"target_col_index ({target_col_index}) out of bounds for data with {data_array.shape[1]} columns.")
@ -512,32 +519,37 @@ class TimeSeriesDataset(Dataset):
self.data = torch.tensor(data_array, dtype=torch.float32)
self.sequence_length = sequence_length
self.forecast_horizon = forecast_horizon
self.forecast_horizon_list = sorted(forecast_horizon)
self.target_col_index = target_col_index
self.n_samples = data_array.shape[0]
self.n_features = data_array.shape[1]
logger.debug(f"TimeSeriesDataset created: data shape={self.data.shape}, "
f"seq_len={self.sequence_length}, forecast_horizon={self.forecast_horizon}, "
f"target_idx={self.target_col_index}")
f"seq_len={self.sequence_length}, forecast_horizons={self.forecast_horizon_list}, "
f"max_horizon={self.max_horizon}, target_idx={self.target_col_index}")
def __len__(self) -> int:
"""Returns the total number of sequences that can be generated."""
return self.n_samples - self.sequence_length - self.forecast_horizon + 1
return self.n_samples - self.sequence_length - self.max_horizon + 1
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Returns a single (input_sequence, target_sequence) pair.
Returns a single (input_sequence, target_vector) pair.
Target vector contains values for the specified forecast horizons.
"""
if not (0 <= idx < len(self)):
raise IndexError(f"Index {idx} out of bounds for dataset with length {len(self)}")
input_start = idx
input_end = idx + self.sequence_length
input_sequence = self.data[input_start:input_end, :]
target_start = input_end
target_end = target_start + self.forecast_horizon
target_sequence = self.data[target_start:target_end, self.target_col_index]
return input_sequence, target_sequence
input_sequence = self.data[input_start:input_end, :] # Shape: (seq_len, n_features)
# Calculate indices for each horizon relative to the end of the input sequence
# Horizon h corresponds to index: input_end + h - 1
target_indices = [input_end + h - 1 for h in self.forecast_horizon_list]
target_vector = self.data[target_indices, self.target_col_index] # Shape: (len(forecast_horizon_list),)
return input_sequence, target_vector
# --- Data Preparation ---
def prepare_fold_data_and_loaders(
@ -576,6 +588,7 @@ def prepare_fold_data_and_loaders(
feature_config: Configuration for feature engineering.
train_config: Configuration for training (used for batch size, device hints).
eval_config: Configuration for evaluation (used for batch size).
Returns:
Tuple containing:
@ -598,13 +611,25 @@ def prepare_fold_data_and_loaders(
if feature_config.lags:
max_lookback = max(max_lookback, max(feature_config.lags))
if feature_config.rolling_window_sizes:
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1 )
max_history_needed = max(max_lookback, feature_config.sequence_length)
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1)
# Also need history for the input sequence length and max target horizon
max_horizon_needed = max(feature_config.forecast_horizon) if feature_config.forecast_horizon else 0
# Max history needed is max of lookback for features OR (sequence_length + max_horizon - 1) for targets/inputs
# Correct logic: Need `sequence_length` history for input, and `max_horizon` steps *after* the train data for targets/evaluation.
# The slicing needs to ensure enough data *before* train_idx[0] for feature lookback *and* sequence_length.
# Max history *before* the start of the training set
max_history_needed_before_train = max(max_lookback, feature_config.sequence_length)
slice_start_idx = max(0, train_idx[0] - max_history_needed_before_train)
# The end index needs to cover the test set PLUS the maximum horizon needed for the last test target
slice_end_idx = test_idx[-1] + max_horizon_needed # Go up to the last needed target
# Ensure end index is within bounds
slice_end_idx = min(slice_end_idx + 1, len(full_df)) # +1 because iloc is exclusive
slice_start_idx = max(0, train_idx[0] - max_history_needed)
slice_end_idx = test_idx[-1] + 1
if slice_start_idx >= slice_end_idx:
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices.")
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices and horizon.")
fold_data_slice = full_df.iloc[slice_start_idx:slice_end_idx]
logger.debug(f"Required data slice for fold: indices {slice_start_idx} to {slice_end_idx-1} "
@ -709,22 +734,38 @@ def prepare_fold_data_and_loaders(
input_size = train_data_scaled.shape[1]
# --- Ensure final data arrays are float32 for PyTorch ---
try:
# Explicitly convert to float32 AFTER scaling (or non-scaling)
train_data_final = train_data_scaled.astype(np.float32)
val_data_final = val_data_scaled.astype(np.float32)
test_data_final = test_data_scaled.astype(np.float32)
logger.debug("Ensured final data arrays are float32.")
except ValueError as e:
# This might happen if data cannot be safely cast (e.g., strings remain unexpectedly)
logger.error(f"Failed to convert data arrays to float32 before creating Tensors: {e}", exc_info=True)
# Consider adding more debug info here if it fails, e.g.:
# logger.debug(f"Data types in train_df before conversion: \n{train_df.dtypes}")
raise ValueError("Data could not be converted to numeric type (float32) for PyTorch.") from e
# 6. Dataset Instantiation
logger.debug("Creating TimeSeriesDataset instances for the fold.")
try:
# Use the explicitly converted arrays
train_dataset = TimeSeriesDataset(
train_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
train_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
val_dataset = TimeSeriesDataset(
val_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
val_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
test_dataset = TimeSeriesDataset(
test_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
test_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
except ValueError as e:
logger.error(f"Error creating TimeSeriesDataset: {e}")
logger.error(f"Shapes fed to Dataset: Train={train_data_scaled.shape}, Val={val_data_scaled.shape}, Test={test_data_scaled.shape}")
logger.error(f"SeqLen={feature_config.sequence_length}, Horizon={feature_config.forecast_horizon}")
logger.error(f"Shapes fed to Dataset: Train={train_data_final.shape}, Val={val_data_final.shape}, Test={test_data_final.shape}")
logger.error(f"SeqLen={feature_config.sequence_length}, Horizons={feature_config.forecast_horizon}")
raise
@ -748,4 +789,69 @@ def prepare_fold_data_and_loaders(
logger.info("Data loaders prepared successfully for the fold.")
return train_loader, val_loader, test_loader, target_scaler, input_size
return train_loader, val_loader, test_loader, target_scaler, input_size
# --- Classic Train/Val/Test Split ---
def split_data_classic(
n_samples: int,
val_frac: float,
test_frac: float,
start_from_end: bool = True
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits data indices into one train, one validation, and one test set based on fractions.
Args:
n_samples: Total number of samples in the dataset.
val_frac: Fraction of the *total* data to use for validation.
test_frac: Fraction of the *total* data to use for testing.
start_from_end: If True (default), test and validation sets are taken from the end
of the series. If False, they are taken after the initial training block.
Default is True for typical time series evaluation.
Returns:
Tuple of (train_indices, val_indices, test_indices).
Raises:
ValueError: If fractions are invalid or sum to >= 1.
"""
if not (0 < val_frac < 1):
raise ValueError(f"val_frac must be between 0 and 1, got {val_frac}")
if not (0 < test_frac < 1):
raise ValueError(f"test_frac must be between 0 and 1, got {test_frac}")
if val_frac + test_frac >= 1:
raise ValueError(f"Sum of val_frac ({val_frac}) and test_frac ({test_frac}) must be less than 1.")
test_size = math.ceil(n_samples * test_frac) # Use ceil to ensure at least one sample if frac is tiny
val_size = math.ceil(n_samples * val_frac)
train_size = n_samples - val_size - test_size
if train_size <= 0:
raise ValueError(f"Calculated train_size ({train_size}) is not positive. Adjust fractions or increase data.")
if val_size <= 0:
raise ValueError("Calculated val_size is not positive.")
if test_size <= 0:
raise ValueError("Calculated test_size is not positive.")
indices = np.arange(n_samples)
if start_from_end:
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size + val_size]
test_indices = indices[train_size + val_size:]
# Adjust if ceil caused slight overallocation in test
test_indices = test_indices[:test_size]
else:
# Less common: place val/test directly after train
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size + val_size]
test_indices = indices[train_size + val_size:train_size + val_size + test_size]
# Remaining data is unused in this scenario
logger.info(f"Classic split: Train indices {train_indices[0]}-{train_indices[-1]} (size {len(train_indices)}), "
f"Val indices {val_indices[0]}-{val_indices[-1]} (size {len(val_indices)}), "
f"Test indices {test_indices[0]}-{test_indices[-1]} (size {len(test_indices)})")
return train_indices, val_indices, test_indices