intermediate backup
This commit is contained in:
@ -5,9 +5,10 @@ import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from typing import Tuple, Generator, List, Optional, Union, Dict, Literal, Type
|
||||
import math # Add math import
|
||||
|
||||
# Use relative import for utils within the package
|
||||
from .utils.config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
|
||||
from .utils.forecast_config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
|
||||
# Optional: Import wavelet library if needed later
|
||||
# import pywt
|
||||
|
||||
@ -264,31 +265,39 @@ def engineer_features(df: pd.DataFrame, target_col: str, feature_config: Feature
|
||||
if isinstance(nan_handler, str):
|
||||
if nan_handler in ['ffill', 'bfill']:
|
||||
fill_method = nan_handler
|
||||
logger.debug(f"Filling NaNs in generated features using method: '{fill_method}'")
|
||||
logger.debug(f"Selected NaN fill method for generated features: '{fill_method}'")
|
||||
elif nan_handler == 'mean':
|
||||
logger.warning("NaN filling with 'mean' in generated features is applied globally here;"
|
||||
" consider per-fold mean filling if lookahead is a concern.")
|
||||
# Calculate mean only on the slice provided, potentially leaking info if slice includes val/test
|
||||
# Better to use ffill/bfill here or handle after split
|
||||
fill_value = features_df[feature_cols_generated].mean() # Calculate mean per feature column
|
||||
logger.debug("Filling NaNs in generated features using column means.")
|
||||
fill_value = features_df[feature_cols_generated].mean()
|
||||
logger.debug("Selected NaN fill method: column means.")
|
||||
else:
|
||||
logger.warning(f"Unsupported string fill_nan method '{nan_handler}' for generated features. Using 'ffill'.")
|
||||
fill_method = 'ffill'
|
||||
fill_method = 'ffill' # Default to ffill if unsupported string
|
||||
elif isinstance(nan_handler, (int, float)):
|
||||
fill_value = float(nan_handler)
|
||||
logger.debug(f"Filling NaNs in generated features with value: {fill_value}")
|
||||
logger.debug(f"Selected NaN fill value for generated features: {fill_value}")
|
||||
else:
|
||||
logger.warning(f"Invalid fill_nan type: {type(nan_handler)}. NaNs in features may remain.")
|
||||
|
||||
# Apply filling only to generated feature columns
|
||||
if fill_method:
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method=fill_method)
|
||||
if fill_method == 'ffill':
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method='bfill')
|
||||
# Apply filling only to generated feature columns using recommended methods
|
||||
if fill_method == 'ffill':
|
||||
logger.debug("Applying .ffill() to generated features...")
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
|
||||
# Apply bfill afterwards to handle any NaNs remaining at the very beginning
|
||||
logger.debug("Applying .bfill() to handle any remaining NaNs at the start...")
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
|
||||
elif fill_method == 'bfill':
|
||||
logger.debug("Applying .bfill() to generated features...")
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
|
||||
# Optionally apply ffill after bfill if you need to fill trailing NaNs (less common)
|
||||
# features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
|
||||
elif fill_value is not None:
|
||||
# fillna with Series/dict for column-wise mean, or scalar for constant value
|
||||
logger.debug(f"Applying .fillna(value={fill_value}) to generated features...")
|
||||
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(value=fill_value)
|
||||
# No else needed, if fill_method and fill_value are None, no filling happens
|
||||
|
||||
else:
|
||||
logger.warning("`fill_nan` is None. NaNs generated by feature engineering may remain.")
|
||||
|
||||
@ -366,36 +375,31 @@ class TimeSeriesCrossValidationSplitter:
|
||||
|
||||
# Estimate if None
|
||||
elif self.initial_train_size is None:
|
||||
min_samples_per_split_step = 2 # Heuristic minimum samples for val+test in one step
|
||||
# Estimate val/test based on *potential* train size (crude)
|
||||
# Assume train is roughly (1 - val - test) fraction for estimation
|
||||
estimated_train_frac = max(0.1, 1.0 - self.val_frac - self.test_frac) # Ensure non-zero
|
||||
estimated_train_n = int(self.n_samples * estimated_train_frac)
|
||||
val_test_size_per_step = max(min_samples_per_split_step, int(estimated_train_n * (self.val_frac + self.test_frac)))
|
||||
logger.info("Estimating fixed train size based on n_splits, val_frac, test_frac.")
|
||||
# Estimate based on the total space needed for all splits:
|
||||
# n_samples >= fixed_train_n + val_size + test_size + (n_splits - 1) * step_size
|
||||
# n_samples >= fixed_train_n + int(fixed_train_n*val_frac) + n_splits * int(fixed_train_n*test_frac)
|
||||
# n_samples >= fixed_train_n * (1 + val_frac + n_splits * test_frac)
|
||||
# fixed_train_n <= n_samples / (1 + val_frac + n_splits * test_frac)
|
||||
|
||||
# Tentative initial train size is total minus one val/test block
|
||||
fixed_train_n_est = self.n_samples - val_test_size_per_step
|
||||
denominator = 1.0 + self.val_frac + self.n_splits * self.test_frac
|
||||
if denominator <= 1.0: # Avoid division by zero or non-positive, and ensure train frac < 1
|
||||
raise ValueError(f"Cannot estimate initial_train_size. Combination of val_frac ({self.val_frac}), "
|
||||
f"test_frac ({self.test_frac}), and n_splits ({self.n_splits}) is invalid (denominator {denominator:.2f} <= 1.0).")
|
||||
|
||||
# Basic sanity checks
|
||||
if fixed_train_n_est <= 0:
|
||||
raise ValueError("Could not estimate a valid initial_train_size (<= 0). Please specify it or check CV fractions.")
|
||||
# Need at least 1 sample for train, val, test each theoretically
|
||||
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
|
||||
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
|
||||
if fixed_train_n_est + est_val_size + est_test_size > self.n_samples:
|
||||
# If the simple estimate is too large, reduce it more drastically
|
||||
# Try setting train size = 50% and see if val/test fit?
|
||||
fixed_train_n_est = int(self.n_samples * 0.5)
|
||||
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
|
||||
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
|
||||
if fixed_train_n_est <=0 or (fixed_train_n_est + est_val_size + est_test_size > self.n_samples):
|
||||
raise ValueError("Could not estimate a valid initial_train_size. Data too small relative to val/test fractions? Please specify initial_train_size.")
|
||||
estimated_size = int(self.n_samples / denominator)
|
||||
|
||||
logger.warning(f"initial_train_size not set, estimated fixed train size for rolling window: {fixed_train_n_est}. "
|
||||
"This is a heuristic; viability depends on n_splits and step size. Validation happens in split().")
|
||||
return fixed_train_n_est
|
||||
# Add a sanity check: ensure estimated size is reasonably large
|
||||
min_required_for_features = 1 # Placeholder - ideally get from FeatureConfig if possible, but complex here
|
||||
if estimated_size < min_required_for_features:
|
||||
raise ValueError(f"Estimated fixed train size ({estimated_size}) is too small. "
|
||||
f"Check CV config (n_splits={self.n_splits}, val_frac={self.val_frac}, test_frac={self.test_frac}) "
|
||||
f"relative to total samples ({self.n_samples}). Consider specifying initial_train_size manually.")
|
||||
|
||||
logger.info(f"Estimated fixed training window size: {estimated_size}")
|
||||
return estimated_size
|
||||
else:
|
||||
raise ValueError(f"Invalid initial_train_size: {self.initial_train_size}")
|
||||
raise ValueError(f"Invalid initial_train_size type or value: {self.initial_train_size}")
|
||||
|
||||
|
||||
def split(self) -> Generator[Tuple[np.ndarray, np.ndarray, np.ndarray], None, None]:
|
||||
@ -483,28 +487,31 @@ class TimeSeriesDataset(Dataset):
|
||||
"""
|
||||
PyTorch Dataset for time series forecasting.
|
||||
|
||||
Takes a NumPy array (features + target), sequence length, and forecast horizon,
|
||||
and returns (input_sequence, target_sequence) tuples. Compatible with PyTorch
|
||||
DataLoaders used by PyTorch Lightning.
|
||||
Takes a NumPy array (features + target), sequence length, and a list of
|
||||
specific forecast horizons. Returns (input_sequence, target_vector) tuples,
|
||||
where target_vector contains the target values at the specified future steps.
|
||||
"""
|
||||
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: int, target_col_index: int = 0):
|
||||
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: List[int], target_col_index: int = 0):
|
||||
"""
|
||||
Args:
|
||||
data_array: Numpy array of shape (n_samples, n_features).
|
||||
Assumes the target variable is one of the columns.
|
||||
sequence_length: Length of the input sequence (lookback window).
|
||||
forecast_horizon: Number of steps ahead to predict.
|
||||
forecast_horizon: List of specific steps ahead to predict (e.g., [1, 6, 12]).
|
||||
target_col_index: Index of the target column in data_array. Defaults to 0.
|
||||
"""
|
||||
if sequence_length <= 0:
|
||||
raise ValueError("sequence_length must be positive.")
|
||||
if forecast_horizon <= 0:
|
||||
raise ValueError("forecast_horizon must be positive.")
|
||||
if not forecast_horizon or not isinstance(forecast_horizon, list) or any(h <= 0 for h in forecast_horizon):
|
||||
raise ValueError("forecast_horizon must be a non-empty list of positive integers.")
|
||||
if data_array.ndim != 2:
|
||||
raise ValueError(f"data_array must be 2D, but got shape {data_array.shape}")
|
||||
min_len_required = sequence_length + forecast_horizon
|
||||
|
||||
self.max_horizon = max(forecast_horizon) # Find the furthest point needed
|
||||
|
||||
min_len_required = sequence_length + self.max_horizon
|
||||
if min_len_required > data_array.shape[0]:
|
||||
raise ValueError(f"sequence_length ({sequence_length}) + forecast_horizon ({forecast_horizon}) = {min_len_required} "
|
||||
raise ValueError(f"sequence_length ({sequence_length}) + max_horizon ({self.max_horizon}) = {min_len_required} "
|
||||
f"exceeds total samples provided ({data_array.shape[0]})")
|
||||
if not (0 <= target_col_index < data_array.shape[1]):
|
||||
raise ValueError(f"target_col_index ({target_col_index}) out of bounds for data with {data_array.shape[1]} columns.")
|
||||
@ -512,32 +519,37 @@ class TimeSeriesDataset(Dataset):
|
||||
|
||||
self.data = torch.tensor(data_array, dtype=torch.float32)
|
||||
self.sequence_length = sequence_length
|
||||
self.forecast_horizon = forecast_horizon
|
||||
self.forecast_horizon_list = sorted(forecast_horizon)
|
||||
self.target_col_index = target_col_index
|
||||
self.n_samples = data_array.shape[0]
|
||||
self.n_features = data_array.shape[1]
|
||||
|
||||
logger.debug(f"TimeSeriesDataset created: data shape={self.data.shape}, "
|
||||
f"seq_len={self.sequence_length}, forecast_horizon={self.forecast_horizon}, "
|
||||
f"target_idx={self.target_col_index}")
|
||||
f"seq_len={self.sequence_length}, forecast_horizons={self.forecast_horizon_list}, "
|
||||
f"max_horizon={self.max_horizon}, target_idx={self.target_col_index}")
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Returns the total number of sequences that can be generated."""
|
||||
return self.n_samples - self.sequence_length - self.forecast_horizon + 1
|
||||
return self.n_samples - self.sequence_length - self.max_horizon + 1
|
||||
|
||||
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""
|
||||
Returns a single (input_sequence, target_sequence) pair.
|
||||
Returns a single (input_sequence, target_vector) pair.
|
||||
Target vector contains values for the specified forecast horizons.
|
||||
"""
|
||||
if not (0 <= idx < len(self)):
|
||||
raise IndexError(f"Index {idx} out of bounds for dataset with length {len(self)}")
|
||||
|
||||
input_start = idx
|
||||
input_end = idx + self.sequence_length
|
||||
input_sequence = self.data[input_start:input_end, :]
|
||||
target_start = input_end
|
||||
target_end = target_start + self.forecast_horizon
|
||||
target_sequence = self.data[target_start:target_end, self.target_col_index]
|
||||
return input_sequence, target_sequence
|
||||
input_sequence = self.data[input_start:input_end, :] # Shape: (seq_len, n_features)
|
||||
|
||||
# Calculate indices for each horizon relative to the end of the input sequence
|
||||
# Horizon h corresponds to index: input_end + h - 1
|
||||
target_indices = [input_end + h - 1 for h in self.forecast_horizon_list]
|
||||
target_vector = self.data[target_indices, self.target_col_index] # Shape: (len(forecast_horizon_list),)
|
||||
|
||||
return input_sequence, target_vector
|
||||
|
||||
# --- Data Preparation ---
|
||||
def prepare_fold_data_and_loaders(
|
||||
@ -576,6 +588,7 @@ def prepare_fold_data_and_loaders(
|
||||
feature_config: Configuration for feature engineering.
|
||||
train_config: Configuration for training (used for batch size, device hints).
|
||||
eval_config: Configuration for evaluation (used for batch size).
|
||||
|
||||
|
||||
Returns:
|
||||
Tuple containing:
|
||||
@ -598,13 +611,25 @@ def prepare_fold_data_and_loaders(
|
||||
if feature_config.lags:
|
||||
max_lookback = max(max_lookback, max(feature_config.lags))
|
||||
if feature_config.rolling_window_sizes:
|
||||
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1 )
|
||||
max_history_needed = max(max_lookback, feature_config.sequence_length)
|
||||
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1)
|
||||
|
||||
# Also need history for the input sequence length and max target horizon
|
||||
max_horizon_needed = max(feature_config.forecast_horizon) if feature_config.forecast_horizon else 0
|
||||
# Max history needed is max of lookback for features OR (sequence_length + max_horizon - 1) for targets/inputs
|
||||
# Correct logic: Need `sequence_length` history for input, and `max_horizon` steps *after* the train data for targets/evaluation.
|
||||
# The slicing needs to ensure enough data *before* train_idx[0] for feature lookback *and* sequence_length.
|
||||
# Max history *before* the start of the training set
|
||||
max_history_needed_before_train = max(max_lookback, feature_config.sequence_length)
|
||||
|
||||
slice_start_idx = max(0, train_idx[0] - max_history_needed_before_train)
|
||||
# The end index needs to cover the test set PLUS the maximum horizon needed for the last test target
|
||||
slice_end_idx = test_idx[-1] + max_horizon_needed # Go up to the last needed target
|
||||
|
||||
# Ensure end index is within bounds
|
||||
slice_end_idx = min(slice_end_idx + 1, len(full_df)) # +1 because iloc is exclusive
|
||||
|
||||
slice_start_idx = max(0, train_idx[0] - max_history_needed)
|
||||
slice_end_idx = test_idx[-1] + 1
|
||||
if slice_start_idx >= slice_end_idx:
|
||||
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices.")
|
||||
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices and horizon.")
|
||||
|
||||
fold_data_slice = full_df.iloc[slice_start_idx:slice_end_idx]
|
||||
logger.debug(f"Required data slice for fold: indices {slice_start_idx} to {slice_end_idx-1} "
|
||||
@ -709,22 +734,38 @@ def prepare_fold_data_and_loaders(
|
||||
|
||||
input_size = train_data_scaled.shape[1]
|
||||
|
||||
# --- Ensure final data arrays are float32 for PyTorch ---
|
||||
try:
|
||||
# Explicitly convert to float32 AFTER scaling (or non-scaling)
|
||||
train_data_final = train_data_scaled.astype(np.float32)
|
||||
val_data_final = val_data_scaled.astype(np.float32)
|
||||
test_data_final = test_data_scaled.astype(np.float32)
|
||||
logger.debug("Ensured final data arrays are float32.")
|
||||
except ValueError as e:
|
||||
# This might happen if data cannot be safely cast (e.g., strings remain unexpectedly)
|
||||
logger.error(f"Failed to convert data arrays to float32 before creating Tensors: {e}", exc_info=True)
|
||||
# Consider adding more debug info here if it fails, e.g.:
|
||||
# logger.debug(f"Data types in train_df before conversion: \n{train_df.dtypes}")
|
||||
raise ValueError("Data could not be converted to numeric type (float32) for PyTorch.") from e
|
||||
|
||||
|
||||
# 6. Dataset Instantiation
|
||||
logger.debug("Creating TimeSeriesDataset instances for the fold.")
|
||||
try:
|
||||
# Use the explicitly converted arrays
|
||||
train_dataset = TimeSeriesDataset(
|
||||
train_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
train_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
)
|
||||
val_dataset = TimeSeriesDataset(
|
||||
val_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
val_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
)
|
||||
test_dataset = TimeSeriesDataset(
|
||||
test_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
test_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
|
||||
)
|
||||
except ValueError as e:
|
||||
logger.error(f"Error creating TimeSeriesDataset: {e}")
|
||||
logger.error(f"Shapes fed to Dataset: Train={train_data_scaled.shape}, Val={val_data_scaled.shape}, Test={test_data_scaled.shape}")
|
||||
logger.error(f"SeqLen={feature_config.sequence_length}, Horizon={feature_config.forecast_horizon}")
|
||||
logger.error(f"Shapes fed to Dataset: Train={train_data_final.shape}, Val={val_data_final.shape}, Test={test_data_final.shape}")
|
||||
logger.error(f"SeqLen={feature_config.sequence_length}, Horizons={feature_config.forecast_horizon}")
|
||||
raise
|
||||
|
||||
|
||||
@ -748,4 +789,69 @@ def prepare_fold_data_and_loaders(
|
||||
|
||||
logger.info("Data loaders prepared successfully for the fold.")
|
||||
|
||||
return train_loader, val_loader, test_loader, target_scaler, input_size
|
||||
return train_loader, val_loader, test_loader, target_scaler, input_size
|
||||
|
||||
# --- Classic Train/Val/Test Split ---
|
||||
|
||||
def split_data_classic(
|
||||
n_samples: int,
|
||||
val_frac: float,
|
||||
test_frac: float,
|
||||
start_from_end: bool = True
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Splits data indices into one train, one validation, and one test set based on fractions.
|
||||
|
||||
Args:
|
||||
n_samples: Total number of samples in the dataset.
|
||||
val_frac: Fraction of the *total* data to use for validation.
|
||||
test_frac: Fraction of the *total* data to use for testing.
|
||||
start_from_end: If True (default), test and validation sets are taken from the end
|
||||
of the series. If False, they are taken after the initial training block.
|
||||
Default is True for typical time series evaluation.
|
||||
|
||||
Returns:
|
||||
Tuple of (train_indices, val_indices, test_indices).
|
||||
|
||||
Raises:
|
||||
ValueError: If fractions are invalid or sum to >= 1.
|
||||
"""
|
||||
if not (0 < val_frac < 1):
|
||||
raise ValueError(f"val_frac must be between 0 and 1, got {val_frac}")
|
||||
if not (0 < test_frac < 1):
|
||||
raise ValueError(f"test_frac must be between 0 and 1, got {test_frac}")
|
||||
if val_frac + test_frac >= 1:
|
||||
raise ValueError(f"Sum of val_frac ({val_frac}) and test_frac ({test_frac}) must be less than 1.")
|
||||
|
||||
test_size = math.ceil(n_samples * test_frac) # Use ceil to ensure at least one sample if frac is tiny
|
||||
val_size = math.ceil(n_samples * val_frac)
|
||||
train_size = n_samples - val_size - test_size
|
||||
|
||||
if train_size <= 0:
|
||||
raise ValueError(f"Calculated train_size ({train_size}) is not positive. Adjust fractions or increase data.")
|
||||
if val_size <= 0:
|
||||
raise ValueError("Calculated val_size is not positive.")
|
||||
if test_size <= 0:
|
||||
raise ValueError("Calculated test_size is not positive.")
|
||||
|
||||
|
||||
indices = np.arange(n_samples)
|
||||
|
||||
if start_from_end:
|
||||
train_indices = indices[:train_size]
|
||||
val_indices = indices[train_size:train_size + val_size]
|
||||
test_indices = indices[train_size + val_size:]
|
||||
# Adjust if ceil caused slight overallocation in test
|
||||
test_indices = test_indices[:test_size]
|
||||
else:
|
||||
# Less common: place val/test directly after train
|
||||
train_indices = indices[:train_size]
|
||||
val_indices = indices[train_size:train_size + val_size]
|
||||
test_indices = indices[train_size + val_size:train_size + val_size + test_size]
|
||||
# Remaining data is unused in this scenario
|
||||
|
||||
logger.info(f"Classic split: Train indices {train_indices[0]}-{train_indices[-1]} (size {len(train_indices)}), "
|
||||
f"Val indices {val_indices[0]}-{val_indices[-1]} (size {len(val_indices)}), "
|
||||
f"Test indices {test_indices[0]}-{test_indices[-1]} (size {len(test_indices)})")
|
||||
|
||||
return train_indices, val_indices, test_indices
|
Reference in New Issue
Block a user