intermediate backup

This commit is contained in:
2025-05-03 20:46:14 +02:00
parent 2b0a5728d4
commit 6542caf48f
38 changed files with 4513 additions and 1067 deletions

View File

@ -15,7 +15,7 @@ from .data_processing import (
prepare_fold_data_and_loaders,
TimeSeriesDataset
)
from .model import LSTMForecastLightningModule
from forecasting_model.train.model import LSTMForecastLightningModule
from .evaluation import (
evaluate_fold_predictions,
# Optionally expose the standalone evaluation utility if needed externally

View File

@ -5,9 +5,10 @@ import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import Tuple, Generator, List, Optional, Union, Dict, Literal, Type
import math # Add math import
# Use relative import for utils within the package
from .utils.config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
from .utils.forecast_config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig, CrossValidationConfig
# Optional: Import wavelet library if needed later
# import pywt
@ -264,31 +265,39 @@ def engineer_features(df: pd.DataFrame, target_col: str, feature_config: Feature
if isinstance(nan_handler, str):
if nan_handler in ['ffill', 'bfill']:
fill_method = nan_handler
logger.debug(f"Filling NaNs in generated features using method: '{fill_method}'")
logger.debug(f"Selected NaN fill method for generated features: '{fill_method}'")
elif nan_handler == 'mean':
logger.warning("NaN filling with 'mean' in generated features is applied globally here;"
" consider per-fold mean filling if lookahead is a concern.")
# Calculate mean only on the slice provided, potentially leaking info if slice includes val/test
# Better to use ffill/bfill here or handle after split
fill_value = features_df[feature_cols_generated].mean() # Calculate mean per feature column
logger.debug("Filling NaNs in generated features using column means.")
fill_value = features_df[feature_cols_generated].mean()
logger.debug("Selected NaN fill method: column means.")
else:
logger.warning(f"Unsupported string fill_nan method '{nan_handler}' for generated features. Using 'ffill'.")
fill_method = 'ffill'
fill_method = 'ffill' # Default to ffill if unsupported string
elif isinstance(nan_handler, (int, float)):
fill_value = float(nan_handler)
logger.debug(f"Filling NaNs in generated features with value: {fill_value}")
logger.debug(f"Selected NaN fill value for generated features: {fill_value}")
else:
logger.warning(f"Invalid fill_nan type: {type(nan_handler)}. NaNs in features may remain.")
# Apply filling only to generated feature columns
if fill_method:
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method=fill_method)
if fill_method == 'ffill':
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(method='bfill')
# Apply filling only to generated feature columns using recommended methods
if fill_method == 'ffill':
logger.debug("Applying .ffill() to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
# Apply bfill afterwards to handle any NaNs remaining at the very beginning
logger.debug("Applying .bfill() to handle any remaining NaNs at the start...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
elif fill_method == 'bfill':
logger.debug("Applying .bfill() to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].bfill()
# Optionally apply ffill after bfill if you need to fill trailing NaNs (less common)
# features_df[feature_cols_generated] = features_df[feature_cols_generated].ffill()
elif fill_value is not None:
# fillna with Series/dict for column-wise mean, or scalar for constant value
logger.debug(f"Applying .fillna(value={fill_value}) to generated features...")
features_df[feature_cols_generated] = features_df[feature_cols_generated].fillna(value=fill_value)
# No else needed, if fill_method and fill_value are None, no filling happens
else:
logger.warning("`fill_nan` is None. NaNs generated by feature engineering may remain.")
@ -366,36 +375,31 @@ class TimeSeriesCrossValidationSplitter:
# Estimate if None
elif self.initial_train_size is None:
min_samples_per_split_step = 2 # Heuristic minimum samples for val+test in one step
# Estimate val/test based on *potential* train size (crude)
# Assume train is roughly (1 - val - test) fraction for estimation
estimated_train_frac = max(0.1, 1.0 - self.val_frac - self.test_frac) # Ensure non-zero
estimated_train_n = int(self.n_samples * estimated_train_frac)
val_test_size_per_step = max(min_samples_per_split_step, int(estimated_train_n * (self.val_frac + self.test_frac)))
logger.info("Estimating fixed train size based on n_splits, val_frac, test_frac.")
# Estimate based on the total space needed for all splits:
# n_samples >= fixed_train_n + val_size + test_size + (n_splits - 1) * step_size
# n_samples >= fixed_train_n + int(fixed_train_n*val_frac) + n_splits * int(fixed_train_n*test_frac)
# n_samples >= fixed_train_n * (1 + val_frac + n_splits * test_frac)
# fixed_train_n <= n_samples / (1 + val_frac + n_splits * test_frac)
# Tentative initial train size is total minus one val/test block
fixed_train_n_est = self.n_samples - val_test_size_per_step
denominator = 1.0 + self.val_frac + self.n_splits * self.test_frac
if denominator <= 1.0: # Avoid division by zero or non-positive, and ensure train frac < 1
raise ValueError(f"Cannot estimate initial_train_size. Combination of val_frac ({self.val_frac}), "
f"test_frac ({self.test_frac}), and n_splits ({self.n_splits}) is invalid (denominator {denominator:.2f} <= 1.0).")
# Basic sanity checks
if fixed_train_n_est <= 0:
raise ValueError("Could not estimate a valid initial_train_size (<= 0). Please specify it or check CV fractions.")
# Need at least 1 sample for train, val, test each theoretically
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
if fixed_train_n_est + est_val_size + est_test_size > self.n_samples:
# If the simple estimate is too large, reduce it more drastically
# Try setting train size = 50% and see if val/test fit?
fixed_train_n_est = int(self.n_samples * 0.5)
est_val_size = max(1, int(fixed_train_n_est * self.val_frac))
est_test_size = max(1, int(fixed_train_n_est * self.test_frac))
if fixed_train_n_est <=0 or (fixed_train_n_est + est_val_size + est_test_size > self.n_samples):
raise ValueError("Could not estimate a valid initial_train_size. Data too small relative to val/test fractions? Please specify initial_train_size.")
estimated_size = int(self.n_samples / denominator)
logger.warning(f"initial_train_size not set, estimated fixed train size for rolling window: {fixed_train_n_est}. "
"This is a heuristic; viability depends on n_splits and step size. Validation happens in split().")
return fixed_train_n_est
# Add a sanity check: ensure estimated size is reasonably large
min_required_for_features = 1 # Placeholder - ideally get from FeatureConfig if possible, but complex here
if estimated_size < min_required_for_features:
raise ValueError(f"Estimated fixed train size ({estimated_size}) is too small. "
f"Check CV config (n_splits={self.n_splits}, val_frac={self.val_frac}, test_frac={self.test_frac}) "
f"relative to total samples ({self.n_samples}). Consider specifying initial_train_size manually.")
logger.info(f"Estimated fixed training window size: {estimated_size}")
return estimated_size
else:
raise ValueError(f"Invalid initial_train_size: {self.initial_train_size}")
raise ValueError(f"Invalid initial_train_size type or value: {self.initial_train_size}")
def split(self) -> Generator[Tuple[np.ndarray, np.ndarray, np.ndarray], None, None]:
@ -483,28 +487,31 @@ class TimeSeriesDataset(Dataset):
"""
PyTorch Dataset for time series forecasting.
Takes a NumPy array (features + target), sequence length, and forecast horizon,
and returns (input_sequence, target_sequence) tuples. Compatible with PyTorch
DataLoaders used by PyTorch Lightning.
Takes a NumPy array (features + target), sequence length, and a list of
specific forecast horizons. Returns (input_sequence, target_vector) tuples,
where target_vector contains the target values at the specified future steps.
"""
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: int, target_col_index: int = 0):
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: List[int], target_col_index: int = 0):
"""
Args:
data_array: Numpy array of shape (n_samples, n_features).
Assumes the target variable is one of the columns.
sequence_length: Length of the input sequence (lookback window).
forecast_horizon: Number of steps ahead to predict.
forecast_horizon: List of specific steps ahead to predict (e.g., [1, 6, 12]).
target_col_index: Index of the target column in data_array. Defaults to 0.
"""
if sequence_length <= 0:
raise ValueError("sequence_length must be positive.")
if forecast_horizon <= 0:
raise ValueError("forecast_horizon must be positive.")
if not forecast_horizon or not isinstance(forecast_horizon, list) or any(h <= 0 for h in forecast_horizon):
raise ValueError("forecast_horizon must be a non-empty list of positive integers.")
if data_array.ndim != 2:
raise ValueError(f"data_array must be 2D, but got shape {data_array.shape}")
min_len_required = sequence_length + forecast_horizon
self.max_horizon = max(forecast_horizon) # Find the furthest point needed
min_len_required = sequence_length + self.max_horizon
if min_len_required > data_array.shape[0]:
raise ValueError(f"sequence_length ({sequence_length}) + forecast_horizon ({forecast_horizon}) = {min_len_required} "
raise ValueError(f"sequence_length ({sequence_length}) + max_horizon ({self.max_horizon}) = {min_len_required} "
f"exceeds total samples provided ({data_array.shape[0]})")
if not (0 <= target_col_index < data_array.shape[1]):
raise ValueError(f"target_col_index ({target_col_index}) out of bounds for data with {data_array.shape[1]} columns.")
@ -512,32 +519,37 @@ class TimeSeriesDataset(Dataset):
self.data = torch.tensor(data_array, dtype=torch.float32)
self.sequence_length = sequence_length
self.forecast_horizon = forecast_horizon
self.forecast_horizon_list = sorted(forecast_horizon)
self.target_col_index = target_col_index
self.n_samples = data_array.shape[0]
self.n_features = data_array.shape[1]
logger.debug(f"TimeSeriesDataset created: data shape={self.data.shape}, "
f"seq_len={self.sequence_length}, forecast_horizon={self.forecast_horizon}, "
f"target_idx={self.target_col_index}")
f"seq_len={self.sequence_length}, forecast_horizons={self.forecast_horizon_list}, "
f"max_horizon={self.max_horizon}, target_idx={self.target_col_index}")
def __len__(self) -> int:
"""Returns the total number of sequences that can be generated."""
return self.n_samples - self.sequence_length - self.forecast_horizon + 1
return self.n_samples - self.sequence_length - self.max_horizon + 1
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Returns a single (input_sequence, target_sequence) pair.
Returns a single (input_sequence, target_vector) pair.
Target vector contains values for the specified forecast horizons.
"""
if not (0 <= idx < len(self)):
raise IndexError(f"Index {idx} out of bounds for dataset with length {len(self)}")
input_start = idx
input_end = idx + self.sequence_length
input_sequence = self.data[input_start:input_end, :]
target_start = input_end
target_end = target_start + self.forecast_horizon
target_sequence = self.data[target_start:target_end, self.target_col_index]
return input_sequence, target_sequence
input_sequence = self.data[input_start:input_end, :] # Shape: (seq_len, n_features)
# Calculate indices for each horizon relative to the end of the input sequence
# Horizon h corresponds to index: input_end + h - 1
target_indices = [input_end + h - 1 for h in self.forecast_horizon_list]
target_vector = self.data[target_indices, self.target_col_index] # Shape: (len(forecast_horizon_list),)
return input_sequence, target_vector
# --- Data Preparation ---
def prepare_fold_data_and_loaders(
@ -576,6 +588,7 @@ def prepare_fold_data_and_loaders(
feature_config: Configuration for feature engineering.
train_config: Configuration for training (used for batch size, device hints).
eval_config: Configuration for evaluation (used for batch size).
Returns:
Tuple containing:
@ -598,13 +611,25 @@ def prepare_fold_data_and_loaders(
if feature_config.lags:
max_lookback = max(max_lookback, max(feature_config.lags))
if feature_config.rolling_window_sizes:
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1 )
max_history_needed = max(max_lookback, feature_config.sequence_length)
max_lookback = max(max_lookback, max(feature_config.rolling_window_sizes) -1)
# Also need history for the input sequence length and max target horizon
max_horizon_needed = max(feature_config.forecast_horizon) if feature_config.forecast_horizon else 0
# Max history needed is max of lookback for features OR (sequence_length + max_horizon - 1) for targets/inputs
# Correct logic: Need `sequence_length` history for input, and `max_horizon` steps *after* the train data for targets/evaluation.
# The slicing needs to ensure enough data *before* train_idx[0] for feature lookback *and* sequence_length.
# Max history *before* the start of the training set
max_history_needed_before_train = max(max_lookback, feature_config.sequence_length)
slice_start_idx = max(0, train_idx[0] - max_history_needed_before_train)
# The end index needs to cover the test set PLUS the maximum horizon needed for the last test target
slice_end_idx = test_idx[-1] + max_horizon_needed # Go up to the last needed target
# Ensure end index is within bounds
slice_end_idx = min(slice_end_idx + 1, len(full_df)) # +1 because iloc is exclusive
slice_start_idx = max(0, train_idx[0] - max_history_needed)
slice_end_idx = test_idx[-1] + 1
if slice_start_idx >= slice_end_idx:
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices.")
raise ValueError(f"Calculated slice start ({slice_start_idx}) >= slice end ({slice_end_idx}). Check indices and horizon.")
fold_data_slice = full_df.iloc[slice_start_idx:slice_end_idx]
logger.debug(f"Required data slice for fold: indices {slice_start_idx} to {slice_end_idx-1} "
@ -709,22 +734,38 @@ def prepare_fold_data_and_loaders(
input_size = train_data_scaled.shape[1]
# --- Ensure final data arrays are float32 for PyTorch ---
try:
# Explicitly convert to float32 AFTER scaling (or non-scaling)
train_data_final = train_data_scaled.astype(np.float32)
val_data_final = val_data_scaled.astype(np.float32)
test_data_final = test_data_scaled.astype(np.float32)
logger.debug("Ensured final data arrays are float32.")
except ValueError as e:
# This might happen if data cannot be safely cast (e.g., strings remain unexpectedly)
logger.error(f"Failed to convert data arrays to float32 before creating Tensors: {e}", exc_info=True)
# Consider adding more debug info here if it fails, e.g.:
# logger.debug(f"Data types in train_df before conversion: \n{train_df.dtypes}")
raise ValueError("Data could not be converted to numeric type (float32) for PyTorch.") from e
# 6. Dataset Instantiation
logger.debug("Creating TimeSeriesDataset instances for the fold.")
try:
# Use the explicitly converted arrays
train_dataset = TimeSeriesDataset(
train_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
train_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
val_dataset = TimeSeriesDataset(
val_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
val_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
test_dataset = TimeSeriesDataset(
test_data_scaled, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
test_data_final, feature_config.sequence_length, feature_config.forecast_horizon, target_col_index=target_col_index_in_features
)
except ValueError as e:
logger.error(f"Error creating TimeSeriesDataset: {e}")
logger.error(f"Shapes fed to Dataset: Train={train_data_scaled.shape}, Val={val_data_scaled.shape}, Test={test_data_scaled.shape}")
logger.error(f"SeqLen={feature_config.sequence_length}, Horizon={feature_config.forecast_horizon}")
logger.error(f"Shapes fed to Dataset: Train={train_data_final.shape}, Val={val_data_final.shape}, Test={test_data_final.shape}")
logger.error(f"SeqLen={feature_config.sequence_length}, Horizons={feature_config.forecast_horizon}")
raise
@ -748,4 +789,69 @@ def prepare_fold_data_and_loaders(
logger.info("Data loaders prepared successfully for the fold.")
return train_loader, val_loader, test_loader, target_scaler, input_size
return train_loader, val_loader, test_loader, target_scaler, input_size
# --- Classic Train/Val/Test Split ---
def split_data_classic(
n_samples: int,
val_frac: float,
test_frac: float,
start_from_end: bool = True
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits data indices into one train, one validation, and one test set based on fractions.
Args:
n_samples: Total number of samples in the dataset.
val_frac: Fraction of the *total* data to use for validation.
test_frac: Fraction of the *total* data to use for testing.
start_from_end: If True (default), test and validation sets are taken from the end
of the series. If False, they are taken after the initial training block.
Default is True for typical time series evaluation.
Returns:
Tuple of (train_indices, val_indices, test_indices).
Raises:
ValueError: If fractions are invalid or sum to >= 1.
"""
if not (0 < val_frac < 1):
raise ValueError(f"val_frac must be between 0 and 1, got {val_frac}")
if not (0 < test_frac < 1):
raise ValueError(f"test_frac must be between 0 and 1, got {test_frac}")
if val_frac + test_frac >= 1:
raise ValueError(f"Sum of val_frac ({val_frac}) and test_frac ({test_frac}) must be less than 1.")
test_size = math.ceil(n_samples * test_frac) # Use ceil to ensure at least one sample if frac is tiny
val_size = math.ceil(n_samples * val_frac)
train_size = n_samples - val_size - test_size
if train_size <= 0:
raise ValueError(f"Calculated train_size ({train_size}) is not positive. Adjust fractions or increase data.")
if val_size <= 0:
raise ValueError("Calculated val_size is not positive.")
if test_size <= 0:
raise ValueError("Calculated test_size is not positive.")
indices = np.arange(n_samples)
if start_from_end:
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size + val_size]
test_indices = indices[train_size + val_size:]
# Adjust if ceil caused slight overallocation in test
test_indices = test_indices[:test_size]
else:
# Less common: place val/test directly after train
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size + val_size]
test_indices = indices[train_size + val_size:train_size + val_size + test_size]
# Remaining data is unused in this scenario
logger.info(f"Classic split: Train indices {train_indices[0]}-{train_indices[-1]} (size {len(train_indices)}), "
f"Val indices {val_indices[0]}-{val_indices[-1]} (size {len(val_indices)}), "
f"Test indices {test_indices[0]}-{test_indices[-1]} (size {len(test_indices)})")
return train_indices, val_indices, test_indices

View File

@ -1,24 +1,22 @@
import logging
import os
from pathlib import Path # Added
import numpy as np
import torch
import torchmetrics
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler # For type hinting target_scaler
from typing import Dict, Any, Optional, Union, List, Tuple
# import matplotlib.pyplot as plt # No longer needed directly
# import seaborn as sns # No longer needed directly
from typing import Dict, Optional, Union, List
import pandas as pd # For time index type hint
# Assuming config_model and io.plotting are accessible
from forecasting_model.utils.config_model import EvaluationConfig
from forecasting_model.io.plotting import ( # Import the plotting utilities
from forecasting_model.utils.forecast_config_model import EvaluationConfig
from forecasting_model.train.model import LSTMForecastLightningModule
from forecasting_model.io.plotting import (
setup_plot_style,
save_plot,
create_time_series_plot,
create_scatter_plot,
create_residuals_plot,
create_residuals_distribution_plot
create_residuals_distribution_plot,
)
@ -82,90 +80,101 @@ def calculate_rmse_np(y_true: np.ndarray, y_pred: np.ndarray) -> float:
return float(rmse)
# --- Plotting Functions (Utilities) ---
# REMOVED - These are now imported from io.plotting
# --- Fold Evaluation Function ---
def evaluate_fold_predictions(
y_true_scaled: np.ndarray,
y_pred_scaled: np.ndarray,
y_true_scaled: np.ndarray, # Shape: (n_samples, len(horizons))
y_pred_scaled: np.ndarray, # Shape: (n_samples, len(horizons))
target_scaler: Union[StandardScaler, MinMaxScaler, None],
eval_config: EvaluationConfig,
fold_num: int,
output_dir: str, # Base output directory (e.g., output/cv_results)
time_index: Optional[np.ndarray] = None # Optional: Pass time index for x-axis
fold_num: int, # Zero-based fold index
output_dir: str, # Base output directory
plot_subdir: Optional[str] = "plots",
# time_index: Optional[Union[np.ndarray, pd.Index]] = None, # OLD: Index for samples
prediction_time_index: Optional[pd.Index] = None, # Index corresponding to the prediction times (n_samples,)
forecast_horizons: Optional[List[int]] = None, # The list of horizons predicted (e.g., [1, 6, 12])
plot_title_prefix: Optional[str] = None
) -> Dict[str, float]:
"""
Processes prediction results for a fold's test set using torchmetrics.
Processes prediction results (multiple horizons) for a fold or ensemble.
Takes scaled predictions and targets, inverse transforms them,
calculates final metrics (MAE, RMSE) using torchmetrics.functional,
and generates evaluation plots using utilities from io.plotting. Assumes
model inference is already done.
Takes scaled predictions and targets (shape: samples, num_horizons),
inverse transforms them, calculates overall metrics (MAE, RMSE) across all horizons,
and generates evaluation plots *for the first specified horizon only*.
Args:
y_true_scaled: Numpy array of scaled ground truth targets (n_samples, horizon).
y_pred_scaled: Numpy array of scaled model predictions (n_samples, horizon).
target_scaler: The scaler fitted on the target variable during training. Needed
for inverse transforming to original scale. Can be None.
eval_config: Configuration object for evaluation parameters (e.g., plotting).
fold_num: The current fold number (e.g., 0, 1, ...).
output_dir: The base directory to save fold-specific outputs (plots, metrics).
time_index: Optional array representing the time index for the test set,
used for x-axis in time-based plots. If None, uses integer indices.
y_true_scaled: Numpy array of scaled ground truth targets (n_samples, len(horizons)).
y_pred_scaled: Numpy array of scaled model predictions (n_samples, len(horizons)).
target_scaler: The scaler fitted on the target variable.
eval_config: Configuration object for evaluation parameters.
fold_num: The current fold number (zero-based or -1 for classic).
output_dir: The base directory to save outputs.
plot_subdir: Specific subdirectory under output_dir for plots.
prediction_time_index: Pandas Index representing the time for each prediction point (n_samples,).
Required for meaningful time plots.
forecast_horizons: List of horizons predicted (e.g., [1, 6, 12]). Required for plotting.
plot_title_prefix: Optional string to prepend to plot titles.
Returns:
Dictionary containing evaluation metrics {'MAE': value, 'RMSE': value} on the
original scale. Metrics will be NaN if inverse transform or calculation fails.
Raises:
ValueError: If input shapes are inconsistent or required scaler is missing.
original scale, calculated *across all predicted horizons*.
"""
logger.info(f"Processing evaluation results for Fold {fold_num + 1}...")
fold_id = fold_num + 1 # Use 1-based indexing for reporting/filenames
fold_id_str = f"Fold {fold_num + 1}" if fold_num >= 0 else "Classic Run"
eval_context_str = f"{plot_title_prefix} {fold_id_str}" if plot_title_prefix else fold_id_str
logger.info(f"Processing evaluation results for: {eval_context_str}")
if y_true_scaled.shape != y_pred_scaled.shape:
raise ValueError(f"Shape mismatch between targets and predictions: "
raise ValueError(f"Shape mismatch between targets and predictions for {eval_context_str}: "
f"{y_true_scaled.shape} vs {y_pred_scaled.shape}")
if y_true_scaled.ndim != 2:
raise ValueError(f"Expected 2D arrays for targets and predictions, got {y_true_scaled.ndim}D")
raise ValueError(f"Expected 2D arrays (samples, num_horizons) for {eval_context_str}, got {y_true_scaled.ndim}D")
n_samples, horizon = y_true_scaled.shape
logger.debug(f"Processing {n_samples} samples with horizon {horizon}.")
n_samples, n_horizons = y_true_scaled.shape
logger.debug(f"Processing {n_samples} samples across {n_horizons} horizons for {eval_context_str}.")
# --- Inverse Transform (Outputs NumPy) ---
y_true_flat_scaled = y_true_scaled.reshape(-1, 1)
y_pred_flat_scaled = y_pred_scaled.reshape(-1, 1)
# Flatten the multi-horizon arrays for the scaler (which expects (N, 1))
y_true_flat_scaled = y_true_scaled.reshape(-1, 1) # Shape: (n_samples * n_horizons, 1)
y_pred_flat_scaled = y_pred_scaled.reshape(-1, 1) # Shape: (n_samples * n_horizons, 1)
y_true_inv_np: np.ndarray
y_pred_inv_np: np.ndarray
if target_scaler is not None:
try:
logger.debug("Inverse transforming predictions and targets.")
y_true_inv_np = target_scaler.inverse_transform(y_true_flat_scaled)
y_pred_inv_np = target_scaler.inverse_transform(y_pred_flat_scaled)
# Flatten NumPy arrays for metric calculation and plotting
y_true_np = y_true_inv_np.flatten()
y_pred_np = y_pred_inv_np.flatten()
logger.debug(f"Inverse transforming predictions and targets for {eval_context_str}.")
y_true_inv_flat = target_scaler.inverse_transform(y_true_flat_scaled)
y_pred_inv_flat = target_scaler.inverse_transform(y_pred_flat_scaled)
# Reshape back to (n_samples, n_horizons) for potential per-horizon analysis later
y_true_inv_np = y_true_inv_flat.reshape(n_samples, n_horizons)
y_pred_inv_np = y_pred_inv_flat.reshape(n_samples, n_horizons)
except Exception as e:
logger.error(f"Error during inverse scaling for Fold {fold_id}: {e}", exc_info=True)
logger.error(f"Error during inverse scaling for {eval_context_str}: {e}", exc_info=True)
logger.error("Metrics calculation will be skipped due to inverse transform failure.")
return {'MAE': np.nan, 'RMSE': np.nan}
else:
logger.info("No target scaler provided, assuming inputs are already on original scale.")
# Flatten NumPy arrays for metric calculation and plotting
y_true_np = y_true_flat_scaled.flatten()
y_pred_np = y_pred_flat_scaled.flatten()
# --- Calculate Metrics using torchmetrics.functional ---
metrics: Dict[str, float] = {'MAE': np.nan, 'RMSE': np.nan} # Initialize with NaN
else:
logger.info(f"No target scaler provided for {eval_context_str}, assuming inputs are on original scale.")
y_true_inv_np = y_true_scaled # Keep original shape (n_samples, n_horizons)
y_pred_inv_np = y_pred_scaled # Keep original shape
# --- Calculate Metrics using torchmetrics.functional (Overall across all horizons) ---
metrics: Dict[str, float] = {'MAE': np.nan, 'RMSE': np.nan}
try:
if len(y_true_np) > 0: # Check if data exists after potential failures
y_true_tensor = torch.from_numpy(y_true_np).float().cpu()
y_pred_tensor = torch.from_numpy(y_pred_np).float().cpu()
# Flatten arrays for overall metrics calculation
y_true_flat_for_metrics = y_true_inv_np.flatten()
y_pred_flat_for_metrics = y_pred_inv_np.flatten()
valid_mask = ~np.isnan(y_true_flat_for_metrics) & ~np.isnan(y_pred_flat_for_metrics)
if np.sum(valid_mask) < len(y_true_flat_for_metrics):
nan_count = len(y_true_flat_for_metrics) - np.sum(valid_mask)
logger.warning(f"{nan_count} NaN values found in predictions/targets (across all horizons) for {eval_context_str}. These will be excluded from metrics.")
if np.sum(valid_mask) > 0:
y_true_tensor = torch.from_numpy(y_true_flat_for_metrics[valid_mask]).float().cpu()
y_pred_tensor = torch.from_numpy(y_pred_flat_for_metrics[valid_mask]).float().cpu()
mae_tensor = torchmetrics.functional.mean_absolute_error(y_pred_tensor, y_true_tensor)
mse_tensor = torchmetrics.functional.mean_squared_error(y_pred_tensor, y_true_tensor)
@ -174,82 +183,95 @@ def evaluate_fold_predictions(
metrics['MAE'] = mae_tensor.item()
metrics['RMSE'] = rmse_tensor.item()
logger.info(f"Fold {fold_id} Test Set Metrics (torchmetrics): MAE={metrics['MAE']:.4f}, RMSE={metrics['RMSE']:.4f}")
logger.info(f"{eval_context_str} Test Set Overall Metrics (torchmetrics): MAE={metrics['MAE']:.4f}, RMSE={metrics['RMSE']:.4f} (across all horizons)")
else:
logger.warning(f"Skipping metric calculation for Fold {fold_id} due to empty data after inverse transform.")
logger.warning(f"Skipping metric calculation for {eval_context_str} due to no valid (non-NaN) data points.")
except Exception as e:
logger.error(f"Failed to calculate metrics using torchmetrics for Fold {fold_id}: {e}", exc_info=True)
# metrics already initialized to NaN
logger.error(f"Failed to calculate overall metrics using torchmetrics for {eval_context_str}: {e}", exc_info=True)
# --- Generate Plots (Optional - uses plotting utilities) ---
if eval_config.save_plots and len(y_true_np) > 0:
logger.info(f"Generating evaluation plots for Fold {fold_id}...")
# Define plot directory and setup style
fold_plot_dir = Path(output_dir) / f"fold_{fold_id:02d}" / "plots"
setup_plot_style() # Apply consistent styling
# --- Generate Plots (Optional - Focus on FIRST horizon) ---
if eval_config.save_plots and np.sum(valid_mask) > 0:
if forecast_horizons is None or not forecast_horizons:
logger.warning(f"Skipping plot generation for {eval_context_str}: `forecast_horizons` list not provided.")
elif prediction_time_index is None or len(prediction_time_index) != n_samples:
logger.warning(f"Skipping plot generation for {eval_context_str}: `prediction_time_index` is missing or has incorrect length ({len(prediction_time_index) if prediction_time_index is not None else 'None'} != {n_samples}).")
else:
logger.info(f"Generating evaluation plots for {eval_context_str} (using first horizon H+{forecast_horizons[0]} only)...")
base_plot_dir = Path(output_dir)
fold_plot_dir = base_plot_dir / plot_subdir if plot_subdir else base_plot_dir
setup_plot_style()
title_suffix = f"Fold {fold_id} Test Set"
residuals_np = y_true_np - y_pred_np
# --- Plotting for the FIRST horizon ---
first_horizon = forecast_horizons[0]
y_true_h1 = y_true_inv_np[:, 0] # Data for the first horizon
y_pred_h1 = y_pred_inv_np[:, 0] # Data for the first horizon
residuals_h1 = y_true_h1 - y_pred_h1
# Determine x-axis: use provided time_index if available, else integer indices
# Note: Flattened y_true/y_pred have length n_samples * horizon
# Need an appropriate index for this flattened view if time_index is provided.
# Simple approach: use integer indices for flattened data.
plot_indices = np.arange(len(y_true_np))
xlabel = "Time Index (Flattened Horizon x Samples)"
# If time_index corresponding to the start of each forecast is passed,
# more sophisticated x-axis handling could be done, but integer indices are simpler.
# Calculate the actual time index for the first horizon's targets
# Requires the original dataset's frequency if available, otherwise assumes simple offset
target_time_index_h1 = prediction_time_index
try:
# Assuming prediction_time_index corresponds to the *time* of prediction
# The target for H+h occurs `h` steps later.
# This requires a DatetimeIndex with a frequency.
if isinstance(prediction_time_index, pd.DatetimeIndex) and prediction_time_index.freq:
time_offset = pd.Timedelta(first_horizon, unit=prediction_time_index.freq.name)
target_time_index_h1 = prediction_time_index + time_offset
xlabel_h1 = f"Time (Target H+{first_horizon})"
else:
logger.warning(f"Prediction time index lacks frequency info. Using original prediction time for H+{first_horizon} plot x-axis.")
xlabel_h1 = f"Prediction Time (Plotting H+{first_horizon})"
except Exception as time_err:
logger.warning(f"Could not calculate target time index for H+{first_horizon}: {time_err}. Using prediction time index for x-axis.")
xlabel_h1 = f"Prediction Time (Plotting H+{first_horizon})"
try:
# Create and save each plot using utility functions
fig_ts = create_time_series_plot(
plot_indices, y_true_np, y_pred_np,
f"Predictions vs Actual - {title_suffix}",
xlabel=xlabel,
ylabel="Value (Original Scale)",
max_points=eval_config.plot_sample_size
)
save_plot(fig_ts, fold_plot_dir / "predictions_vs_actual.png")
title_suffix = f"- {eval_context_str} (H+{first_horizon})"
fig_scatter = create_scatter_plot(
y_true_np, y_pred_np,
f"Scatter Plot - {title_suffix}",
xlabel="Actual Values (Original Scale)",
ylabel="Predicted Values (Original Scale)"
)
save_plot(fig_scatter, fold_plot_dir / "scatter_predictions.png")
try:
fig_ts = create_time_series_plot(
target_time_index_h1, y_true_h1, y_pred_h1, # Use H1 data and time
f"Predictions vs Actual {title_suffix}",
xlabel=xlabel_h1, ylabel="Value (Original Scale)",
max_points=eval_config.plot_sample_size
)
save_plot(fig_ts, fold_plot_dir / f"predictions_vs_actual_h{first_horizon}.png")
fig_res_time = create_residuals_plot(
plot_indices, residuals_np,
f"Residuals Over Time - {title_suffix}",
xlabel=xlabel,
ylabel="Residual (Original Scale)",
max_points=eval_config.plot_sample_size
)
save_plot(fig_res_time, fold_plot_dir / "residuals_time.png")
fig_scatter = create_scatter_plot(
y_true_h1, y_pred_h1, # Use H1 data
f"Scatter Plot {title_suffix}",
xlabel="Actual Values (Original Scale)", ylabel="Predicted Values (Original Scale)"
)
save_plot(fig_scatter, fold_plot_dir / f"scatter_predictions_h{first_horizon}.png")
fig_res_dist = create_residuals_distribution_plot(
residuals_np,
f"Residuals Distribution - {title_suffix}",
xlabel="Residual Value (Original Scale)",
ylabel="Density"
)
save_plot(fig_res_dist, fold_plot_dir / "residuals_distribution.png")
fig_res_time = create_residuals_plot(
target_time_index_h1, residuals_h1, # Use H1 residuals and time
f"Residuals Over Time {title_suffix}",
xlabel=xlabel_h1, ylabel="Residual (Original Scale)",
max_points=eval_config.plot_sample_size
)
save_plot(fig_res_time, fold_plot_dir / f"residuals_time_h{first_horizon}.png")
logger.info(f"Evaluation plots saved to: {fold_plot_dir}")
# Residual distribution can use residuals from ALL horizons
residuals_all = y_true_inv_np.flatten() - y_pred_inv_np.flatten()
fig_res_dist = create_residuals_distribution_plot(
residuals_all, # Use all residuals
f"Residuals Distribution {eval_context_str} (All Horizons)", # Adjusted title
xlabel="Residual Value (Original Scale)", ylabel="Density"
)
save_plot(fig_res_dist, fold_plot_dir / "residuals_distribution_all_horizons.png")
except Exception as e:
logger.error(f"Failed to generate or save one or more plots for Fold {fold_id}: {e}", exc_info=True)
# Continue without plots, metrics are already calculated.
logger.info(f"Evaluation plots saved to: {fold_plot_dir}")
elif eval_config.save_plots and len(y_true_np) == 0:
logger.warning(f"Skipping plot generation for Fold {fold_id} due to empty data.")
except Exception as e:
logger.error(f"Failed to generate or save one or more plots for {eval_context_str}: {e}", exc_info=True)
elif eval_config.save_plots and np.sum(valid_mask) == 0:
logger.warning(f"Skipping plot generation for {eval_context_str} due to no valid data points.")
logger.info(f"Evaluation processing finished for Fold {fold_id}.")
logger.info(f"Evaluation processing finished for {eval_context_str}.")
return metrics
@ -257,63 +279,90 @@ def evaluate_fold_predictions(
# This function still calls evaluate_fold_predictions internally, so it benefits
# from the updated plotting logic without needing direct changes here.
def evaluate_model_on_fold_test_set(
model: torch.nn.Module,
model: LSTMForecastLightningModule, # Use the specific type
test_loader: DataLoader,
device: torch.device,
target_scaler: Union[StandardScaler, MinMaxScaler, None],
eval_config: EvaluationConfig,
fold_num: int,
output_dir: str
output_dir: str,
# time_index: Optional[Union[np.ndarray, pd.Index]] = None, # OLD
prediction_time_index: Optional[pd.Index] = None, # Pass prediction time index
forecast_horizons: Optional[List[int]] = None # Pass horizons
) -> Dict[str, float]:
"""
[Optional Function] Evaluates a given model on a fold's test set.
Runs the inference loop, collects scaled results, then processes them using
`evaluate_fold_predictions` (which now uses plotting utilities).
Useful for standalone testing or if not using pl.Trainer.test().
Handles multiple forecast horizons.
"""
# ... (Implementation of inference loop remains the same) ...
logger.info(f"Starting full evaluation (inference + processing) for Fold {fold_num + 1}...")
model.eval()
model.to(device)
all_preds_scaled_list: List[torch.Tensor] = []
all_targets_scaled_list: List[torch.Tensor] = []
with torch.no_grad():
for i, (X_batch, y_batch) in enumerate(test_loader):
for i, batch in enumerate(test_loader):
try:
X_batch = X_batch.to(device)
outputs = model(X_batch) # Scaled outputs
if isinstance(batch, (list, tuple)) and len(batch) == 2:
X_batch, y_batch = batch # y_batch shape: (batch, len(horizons))
targets_present = True
else:
X_batch = batch
y_batch = None
targets_present = False
# Ensure outputs match target shape (e.g., handle trailing dimension)
if outputs.shape != y_batch.shape:
if outputs.ndim == y_batch.ndim + 1 and outputs.shape[-1] == 1:
outputs = outputs.squeeze(-1)
if outputs.shape != y_batch.shape:
raise ValueError(f"Shape mismatch: Output {outputs.shape}, Target {y_batch.shape}")
X_batch = X_batch.to(device)
outputs = model(X_batch) # Scaled outputs: (batch, len(horizons))
all_preds_scaled_list.append(outputs.cpu())
all_targets_scaled_list.append(y_batch.cpu()) # Keep targets on CPU
if targets_present and y_batch is not None:
if outputs.shape != y_batch.shape:
raise ValueError(f"Shape mismatch: Output {outputs.shape}, Target {y_batch.shape}")
all_targets_scaled_list.append(y_batch.cpu())
# ... error/warning if targets expected but not found ...
except Exception as e:
logger.error(f"Error during inference batch {i} for Fold {fold_num+1}: {e}", exc_info=True)
raise ValueError(f"Inference failed on batch {i} for Fold {fold_num+1}")
logger.error(f"Error during inference batch {i} for Fold {fold_num+1}: {e}", exc_info=True)
raise ValueError(f"Inference failed on batch {i} for Fold {fold_num+1}")
# Concatenate results from all batches
# --- Concatenate results ---
try:
if not all_preds_scaled_list or not all_targets_scaled_list:
logger.error(f"No prediction results collected for Fold {fold_num + 1}. Check test_loader.")
if not all_preds_scaled_list:
# ... handle no predictions ...
return {'MAE': np.nan, 'RMSE': np.nan}
# Resulting shapes: (n_samples, len(horizons))
y_pred_scaled = torch.cat(all_preds_scaled_list, dim=0).numpy()
y_true_scaled = None
if all_targets_scaled_list:
y_true_scaled = torch.cat(all_targets_scaled_list, dim=0).numpy()
elif targets_present:
# ... handle missing targets ...
return {'MAE': np.nan, 'RMSE': np.nan}
else:
# ... handle no targets available ...
return {'MAE': np.nan, 'RMSE': np.nan}
y_pred_scaled = torch.cat(all_preds_scaled_list, dim=0).numpy()
y_true_scaled = torch.cat(all_targets_scaled_list, dim=0).numpy()
except Exception as e:
logger.error(f"Error concatenating prediction results for Fold {fold_num + 1}: {e}", exc_info=True)
# ... error handling ...
raise ValueError("Failed to combine batch results during evaluation inference.")
# Process the collected predictions using the refactored function
# No time_index passed here by default, plotting will use integer indices
if y_true_scaled is None:
# ... handle missing targets ...
return {'MAE': np.nan, 'RMSE': np.nan}
# Ensure forecast_horizons are passed if available from the model
# Retrieve from model's hparams if not passed explicitly
if forecast_horizons is None:
try:
# Assuming forecast_horizon list is stored in model_config hparam
forecast_horizons = model.hparams.model_config.forecast_horizon
except AttributeError:
logger.warning("Could not retrieve forecast_horizons from model hparams for evaluation.")
# Process the collected predictions
return evaluate_fold_predictions(
y_true_scaled=y_true_scaled,
y_pred_scaled=y_pred_scaled,
@ -321,5 +370,8 @@ def evaluate_model_on_fold_test_set(
eval_config=eval_config,
fold_num=fold_num,
output_dir=output_dir,
time_index=None # Explicitly pass None
# time_index=time_index # OLD
prediction_time_index=prediction_time_index, # Pass through
forecast_horizons=forecast_horizons, # Pass through
plot_title_prefix=f"Test Fold {fold_num + 1}" # Example prefix
)

View File

@ -1,11 +1,15 @@
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from typing import Optional, Union
from typing import Optional, Union, List
import logging
import pandas as pd
from pathlib import Path
# Assuming sklearn scalers are available
from sklearn.preprocessing import StandardScaler, MinMaxScaler
logger = logging.getLogger(__name__)
def setup_plot_style(use_seaborn: bool = True) -> None:
@ -17,14 +21,16 @@ def setup_plot_style(use_seaborn: bool = True) -> None:
"""
if use_seaborn:
try:
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams['figure.figsize'] = (12, 6) # Default figure size
# Use a different style that might be better for multiple lines
sns.set_theme(style="whitegrid", palette="viridis") # Changed palette
plt.rcParams['figure.figsize'] = (15, 7) # Slightly larger default figure size
logger.debug("Seaborn plot style set.")
except Exception as e:
logger.warning(f"Failed to set seaborn theme: {e}. Using default matplotlib style.")
else:
# Optional: Define a default matplotlib style if seaborn is not used
plt.style.use('default')
plt.rcParams['figure.figsize'] = (15, 7)
logger.debug("Using default matplotlib plot style.")
def save_plot(fig: plt.Figure, filename: Union[str, Path]) -> None:
@ -49,16 +55,21 @@ def save_plot(fig: plt.Figure, filename: Union[str, Path]) -> None:
logger.info(f"Plot saved successfully to: {filepath}")
except OSError as e:
logger.error(f"Failed to create directory for plot {filepath}: {e}", exc_info=True)
raise # Re-raise OSError for directory creation issues
# Don't re-raise immediately, try closing figure first
# raise # Re-raise OSError for directory creation issues - Removed to ensure finally runs
except Exception as e:
logger.error(f"Failed to save plot to {filepath}: {e}", exc_info=True)
raise # Re-raise other saving errors
# Don't re-raise immediately, try closing figure first
finally:
# Close the figure to free up memory, regardless of saving success
plt.close(fig)
# Close the figure to free up memory, regardless of saving success or failure
try:
plt.close(fig)
logger.debug(f"Closed figure for plot {filepath}.")
except Exception as e:
logger.warning(f"Failed to close figure for plot {filepath}: {e}")
def create_time_series_plot(
x: np.ndarray,
x: Union[np.ndarray, pd.Index], # Allow pd.Index for time axis
y_true: np.ndarray,
y_pred: np.ndarray,
title: str,
@ -68,9 +79,9 @@ def create_time_series_plot(
) -> plt.Figure:
"""
Create a time series plot comparing actual vs predicted values.
NOTE: When using multi-horizon forecasts, this typically plots only ONE selected horizon.
Args:
x: The array for the x-axis (e.g., time steps, indices).
x: The array or index for the x-axis (e.g., time steps, datetime index). Should align with y_true/y_pred.
y_true: Ground truth values (1D array).
y_pred: Predicted values (1D array).
title: Title for the plot.
@ -84,8 +95,9 @@ def create_time_series_plot(
Raises:
ValueError: If input array shapes are incompatible.
"""
if not (x.shape == y_true.shape == y_pred.shape and x.ndim == 1):
raise ValueError("Input arrays (x, y_true, y_pred) must be 1D and have the same shape.")
# Add check for pd.Index for x
if not isinstance(x, (np.ndarray, pd.Index)) or x.shape[0] != y_true.shape[0] or x.shape[0] != y_pred.shape[0] or y_true.ndim != 1 or y_pred.ndim != 1:
raise ValueError(f"Input shapes mismatch or invalid types: x({type(x)}, {x.shape if hasattr(x, 'shape') else 'N/A'}), y_true({y_true.shape}), y_pred({y_pred.shape}). Expecting 1D y arrays and matching length x.")
if len(x) == 0:
logger.warning("Attempting to create time series plot with empty data.")
# Return an empty figure or raise error? Let's return empty.
@ -304,4 +316,243 @@ def create_residuals_distribution_plot(
ax.grid(True, axis='y', linestyle='--', alpha=0.6)
fig.tight_layout()
return fig
return fig
def create_multi_horizon_time_series_plot(
y_true_scaled_all_horizons: np.ndarray, # (N, H)
y_pred_scaled_all_horizons: np.ndarray, # (N, H)
target_scaler: Optional[Union[StandardScaler, MinMaxScaler]],
prediction_time_index_h1: pd.DatetimeIndex, # Time index for the first horizon predictions
forecast_horizons: List[int],
title: str,
xlabel: str = "Time",
ylabel: str = "Value (Original Scale)",
max_points: Optional[int] = 1000 # Limit points for clarity
) -> plt.Figure:
"""
Create a time series plot comparing actual values to predictions for multiple horizons.
Predictions for each horizon are plotted on their corresponding target time step.
Args:
y_true_scaled_all_horizons: Ground truth values (N, H array) on scaled scale.
y_pred_scaled_all_horizons: Predicted values (N, H array) on scaled scale.
target_scaler: The scaler used for the target variable, needed for inverse transform.
prediction_time_index_h1: DatetimeIndex for the first horizon (h=h1) predictions.
Length should be N.
forecast_horizons: List of forecast horizons (e.g., [1, 6, 12, 24]).
title: Title for the plot.
xlabel: Label for the x-axis.
ylabel: Label for the y-axis.
max_points: Maximum number of points to display (subsamples if needed).
Returns:
The generated matplotlib Figure object.
Raises:
ValueError: If input shapes are incompatible or horizons list is invalid.
"""
if y_true_scaled_all_horizons.shape != y_pred_scaled_all_horizons.shape:
raise ValueError(f"Shapes of y_true_scaled_all_horizons {y_true_scaled_all_horizons.shape} and y_pred_scaled_all_horizons {y_pred_scaled_all_horizons.shape} must match.")
if y_true_scaled_all_horizons.ndim != 2 or y_true_scaled_all_horizons.shape[1] != len(forecast_horizons):
raise ValueError(f"y arrays must be 2D (N, H) where H is the number of horizons ({len(forecast_horizons)}). Shape is {y_true_scaled_all_horizons.shape}.")
if len(prediction_time_index_h1) != y_true_scaled_all_horizons.shape[0]:
raise ValueError(f"Length of prediction_time_index_h1 ({len(prediction_time_index_h1)}) must match the number of predictions ({y_true_scaled_all_horizons.shape[0]}).")
if not isinstance(prediction_time_index_h1, pd.DatetimeIndex):
logger.warning("prediction_time_index_h1 is not a DatetimeIndex. Time shifts may not work as expected.")
if not forecast_horizons or len(forecast_horizons) == 0:
raise ValueError("forecast_horizons list cannot be empty.")
logger.debug(f"Creating multi-horizon time series plot: {title}")
setup_plot_style() # Apply standard style
fig, ax = plt.subplots(figsize=(18, 8)) # Larger figure for multi-horizon
n_points = y_true_scaled_all_horizons.shape[0]
plot_indices = np.arange(n_points)
if max_points and n_points > max_points:
step = max(1, n_points // max_points)
plot_indices = plot_indices[::step]
# Subsample the data and index
y_true_scaled_plot = y_true_scaled_all_horizons[plot_indices]
y_pred_scaled_plot = y_pred_scaled_all_horizons[plot_indices]
time_index_h1_plot = prediction_time_index_h1[plot_indices]
effective_title = f'{title} (Sampled {len(plot_indices)} points)'
else:
y_true_scaled_plot = y_true_scaled_all_horizons
y_pred_scaled_plot = y_pred_scaled_all_horizons
time_index_h1_plot = prediction_time_index_h1
effective_title = title
# Inverse transform the subsampled data
y_true_inv_plot = None
y_pred_inv_plot = None
if target_scaler is not None:
try:
# Scaler expects (N * H, 1), reshape (N, H) to (N*H, 1)
y_true_inv_plot_flat = target_scaler.inverse_transform(y_true_scaled_plot.reshape(-1, 1))
y_pred_inv_plot_flat = target_scaler.inverse_transform(y_pred_scaled_plot.reshape(-1, 1))
# Reshape back to (N, H)
y_true_inv_plot = y_true_inv_plot_flat.reshape(y_true_scaled_plot.shape)
y_pred_inv_plot = y_pred_inv_plot_flat.reshape(y_pred_scaled_plot.shape)
logger.debug("Successfully inverse-transformed data for multi-horizon plot.")
except Exception as e:
logger.error(f"Failed to inverse transform data for multi-horizon plot: {e}", exc_info=True)
# Fallback to plotting scaled data if inverse transform fails
y_true_inv_plot = y_true_scaled_plot
y_pred_inv_plot = y_pred_scaled_plot
ylabel = f"{ylabel} (Scaled Data - Inverse Transform Failed)"
if y_true_inv_plot is None or y_pred_inv_plot is None:
# This should not happen with the fallback, but as a safeguard
logger.error("Inverse transformed data is None, cannot plot.")
return fig # Return empty figure
# Plot Actuals (using h1's time index, as it's the reference point)
ax.plot(time_index_h1_plot, y_true_inv_plot[:, 0], label='Actuals', marker='.', linestyle='-', markersize=4, linewidth=1.5, color='black') # Actuals for H1
# Plot predictions for each horizon
colors = sns.color_palette("viridis", len(forecast_horizons)) # Use palette for distinct colors
linestyles = ['-', '--', '-.', ':'] * (len(forecast_horizons) // 4 + 1) # Cycle through linestyles
for i, horizon in enumerate(forecast_horizons):
preds_h = y_pred_inv_plot[:, i]
# Calculate time index for this specific horizon by shifting the h1 index
# Assumes the time index frequency is appropriate for the horizon steps
try:
time_index_h = time_index_h1_plot + pd.to_timedelta(horizon - forecast_horizons[0], unit='h') # Assuming 'h' for hours
ax.plot(time_index_h, preds_h, label=f'Predicted (h={horizon})', marker='x', linestyle=linestyles[i], markersize=4, alpha=0.8, linewidth=1, color=colors[i])
except Exception as e:
logger.warning(f"Could not calculate time index for horizon {horizon}: {e}. Skipping plot for this horizon.", exc_info=True)
# Configure plot appearance
ax.set_title(effective_title, fontsize=16) # Slightly larger title
ax.set_xlabel(xlabel, fontsize=12)
ax.set_ylabel(ylabel, fontsize=12)
ax.legend(fontsize=10) # Smaller legend font
ax.grid(True, linestyle='--', alpha=0.6)
# Improve x-axis readability for datetimes
fig.autofmt_xdate() # Auto-rotate date labels
fig.tight_layout()
return fig
def plot_loss_curve_from_csv(
metrics_csv_path: Union[str, Path],
output_path: Union[str, Path],
title: str = "Training Loss Curve",
train_loss_col: str = "train_loss", # Changed to match logging in model.py
val_loss_col: str = "val_loss", # Common validation loss metric logged by PL
epoch_col: str = "epoch"
) -> None:
"""
Reads training metrics from a PyTorch Lightning CSVLogger file and plots
training and validation loss curves over epochs.
Args:
metrics_csv_path: Path to the metrics.csv file generated by CSVLogger.
output_path: Path where the plot image will be saved.
title: Title for the plot.
train_loss_col: Name of the column containing epoch-level training loss.
val_loss_col: Name of the column containing epoch-level validation loss.
epoch_col: Name of the column containing the epoch number.
Raises:
FileNotFoundError: If the metrics_csv_path does not exist.
KeyError: If required columns are not found in the CSV.
Exception: For other plotting or file reading errors.
"""
logger.info(f"Generating loss curve plot from: {metrics_csv_path}")
metrics_path = Path(metrics_csv_path)
if not metrics_path.is_file():
raise FileNotFoundError(f"Metrics CSV file not found at: {metrics_path}")
try:
metrics_df = pd.read_csv(metrics_path)
# Check if required columns exist
required_cols = [epoch_col, train_loss_col]
# Val loss column might be the scaled loss or the original scale MAE
possible_val_cols = [val_loss_col, 'val_MeanAbsoluteError_Original_Scale', 'val_mae_orig_scale'] # Include potential names
found_val_col = None
for col in possible_val_cols:
if col in metrics_df.columns:
found_val_col = col
break
if not found_val_col:
missing_cols = [col for col in required_cols if col not in metrics_df.columns]
raise KeyError(f"Missing required columns in {metrics_path}: {missing_cols} or a suitable validation loss/metric column from {possible_val_cols}.")
# --- Plotting ---
setup_plot_style() # Apply standard style
fig, ax1 = plt.subplots(figsize=(12, 6))
color1 = 'tab:red'
ax1.set_xlabel(epoch_col.capitalize())
# Adjust ylabel based on actual column name used for train loss
ax1.set_ylabel(train_loss_col.replace('_epoch','').replace('_',' ').capitalize(), color=color1)
# Drop NaNs specific to this column for plotting integrity
train_plot_data = metrics_df[[epoch_col, train_loss_col]].dropna(subset=[train_loss_col])
# Filter for epoch column only if needed (usually not for loss plots)
# train_plot_data = train_plot_data[train_plot_data[epoch_col].notna()]
# Ensure epoch starts from 0 or 1 consistently
if train_plot_data[epoch_col].min() > 0 and 0 in metrics_df[epoch_col].unique():
# If epoch starts from 1 in plot data but 0 exists, adjust x-axis for alignment
ax1.plot(train_plot_data[epoch_col] + 1, train_plot_data[train_loss_col], color=color1, label='Train Loss', marker='.', linestyle='-')
logger.debug("Adjusting train loss x-axis by +1 for epoch alignment.")
else:
ax1.plot(train_plot_data[epoch_col], train_plot_data[train_loss_col], color=color1, label='Train Loss', marker='.', linestyle='-')
ax1.tick_params(axis='y', labelcolor=color1)
ax1.grid(True, axis='y', linestyle='--', alpha=0.6, which='major')
# Validation loss/metric plotting on twin axis
ax2 = ax1.twinx()
color2 = 'tab:blue'
# Adjust ylabel based on actual column name used for val metric
ax2.set_ylabel(found_val_col.replace('_epoch','').replace('_',' ').capitalize(), color=color2)
# Drop NaNs specific to the found validation column
val_plot_data = metrics_df[[epoch_col, found_val_col]].dropna(subset=[found_val_col])
# val_plot_data = val_plot_data[val_plot_data[epoch_col].notna()] # Ensure epoch is not NaN
# Ensure epoch starts from 0 or 1 consistently
if val_plot_data[epoch_col].min() > 0 and 0 in metrics_df[epoch_col].unique():
# If epoch starts from 1 in plot data but 0 exists, adjust x-axis for alignment
ax2.plot(val_plot_data[epoch_col] + 1, val_plot_data[found_val_col], color=color2, label='Validation Metric', marker='x', linestyle='--')
logger.debug("Adjusting val metric x-axis by +1 for epoch alignment.")
else:
ax2.plot(val_plot_data[epoch_col], val_plot_data[found_val_col], color=color2, label='Validation Metric', marker='x', linestyle='--')
ax2.tick_params(axis='y', labelcolor=color2)
# Add legend manually combining lines from both axes
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper right')
plt.title(title, fontsize=14)
fig.tight_layout() # Otherwise the right y-label is slightly clipped
# Save the plot
save_plot(fig, output_path)
except pd.errors.EmptyDataError:
logger.error(f"Metrics CSV file is empty: {metrics_csv_path}")
except KeyError as e:
logger.error(f"Could not find expected column in {metrics_csv_path}: {e}")
raise # Re-raise specific error after logging
except Exception as e:
logger.error(f"Failed to create or save loss curve plot from {metrics_csv_path}: {e}", exc_info=True)
raise # Re-raise general errors

View File

@ -0,0 +1,20 @@
"""
TODO
"""
__version__ = "0.1.0"
# Expose core components for easier import
from .ensemble_evaluation import (
run_ensemble_evaluation
)
# Expose main configuration class from utils
from ..utils import MainConfig
# Define __all__ for explicit public API (optional but good practice)
__all__ = [
"run_ensemble_evaluation",
"MainConfig",
]

View File

@ -0,0 +1,276 @@
"""
Classic training routine: Train on initial data segment, validate and test on final segments.
"""
import logging
import time
from pathlib import Path
import pandas as pd
import torch
import yaml
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger
from typing import Dict, Optional
from forecasting_model.utils.forecast_config_model import MainConfig
from forecasting_model.data_processing import prepare_fold_data_and_loaders, split_data_classic
from forecasting_model.train.model import LSTMForecastLightningModule
from forecasting_model.evaluation import evaluate_fold_predictions
from forecasting_model.utils.helper import save_results
from forecasting_model.io.plotting import plot_loss_curve_from_csv
logger = logging.getLogger(__name__)
def run_classic_training(
config: MainConfig,
full_df: pd.DataFrame,
output_base_dir: Path
) -> Optional[Dict[str, float]]:
"""
Runs a single training pipeline using a classic train/val/test split.
Args:
config: The main configuration object.
full_df: The complete raw DataFrame.
output_base_dir: The base directory where general outputs are saved.
Classic results will be saved in a subdirectory.
Returns:
A dictionary containing test metrics (e.g., {'MAE': ..., 'RMSE': ...})
for the classic run, or None if it fails.
"""
run_start_time = time.perf_counter()
logger.info("--- Starting Classic Training Run ---")
# Define a specific output directory for this run
classic_output_dir = output_base_dir / "classic_run"
classic_output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Classic run outputs will be saved to: {classic_output_dir}")
test_metrics: Optional[Dict[str, float]] = None
best_val_score: Optional[float] = None
best_model_path: Optional[str] = None
try:
# --- Data Splitting ---
logger.info("Splitting data into classic train/val/test sets...")
n_samples = len(full_df)
val_frac = config.cross_validation.val_size_fraction
test_frac = config.cross_validation.test_size_fraction
train_idx, val_idx, test_idx = split_data_classic(n_samples, val_frac, test_frac)
# Store test datetime index for evaluation plotting
test_datetime_index = full_df.iloc[test_idx].index
# --- Data Preparation ---
logger.info("Preparing data loaders for the classic split...")
train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
full_df=full_df,
train_idx=train_idx,
val_idx=val_idx,
test_idx=test_idx,
target_col=config.data.target_col,
feature_config=config.features,
train_config=config.training,
eval_config=config.evaluation
)
logger.info(f"Data loaders prepared. Input size determined: {input_size}")
# Save artifacts specific to this run if needed (e.g., for later inference)
torch.save(test_loader, classic_output_dir / "classic_test_loader.pt")
torch.save(target_scaler, classic_output_dir / "classic_target_scaler.pt")
torch.save(input_size, classic_output_dir / "classic_input_size.pt")
# Save config for this run
try: config_dump = config.model_dump()
except AttributeError: config_dump = config.model_dump()
with open(classic_output_dir / "config.yaml", 'w') as f:
yaml.dump(config_dump, f, default_flow_style=False)
# --- Model Initialization ---
model = LSTMForecastLightningModule(
model_config=config.model,
train_config=config.training,
input_size=input_size,
target_scaler=target_scaler
)
logger.info("Classic LSTMForecastLightningModule initialized.")
# --- PyTorch Lightning Callbacks ---
monitor_metric = "val_MeanAbsoluteError" # Monitor same metric as CV folds
monitor_mode = "min"
early_stop_callback = None
if config.training.early_stopping_patience is not None and config.training.early_stopping_patience > 0:
early_stop_callback = EarlyStopping(
monitor=monitor_metric, min_delta=0.0001,
patience=config.training.early_stopping_patience, verbose=True, mode=monitor_mode
)
logger.info(f"Enabled EarlyStopping: monitor='{monitor_metric}', patience={config.training.early_stopping_patience}")
checkpoint_callback = ModelCheckpoint(
dirpath=classic_output_dir / "checkpoints",
filename="best_classic_model", # Simple filename
save_top_k=1, monitor=monitor_metric, mode=monitor_mode, verbose=True
)
logger.info(f"Enabled ModelCheckpoint: monitor='{monitor_metric}', mode='{monitor_mode}'")
lr_monitor = LearningRateMonitor(logging_interval='epoch')
callbacks = [checkpoint_callback, lr_monitor]
if early_stop_callback: callbacks.append(early_stop_callback)
# --- PyTorch Lightning Logger ---
pl_logger = CSVLogger(save_dir=str(classic_output_dir), name="training_logs")
logger.info(f"Using CSVLogger, logs will be saved in: {pl_logger.log_dir}")
# --- PyTorch Lightning Trainer ---
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
devices = 1 if accelerator == 'gpu' else None
precision = getattr(config.training, 'precision', 32)
trainer = pl.Trainer(
accelerator=accelerator, devices=devices,
max_epochs=config.training.epochs,
callbacks=callbacks, logger=pl_logger,
log_every_n_steps=max(1, len(train_loader)//10),
enable_progress_bar=True,
gradient_clip_val=getattr(config.training, 'gradient_clip_val', None),
precision=precision,
)
logger.info(f"Initialized PyTorch Lightning Trainer: accelerator='{accelerator}', devices={devices}, precision={precision}")
# --- Training ---
logger.info("Starting classic model training...")
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
logger.info("Classic model training finished.")
# Store best validation score and path
best_val_score_tensor = trainer.checkpoint_callback.best_model_score
best_model_path = trainer.checkpoint_callback.best_model_path
best_val_score = best_val_score_tensor.item() if best_val_score_tensor is not None else None
if best_val_score is not None:
logger.info(f"Best validation score ({monitor_metric}): {best_val_score:.4f}")
logger.info(f"Best model checkpoint path: {best_model_path}")
else:
logger.warning(f"Could not retrieve best validation score/path (metric: {monitor_metric}). Evaluation might use last model.")
best_model_path = None
# --- Prediction on Test Set ---
logger.info("Starting prediction on classic test set using best checkpoint...")
prediction_results_list = trainer.predict(
ckpt_path=best_model_path if best_model_path else 'last',
dataloaders=test_loader
)
# --- Evaluation ---
if not prediction_results_list:
logger.error("Predict phase did not return any results for classic run.")
test_metrics = None
else:
try:
# Shapes: (n_samples, len(horizons))
all_preds_scaled = torch.cat([b['preds_scaled'] for b in prediction_results_list], dim=0).numpy()
n_predictions = len(all_preds_scaled) # Number of samples actually predicted
if 'targets_scaled' in prediction_results_list[0]:
all_targets_scaled = torch.cat([b['targets_scaled'] for b in prediction_results_list], dim=0).numpy()
if len(all_targets_scaled) != n_predictions:
logger.error(f"Classic Run: Mismatch between number of predictions ({n_predictions}) and targets ({len(all_targets_scaled)}).")
raise ValueError("Prediction and target count mismatch during classic evaluation.")
else:
raise ValueError("Targets missing from prediction results.")
logger.info(f"Processing {n_predictions} prediction results for classic test set...")
# --- Calculate Correct Time Index for Plotting (First Horizon) ---
target_time_index_for_plotting = None
if test_idx is not None and config.features.forecast_horizon:
try:
test_block_index = full_df.index[test_idx] # Use the test_idx from classic split
seq_len = config.features.sequence_length
first_horizon = config.features.forecast_horizon[0]
start_offset = seq_len + first_horizon - 1
if start_offset < len(test_block_index):
end_index = min(start_offset + n_predictions, len(test_block_index))
target_time_index_for_plotting = test_block_index[start_offset:end_index]
if len(target_time_index_for_plotting) != n_predictions:
logger.warning(f"Classic Run: Calculated target time index length ({len(target_time_index_for_plotting)}) "
f"does not match prediction count ({n_predictions}). Plotting x-axis might be misaligned.")
target_time_index_for_plotting = None
else:
logger.warning(f"Classic Run: Cannot calculate target time index, start offset ({start_offset}) "
f"exceeds test block length ({len(test_block_index)}).")
except Exception as e:
logger.error(f"Classic Run: Error calculating target time index for plotting: {e}", exc_info=True)
target_time_index_for_plotting = None # Ensure it's None if error occurs
else:
logger.warning(f"Classic Run: Skipping target time index calculation (missing test_idx or forecast_horizon).")
# --- End Index Calculation ---
# Use the classic run specific objects and config
test_metrics = evaluate_fold_predictions(
y_true_scaled=all_targets_scaled,
y_pred_scaled=all_preds_scaled,
target_scaler=target_scaler,
eval_config=config.evaluation,
fold_num=-1, # Indicate classic run
output_dir=str(classic_output_dir),
plot_subdir="plots",
prediction_time_index=target_time_index_for_plotting, # Pass the correctly calculated index
forecast_horizons=config.features.forecast_horizon,
plot_title_prefix="Classic Run"
)
# Save metrics
save_results({"overall_metrics": test_metrics}, classic_output_dir / "test_metrics.json")
logger.info(f"Classic run test metrics (overall): {test_metrics}")
# --- Plot Loss Curve for Classic Run ---
try:
# Adjusted logic to find metrics.csv inside potential version_*/ directories
classic_log_dir = classic_output_dir / "training_logs"
metrics_file = None
version_dirs = list(classic_log_dir.glob("version_*"))
if version_dirs:
# Assuming the latest version directory contains the relevant logs
latest_version_dir = max(version_dirs, key=lambda p: p.stat().st_mtime)
potential_metrics_file = latest_version_dir / "metrics.csv"
if potential_metrics_file.is_file():
metrics_file = potential_metrics_file
else:
logger.warning(f"Classic Run: metrics.csv not found in latest version directory: {latest_version_dir}")
else:
# Fallback if no version_* directories exist (less common with CSVLogger)
potential_metrics_file = classic_log_dir / "metrics.csv"
if potential_metrics_file.is_file():
metrics_file = potential_metrics_file
if metrics_file and metrics_file.is_file():
plot_loss_curve_from_csv(
metrics_csv_path=metrics_file,
output_path=classic_output_dir / "loss_curve.png",
title="Classic Run Training Progression",
train_loss_col='train_loss', # Changed from 'train_loss_epoch'
val_loss_col='val_loss' # Keep as 'val_loss'
)
logger.info(f"Generating loss curve for classic run from: {metrics_file}")
else:
logger.warning(f"Classic Run: Could not find metrics.csv in {classic_log_dir} or its version subdirectories for loss curve plot.")
except Exception as plot_e:
logger.error(f"Classic Run: Failed to generate loss curve plot: {plot_e}", exc_info=True)
# --- End Classic Loss Plotting ---
except (KeyError, ValueError, Exception) as e:
logger.error(f"Error processing classic prediction results: {e}", exc_info=True)
test_metrics = None
except Exception as e:
logger.error(f"An error occurred during the classic training pipeline: {e}", exc_info=True)
test_metrics = None # Indicate failure
finally:
if torch.cuda.is_available(): torch.cuda.empty_cache()
run_end_time = time.perf_counter()
logger.info(f"--- Finished Classic Training Run in {run_end_time - run_start_time:.2f} seconds ---")
return test_metrics

View File

@ -0,0 +1,425 @@
"""
Ensemble evaluation for time series forecasting models.
This module provides functionality to evaluate ensemble predictions
by combining predictions from n-1 folds and testing on the remaining fold.
"""
import logging
import numpy as np
import torch
import yaml # For loading fold config
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd # For time index handling
import pickle # Need pickle for the specific error check
from forecasting_model.evaluation import evaluate_fold_predictions
from forecasting_model.train.model import LSTMForecastLightningModule
from forecasting_model.utils.forecast_config_model import MainConfig
logger = logging.getLogger(__name__)
def load_fold_model_and_objects(
fold_dir: Path,
) -> Optional[Tuple[LSTMForecastLightningModule, MainConfig, torch.utils.data.DataLoader, Union[StandardScaler, MinMaxScaler, None], int, Optional[pd.Index], List[int]]]:
"""
Load a trained model, its config, dataloader, scaler, input_size, prediction time index, and forecast horizons.
Args:
fold_dir: Directory containing the fold's artifacts (checkpoint, config, loader, etc.).
Returns:
A tuple containing (model, config, test_loader, target_scaler, input_size, prediction_target_time_index, forecast_horizons)
or None if any essential artifact is missing or loading fails.
"""
try:
logger.info(f"Loading artifacts from: {fold_dir}")
# 1. Load Fold Configuration
config_path = fold_dir / "config.yaml"
if not config_path.is_file():
logger.error(f"Fold config file not found in {fold_dir}")
return None
with open(config_path, 'r') as f:
fold_config_dict = yaml.safe_load(f)
fold_config = MainConfig(**fold_config_dict) # Validate fold's config
# 2. Load Saved Objects using torch.load
test_loader_path = fold_dir / "test_loader.pt"
scaler_path = fold_dir / "target_scaler.pt"
input_size_path = fold_dir / "input_size.pt"
prediction_index_path = fold_dir / "prediction_target_time_index.pt"
if not all([p.is_file() for p in [test_loader_path, scaler_path, input_size_path]]):
logger.error(f"Missing one or more required artifacts (test_loader, target_scaler, input_size) in {fold_dir}")
return None
try:
# --- Explicitly set weights_only=False for non-model objects ---
test_loader = torch.load(test_loader_path, weights_only=False)
target_scaler = torch.load(scaler_path, weights_only=False)
input_size = torch.load(input_size_path, weights_only=False)
# --- End Modification ---
except pickle.UnpicklingError as e:
# Catch potential unpickling errors even with weights_only=False
logger.error(f"Failed to unpickle saved object in {fold_dir}: {e}", exc_info=True)
return None
except AttributeError as e:
# Catch potential issues if class definitions changed between saving and loading
logger.error(f"AttributeError loading saved object in {fold_dir} (class definition changed?): {e}", exc_info=True)
return None
except Exception as e:
# Catch other potential loading errors
logger.error(f"Unexpected error loading saved objects (loader/scaler/size) from {fold_dir}: {e}", exc_info=True)
return None
# Retrieve forecast horizon list from the fold's config
forecast_horizons = fold_config.features.forecast_horizon
# --- Extract prediction target time index (if available) ---
prediction_target_time_index: Optional[pd.Index] = None
if prediction_index_path.is_file():
try:
prediction_target_time_index = torch.load(prediction_index_path, weights_only=False)
# Basic validation
if not isinstance(prediction_target_time_index, pd.Index):
logger.warning(f"Loaded prediction index from {prediction_index_path} is not a pandas Index.")
prediction_target_time_index = None
else:
logger.debug(f"Loaded prediction target time index from {prediction_index_path}")
except Exception as e:
logger.warning(f"Failed to load prediction target time index from {prediction_index_path}: {e}")
else:
logger.warning(f"Prediction target time index file not found at {prediction_index_path}. Plotting x-axis might be inaccurate for ensemble plots.")
# --- End Index Extraction ---
# 3. Find Checkpoint and Load Model
checkpoint_path = None
try:
# Use rglob to find the checkpoint potentially nested deeper
checkpoints = list(fold_dir.glob("**/best_model_fold_*.ckpt"))
if not checkpoints:
logger.error(f"No 'best_model_fold_*.ckpt' checkpoint found in {fold_dir} or subdirectories.")
return None
if len(checkpoints) > 1:
logger.warning(f"Multiple checkpoints found in {fold_dir}, using the first one: {checkpoints[0]}")
checkpoint_path = checkpoints[0]
logger.info(f"Loading model from checkpoint: {checkpoint_path}")
model = LSTMForecastLightningModule.load_from_checkpoint(
checkpoint_path,
map_location=torch.device('cpu'), # Optional: load to CPU first if memory is tight
model_config=fold_config.model,
train_config=fold_config.training,
input_size=input_size,
target_scaler=target_scaler
)
model.eval()
logger.info(f"Successfully loaded model and artifacts from {fold_dir}")
return model, fold_config, test_loader, target_scaler, input_size, prediction_target_time_index, forecast_horizons
except FileNotFoundError:
logger.error(f"Checkpoint file not found: {checkpoint_path}")
return None
except Exception as e:
logger.error(f"Failed to load model from checkpoint {checkpoint_path} in {fold_dir}: {e}", exc_info=True)
return None
except Exception as e:
logger.error(f"Generic error loading artifacts from {fold_dir}: {e}", exc_info=True)
return None
def make_ensemble_predictions(
models: List[LSTMForecastLightningModule],
test_loader: torch.utils.data.DataLoader,
device: Optional[torch.device] = None
) -> Tuple[Optional[Dict[str, np.ndarray]], Optional[np.ndarray]]:
"""
Make predictions using an ensemble of models efficiently.
Processes the test_loader once, getting predictions from all models per batch.
Args:
models: List of trained models (already in eval mode).
test_loader: DataLoader for the test set.
device: Device to run predictions on (e.g., torch.device("cuda:0")).
If None, attempts to use GPU if available, else CPU.
Returns:
Tuple of (ensemble_predictions, targets):
- ensemble_predictions: Dict containing ensemble predictions keyed by method
('mean', 'median', 'min', 'max'). Values are np.arrays.
Returns None if prediction fails.
- targets: Ground truth values as a single np.array. Returns None if prediction fails
or targets are unavailable in loader.
"""
if not models:
logger.warning("make_ensemble_predictions received an empty list of models.")
return None, None
if device is None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Running ensemble predictions on device: {device}")
# Move all models to the target device
for model in models:
model.to(device)
all_batch_preds: List[List[np.ndarray]] = [[] for _ in models] # Outer list: models, Inner list: batches
all_batch_targets: List[np.ndarray] = []
targets_available = True
with torch.no_grad():
for batch_idx, batch in enumerate(test_loader):
try:
# Determine if batch contains targets
if isinstance(batch, (list, tuple)) and len(batch) == 2:
x, y = batch
x = x.to(device)
# Keep targets on CPU until needed for concatenation
all_batch_targets.append(y.cpu().numpy())
else:
x = batch.to(device)
targets_available = False # No targets found in this batch
# Get predictions from all models for this batch
for i, model in enumerate(models):
try:
pred = model(x) # Shape: (batch, horizon)
all_batch_preds[i].append(pred.cpu().numpy())
except Exception as model_err:
logger.error(f"Error during prediction with model {i} on batch {batch_idx}: {model_err}", exc_info=True)
# Handle error: Fill with NaNs? Skip model? For now, fill with NaNs of expected shape
# Infer expected shape: (batch_size, horizon)
batch_size = x.shape[0]
horizon = models[0].output_size # Assume all models have same horizon
nan_preds = np.full((batch_size, horizon), np.nan)
all_batch_preds[i].append(nan_preds)
except Exception as batch_err:
logger.error(f"Error processing batch {batch_idx} for ensemble prediction: {batch_err}", exc_info=True)
# If a batch fails catastrophically, we might not be able to proceed reliably
return None, None # Indicate failure
# Concatenate batch results for each model
model_preds_concat = []
for i in range(len(models)):
if not all_batch_preds[i]: # Check if any predictions were collected for this model
logger.warning(f"No predictions collected for model index {i}. Skipping this model in ensemble.")
continue # Skip this model if it failed on all batches
try:
model_preds_concat.append(np.concatenate(all_batch_preds[i], axis=0))
except ValueError as e:
logger.error(f"Failed to concatenate predictions for model index {i}: {e}. Check for shape mismatches or empty lists.")
# Decide how to handle: skip model or fail? Let's skip for robustness.
continue
if not model_preds_concat:
logger.error("No valid predictions collected from any model in the ensemble.")
return None, None
# Concatenate targets if available
targets_concat = None
if targets_available and all_batch_targets:
try:
targets_concat = np.concatenate(all_batch_targets, axis=0)
except ValueError as e:
logger.error(f"Failed to concatenate targets: {e}")
return None, None # Fail if targets were expected but couldn't be combined
elif targets_available and not all_batch_targets:
logger.warning("Targets were expected based on first batch, but none were collected.")
# Proceed without targets, returning None for them
# Stack predictions from all models: Shape (num_models, num_samples, horizon)
try:
stacked_preds = np.stack(model_preds_concat, axis=0)
except ValueError as e:
logger.error(f"Failed to stack model predictions: {e}. Check if all models produced compatible shapes.")
return None, targets_concat # Return targets if available, but no ensemble preds
# Calculate different ensemble predictions (handle NaNs potentially introduced by model failures)
# np.nanmean, np.nanmedian etc. ignore NaNs
ensemble_preds = {
'mean': np.nanmean(stacked_preds, axis=0),
'median': np.nanmedian(stacked_preds, axis=0),
'min': np.nanmin(stacked_preds, axis=0),
'max': np.nanmax(stacked_preds, axis=0)
}
logger.info(f"Ensemble predictions generated using {stacked_preds.shape[0]} models.")
return ensemble_preds, targets_concat
def evaluate_ensemble_for_test_fold(
test_fold_num: int,
all_fold_dirs: List[Path],
output_base_dir: Path,
# full_data_index: Optional[pd.Index] = None # Removed, get from loaded objects
) -> Optional[Dict[str, Dict[str, float]]]:
"""
Evaluates ensemble predictions for a specific test fold.
Args:
test_fold_num: The 1-based number of the fold to use as the test set.
all_fold_dirs: List of paths to all fold directories.
output_base_dir: Base directory for saving evaluation results/plots.
Returns:
Dictionary containing metrics for each ensemble method for this test fold,
or None if evaluation fails.
"""
logger.info(f"--- Evaluating Ensemble: Test Fold {test_fold_num} ---")
test_fold_dir = output_base_dir / f"fold_{test_fold_num:02d}"
load_result = load_fold_model_and_objects(test_fold_dir)
if load_result is None:
logger.error(f"Failed to load necessary artifacts for test fold {test_fold_num}. Skipping ensemble evaluation for this fold.")
return None
# Unpack results including the prediction time index and horizons
_, test_fold_config, test_loader, target_scaler, _, prediction_target_time_index, test_forecast_horizons = load_result
# Load models from all *other* folds
ensemble_models: List[LSTMForecastLightningModule] = []
model_forecast_horizons = None # Track horizons from loaded models
for i, fold_dir in enumerate(all_fold_dirs):
current_fold_num = i + 1
if current_fold_num == test_fold_num:
continue # Skip the test fold itself
model_load_result = load_fold_model_and_objects(fold_dir)
if model_load_result:
model, _, _, _, _, _, fold_horizons = model_load_result # Only need the model here
if model:
ensemble_models.append(model)
# Store horizons from the first successful model load
if model_forecast_horizons is None:
model_forecast_horizons = fold_horizons
# Optional: Check consistency of horizons across ensemble models
elif set(model_forecast_horizons) != set(fold_horizons):
logger.error(f"Inconsistent forecast horizons between ensemble models! Test fold {test_fold_num} expected {test_forecast_horizons}, "
f"Model {i+1} has {fold_horizons}. Ensemble may be invalid.")
# Decide how to handle: error out, or proceed with caution?
# return None # Option: Fail hard
else:
logger.warning(f"Could not load model from fold {current_fold_num} to include in ensemble for test fold {test_fold_num}.")
if len(ensemble_models) < 2:
logger.warning(f"Skipping ensemble evaluation for test fold {test_fold_num}: "
f"Need at least 2 models for ensemble, only loaded {len(ensemble_models)}.")
return {} # Return empty dict, not None, to indicate process ran but no ensemble formed
# Check consistency between test fold horizons and ensemble model horizons
if model_forecast_horizons is None: # Should not happen if len(ensemble_models) >= 1
logger.error(f"Could not determine forecast horizons from ensemble models for test fold {test_fold_num}.")
return None
if set(test_forecast_horizons) != set(model_forecast_horizons):
logger.error(f"Forecast horizons of test fold {test_fold_num} ({test_forecast_horizons}) do not match "
f"horizons from ensemble models ({model_forecast_horizons}). Cannot evaluate.")
return None
# Make ensemble predictions using the loaded models and the test fold's data loader
# Use the test fold's config to determine device implicitly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ensemble_preds_dict, targets_np = make_ensemble_predictions(ensemble_models, test_loader, device=device)
if ensemble_preds_dict is None or targets_np is None:
logger.error(f"Failed to generate ensemble predictions or retrieve targets for test fold {test_fold_num}.")
return None # Indicate failure
# Evaluate each ensemble method's predictions against the test fold's targets
fold_ensemble_results: Dict[str, Dict[str, float]] = {}
for method, preds_np in ensemble_preds_dict.items():
logger.info(f"Evaluating ensemble method '{method}' for test fold {test_fold_num}...")
# Define a unique output directory for this method's plots
method_plot_dir = output_base_dir / "ensemble_eval_plots" / f"test_fold_{test_fold_num:02d}" / f"method_{method}"
# Use the prediction_target_time_index loaded earlier
prediction_time_index_for_plot = None
if prediction_target_time_index is not None:
if len(prediction_target_time_index) == targets_np.shape[0]:
prediction_time_index_for_plot = prediction_target_time_index
else:
logger.warning(f"Length of loaded prediction target time index ({len(prediction_target_time_index)}) does not match "
f"number of samples ({targets_np.shape[0]}) for test fold {test_fold_num}, method '{method}'. Plot x-axis may be incorrect.")
# Call the standard evaluation function
metrics = evaluate_fold_predictions(
y_true_scaled=targets_np,
y_pred_scaled=preds_np,
target_scaler=target_scaler,
eval_config=test_fold_config.evaluation,
fold_num=test_fold_num - 1,
output_dir=str(method_plot_dir.parent.parent),
plot_subdir=f"method_{method}",
prediction_time_index=prediction_time_index_for_plot, # Pass the index
forecast_horizons=test_forecast_horizons,
plot_title_prefix=f"Ensemble ({method})"
)
fold_ensemble_results[method] = metrics
logger.info(f"--- Finished Ensemble Evaluation: Test Fold {test_fold_num} ---")
return fold_ensemble_results
def run_ensemble_evaluation(
config: MainConfig, # Pass main config for context if needed, though fold configs are loaded
output_base_dir: Path,
# full_data_index: Optional[pd.Index] = None # Removed, get index from loaded objects
) -> Dict[int, Dict[str, Dict[str, float]]]:
"""
Run ensemble evaluation across all folds, treating each as the test set once.
Args:
config: The main configuration object (potentially unused if fold configs sufficient).
output_base_dir: Base directory where fold outputs are stored.
Returns:
Dictionary containing ensemble metrics for each test fold:
{ test_fold_num: { ensemble_method: { metric_name: value, ... }, ... }, ... }
"""
logger.info("===== Starting Cross-Validated Ensemble Evaluation =====")
all_ensemble_results: Dict[int, Dict[str, Dict[str, float]]] = {}
# Discover fold directories
fold_dirs = sorted([d for d in output_base_dir.glob("fold_*") if d.is_dir()])
if not fold_dirs:
logger.error(f"No fold directories found in {output_base_dir} for ensemble evaluation.")
return {}
if len(fold_dirs) < 2:
logger.warning(f"Need at least 2 folds for ensemble evaluation, found {len(fold_dirs)}. Skipping.")
return {}
logger.info(f"Found {len(fold_dirs)} fold directories.")
# Iterate through each fold, designating it as the test fold
for i, test_fold_dir in enumerate(fold_dirs):
test_fold_num = i + 1 # 1-based fold number
try:
results_for_test_fold = evaluate_ensemble_for_test_fold(
test_fold_num=test_fold_num,
all_fold_dirs=fold_dirs,
output_base_dir=output_base_dir,
# full_data_index=full_data_index # Removed
)
if results_for_test_fold is not None:
# Only add results if the evaluation didn't fail completely
all_ensemble_results[test_fold_num] = results_for_test_fold
except Exception as e:
# Catch unexpected errors during a specific test fold evaluation
logger.error(f"Unexpected error during ensemble evaluation with test fold {test_fold_num}: {e}", exc_info=True)
continue # Continue to the next fold
# Saving is handled by the main script (`forecasting_model_run.py`) which calls this
if not all_ensemble_results:
logger.warning("Ensemble evaluation finished, but no results were generated.")
else:
logger.info("===== Finished Cross-Validated Ensemble Evaluation =====")
return all_ensemble_results

View File

View File

@ -9,7 +9,7 @@ from typing import Optional, Dict, Any, Union, List, Tuple
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Assuming config_model is in sibling directory utils/
from forecasting_model.utils.config_model import ModelConfig, TrainingConfig
from forecasting_model.utils.forecast_config_model import ModelConfig, TrainingConfig
logger = logging.getLogger(__name__)
@ -30,41 +30,42 @@ class LSTMForecastLightningModule(pl.LightningModule):
super().__init__()
# --- Validate & Store Configs ---
# Validate the input_size passed during instantiation
if input_size <= 0:
raise ValueError("`input_size` must be provided as a positive integer during model instantiation.")
self._input_size = input_size # Use a temporary attribute
# Store the validated input_size directly for use in layer definitions
self._input_size = input_size # Use a temporary attribute before hparams are saved
# Ensure forecast_horizon is a valid list in the config
if not hasattr(model_config, 'forecast_horizon') or \
not isinstance(model_config.forecast_horizon, list) or \
not model_config.forecast_horizon or \
any(h <= 0 for h in model_config.forecast_horizon):
raise ValueError("ModelConfig requires `forecast_horizon` to be a non-empty list of positive integers.")
# Ensure forecast_horizon is set in the config for the output layer
if not hasattr(model_config, 'forecast_horizon') or model_config.forecast_horizon is None or model_config.forecast_horizon <= 0:
raise ValueError("ModelConfig requires `forecast_horizon` to be set and positive.")
self.output_size = model_config.forecast_horizon
# Output size is the number of horizons we predict
self.output_size = len(model_config.forecast_horizon)
# Store the actual horizon list for reference if needed, ensure sorted
self.forecast_horizons = sorted(model_config.forecast_horizon)
# Store configurations - input_size argument will be saved via save_hyperparameters
self.model_config = model_config
self.train_config = train_config
self.target_scaler = target_scaler # Store scaler for this fold
# Use save_hyperparameters() to automatically log configs and allow loading
# Pass input_size explicitly to be saved in hparams
# Exclude scaler as it's stateful and fold-specific
# Use save_hyperparameters() - forecast_horizon is part of model_config which is saved
self.save_hyperparameters('model_config', 'train_config', 'input_size', ignore=['target_scaler'])
# Note: Pydantic models might not be perfectly saved/loaded by PL's hparams, check if needed.
# If issues arise loading, might need to flatten relevant hparams manually.
# --- Define Model Layers ---
# Access input_size via hparams now
self.lstm = nn.LSTM(
input_size=self.hparams.input_size,
hidden_size=self.hparams.model_config.hidden_size,
num_layers=self.hparams.model_config.num_layers,
batch_first=True, # Input shape: (batch, seq_len, features)
batch_first=True,
dropout=self.hparams.model_config.dropout if self.hparams.model_config.num_layers > 1 else 0.0
)
self.dropout = nn.Dropout(self.hparams.model_config.dropout)
# Output layer maps LSTM hidden state to the forecast horizon
# We typically take the output of the last time step
# Output layer maps LSTM hidden state to the number of forecast horizons
self.fc = nn.Linear(self.hparams.model_config.hidden_size, self.output_size)
# Optional residual connection handling
@ -96,7 +97,7 @@ class LSTMForecastLightningModule(pl.LightningModule):
self.val_metrics = metrics.clone(prefix='val_')
self.test_metrics = metrics.clone(prefix='test_')
self.val_mae_original_scale = torchmetrics.MeanAbsoluteError()
self.val_MeanAbsoluteError_Original_Scale = torchmetrics.MeanAbsoluteError()
def forward(self, x: torch.Tensor) -> torch.Tensor:
@ -107,7 +108,8 @@ class LSTMForecastLightningModule(pl.LightningModule):
x: Input tensor of shape (batch_size, sequence_length, input_size)
Returns:
Predictions tensor of shape (batch_size, forecast_horizon)
Predictions tensor of shape (batch_size, len(forecast_horizons))
where each element corresponds to a predicted horizon in sorted order.
"""
# LSTM forward pass
lstm_out, (hidden, cell) = self.lstm(x) # Shape: (batch, seq_len, hidden_size)
@ -126,59 +128,50 @@ class LSTMForecastLightningModule(pl.LightningModule):
last_time_step_out = last_time_step_out + residual
# Final fully connected layer
predictions = self.fc(last_time_step_out) # Shape: (batch_size, output_size/horizon)
predictions = self.fc(last_time_step_out) # Shape: (batch_size, output_size/len(horizons))
return predictions # Shape: (batch_size, forecast_horizon)
return predictions # Shape: (batch_size, len(forecast_horizons))
def _calculate_loss(self, outputs, targets):
# Ensure shapes match before loss calculation
# Shapes should now be (batch_size, len(horizons)) for both
if outputs.shape != targets.shape:
# Squeeze potential extra dim: (batch, horizon, 1) -> (batch, horizon)
if outputs.ndim == targets.ndim + 1 and outputs.shape[-1] == 1:
outputs = outputs.squeeze(-1)
if outputs.shape != targets.shape:
raise ValueError(f"Output shape {outputs.shape} doesn't match target shape {targets.shape} for loss calculation.")
# Minimal check, dataset __getitem__ should ensure this
raise ValueError(f"Output shape {outputs.shape} doesn't match target shape {targets.shape} for loss calculation.")
return self.criterion(outputs, targets)
def _inverse_transform(self, data: torch.Tensor) -> Optional[torch.Tensor]:
"""Helper to inverse transform data using the stored target scaler."""
"""Helper to inverse transform data (preds or targets) using the stored target scaler."""
if self.target_scaler is None:
# logger.warning("Cannot inverse transform: target_scaler not available.")
return None # Cannot inverse transform
return None
data_cpu = data.detach().cpu().numpy().astype(np.float64)
original_shape = data_cpu.shape # e.g., (batch_size, len(horizons))
num_elements = data_cpu.size
# Scaler expects 2D input (N, 1)
# Ensure data is on CPU and is float64 for sklearn scaler typically
data_cpu = data.detach().cpu().numpy().astype(np.float64)
original_shape = data_cpu.shape
if data_cpu.ndim == 1:
data_flat = data_cpu.reshape(-1, 1)
elif data_cpu.ndim == 2: # (batch, horizon)
data_flat = data_cpu.reshape(-1, 1)
else:
logger.warning(f"Unexpected shape for inverse transform: {original_shape}. Reshaping to (-1, 1).")
data_flat = data_cpu.reshape(-1, 1)
data_flat = data_cpu.reshape(num_elements, 1)
try:
inversed_np = self.target_scaler.inverse_transform(data_flat)
# Return as tensor on the original device
# Return as tensor on the original device, potentially reshaped
inversed_tensor = torch.from_numpy(inversed_np).float().to(data.device)
# Reshape back? Or keep flat? Keep flat for direct metric use often.
return inversed_tensor.flatten()
# return inversed_tensor.reshape(original_shape) # If original shape needed
# Reshape back to original multi-horizon shape
return inversed_tensor.reshape(original_shape)
# return inversed_tensor.flatten() # Keep flat if needed for specific metric inputs
except Exception as e:
logger.error(f"Failed to inverse transform data: {e}", exc_info=True)
return None # Return None if inverse transform fails
return None
def training_step(self, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
x, y = batch # Shapes: x=(batch, seq_len, features), y=(batch, horizon)
outputs = self(x) # Scaled outputs: (batch, horizon)
x, y = batch # Shapes: x=(batch, seq_len, features), y=(batch, len(horizons))
outputs = self(x) # Scaled outputs: (batch, len(horizons))
loss = self._calculate_loss(outputs, y)
# Log scaled metrics
metrics = self.train_metrics(outputs, y) # Update internal state
self.train_metrics.update(outputs, y)
self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log_dict(self.train_metrics, on_step=False, on_epoch=True, logger=True) # Log all metrics in collection
self.log_dict(self.train_metrics, on_step=False, on_epoch=True, logger=True)
return loss
@ -188,20 +181,22 @@ class LSTMForecastLightningModule(pl.LightningModule):
loss = self._calculate_loss(outputs, y)
# Log scaled metrics
metrics = self.val_metrics(outputs, y) # Update internal state
self.val_metrics.update(outputs, y)
self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log_dict(self.val_metrics, on_step=False, on_epoch=True, logger=True)
# Log MAE on ORIGINAL scale if scaler is available (often the primary metric for checkpointing/Optuna)
# Log MAE on ORIGINAL scale (primary metric for checkpoints)
if self.target_scaler is not None:
# Inverse transform keeps the (batch, len(horizons)) shape
outputs_inv = self._inverse_transform(outputs)
y_inv = self._inverse_transform(y)
if outputs_inv is not None and y_inv is not None:
# Ensure shapes are compatible (flattened by _inverse_transform)
# Ensure shapes match
if outputs_inv.shape == y_inv.shape:
self.val_mae_original_scale.update(outputs_inv, y_inv)
self.log('val_mae_orig_scale', self.val_mae_original_scale, on_step=False, on_epoch=True, prog_bar=True, logger=True)
# It will compute the average MAE across all elements if multi-dim
self.val_MeanAbsoluteError_Original_Scale.update(outputs_inv, y_inv)
self.log('val_MeanAbsoluteError_Original_Scale', self.val_MeanAbsoluteError_Original_Scale, on_step=False, on_epoch=True, prog_bar=True, logger=True)
else:
logger.warning(f"Shape mismatch after inverse transform in validation: Preds {outputs_inv.shape}, Targets {y_inv.shape}")
else:

View File

@ -5,7 +5,7 @@ This package contains configuration models, helper functions, and other utilitie
"""
# Expose configuration models
from .config_model import (
from .forecast_config_model import (
MainConfig,
DataConfig,
FeatureConfig,

View File

@ -44,7 +44,7 @@ class DataConfig(BaseModel):
class FeatureConfig(BaseModel):
"""Configuration for feature engineering and preprocessing."""
sequence_length: int = Field(..., gt=0)
forecast_horizon: int = Field(..., gt=0)
forecast_horizon: List[int] = Field(..., min_length=1, description="List of specific forecast horizons to predict (e.g., [1, 6, 12]).")
lags: List[int] = []
rolling_window_sizes: List[int] = []
use_time_features: bool = True
@ -55,11 +55,11 @@ class FeatureConfig(BaseModel):
clipping: ClippingConfig = ClippingConfig() # Default instance
scaling_method: Optional[Literal['standard', 'minmax']] = 'standard' # Added literal validation
@field_validator('lags', 'rolling_window_sizes')
@field_validator('lags', 'rolling_window_sizes', 'forecast_horizon')
@classmethod
def check_positive_list_values(cls, v: List[int]) -> List[int]:
if any(val <= 0 for val in v):
raise ValueError('Lists lags/rolling_window_sizes must contain only positive values')
raise ValueError('Lists lags, rolling_window_sizes, and forecast_horizon must contain only positive values')
return v
class ModelConfig(BaseModel):
@ -69,8 +69,8 @@ class ModelConfig(BaseModel):
num_layers: int = Field(..., gt=0)
dropout: float = Field(..., ge=0.0, le=1.0)
use_residual_skips: bool = False
# Add forecast_horizon here to ensure LightningModule gets it directly
forecast_horizon: Optional[int] = Field(None, gt=0) # Will be set from FeatureConfig
# forecast_horizon: Optional[int] = Field(None, gt=0) # OLD
forecast_horizon: Optional[List[int]] = Field(None, min_length=1) # Will be set from FeatureConfig
class TrainingConfig(BaseModel):
"""Configuration for the training process (PyTorch Lightning)."""
@ -103,26 +103,35 @@ class EvaluationConfig(BaseModel):
class OptunaConfig(BaseModel):
"""Optional configuration for Optuna hyperparameter optimization."""
enabled: bool = False
study_name: str = "default_study" # Added study_name
n_trials: int = Field(20, gt=0)
storage: Optional[str] = None # e.g., "sqlite:///output/hpo_results/study.db"
direction: Literal['minimize', 'maximize'] = 'minimize'
metric_to_optimize: str = 'val_mae_orig_scale'
pruning: bool = True
metric_to_optimize: str = 'val_MeanAbsoluteError_Original_Scale' # Updated default metric
pruning: bool = True
# --- Top-Level Configuration Model ---
class MainConfig(BaseModel):
"""Main configuration model nesting all sections."""
project_name: str = "TimeSeriesForecasting"
random_seed: Optional[int] = 42 # Added top-level seed
random_seed: Optional[int] = 42
log_level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'INFO'
output_dir: str = Field("output/cv_results", description="Base directory for saving all outputs (results, logs, models, plots).")
# --- Execution Control ---
run_cross_validation: bool = Field(True, description="Run the main cross-validation training loop?")
run_classic_training: bool = Field(True, description="Run a single classic train/val/test split training?")
run_ensemble_evaluation: bool = Field(True, description="Run ensemble evaluation using CV fold models?")
# --- End Execution Control ---
data: DataConfig
features: FeatureConfig
model: ModelConfig # ModelConfig no longer contains input_size
model: ModelConfig
training: TrainingConfig
cross_validation: CrossValidationConfig
evaluation: EvaluationConfig
optuna: Optional[OptunaConfig] = OptunaConfig() # Added optional Optuna config
optuna: Optional[OptunaConfig] = OptunaConfig()
@model_validator(mode='after')
def check_forecast_horizon_consistency(self) -> 'MainConfig':
@ -131,20 +140,33 @@ class MainConfig(BaseModel):
if self.model.forecast_horizon is None:
# If model config doesn't have it, set it from features config
self.model.forecast_horizon = self.features.forecast_horizon
elif self.model.forecast_horizon != self.features.forecast_horizon:
elif set(self.model.forecast_horizon) != set(self.features.forecast_horizon): # Compare sets for content equality
# If both are set but differ, raise error
raise ValueError(
f"ModelConfig forecast_horizon ({self.model.forecast_horizon}) must match "
f"FeatureConfig forecast_horizon ({self.features.forecast_horizon})."
)
# After potential setting, ensure model.forecast_horizon is actually set
if self.model and (self.model.forecast_horizon is None or self.model.forecast_horizon <= 0):
raise ValueError("ModelConfig requires a positive forecast_horizon (must be set in features config if not set explicitly in model config).")
# After potential setting, ensure model.forecast_horizon is actually set and valid
if self.model and (
self.model.forecast_horizon is None or
not isinstance(self.model.forecast_horizon, list) or # Check type
len(self.model.forecast_horizon) == 0 or # Check not empty
any(h <= 0 for h in self.model.forecast_horizon) # Check positive values
):
raise ValueError("ModelConfig requires a non-empty list of positive forecast_horizon values (must be set in features config if not set explicitly in model config).")
# Input size check is removed as it's not part of static config anymore
return self
@model_validator(mode='after')
def check_execution_flags(self) -> 'MainConfig':
if not self.run_cross_validation and not self.run_classic_training:
raise ValueError("At least one of 'run_cross_validation' or 'run_classic_training' must be True.")
if self.run_ensemble_evaluation and not self.run_cross_validation:
raise ValueError("'run_ensemble_evaluation' requires 'run_cross_validation' to be True (needs CV fold models).")
return self
class Config:
# Example configuration for Pydantic itself
validate_assignment = True # Re-validate on assignment

View File

@ -0,0 +1,173 @@
import argparse
import json
import logging
import random
from pathlib import Path
from typing import Optional, List, Dict
import numpy as np
import pandas as pd
import torch
import yaml
from forecasting_model import MainConfig
# Get the root logger
logger = logging.getLogger(__name__)
def parse_arguments():
"""Parses command-line arguments."""
parser = argparse.ArgumentParser(
description="Run the Time Series Forecasting training pipeline using a configuration file.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-c', '--config',
type=str,
default='config.yaml',
help="Path to the YAML configuration file."
)
# Removed seed, debug, and output-dir arguments
args = parser.parse_args()
return args
def load_config(config_path: Path) -> MainConfig:
"""
Load and validate configuration from YAML file using Pydantic.
Args:
config_path: Path to the YAML configuration file.
Returns:
Validated MainConfig object.
Raises:
FileNotFoundError: If the config file doesn't exist.
yaml.YAMLError: If the file is not valid YAML.
pydantic.ValidationError: If the config doesn't match the schema.
"""
if not config_path.is_file():
logger.error(f"Configuration file not found at: {config_path}")
raise FileNotFoundError(f"Config file not found: {config_path}")
logger.info(f"Loading configuration from: {config_path}")
try:
with open(config_path, 'r') as f:
config_dict = yaml.safe_load(f)
# Validate configuration using Pydantic model
config = MainConfig(**config_dict)
logger.info("Configuration loaded and validated successfully.")
return config
except yaml.YAMLError as e:
logger.error(f"Error parsing YAML file {config_path}: {e}", exc_info=True)
raise
except Exception as e: # Catches Pydantic validation errors too
logger.error(f"Error validating configuration {config_path}: {e}", exc_info=True)
raise
def set_seeds(seed: Optional[int] = 42) -> None:
"""
Set random seeds for reproducibility across libraries.
Args:
seed: The seed value to use. If None, uses default 42.
"""
actual_seed = seed if seed is not None else 42
if seed is None:
logger.warning(f"No random_seed specified in config, using default seed: {actual_seed}")
else:
logger.info(f"Setting random seed from config: {actual_seed}")
random.seed(actual_seed)
np.random.seed(actual_seed)
torch.manual_seed(actual_seed)
# Ensure reproducibility for CUDA operations where possible
if torch.cuda.is_available():
torch.cuda.manual_seed(actual_seed)
torch.cuda.manual_seed_all(actual_seed) # For multi-GPU
# These settings can slow down training but improve reproducibility
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# PyTorch Lightning seeding (optional, as we seed torch directly)
# pl.seed_everything(seed, workers=True) # workers=True ensures dataloader reproducibility
def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
"""
Calculate mean and standard deviation of metrics across folds.
Handles potential NaN values by ignoring them.
Args:
all_fold_metrics: A list where each element is a dictionary of
metrics for one fold (e.g., {'MAE': v1, 'RMSE': v2}).
Returns:
A dictionary where keys are metric names and values are dicts
containing 'mean' and 'std' for that metric across folds.
Example: {'MAE': {'mean': m, 'std': s}, 'RMSE': {'mean': m2, 'std': s2}}
"""
if not all_fold_metrics:
logger.warning("Received empty list for metric aggregation.")
return {}
aggregated: Dict[str, Dict[str, float]] = {}
# Get metric names from the first valid fold's results
first_valid_metrics = next((m for m in all_fold_metrics if m), None)
if not first_valid_metrics:
logger.warning("No valid fold metrics found for aggregation.")
return {}
metric_names = list(first_valid_metrics.keys())
for metric in metric_names:
# Collect values for this metric across all folds, ignoring NaNs
values = [fold_metrics.get(metric) for fold_metrics in all_fold_metrics if fold_metrics and metric in fold_metrics]
valid_values = [v for v in values if v is not None and not np.isnan(v)]
if not valid_values:
logger.warning(f"No valid values found for metric '{metric}' across folds.")
mean_val = np.nan
std_val = np.nan
else:
mean_val = float(np.mean(valid_values))
std_val = float(np.std(valid_values))
logger.debug(f"Aggregated '{metric}': Mean={mean_val:.4f}, Std={std_val:.4f} from {len(valid_values)} folds.")
aggregated[metric] = {'mean': mean_val, 'std': std_val}
return aggregated
def save_results(results: Dict, filename: Path):
"""Save dictionary results to a JSON file."""
try:
filename.parent.mkdir(parents=True, exist_ok=True)
# Convert numpy types to native Python types for JSON serialization
results_serializable = json.loads(json.dumps(results, cls=NumpyEncoder))
with open(filename, 'w') as f:
json.dump(results_serializable, f, indent=4)
logger.info(f"Saved results to {filename}")
except TypeError as e:
logger.error(f"Serialization error saving results to {filename}. Check for non-serializable types (e.g., numpy types): {e}", exc_info=True)
except Exception as e:
logger.error(f"Failed to save results to {filename}: {e}", exc_info=True)
class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
elif isinstance(obj, np.floating):
return float(obj)
elif isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, (np.bool_, bool)):
return bool(obj)
elif pd.isna(obj): # Handle pandas NaT or numpy NaN gracefully
return None
return super(NumpyEncoder, self).default(obj)