entrix_case_challange/forecasting_config.yaml

# Configuration for Time Series Forecasting Pipeline

project_name: "TimeSeriesForecasting" # Name for the project/run
random_seed: 42 # Optional: Global random seed for reproducibility
log_level: INFO # Or DEBUG
output_dir: "output" # Base directory for all outputs (logs, models, results)

# --- Execution Control ---
run_cross_validation: true     # Run the main cross-validation loop?
run_classic_training: true     # Run a single classic train/val/test split?
run_ensemble_evaluation: true  # Run ensemble evaluation (requires run_cross_validation=true)?
# --- End Execution Control ---

# --- Data Loading Configuration ---
data:
  data_path: "data/Day-ahead_Prices_60min.csv"      # Path to your CSV
  # --- Raw Data Specifics ---
  raw_datetime_col: "MTU (CET/CEST)"                # EXACT name in your raw CSV
  raw_target_col: "Day-ahead Price [EUR/MWh]"       # EXACT name in your raw CSV
  raw_datetime_format: '%d.%m.%Y %H:%M'           # Format string is now hardcoded in load_raw_data based on analysis

  # --- Standardized Names & Processing ---
  datetime_col: "Timestamp" # Desired name for the index after processing
  target_col: "Price" # Desired name for the target column after processing
  expected_frequency: "h" # Expected frequency ('h', 'D', '15min', etc. or null)
  fill_initial_target_nans: true # Fill target NaNs immediately after loading?

# --- Feature Engineering & Preprocessing Configuration ---
features:
  sequence_length: 72 # REQUIRED: Lookback window size (e.g., 72 hours = 3 days) includes all features and lags!
  # REQUIRED: List of steps ahead to predict (e.g., 1 hour, 6 hours, 12 hours, 24 hours, 48 hours, 72 hours, 168 hours)
  forecast_horizon: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]
  lags: [1,2,3,24,48,168] # List of lag features to create in h; 168 = 1W
  rolling_window_sizes: [72, 168] # List of window sizes for rolling stats (mean, std)
  use_time_features: true # Create calendar features (hour, dayofweek, month, etc.)?
  sinus_curve: true # Create sinusoidal feature for time of day?
  cosine_curve: true # Create cosinusoidal feature for time of day?
  fill_nan: 'ffill' # Method to fill NaNs created by lags/rolling windows ('ffill', 'bfill', 0, etc.)
  scaling_method: 'standard' # Scaling method ('standard', 'minmax', or null/None for no scaling) Fit per fold.

  # Optional: Wavelet Transform configuration
  wavelet_transform:
    apply: false # Apply wavelet transform?
    target_or_feature: "target" # Apply to 'target' before other features, or 'feature' after?
    wavelet_type: "db4" # Type of wavelet (e.g., 'db4', 'sym4')
    level: 3 # Decomposition level (must be > 0)
    use_coeffs: ["approx", "detail_1"] # Which coefficients to use as features

  # Optional: Feature Clipping configuration
  clipping:
    apply: false # Apply clipping to generated features (excluding target)?
    clip_min: 0 # Minimum value for clipping
    clip_max: 400 # Maximum value for clipping

# --- Model Architecture Configuration ---
model:
  # input_size: null # Removed: Calculated automatically based on features and passed directly to model
  hidden_size: 128 # REQUIRED: Number of units in LSTM hidden layers
  num_layers: 2 # REQUIRED: Number of LSTM layers
  dropout: 0.2 # REQUIRED: Dropout rate (between 0.0 and 1.0)
  use_residual_skips: false # Add residual connection from input to LSTM output?
  # forecast_horizon: null # Set automatically from features.forecast_horizon

# --- Training Configuration (PyTorch Lightning) ---
training:
  batch_size: 64 # REQUIRED: Batch size for training
  epochs: 72 # REQUIRED: Max number of training epochs per fold
  learning_rate: 0.0001 # REQUIRED: Initial learning rate for Adam optimizer
  check_val_n_epoch: 3
  loss_function: "MSE" # Loss function ('MSE' or 'MAE')
  early_stopping_patience: 10 # Optional: Patience for early stopping (epochs). Set null/None to disable. Must be >= 1 if set.
  scheduler_step_size: null # Optional: Step size for StepLR scheduler (epochs). Set null/None to disable. Must be > 0 if set.
  scheduler_gamma: null # Optional: Gamma factor for StepLR scheduler. Set null/None to disable. Must be 0 < gamma < 1 if set.
  gradient_clip_val: 1.0 # Optional: Value for gradient clipping. Set null/None to disable. Must be >= 0.0 if set.
  num_workers: 4 # Number of workers for DataLoader (>= 0). 0 means data loading happens in the main process.
  precision: 32 # Training precision (16, 32, 64, 'bf16')

# --- Cross-Validation Configuration (Rolling Window) ---
cross_validation:
  n_splits: 3 # REQUIRED: Number of CV folds (must be > 0)
  test_size_fraction: 0.1 # REQUIRED: Fraction of the *fixed training window size* for the test set (0 < frac < 1)
  val_size_fraction: 0.1 # REQUIRED: Fraction of the *fixed training window size* for the validation set (0 < frac < 1)
  initial_train_size: null # Optional: Size of the fixed training window (integer samples or float fraction of total data > 0). If null, estimated automatically.

# --- Evaluation Configuration ---
evaluation:
  eval_batch_size: 128 # REQUIRED: Batch size for evaluation/testing (must be > 0)
  save_plots: true # Save evaluation plots (predictions, residuals)?
  plot_sample_size: 1000 # Optional: Max number of points in time series plots (must be > 0 if set)

# --- Optuna Hyperparameter Optimization Configuration ---
optuna:
  enabled: true # Set to true to actually run HPO via optuna_run.py
  study_name: "lstm_price_forecast" # Specific name for this study
  n_trials: 100 # Number of trials to run
  storage: "sqlite:///study_v1.db" # Path to database file
  direction: "minimize" # 'minimize' or 'maximize'
  metric_to_optimize: "val_MeanAbsoluteError" # Metric logged in validation_step
  pruning: true # Enable pruning