intermediate backup

2025-05-02 14:36:19 +02:00
parent 980696aef5
commit 2b0a5728d4
16 changed files with 2780 additions and 316 deletions
--- a/forecasting_config.yaml
+++ b/forecasting_config.yaml
@ -1,22 +1,88 @@
-# Configuration for the forecasting model EDA
-# This file defines the settings for data loading, analysis, and visualization
+# Configuration for Time Series Forecasting Pipeline

-# -- General Settings --
-log_level: INFO  # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
-debug: true
+project_name: "TimeSeriesForecasting" # Name for the project/run
+random_seed: 42 # Optional: Global random seed for reproducibility

-# -- IO Settings --
-data_file: data/Day-ahead_Prices_60min.csv               # Path to the input data CSV relative to project root
-output_dir: output/reports                               # Directory to save generated plots and report artifacts
-latex_template_file: null                                # Path to the LaTeX template file relative to project root
+# --- Data Loading Configuration ---
+data:
+  data_path: "data/Day-ahead_Prices_60min.csv"      # Path to your CSV
+  # --- Raw Data Specifics ---
+  raw_datetime_col: "MTU (CET/CEST)"                # EXACT name in your raw CSV
+  raw_target_col: "Day-ahead Price [EUR/MWh]"       # EXACT name in your raw CSV
+  raw_datetime_format: '%d.%m.%Y %H:%M'           # Format string is now hardcoded in load_raw_data based on analysis

+  # --- Standardized Names & Processing ---
+  datetime_col: "Timestamp" # Desired name for the index after processing
+  target_col: "Price" # Desired name for the target column after processing
+  expected_frequency: "h" # Expected frequency ('h', 'D', '15min', etc. or null)
+  fill_initial_target_nans: true # Fill target NaNs immediately after loading?

-# -- Zoom Settings (Plotting and Analysis) --
-# Optional: Specify a date range for zoomed-in plots (YYYY-MM-DD format)
-# Example: zoom_start_date: "2023-01-01"
-# Example: zoom_end_date: "2023-12-31"
-zoom_start_date: null                                    # Default to null
-zoom_end_date: null                                      # Default to null
+# --- Feature Engineering & Preprocessing Configuration ---
+features:
+  sequence_length: 72 # REQUIRED: Lookback window size (e.g., 72 hours = 3 days)
+  forecast_horizon: 24 # REQUIRED: Number of steps ahead to predict (e.g., 24 hours)
+  lags: [24, 48, 72, 168] # List of lag features to create (e.g., 1 day, 2 days, 3 days, 1 week)
+  rolling_window_sizes: [24, 72, 168] # List of window sizes for rolling stats (mean, std)
+  use_time_features: true # Create calendar features (hour, dayofweek, month, etc.)?
+  sinus_curve: true # Create sinusoidal feature for time of day?
+  cosin_curve: true # Create cosinusoidal feature for time of day?
+  fill_nan: 'ffill' # Method to fill NaNs created by lags/rolling windows ('ffill', 'bfill', 0, etc.)
+  scaling_method: 'standard' # Scaling method ('standard', 'minmax', or null/None for no scaling) Fit per fold.

-# -- Data Settings --
-expected_data_frequency: "h"                             # Expected frequency of the time series data (h=hourly, D=daily, M=monthly, Y=yearly)
+  # Optional: Wavelet Transform configuration
+  wavelet_transform:
+    apply: false # Apply wavelet transform?
+    target_or_feature: "target" # Apply to 'target' before other features, or 'feature' after?
+    wavelet_type: "db4" # Type of wavelet (e.g., 'db4', 'sym4')
+    level: 3 # Decomposition level (must be > 0)
+    use_coeffs: ["approx", "detail_1"] # Which coefficients to use as features
+
+  # Optional: Feature Clipping configuration
+  clipping:
+    apply: false # Apply clipping to generated features (excluding target)?
+    clip_min: 0 # Minimum value for clipping
+    clip_max: 400 # Maximum value for clipping
+
+# --- Model Architecture Configuration ---
+model:
+  # input_size: null # Removed: Calculated automatically based on features and passed directly to model
+  hidden_size: 128 # REQUIRED: Number of units in LSTM hidden layers
+  num_layers: 2 # REQUIRED: Number of LSTM layers
+  dropout: 0.2 # REQUIRED: Dropout rate (between 0.0 and 1.0)
+  use_residual_skips: false # Add residual connection from input to LSTM output?
+  # forecast_horizon: null # Set automatically from features.forecast_horizon
+
+# --- Training Configuration (PyTorch Lightning) ---
+training:
+  batch_size: 64 # REQUIRED: Batch size for training
+  epochs: 50 # REQUIRED: Max number of training epochs per fold
+  learning_rate: 0.001 # REQUIRED: Initial learning rate for Adam optimizer
+  loss_function: "MSE" # Loss function ('MSE' or 'MAE')
+  early_stopping_patience: 10 # Optional: Patience for early stopping (epochs). Set null/None to disable. Must be >= 1 if set.
+  scheduler_step_size: null # Optional: Step size for StepLR scheduler (epochs). Set null/None to disable. Must be > 0 if set.
+  scheduler_gamma: null # Optional: Gamma factor for StepLR scheduler. Set null/None to disable. Must be 0 < gamma < 1 if set.
+  gradient_clip_val: 1.0 # Optional: Value for gradient clipping. Set null/None to disable. Must be >= 0.0 if set.
+  num_workers: 0 # Number of workers for DataLoader (>= 0). 0 means data loading happens in the main process.
+  precision: 32 # Training precision (16, 32, 64, 'bf16')
+
+# --- Cross-Validation Configuration (Rolling Window) ---
+cross_validation:
+  n_splits: 5 # REQUIRED: Number of CV folds (must be > 0)
+  test_size_fraction: 0.1 # REQUIRED: Fraction of the *fixed training window size* for the test set (0 < frac < 1)
+  val_size_fraction: 0.1 # REQUIRED: Fraction of the *fixed training window size* for the validation set (0 < frac < 1)
+  initial_train_size: null # Optional: Size of the fixed training window (integer samples or float fraction of total data > 0). If null, estimated automatically.
+
+# --- Evaluation Configuration ---
+evaluation:
+  eval_batch_size: 128 # REQUIRED: Batch size for evaluation/testing (must be > 0)
+  save_plots: true # Save evaluation plots (predictions, residuals)?
+  plot_sample_size: 1000 # Optional: Max number of points in time series plots (must be > 0 if set)
+
+# --- Optuna Hyperparameter Optimization Configuration ---
+optuna:
+  enabled: false # Enable Optuna HPO? If true, requires optuna.py script.
+  n_trials: 20 # Number of trials to run (must be > 0)
+  storage: null # Optional: Optuna storage URL (e.g., "sqlite:///output/hpo_results/study.db"). If null, uses in-memory.
+  direction: "minimize" # Optimization direction ('minimize' or 'maximize')
+  metric_to_optimize: "val_mae_orig_scale" # Metric logged by LightningModule to optimize
+  pruning: true # Enable Optuna trial pruning?