intermediate backup

2025-05-02 14:36:19 +02:00
parent 980696aef5
commit 2b0a5728d4
16 changed files with 2780 additions and 316 deletions
--- a/forecasting_model_run.py
+++ b/forecasting_model_run.py
@ -0,0 +1,468 @@
+import argparse
+import logging
+import sys
+import os
+import random
+from pathlib import Path
+import time
+import json
+import numpy as np
+import pandas as pd
+import torch
+import yaml
+import pytorch_lightning as pl
+from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
+from pytorch_lightning.loggers import CSVLogger
+
+# Import necessary components from your project structure
+# Assuming forecasting_model is a package installable or in PYTHONPATH
+from forecasting_model.utils.config_model import MainConfig
+from forecasting_model.data_processing import (
+    load_raw_data,
+    TimeSeriesCrossValidationSplitter,
+    prepare_fold_data_and_loaders
+)
+from forecasting_model.model import LSTMForecastLightningModule
+from forecasting_model.evaluation import evaluate_fold_predictions
+from typing import Dict, List, Any, Optional
+
+# Silence overly verbose libraries if needed
+mpl_logger = logging.getLogger('matplotlib')
+mpl_logger.setLevel(logging.WARNING)
+pil_logger = logging.getLogger('PIL')
+pil_logger.setLevel(logging.WARNING)
+
+# --- Basic Logging Setup ---
+# Configure logging early. Level might be adjusted by config.
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)-7s - %(message)s',
+                    datefmt='%H:%M:%S')
+# Get the root logger
+logger = logging.getLogger()
+
+# --- Argument Parsing ---
+def parse_arguments():
+    """Parses command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Run the Time Series Forecasting training pipeline.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        '-c', '--config',
+        type=str,
+        default='config.yaml',
+        help="Path to the YAML configuration file."
+    )
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=None, # Default to None, use config value if not provided
+        help="Override random seed defined in config."
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help="Override log level to DEBUG."
+    )
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        default='output/cv_results', # Default output base directory
+        help="Base directory for saving cross-validation results (checkpoints, logs, plots)."
+    )
+
+    args = parser.parse_args()
+    return args
+
+# --- Helper Functions ---
+
+def load_config(config_path: Path) -> MainConfig:
+    """
+    Load and validate configuration from YAML file using Pydantic.
+
+    Args:
+        config_path: Path to the YAML configuration file.
+
+    Returns:
+        Validated MainConfig object.
+
+    Raises:
+        FileNotFoundError: If the config file doesn't exist.
+        yaml.YAMLError: If the file is not valid YAML.
+        pydantic.ValidationError: If the config doesn't match the schema.
+    """
+    if not config_path.is_file():
+        logger.error(f"Configuration file not found at: {config_path}")
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+
+    logger.info(f"Loading configuration from: {config_path}")
+    try:
+        with open(config_path, 'r') as f:
+            config_dict = yaml.safe_load(f)
+
+        # Validate configuration using Pydantic model
+        config = MainConfig(**config_dict)
+        logger.info("Configuration loaded and validated successfully.")
+        return config
+    except yaml.YAMLError as e:
+        logger.error(f"Error parsing YAML file {config_path}: {e}", exc_info=True)
+        raise
+    except Exception as e: # Catches Pydantic validation errors too
+        logger.error(f"Error validating configuration {config_path}: {e}", exc_info=True)
+        raise
+
+def set_seeds(seed: Optional[int] = 42) -> None:
+    """
+    Set random seeds for reproducibility across libraries.
+
+    Args:
+        seed: The seed value to use. If None, uses default 42.
+    """
+    if seed is None:
+        seed = 42
+        logger.warning(f"No seed provided, using default seed: {seed}")
+    else:
+        logger.info(f"Setting random seed: {seed}")
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # Ensure reproducibility for CUDA operations where possible
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed) # For multi-GPU
+        # These settings can slow down training but improve reproducibility
+        # torch.backends.cudnn.deterministic = True
+        # torch.backends.cudnn.benchmark = False
+    # PyTorch Lightning seeding (optional, as we seed torch directly)
+    # pl.seed_everything(seed, workers=True) # workers=True ensures dataloader reproducibility
+
+def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
+    """
+    Calculate mean and standard deviation of metrics across folds.
+    Handles potential NaN values by ignoring them.
+
+    Args:
+        all_fold_metrics: A list where each element is a dictionary of
+                          metrics for one fold (e.g., {'MAE': v1, 'RMSE': v2}).
+
+    Returns:
+        A dictionary where keys are metric names and values are dicts
+        containing 'mean' and 'std' for that metric across folds.
+        Example: {'MAE': {'mean': m, 'std': s}, 'RMSE': {'mean': m2, 'std': s2}}
+    """
+    if not all_fold_metrics:
+        logger.warning("Received empty list for metric aggregation.")
+        return {}
+
+    aggregated: Dict[str, Dict[str, float]] = {}
+    # Get metric names from the first valid fold's results
+    first_valid_metrics = next((m for m in all_fold_metrics if m), None)
+    if not first_valid_metrics:
+        logger.warning("No valid fold metrics found for aggregation.")
+        return {}
+    metric_names = list(first_valid_metrics.keys())
+
+    for metric in metric_names:
+        # Collect values for this metric across all folds, ignoring NaNs
+        values = [fold_metrics.get(metric) for fold_metrics in all_fold_metrics if fold_metrics and metric in fold_metrics]
+        valid_values = [v for v in values if v is not None and not np.isnan(v)]
+
+        if not valid_values:
+            logger.warning(f"No valid values found for metric '{metric}' across folds.")
+            mean_val = np.nan
+            std_val = np.nan
+        else:
+            mean_val = float(np.mean(valid_values))
+            std_val = float(np.std(valid_values))
+            logger.debug(f"Aggregated '{metric}': Mean={mean_val:.4f}, Std={std_val:.4f} from {len(valid_values)} folds.")
+
+        aggregated[metric] = {'mean': mean_val, 'std': std_val}
+
+    return aggregated
+
+def save_results(results: Dict, filename: Path):
+    """Save dictionary results to a JSON file."""
+    try:
+        filename.parent.mkdir(parents=True, exist_ok=True)
+        with open(filename, 'w') as f:
+            json.dump(results, f, indent=4)
+        logger.info(f"Saved results to {filename}")
+    except Exception as e:
+        logger.error(f"Failed to save results to {filename}: {e}", exc_info=True)
+
+
+# --- Main Training & Evaluation Function ---
+def run_training_pipeline(config: MainConfig, output_base_dir: Path):
+    """Runs the full cross-validation training and evaluation pipeline."""
+    start_time = time.perf_counter()
+
+    # --- Data Loading ---
+    try:
+        df = load_raw_data(config.data)
+    except Exception as e:
+        logger.critical(f"Failed to load raw data: {e}", exc_info=True)
+        sys.exit(1) # Cannot proceed without data
+
+    # --- Cross-Validation Setup ---
+    try:
+        cv_splitter = TimeSeriesCrossValidationSplitter(config.cross_validation, len(df))
+    except ValueError as e:
+         logger.critical(f"Failed to initialize CV splitter: {e}", exc_info=True)
+         sys.exit(1)
+
+    all_fold_test_metrics: List[Dict[str, float]] = []
+    all_fold_best_val_scores: Dict[int, Optional[float]] = {} # Store best val score per fold
+
+    # --- Cross-Validation Loop ---
+    logger.info(f"Starting {config.cross_validation.n_splits}-Fold Cross-Validation...")
+    for fold_num, (train_idx, val_idx, test_idx) in enumerate(cv_splitter.split()):
+        fold_start_time = time.perf_counter()
+        fold_id = fold_num + 1
+        logger.info(f"--- Starting Fold {fold_id}/{config.cross_validation.n_splits} ---")
+
+        fold_output_dir = output_base_dir / f"fold_{fold_id:02d}"
+        fold_output_dir.mkdir(parents=True, exist_ok=True)
+        logger.debug(f"Fold output directory: {fold_output_dir}")
+
+        try:
+            # --- Per-Fold Data Preparation ---
+            logger.info("Preparing data loaders for the fold...")
+            train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
+                full_df=df,
+                train_idx=train_idx,
+                val_idx=val_idx,
+                test_idx=test_idx,
+                target_col=config.data.target_col, # Pass target col name explicitly
+                feature_config=config.features,
+                train_config=config.training,
+                eval_config=config.evaluation
+            )
+            logger.info(f"Data loaders prepared. Input size determined: {input_size}")
+
+            # --- Model Initialization ---
+            # Pass input_size directly, ModelConfig no longer holds it.
+            # Ensure forecast horizon is consistent (checked in MainConfig validation)
+            current_model_config = config.model # Use the validated model config
+
+            model = LSTMForecastLightningModule(
+                model_config=current_model_config, # Does not contain input_size
+                train_config=config.training,
+                input_size=input_size,  # Pass the dynamically determined input_size
+                target_scaler=target_scaler # Pass the fold-specific scaler
+            )
+            logger.info("LSTMForecastLightningModule initialized.")
+
+            # --- PyTorch Lightning Callbacks ---
+            # Monitor the validation MAE on the original scale (logged by LightningModule)
+            monitor_metric = "val_mae_orig_scale"
+            monitor_mode = "min"
+
+            early_stop_callback = None
+            if config.training.early_stopping_patience is not None and config.training.early_stopping_patience > 0:
+                 early_stop_callback = EarlyStopping(
+                     monitor=monitor_metric,
+                     min_delta=0.0001, # Minimum change to qualify as improvement
+                     patience=config.training.early_stopping_patience,
+                     verbose=True,
+                     mode=monitor_mode
+                 )
+                 logger.info(f"Enabled EarlyStopping: monitor='{monitor_metric}', patience={config.training.early_stopping_patience}")
+
+            # Checkpoint callback to save the best model based on validation metric
+            checkpoint_callback = ModelCheckpoint(
+                dirpath=fold_output_dir / "checkpoints",
+                filename=f"best_model_fold_{fold_id}", # {{epoch}}-{{val_loss:.2f}} etc. possible
+                save_top_k=1,
+                monitor=monitor_metric,
+                mode=monitor_mode,
+                verbose=True
+            )
+            logger.info(f"Enabled ModelCheckpoint: monitor='{monitor_metric}', mode='{monitor_mode}'")
+
+            # Learning rate monitor callback
+            lr_monitor = LearningRateMonitor(logging_interval='epoch')
+
+            callbacks = [checkpoint_callback, lr_monitor]
+            if early_stop_callback:
+                callbacks.append(early_stop_callback)
+
+            # --- PyTorch Lightning Logger ---
+            # Log metrics to a CSV file within the fold directory
+            pl_logger = CSVLogger(save_dir=str(output_base_dir), name=f"fold_{fold_id:02d}", version='logs')
+            logger.info(f"Using CSVLogger, logs will be saved in: {pl_logger.log_dir}")
+
+            # --- PyTorch Lightning Trainer ---
+            # Determine accelerator and devices based on PyTorch check
+            accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
+            devices = 1 if accelerator == 'gpu' else None # Or specify specific GPU IDs [0], [1] etc.
+            precision = getattr(config.training, 'precision', 32) # Default to 32-bit
+
+            trainer = pl.Trainer(
+                accelerator=accelerator,
+                devices=devices,
+                max_epochs=config.training.epochs,
+                callbacks=callbacks,
+                logger=pl_logger,
+                log_every_n_steps=max(1, len(train_loader)//10), # Log ~10 times per epoch
+                enable_progress_bar=True, # Set to False for less verbose runs (e.g., HPO)
+                gradient_clip_val=getattr(config.training, 'gradient_clip_val', None),
+                precision=precision,
+                # deterministic=True, # For stricter reproducibility (can slow down)
+            )
+            logger.info(f"Initialized PyTorch Lightning Trainer: accelerator='{accelerator}', devices={devices}, precision={precision}")
+
+            # --- Training ---
+            logger.info(f"Starting training for Fold {fold_id}...")
+            trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
+            logger.info(f"Training finished for Fold {fold_id}.")
+
+            # Store best validation score for this fold
+            best_val_score = trainer.checkpoint_callback.best_model_score
+            best_model_path = trainer.checkpoint_callback.best_model_path
+            all_fold_best_val_scores[fold_id] = best_val_score.item() if best_val_score else None
+            if best_val_score is not None:
+                logger.info(f"Best validation score ({monitor_metric}) for Fold {fold_id}: {all_fold_best_val_scores[fold_id]:.4f}")
+                logger.info(f"Best model checkpoint path: {best_model_path}")
+            else:
+                 logger.warning(f"Could not retrieve best validation score/path for Fold {fold_id} (metric: {monitor_metric}). Evaluation might use last model.")
+                 best_model_path = None # Ensure evaluation doesn't try to load 'best' if checkpointing failed
+
+            # --- Prediction on Test Set ---
+            # Use trainer.predict() to get model outputs
+            logger.info(f"Starting prediction for Fold {fold_id} using best checkpoint...")
+            # predict_step returns dict {'preds_scaled': ..., 'targets_scaled': ...}
+            # We pass the test_loader here, which yields (x, y) pairs, so predict_step will include targets
+            prediction_results_list = trainer.predict(
+                # model=model, # Not needed if using ckpt_path
+                ckpt_path=best_model_path if best_model_path else 'last', # Load best model or last if best failed
+                dataloaders=test_loader
+                # return_predictions=True # Default is True
+            )
+
+            # Check if prediction returned results
+            if not prediction_results_list:
+                 logger.error(f"Predict phase did not return any results for Fold {fold_id}. Check predict_step and logs.")
+                 fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
+            else:
+                try:
+                    # Concatenate predictions and targets from predict_step results
+                    all_preds_scaled = torch.cat([batch_res['preds_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
+                    # Check if targets were included (they should be if using test_loader)
+                    if 'targets_scaled' in prediction_results_list[0]:
+                         all_targets_scaled = torch.cat([batch_res['targets_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
+                    else:
+                         # This case shouldn't happen if using test_loader, but good safeguard
+                         logger.error(f"Targets not found in prediction results for Fold {fold_id}. Cannot evaluate.")
+                         raise ValueError("Targets missing from prediction results.")
+
+
+                    # --- Final Evaluation & Plotting ---
+                    logger.info(f"Processing prediction results for Fold {fold_id}...")
+                    fold_metrics = evaluate_fold_predictions(
+                        y_true_scaled=all_targets_scaled,
+                        y_pred_scaled=all_preds_scaled,
+                        target_scaler=target_scaler, # Use the scaler from this fold
+                        eval_config=config.evaluation,
+                        fold_num=fold_num, # Pass zero-based index
+                        output_dir=output_base_dir, # Base dir for saving plots etc.
+                        # time_index=df.iloc[test_idx].index # Pass time index if needed
+                    )
+                    # Save fold metrics
+                    save_results(fold_metrics, fold_output_dir / "test_metrics.json")
+
+                except KeyError as e:
+                     logger.error(f"KeyError processing prediction results for Fold {fold_id}: Missing key {e}. Check predict_step return format.", exc_info=True)
+                     fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
+                except Exception as e:
+                     logger.error(f"Error processing prediction results for Fold {fold_id}: {e}", exc_info=True)
+                     fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
+
+            all_fold_test_metrics.append(fold_metrics)
+
+            # --- (Optional) Log final test metrics using trainer.test() ---
+            # If you want the metrics logged by test_step aggregated, call test now.
+            # logger.info(f"Logging final test metrics via trainer.test() for Fold {fold_id}...")
+            # try:
+            #     trainer.test(ckpt_path=best_model_path if best_model_path else 'last', dataloaders=test_loader, verbose=False)
+            # except Exception as e:
+            #     logger.warning(f"trainer.test() call failed for Fold {fold_id}: {e}")
+
+        except Exception as e:
+            # Catch errors during the fold processing (data prep, training, prediction, eval)
+            logger.error(f"An error occurred during Fold {fold_id} pipeline: {e}", exc_info=True)
+            all_fold_test_metrics.append({'MAE': np.nan, 'RMSE': np.nan})
+
+
+        # --- Cleanup per fold ---
+        if torch.cuda.is_available():
+             torch.cuda.empty_cache()
+             logger.debug("Cleared CUDA cache.")
+
+        fold_end_time = time.perf_counter()
+        logger.info(f"--- Finished Fold {fold_id} in {fold_end_time - fold_start_time:.2f} seconds ---")
+
+
+    # --- Aggregation and Final Reporting ---
+    logger.info("Cross-validation finished. Aggregating results...")
+    aggregated_metrics = aggregate_cv_metrics(all_fold_test_metrics)
+
+    # Save aggregated results
+    final_results = {
+        'aggregated_test_metrics': aggregated_metrics,
+        'per_fold_test_metrics': all_fold_test_metrics,
+        'per_fold_best_val_scores': all_fold_best_val_scores,
+    }
+    save_results(final_results, output_base_dir / "aggregated_cv_results.json")
+
+
+    # Log final results
+    logger.info("--- Aggregated Cross-Validation Test Results ---")
+    if aggregated_metrics:
+        for metric, stats in aggregated_metrics.items():
+            logger.info(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")
+    else:
+        logger.warning("No metrics available for aggregation.")
+    logger.info("-------------------------------------------------")
+
+    end_time = time.perf_counter()
+    logger.info(f"Training pipeline finished successfully in {end_time - start_time:.2f} seconds.")
+
+
+# --- Main Execution ---
+def run():
+    """Main execution function."""
+    args = parse_arguments()
+    config_path = Path(args.config)
+    output_dir = Path(args.output_dir)
+
+    # Adjust log level if debug flag is set
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+        logger.debug("# --- Debug mode enabled. --- #")
+
+    # --- Configuration Loading ---
+    try:
+        config = load_config(config_path)
+    except Exception:
+        # Error already logged in load_config
+        sys.exit(1)
+
+    # --- Seed Setting ---
+    # Use command-line seed if provided, otherwise use config seed
+    seed = args.seed if args.seed is not None else getattr(config, 'random_seed', 42)
+    set_seeds(seed)
+
+    # --- Pipeline Execution ---
+    try:
+        run_training_pipeline(config, output_dir)
+
+    except SystemExit as e:
+         logger.warning(f"Pipeline exited with code {e.code}.")
+         sys.exit(e.code) # Propagate exit code
+    except Exception as e:
+        logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    run()