entrix_case_challange/forecasting_model_run.py

import argparse
import logging
import sys
import os
import random
from pathlib import Path
import time
import json
import numpy as np
import pandas as pd
import torch
import yaml
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger

# Import necessary components from your project structure
# Assuming forecasting_model is a package installable or in PYTHONPATH
from forecasting_model.utils.config_model import MainConfig
from forecasting_model.data_processing import (
    load_raw_data,
    TimeSeriesCrossValidationSplitter,
    prepare_fold_data_and_loaders
)
from forecasting_model.model import LSTMForecastLightningModule
from forecasting_model.evaluation import evaluate_fold_predictions
from typing import Dict, List, Any, Optional

# Silence overly verbose libraries if needed
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
pil_logger = logging.getLogger('PIL')
pil_logger.setLevel(logging.WARNING)

# --- Basic Logging Setup ---
# Configure logging early. Level might be adjusted by config.
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)-7s - %(message)s',
                    datefmt='%H:%M:%S')
# Get the root logger
logger = logging.getLogger()

# --- Argument Parsing ---
def parse_arguments():
    """Parses command-line arguments."""
    parser = argparse.ArgumentParser(
        description="Run the Time Series Forecasting training pipeline.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        '-c', '--config',
        type=str,
        default='config.yaml',
        help="Path to the YAML configuration file."
    )
    parser.add_argument(
        '--seed',
        type=int,
        default=None, # Default to None, use config value if not provided
        help="Override random seed defined in config."
    )
    parser.add_argument(
        '--debug',
        action='store_true',
        help="Override log level to DEBUG."
    )
    parser.add_argument(
        '--output-dir',
        type=str,
        default='output/cv_results', # Default output base directory
        help="Base directory for saving cross-validation results (checkpoints, logs, plots)."
    )

    args = parser.parse_args()
    return args

# --- Helper Functions ---

def load_config(config_path: Path) -> MainConfig:
    """
    Load and validate configuration from YAML file using Pydantic.

    Args:
        config_path: Path to the YAML configuration file.

    Returns:
        Validated MainConfig object.

    Raises:
        FileNotFoundError: If the config file doesn't exist.
        yaml.YAMLError: If the file is not valid YAML.
        pydantic.ValidationError: If the config doesn't match the schema.
    """
    if not config_path.is_file():
        logger.error(f"Configuration file not found at: {config_path}")
        raise FileNotFoundError(f"Config file not found: {config_path}")

    logger.info(f"Loading configuration from: {config_path}")
    try:
        with open(config_path, 'r') as f:
            config_dict = yaml.safe_load(f)

        # Validate configuration using Pydantic model
        config = MainConfig(**config_dict)
        logger.info("Configuration loaded and validated successfully.")
        return config
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML file {config_path}: {e}", exc_info=True)
        raise
    except Exception as e: # Catches Pydantic validation errors too
        logger.error(f"Error validating configuration {config_path}: {e}", exc_info=True)
        raise

def set_seeds(seed: Optional[int] = 42) -> None:
    """
    Set random seeds for reproducibility across libraries.

    Args:
        seed: The seed value to use. If None, uses default 42.
    """
    if seed is None:
        seed = 42
        logger.warning(f"No seed provided, using default seed: {seed}")
    else:
        logger.info(f"Setting random seed: {seed}")

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # Ensure reproducibility for CUDA operations where possible
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # For multi-GPU
        # These settings can slow down training but improve reproducibility
        # torch.backends.cudnn.deterministic = True
        # torch.backends.cudnn.benchmark = False
    # PyTorch Lightning seeding (optional, as we seed torch directly)
    # pl.seed_everything(seed, workers=True) # workers=True ensures dataloader reproducibility

def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
    """
    Calculate mean and standard deviation of metrics across folds.
    Handles potential NaN values by ignoring them.

    Args:
        all_fold_metrics: A list where each element is a dictionary of
                          metrics for one fold (e.g., {'MAE': v1, 'RMSE': v2}).

    Returns:
        A dictionary where keys are metric names and values are dicts
        containing 'mean' and 'std' for that metric across folds.
        Example: {'MAE': {'mean': m, 'std': s}, 'RMSE': {'mean': m2, 'std': s2}}
    """
    if not all_fold_metrics:
        logger.warning("Received empty list for metric aggregation.")
        return {}

    aggregated: Dict[str, Dict[str, float]] = {}
    # Get metric names from the first valid fold's results
    first_valid_metrics = next((m for m in all_fold_metrics if m), None)
    if not first_valid_metrics:
        logger.warning("No valid fold metrics found for aggregation.")
        return {}
    metric_names = list(first_valid_metrics.keys())

    for metric in metric_names:
        # Collect values for this metric across all folds, ignoring NaNs
        values = [fold_metrics.get(metric) for fold_metrics in all_fold_metrics if fold_metrics and metric in fold_metrics]
        valid_values = [v for v in values if v is not None and not np.isnan(v)]

        if not valid_values:
            logger.warning(f"No valid values found for metric '{metric}' across folds.")
            mean_val = np.nan
            std_val = np.nan
        else:
            mean_val = float(np.mean(valid_values))
            std_val = float(np.std(valid_values))
            logger.debug(f"Aggregated '{metric}': Mean={mean_val:.4f}, Std={std_val:.4f} from {len(valid_values)} folds.")

        aggregated[metric] = {'mean': mean_val, 'std': std_val}

    return aggregated

def save_results(results: Dict, filename: Path):
    """Save dictionary results to a JSON file."""
    try:
        filename.parent.mkdir(parents=True, exist_ok=True)
        with open(filename, 'w') as f:
            json.dump(results, f, indent=4)
        logger.info(f"Saved results to {filename}")
    except Exception as e:
        logger.error(f"Failed to save results to {filename}: {e}", exc_info=True)


# --- Main Training & Evaluation Function ---
def run_training_pipeline(config: MainConfig, output_base_dir: Path):
    """Runs the full cross-validation training and evaluation pipeline."""
    start_time = time.perf_counter()

    # --- Data Loading ---
    try:
        df = load_raw_data(config.data)
    except Exception as e:
        logger.critical(f"Failed to load raw data: {e}", exc_info=True)
        sys.exit(1) # Cannot proceed without data

    # --- Cross-Validation Setup ---
    try:
        cv_splitter = TimeSeriesCrossValidationSplitter(config.cross_validation, len(df))
    except ValueError as e:
         logger.critical(f"Failed to initialize CV splitter: {e}", exc_info=True)
         sys.exit(1)

    all_fold_test_metrics: List[Dict[str, float]] = []
    all_fold_best_val_scores: Dict[int, Optional[float]] = {} # Store best val score per fold

    # --- Cross-Validation Loop ---
    logger.info(f"Starting {config.cross_validation.n_splits}-Fold Cross-Validation...")
    for fold_num, (train_idx, val_idx, test_idx) in enumerate(cv_splitter.split()):
        fold_start_time = time.perf_counter()
        fold_id = fold_num + 1
        logger.info(f"--- Starting Fold {fold_id}/{config.cross_validation.n_splits} ---")

        fold_output_dir = output_base_dir / f"fold_{fold_id:02d}"
        fold_output_dir.mkdir(parents=True, exist_ok=True)
        logger.debug(f"Fold output directory: {fold_output_dir}")

        try:
            # --- Per-Fold Data Preparation ---
            logger.info("Preparing data loaders for the fold...")
            train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
                full_df=df,
                train_idx=train_idx,
                val_idx=val_idx,
                test_idx=test_idx,
                target_col=config.data.target_col, # Pass target col name explicitly
                feature_config=config.features,
                train_config=config.training,
                eval_config=config.evaluation
            )
            logger.info(f"Data loaders prepared. Input size determined: {input_size}")

            # --- Model Initialization ---
            # Pass input_size directly, ModelConfig no longer holds it.
            # Ensure forecast horizon is consistent (checked in MainConfig validation)
            current_model_config = config.model # Use the validated model config

            model = LSTMForecastLightningModule(
                model_config=current_model_config, # Does not contain input_size
                train_config=config.training,
                input_size=input_size,  # Pass the dynamically determined input_size
                target_scaler=target_scaler # Pass the fold-specific scaler
            )
            logger.info("LSTMForecastLightningModule initialized.")

            # --- PyTorch Lightning Callbacks ---
            # Monitor the validation MAE on the original scale (logged by LightningModule)
            monitor_metric = "val_mae_orig_scale"
            monitor_mode = "min"

            early_stop_callback = None
            if config.training.early_stopping_patience is not None and config.training.early_stopping_patience > 0:
                 early_stop_callback = EarlyStopping(
                     monitor=monitor_metric,
                     min_delta=0.0001, # Minimum change to qualify as improvement
                     patience=config.training.early_stopping_patience,
                     verbose=True,
                     mode=monitor_mode
                 )
                 logger.info(f"Enabled EarlyStopping: monitor='{monitor_metric}', patience={config.training.early_stopping_patience}")

            # Checkpoint callback to save the best model based on validation metric
            checkpoint_callback = ModelCheckpoint(
                dirpath=fold_output_dir / "checkpoints",
                filename=f"best_model_fold_{fold_id}", # {{epoch}}-{{val_loss:.2f}} etc. possible
                save_top_k=1,
                monitor=monitor_metric,
                mode=monitor_mode,
                verbose=True
            )
            logger.info(f"Enabled ModelCheckpoint: monitor='{monitor_metric}', mode='{monitor_mode}'")

            # Learning rate monitor callback
            lr_monitor = LearningRateMonitor(logging_interval='epoch')

            callbacks = [checkpoint_callback, lr_monitor]
            if early_stop_callback:
                callbacks.append(early_stop_callback)

            # --- PyTorch Lightning Logger ---
            # Log metrics to a CSV file within the fold directory
            pl_logger = CSVLogger(save_dir=str(output_base_dir), name=f"fold_{fold_id:02d}", version='logs')
            logger.info(f"Using CSVLogger, logs will be saved in: {pl_logger.log_dir}")

            # --- PyTorch Lightning Trainer ---
            # Determine accelerator and devices based on PyTorch check
            accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
            devices = 1 if accelerator == 'gpu' else None # Or specify specific GPU IDs [0], [1] etc.
            precision = getattr(config.training, 'precision', 32) # Default to 32-bit

            trainer = pl.Trainer(
                accelerator=accelerator,
                devices=devices,
                max_epochs=config.training.epochs,
                callbacks=callbacks,
                logger=pl_logger,
                log_every_n_steps=max(1, len(train_loader)//10), # Log ~10 times per epoch
                enable_progress_bar=True, # Set to False for less verbose runs (e.g., HPO)
                gradient_clip_val=getattr(config.training, 'gradient_clip_val', None),
                precision=precision,
                # deterministic=True, # For stricter reproducibility (can slow down)
            )
            logger.info(f"Initialized PyTorch Lightning Trainer: accelerator='{accelerator}', devices={devices}, precision={precision}")

            # --- Training ---
            logger.info(f"Starting training for Fold {fold_id}...")
            trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
            logger.info(f"Training finished for Fold {fold_id}.")

            # Store best validation score for this fold
            best_val_score = trainer.checkpoint_callback.best_model_score
            best_model_path = trainer.checkpoint_callback.best_model_path
            all_fold_best_val_scores[fold_id] = best_val_score.item() if best_val_score else None
            if best_val_score is not None:
                logger.info(f"Best validation score ({monitor_metric}) for Fold {fold_id}: {all_fold_best_val_scores[fold_id]:.4f}")
                logger.info(f"Best model checkpoint path: {best_model_path}")
            else:
                 logger.warning(f"Could not retrieve best validation score/path for Fold {fold_id} (metric: {monitor_metric}). Evaluation might use last model.")
                 best_model_path = None # Ensure evaluation doesn't try to load 'best' if checkpointing failed

            # --- Prediction on Test Set ---
            # Use trainer.predict() to get model outputs
            logger.info(f"Starting prediction for Fold {fold_id} using best checkpoint...")
            # predict_step returns dict {'preds_scaled': ..., 'targets_scaled': ...}
            # We pass the test_loader here, which yields (x, y) pairs, so predict_step will include targets
            prediction_results_list = trainer.predict(
                # model=model, # Not needed if using ckpt_path
                ckpt_path=best_model_path if best_model_path else 'last', # Load best model or last if best failed
                dataloaders=test_loader
                # return_predictions=True # Default is True
            )

            # Check if prediction returned results
            if not prediction_results_list:
                 logger.error(f"Predict phase did not return any results for Fold {fold_id}. Check predict_step and logs.")
                 fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
            else:
                try:
                    # Concatenate predictions and targets from predict_step results
                    all_preds_scaled = torch.cat([batch_res['preds_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
                    # Check if targets were included (they should be if using test_loader)
                    if 'targets_scaled' in prediction_results_list[0]:
                         all_targets_scaled = torch.cat([batch_res['targets_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
                    else:
                         # This case shouldn't happen if using test_loader, but good safeguard
                         logger.error(f"Targets not found in prediction results for Fold {fold_id}. Cannot evaluate.")
                         raise ValueError("Targets missing from prediction results.")


                    # --- Final Evaluation & Plotting ---
                    logger.info(f"Processing prediction results for Fold {fold_id}...")
                    fold_metrics = evaluate_fold_predictions(
                        y_true_scaled=all_targets_scaled,
                        y_pred_scaled=all_preds_scaled,
                        target_scaler=target_scaler, # Use the scaler from this fold
                        eval_config=config.evaluation,
                        fold_num=fold_num, # Pass zero-based index
                        output_dir=output_base_dir, # Base dir for saving plots etc.
                        # time_index=df.iloc[test_idx].index # Pass time index if needed
                    )
                    # Save fold metrics
                    save_results(fold_metrics, fold_output_dir / "test_metrics.json")

                except KeyError as e:
                     logger.error(f"KeyError processing prediction results for Fold {fold_id}: Missing key {e}. Check predict_step return format.", exc_info=True)
                     fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
                except Exception as e:
                     logger.error(f"Error processing prediction results for Fold {fold_id}: {e}", exc_info=True)
                     fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}

            all_fold_test_metrics.append(fold_metrics)

            # --- (Optional) Log final test metrics using trainer.test() ---
            # If you want the metrics logged by test_step aggregated, call test now.
            # logger.info(f"Logging final test metrics via trainer.test() for Fold {fold_id}...")
            # try:
            #     trainer.test(ckpt_path=best_model_path if best_model_path else 'last', dataloaders=test_loader, verbose=False)
            # except Exception as e:
            #     logger.warning(f"trainer.test() call failed for Fold {fold_id}: {e}")

        except Exception as e:
            # Catch errors during the fold processing (data prep, training, prediction, eval)
            logger.error(f"An error occurred during Fold {fold_id} pipeline: {e}", exc_info=True)
            all_fold_test_metrics.append({'MAE': np.nan, 'RMSE': np.nan})


        # --- Cleanup per fold ---
        if torch.cuda.is_available():
             torch.cuda.empty_cache()
             logger.debug("Cleared CUDA cache.")

        fold_end_time = time.perf_counter()
        logger.info(f"--- Finished Fold {fold_id} in {fold_end_time - fold_start_time:.2f} seconds ---")


    # --- Aggregation and Final Reporting ---
    logger.info("Cross-validation finished. Aggregating results...")
    aggregated_metrics = aggregate_cv_metrics(all_fold_test_metrics)

    # Save aggregated results
    final_results = {
        'aggregated_test_metrics': aggregated_metrics,
        'per_fold_test_metrics': all_fold_test_metrics,
        'per_fold_best_val_scores': all_fold_best_val_scores,
    }
    save_results(final_results, output_base_dir / "aggregated_cv_results.json")


    # Log final results
    logger.info("--- Aggregated Cross-Validation Test Results ---")
    if aggregated_metrics:
        for metric, stats in aggregated_metrics.items():
            logger.info(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")
    else:
        logger.warning("No metrics available for aggregation.")
    logger.info("-------------------------------------------------")

    end_time = time.perf_counter()
    logger.info(f"Training pipeline finished successfully in {end_time - start_time:.2f} seconds.")


# --- Main Execution ---
def run():
    """Main execution function."""
    args = parse_arguments()
    config_path = Path(args.config)
    output_dir = Path(args.output_dir)

    # Adjust log level if debug flag is set
    if args.debug:
        logger.setLevel(logging.DEBUG)
        logger.debug("# --- Debug mode enabled. --- #")

    # --- Configuration Loading ---
    try:
        config = load_config(config_path)
    except Exception:
        # Error already logged in load_config
        sys.exit(1)

    # --- Seed Setting ---
    # Use command-line seed if provided, otherwise use config seed
    seed = args.seed if args.seed is not None else getattr(config, 'random_seed', 42)
    set_seeds(seed)

    # --- Pipeline Execution ---
    try:
        run_training_pipeline(config, output_dir)

    except SystemExit as e:
         logger.warning(f"Pipeline exited with code {e.code}.")
         sys.exit(e.code) # Propagate exit code
    except Exception as e:
        logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
        sys.exit(1)

if __name__ == "__main__":
    run()