intermediate backup

This commit is contained in:
2025-05-02 14:36:19 +02:00
parent 980696aef5
commit 2b0a5728d4
16 changed files with 2780 additions and 316 deletions

468
forecasting_model_run.py Normal file
View File

@ -0,0 +1,468 @@
import argparse
import logging
import sys
import os
import random
from pathlib import Path
import time
import json
import numpy as np
import pandas as pd
import torch
import yaml
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import CSVLogger
# Import necessary components from your project structure
# Assuming forecasting_model is a package installable or in PYTHONPATH
from forecasting_model.utils.config_model import MainConfig
from forecasting_model.data_processing import (
load_raw_data,
TimeSeriesCrossValidationSplitter,
prepare_fold_data_and_loaders
)
from forecasting_model.model import LSTMForecastLightningModule
from forecasting_model.evaluation import evaluate_fold_predictions
from typing import Dict, List, Any, Optional
# Silence overly verbose libraries if needed
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
pil_logger = logging.getLogger('PIL')
pil_logger.setLevel(logging.WARNING)
# --- Basic Logging Setup ---
# Configure logging early. Level might be adjusted by config.
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)-7s - %(message)s',
datefmt='%H:%M:%S')
# Get the root logger
logger = logging.getLogger()
# --- Argument Parsing ---
def parse_arguments():
"""Parses command-line arguments."""
parser = argparse.ArgumentParser(
description="Run the Time Series Forecasting training pipeline.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-c', '--config',
type=str,
default='config.yaml',
help="Path to the YAML configuration file."
)
parser.add_argument(
'--seed',
type=int,
default=None, # Default to None, use config value if not provided
help="Override random seed defined in config."
)
parser.add_argument(
'--debug',
action='store_true',
help="Override log level to DEBUG."
)
parser.add_argument(
'--output-dir',
type=str,
default='output/cv_results', # Default output base directory
help="Base directory for saving cross-validation results (checkpoints, logs, plots)."
)
args = parser.parse_args()
return args
# --- Helper Functions ---
def load_config(config_path: Path) -> MainConfig:
"""
Load and validate configuration from YAML file using Pydantic.
Args:
config_path: Path to the YAML configuration file.
Returns:
Validated MainConfig object.
Raises:
FileNotFoundError: If the config file doesn't exist.
yaml.YAMLError: If the file is not valid YAML.
pydantic.ValidationError: If the config doesn't match the schema.
"""
if not config_path.is_file():
logger.error(f"Configuration file not found at: {config_path}")
raise FileNotFoundError(f"Config file not found: {config_path}")
logger.info(f"Loading configuration from: {config_path}")
try:
with open(config_path, 'r') as f:
config_dict = yaml.safe_load(f)
# Validate configuration using Pydantic model
config = MainConfig(**config_dict)
logger.info("Configuration loaded and validated successfully.")
return config
except yaml.YAMLError as e:
logger.error(f"Error parsing YAML file {config_path}: {e}", exc_info=True)
raise
except Exception as e: # Catches Pydantic validation errors too
logger.error(f"Error validating configuration {config_path}: {e}", exc_info=True)
raise
def set_seeds(seed: Optional[int] = 42) -> None:
"""
Set random seeds for reproducibility across libraries.
Args:
seed: The seed value to use. If None, uses default 42.
"""
if seed is None:
seed = 42
logger.warning(f"No seed provided, using default seed: {seed}")
else:
logger.info(f"Setting random seed: {seed}")
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
# Ensure reproducibility for CUDA operations where possible
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed) # For multi-GPU
# These settings can slow down training but improve reproducibility
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# PyTorch Lightning seeding (optional, as we seed torch directly)
# pl.seed_everything(seed, workers=True) # workers=True ensures dataloader reproducibility
def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
"""
Calculate mean and standard deviation of metrics across folds.
Handles potential NaN values by ignoring them.
Args:
all_fold_metrics: A list where each element is a dictionary of
metrics for one fold (e.g., {'MAE': v1, 'RMSE': v2}).
Returns:
A dictionary where keys are metric names and values are dicts
containing 'mean' and 'std' for that metric across folds.
Example: {'MAE': {'mean': m, 'std': s}, 'RMSE': {'mean': m2, 'std': s2}}
"""
if not all_fold_metrics:
logger.warning("Received empty list for metric aggregation.")
return {}
aggregated: Dict[str, Dict[str, float]] = {}
# Get metric names from the first valid fold's results
first_valid_metrics = next((m for m in all_fold_metrics if m), None)
if not first_valid_metrics:
logger.warning("No valid fold metrics found for aggregation.")
return {}
metric_names = list(first_valid_metrics.keys())
for metric in metric_names:
# Collect values for this metric across all folds, ignoring NaNs
values = [fold_metrics.get(metric) for fold_metrics in all_fold_metrics if fold_metrics and metric in fold_metrics]
valid_values = [v for v in values if v is not None and not np.isnan(v)]
if not valid_values:
logger.warning(f"No valid values found for metric '{metric}' across folds.")
mean_val = np.nan
std_val = np.nan
else:
mean_val = float(np.mean(valid_values))
std_val = float(np.std(valid_values))
logger.debug(f"Aggregated '{metric}': Mean={mean_val:.4f}, Std={std_val:.4f} from {len(valid_values)} folds.")
aggregated[metric] = {'mean': mean_val, 'std': std_val}
return aggregated
def save_results(results: Dict, filename: Path):
"""Save dictionary results to a JSON file."""
try:
filename.parent.mkdir(parents=True, exist_ok=True)
with open(filename, 'w') as f:
json.dump(results, f, indent=4)
logger.info(f"Saved results to {filename}")
except Exception as e:
logger.error(f"Failed to save results to {filename}: {e}", exc_info=True)
# --- Main Training & Evaluation Function ---
def run_training_pipeline(config: MainConfig, output_base_dir: Path):
"""Runs the full cross-validation training and evaluation pipeline."""
start_time = time.perf_counter()
# --- Data Loading ---
try:
df = load_raw_data(config.data)
except Exception as e:
logger.critical(f"Failed to load raw data: {e}", exc_info=True)
sys.exit(1) # Cannot proceed without data
# --- Cross-Validation Setup ---
try:
cv_splitter = TimeSeriesCrossValidationSplitter(config.cross_validation, len(df))
except ValueError as e:
logger.critical(f"Failed to initialize CV splitter: {e}", exc_info=True)
sys.exit(1)
all_fold_test_metrics: List[Dict[str, float]] = []
all_fold_best_val_scores: Dict[int, Optional[float]] = {} # Store best val score per fold
# --- Cross-Validation Loop ---
logger.info(f"Starting {config.cross_validation.n_splits}-Fold Cross-Validation...")
for fold_num, (train_idx, val_idx, test_idx) in enumerate(cv_splitter.split()):
fold_start_time = time.perf_counter()
fold_id = fold_num + 1
logger.info(f"--- Starting Fold {fold_id}/{config.cross_validation.n_splits} ---")
fold_output_dir = output_base_dir / f"fold_{fold_id:02d}"
fold_output_dir.mkdir(parents=True, exist_ok=True)
logger.debug(f"Fold output directory: {fold_output_dir}")
try:
# --- Per-Fold Data Preparation ---
logger.info("Preparing data loaders for the fold...")
train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
full_df=df,
train_idx=train_idx,
val_idx=val_idx,
test_idx=test_idx,
target_col=config.data.target_col, # Pass target col name explicitly
feature_config=config.features,
train_config=config.training,
eval_config=config.evaluation
)
logger.info(f"Data loaders prepared. Input size determined: {input_size}")
# --- Model Initialization ---
# Pass input_size directly, ModelConfig no longer holds it.
# Ensure forecast horizon is consistent (checked in MainConfig validation)
current_model_config = config.model # Use the validated model config
model = LSTMForecastLightningModule(
model_config=current_model_config, # Does not contain input_size
train_config=config.training,
input_size=input_size, # Pass the dynamically determined input_size
target_scaler=target_scaler # Pass the fold-specific scaler
)
logger.info("LSTMForecastLightningModule initialized.")
# --- PyTorch Lightning Callbacks ---
# Monitor the validation MAE on the original scale (logged by LightningModule)
monitor_metric = "val_mae_orig_scale"
monitor_mode = "min"
early_stop_callback = None
if config.training.early_stopping_patience is not None and config.training.early_stopping_patience > 0:
early_stop_callback = EarlyStopping(
monitor=monitor_metric,
min_delta=0.0001, # Minimum change to qualify as improvement
patience=config.training.early_stopping_patience,
verbose=True,
mode=monitor_mode
)
logger.info(f"Enabled EarlyStopping: monitor='{monitor_metric}', patience={config.training.early_stopping_patience}")
# Checkpoint callback to save the best model based on validation metric
checkpoint_callback = ModelCheckpoint(
dirpath=fold_output_dir / "checkpoints",
filename=f"best_model_fold_{fold_id}", # {{epoch}}-{{val_loss:.2f}} etc. possible
save_top_k=1,
monitor=monitor_metric,
mode=monitor_mode,
verbose=True
)
logger.info(f"Enabled ModelCheckpoint: monitor='{monitor_metric}', mode='{monitor_mode}'")
# Learning rate monitor callback
lr_monitor = LearningRateMonitor(logging_interval='epoch')
callbacks = [checkpoint_callback, lr_monitor]
if early_stop_callback:
callbacks.append(early_stop_callback)
# --- PyTorch Lightning Logger ---
# Log metrics to a CSV file within the fold directory
pl_logger = CSVLogger(save_dir=str(output_base_dir), name=f"fold_{fold_id:02d}", version='logs')
logger.info(f"Using CSVLogger, logs will be saved in: {pl_logger.log_dir}")
# --- PyTorch Lightning Trainer ---
# Determine accelerator and devices based on PyTorch check
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
devices = 1 if accelerator == 'gpu' else None # Or specify specific GPU IDs [0], [1] etc.
precision = getattr(config.training, 'precision', 32) # Default to 32-bit
trainer = pl.Trainer(
accelerator=accelerator,
devices=devices,
max_epochs=config.training.epochs,
callbacks=callbacks,
logger=pl_logger,
log_every_n_steps=max(1, len(train_loader)//10), # Log ~10 times per epoch
enable_progress_bar=True, # Set to False for less verbose runs (e.g., HPO)
gradient_clip_val=getattr(config.training, 'gradient_clip_val', None),
precision=precision,
# deterministic=True, # For stricter reproducibility (can slow down)
)
logger.info(f"Initialized PyTorch Lightning Trainer: accelerator='{accelerator}', devices={devices}, precision={precision}")
# --- Training ---
logger.info(f"Starting training for Fold {fold_id}...")
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
logger.info(f"Training finished for Fold {fold_id}.")
# Store best validation score for this fold
best_val_score = trainer.checkpoint_callback.best_model_score
best_model_path = trainer.checkpoint_callback.best_model_path
all_fold_best_val_scores[fold_id] = best_val_score.item() if best_val_score else None
if best_val_score is not None:
logger.info(f"Best validation score ({monitor_metric}) for Fold {fold_id}: {all_fold_best_val_scores[fold_id]:.4f}")
logger.info(f"Best model checkpoint path: {best_model_path}")
else:
logger.warning(f"Could not retrieve best validation score/path for Fold {fold_id} (metric: {monitor_metric}). Evaluation might use last model.")
best_model_path = None # Ensure evaluation doesn't try to load 'best' if checkpointing failed
# --- Prediction on Test Set ---
# Use trainer.predict() to get model outputs
logger.info(f"Starting prediction for Fold {fold_id} using best checkpoint...")
# predict_step returns dict {'preds_scaled': ..., 'targets_scaled': ...}
# We pass the test_loader here, which yields (x, y) pairs, so predict_step will include targets
prediction_results_list = trainer.predict(
# model=model, # Not needed if using ckpt_path
ckpt_path=best_model_path if best_model_path else 'last', # Load best model or last if best failed
dataloaders=test_loader
# return_predictions=True # Default is True
)
# Check if prediction returned results
if not prediction_results_list:
logger.error(f"Predict phase did not return any results for Fold {fold_id}. Check predict_step and logs.")
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
else:
try:
# Concatenate predictions and targets from predict_step results
all_preds_scaled = torch.cat([batch_res['preds_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
# Check if targets were included (they should be if using test_loader)
if 'targets_scaled' in prediction_results_list[0]:
all_targets_scaled = torch.cat([batch_res['targets_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
else:
# This case shouldn't happen if using test_loader, but good safeguard
logger.error(f"Targets not found in prediction results for Fold {fold_id}. Cannot evaluate.")
raise ValueError("Targets missing from prediction results.")
# --- Final Evaluation & Plotting ---
logger.info(f"Processing prediction results for Fold {fold_id}...")
fold_metrics = evaluate_fold_predictions(
y_true_scaled=all_targets_scaled,
y_pred_scaled=all_preds_scaled,
target_scaler=target_scaler, # Use the scaler from this fold
eval_config=config.evaluation,
fold_num=fold_num, # Pass zero-based index
output_dir=output_base_dir, # Base dir for saving plots etc.
# time_index=df.iloc[test_idx].index # Pass time index if needed
)
# Save fold metrics
save_results(fold_metrics, fold_output_dir / "test_metrics.json")
except KeyError as e:
logger.error(f"KeyError processing prediction results for Fold {fold_id}: Missing key {e}. Check predict_step return format.", exc_info=True)
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
except Exception as e:
logger.error(f"Error processing prediction results for Fold {fold_id}: {e}", exc_info=True)
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
all_fold_test_metrics.append(fold_metrics)
# --- (Optional) Log final test metrics using trainer.test() ---
# If you want the metrics logged by test_step aggregated, call test now.
# logger.info(f"Logging final test metrics via trainer.test() for Fold {fold_id}...")
# try:
# trainer.test(ckpt_path=best_model_path if best_model_path else 'last', dataloaders=test_loader, verbose=False)
# except Exception as e:
# logger.warning(f"trainer.test() call failed for Fold {fold_id}: {e}")
except Exception as e:
# Catch errors during the fold processing (data prep, training, prediction, eval)
logger.error(f"An error occurred during Fold {fold_id} pipeline: {e}", exc_info=True)
all_fold_test_metrics.append({'MAE': np.nan, 'RMSE': np.nan})
# --- Cleanup per fold ---
if torch.cuda.is_available():
torch.cuda.empty_cache()
logger.debug("Cleared CUDA cache.")
fold_end_time = time.perf_counter()
logger.info(f"--- Finished Fold {fold_id} in {fold_end_time - fold_start_time:.2f} seconds ---")
# --- Aggregation and Final Reporting ---
logger.info("Cross-validation finished. Aggregating results...")
aggregated_metrics = aggregate_cv_metrics(all_fold_test_metrics)
# Save aggregated results
final_results = {
'aggregated_test_metrics': aggregated_metrics,
'per_fold_test_metrics': all_fold_test_metrics,
'per_fold_best_val_scores': all_fold_best_val_scores,
}
save_results(final_results, output_base_dir / "aggregated_cv_results.json")
# Log final results
logger.info("--- Aggregated Cross-Validation Test Results ---")
if aggregated_metrics:
for metric, stats in aggregated_metrics.items():
logger.info(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")
else:
logger.warning("No metrics available for aggregation.")
logger.info("-------------------------------------------------")
end_time = time.perf_counter()
logger.info(f"Training pipeline finished successfully in {end_time - start_time:.2f} seconds.")
# --- Main Execution ---
def run():
"""Main execution function."""
args = parse_arguments()
config_path = Path(args.config)
output_dir = Path(args.output_dir)
# Adjust log level if debug flag is set
if args.debug:
logger.setLevel(logging.DEBUG)
logger.debug("# --- Debug mode enabled. --- #")
# --- Configuration Loading ---
try:
config = load_config(config_path)
except Exception:
# Error already logged in load_config
sys.exit(1)
# --- Seed Setting ---
# Use command-line seed if provided, otherwise use config seed
seed = args.seed if args.seed is not None else getattr(config, 'random_seed', 42)
set_seeds(seed)
# --- Pipeline Execution ---
try:
run_training_pipeline(config, output_dir)
except SystemExit as e:
logger.warning(f"Pipeline exited with code {e.code}.")
sys.exit(e.code) # Propagate exit code
except Exception as e:
logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
sys.exit(1)
if __name__ == "__main__":
run()