intermediate backup
This commit is contained in:
468
forecasting_model_run.py
Normal file
468
forecasting_model_run.py
Normal file
@ -0,0 +1,468 @@
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
import time
|
||||
import json
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import yaml
|
||||
import pytorch_lightning as pl
|
||||
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
|
||||
from pytorch_lightning.loggers import CSVLogger
|
||||
|
||||
# Import necessary components from your project structure
|
||||
# Assuming forecasting_model is a package installable or in PYTHONPATH
|
||||
from forecasting_model.utils.config_model import MainConfig
|
||||
from forecasting_model.data_processing import (
|
||||
load_raw_data,
|
||||
TimeSeriesCrossValidationSplitter,
|
||||
prepare_fold_data_and_loaders
|
||||
)
|
||||
from forecasting_model.model import LSTMForecastLightningModule
|
||||
from forecasting_model.evaluation import evaluate_fold_predictions
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
# Silence overly verbose libraries if needed
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
pil_logger = logging.getLogger('PIL')
|
||||
pil_logger.setLevel(logging.WARNING)
|
||||
|
||||
# --- Basic Logging Setup ---
|
||||
# Configure logging early. Level might be adjusted by config.
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)-7s - %(message)s',
|
||||
datefmt='%H:%M:%S')
|
||||
# Get the root logger
|
||||
logger = logging.getLogger()
|
||||
|
||||
# --- Argument Parsing ---
|
||||
def parse_arguments():
|
||||
"""Parses command-line arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run the Time Series Forecasting training pipeline.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--config',
|
||||
type=str,
|
||||
default='config.yaml',
|
||||
help="Path to the YAML configuration file."
|
||||
)
|
||||
parser.add_argument(
|
||||
'--seed',
|
||||
type=int,
|
||||
default=None, # Default to None, use config value if not provided
|
||||
help="Override random seed defined in config."
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug',
|
||||
action='store_true',
|
||||
help="Override log level to DEBUG."
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-dir',
|
||||
type=str,
|
||||
default='output/cv_results', # Default output base directory
|
||||
help="Base directory for saving cross-validation results (checkpoints, logs, plots)."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
def load_config(config_path: Path) -> MainConfig:
|
||||
"""
|
||||
Load and validate configuration from YAML file using Pydantic.
|
||||
|
||||
Args:
|
||||
config_path: Path to the YAML configuration file.
|
||||
|
||||
Returns:
|
||||
Validated MainConfig object.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the config file doesn't exist.
|
||||
yaml.YAMLError: If the file is not valid YAML.
|
||||
pydantic.ValidationError: If the config doesn't match the schema.
|
||||
"""
|
||||
if not config_path.is_file():
|
||||
logger.error(f"Configuration file not found at: {config_path}")
|
||||
raise FileNotFoundError(f"Config file not found: {config_path}")
|
||||
|
||||
logger.info(f"Loading configuration from: {config_path}")
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config_dict = yaml.safe_load(f)
|
||||
|
||||
# Validate configuration using Pydantic model
|
||||
config = MainConfig(**config_dict)
|
||||
logger.info("Configuration loaded and validated successfully.")
|
||||
return config
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Error parsing YAML file {config_path}: {e}", exc_info=True)
|
||||
raise
|
||||
except Exception as e: # Catches Pydantic validation errors too
|
||||
logger.error(f"Error validating configuration {config_path}: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
def set_seeds(seed: Optional[int] = 42) -> None:
|
||||
"""
|
||||
Set random seeds for reproducibility across libraries.
|
||||
|
||||
Args:
|
||||
seed: The seed value to use. If None, uses default 42.
|
||||
"""
|
||||
if seed is None:
|
||||
seed = 42
|
||||
logger.warning(f"No seed provided, using default seed: {seed}")
|
||||
else:
|
||||
logger.info(f"Setting random seed: {seed}")
|
||||
|
||||
random.seed(seed)
|
||||
np.random.seed(seed)
|
||||
torch.manual_seed(seed)
|
||||
# Ensure reproducibility for CUDA operations where possible
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed(seed)
|
||||
torch.cuda.manual_seed_all(seed) # For multi-GPU
|
||||
# These settings can slow down training but improve reproducibility
|
||||
# torch.backends.cudnn.deterministic = True
|
||||
# torch.backends.cudnn.benchmark = False
|
||||
# PyTorch Lightning seeding (optional, as we seed torch directly)
|
||||
# pl.seed_everything(seed, workers=True) # workers=True ensures dataloader reproducibility
|
||||
|
||||
def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
|
||||
"""
|
||||
Calculate mean and standard deviation of metrics across folds.
|
||||
Handles potential NaN values by ignoring them.
|
||||
|
||||
Args:
|
||||
all_fold_metrics: A list where each element is a dictionary of
|
||||
metrics for one fold (e.g., {'MAE': v1, 'RMSE': v2}).
|
||||
|
||||
Returns:
|
||||
A dictionary where keys are metric names and values are dicts
|
||||
containing 'mean' and 'std' for that metric across folds.
|
||||
Example: {'MAE': {'mean': m, 'std': s}, 'RMSE': {'mean': m2, 'std': s2}}
|
||||
"""
|
||||
if not all_fold_metrics:
|
||||
logger.warning("Received empty list for metric aggregation.")
|
||||
return {}
|
||||
|
||||
aggregated: Dict[str, Dict[str, float]] = {}
|
||||
# Get metric names from the first valid fold's results
|
||||
first_valid_metrics = next((m for m in all_fold_metrics if m), None)
|
||||
if not first_valid_metrics:
|
||||
logger.warning("No valid fold metrics found for aggregation.")
|
||||
return {}
|
||||
metric_names = list(first_valid_metrics.keys())
|
||||
|
||||
for metric in metric_names:
|
||||
# Collect values for this metric across all folds, ignoring NaNs
|
||||
values = [fold_metrics.get(metric) for fold_metrics in all_fold_metrics if fold_metrics and metric in fold_metrics]
|
||||
valid_values = [v for v in values if v is not None and not np.isnan(v)]
|
||||
|
||||
if not valid_values:
|
||||
logger.warning(f"No valid values found for metric '{metric}' across folds.")
|
||||
mean_val = np.nan
|
||||
std_val = np.nan
|
||||
else:
|
||||
mean_val = float(np.mean(valid_values))
|
||||
std_val = float(np.std(valid_values))
|
||||
logger.debug(f"Aggregated '{metric}': Mean={mean_val:.4f}, Std={std_val:.4f} from {len(valid_values)} folds.")
|
||||
|
||||
aggregated[metric] = {'mean': mean_val, 'std': std_val}
|
||||
|
||||
return aggregated
|
||||
|
||||
def save_results(results: Dict, filename: Path):
|
||||
"""Save dictionary results to a JSON file."""
|
||||
try:
|
||||
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(results, f, indent=4)
|
||||
logger.info(f"Saved results to {filename}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to save results to {filename}: {e}", exc_info=True)
|
||||
|
||||
|
||||
# --- Main Training & Evaluation Function ---
|
||||
def run_training_pipeline(config: MainConfig, output_base_dir: Path):
|
||||
"""Runs the full cross-validation training and evaluation pipeline."""
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# --- Data Loading ---
|
||||
try:
|
||||
df = load_raw_data(config.data)
|
||||
except Exception as e:
|
||||
logger.critical(f"Failed to load raw data: {e}", exc_info=True)
|
||||
sys.exit(1) # Cannot proceed without data
|
||||
|
||||
# --- Cross-Validation Setup ---
|
||||
try:
|
||||
cv_splitter = TimeSeriesCrossValidationSplitter(config.cross_validation, len(df))
|
||||
except ValueError as e:
|
||||
logger.critical(f"Failed to initialize CV splitter: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
all_fold_test_metrics: List[Dict[str, float]] = []
|
||||
all_fold_best_val_scores: Dict[int, Optional[float]] = {} # Store best val score per fold
|
||||
|
||||
# --- Cross-Validation Loop ---
|
||||
logger.info(f"Starting {config.cross_validation.n_splits}-Fold Cross-Validation...")
|
||||
for fold_num, (train_idx, val_idx, test_idx) in enumerate(cv_splitter.split()):
|
||||
fold_start_time = time.perf_counter()
|
||||
fold_id = fold_num + 1
|
||||
logger.info(f"--- Starting Fold {fold_id}/{config.cross_validation.n_splits} ---")
|
||||
|
||||
fold_output_dir = output_base_dir / f"fold_{fold_id:02d}"
|
||||
fold_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug(f"Fold output directory: {fold_output_dir}")
|
||||
|
||||
try:
|
||||
# --- Per-Fold Data Preparation ---
|
||||
logger.info("Preparing data loaders for the fold...")
|
||||
train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
|
||||
full_df=df,
|
||||
train_idx=train_idx,
|
||||
val_idx=val_idx,
|
||||
test_idx=test_idx,
|
||||
target_col=config.data.target_col, # Pass target col name explicitly
|
||||
feature_config=config.features,
|
||||
train_config=config.training,
|
||||
eval_config=config.evaluation
|
||||
)
|
||||
logger.info(f"Data loaders prepared. Input size determined: {input_size}")
|
||||
|
||||
# --- Model Initialization ---
|
||||
# Pass input_size directly, ModelConfig no longer holds it.
|
||||
# Ensure forecast horizon is consistent (checked in MainConfig validation)
|
||||
current_model_config = config.model # Use the validated model config
|
||||
|
||||
model = LSTMForecastLightningModule(
|
||||
model_config=current_model_config, # Does not contain input_size
|
||||
train_config=config.training,
|
||||
input_size=input_size, # Pass the dynamically determined input_size
|
||||
target_scaler=target_scaler # Pass the fold-specific scaler
|
||||
)
|
||||
logger.info("LSTMForecastLightningModule initialized.")
|
||||
|
||||
# --- PyTorch Lightning Callbacks ---
|
||||
# Monitor the validation MAE on the original scale (logged by LightningModule)
|
||||
monitor_metric = "val_mae_orig_scale"
|
||||
monitor_mode = "min"
|
||||
|
||||
early_stop_callback = None
|
||||
if config.training.early_stopping_patience is not None and config.training.early_stopping_patience > 0:
|
||||
early_stop_callback = EarlyStopping(
|
||||
monitor=monitor_metric,
|
||||
min_delta=0.0001, # Minimum change to qualify as improvement
|
||||
patience=config.training.early_stopping_patience,
|
||||
verbose=True,
|
||||
mode=monitor_mode
|
||||
)
|
||||
logger.info(f"Enabled EarlyStopping: monitor='{monitor_metric}', patience={config.training.early_stopping_patience}")
|
||||
|
||||
# Checkpoint callback to save the best model based on validation metric
|
||||
checkpoint_callback = ModelCheckpoint(
|
||||
dirpath=fold_output_dir / "checkpoints",
|
||||
filename=f"best_model_fold_{fold_id}", # {{epoch}}-{{val_loss:.2f}} etc. possible
|
||||
save_top_k=1,
|
||||
monitor=monitor_metric,
|
||||
mode=monitor_mode,
|
||||
verbose=True
|
||||
)
|
||||
logger.info(f"Enabled ModelCheckpoint: monitor='{monitor_metric}', mode='{monitor_mode}'")
|
||||
|
||||
# Learning rate monitor callback
|
||||
lr_monitor = LearningRateMonitor(logging_interval='epoch')
|
||||
|
||||
callbacks = [checkpoint_callback, lr_monitor]
|
||||
if early_stop_callback:
|
||||
callbacks.append(early_stop_callback)
|
||||
|
||||
# --- PyTorch Lightning Logger ---
|
||||
# Log metrics to a CSV file within the fold directory
|
||||
pl_logger = CSVLogger(save_dir=str(output_base_dir), name=f"fold_{fold_id:02d}", version='logs')
|
||||
logger.info(f"Using CSVLogger, logs will be saved in: {pl_logger.log_dir}")
|
||||
|
||||
# --- PyTorch Lightning Trainer ---
|
||||
# Determine accelerator and devices based on PyTorch check
|
||||
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
|
||||
devices = 1 if accelerator == 'gpu' else None # Or specify specific GPU IDs [0], [1] etc.
|
||||
precision = getattr(config.training, 'precision', 32) # Default to 32-bit
|
||||
|
||||
trainer = pl.Trainer(
|
||||
accelerator=accelerator,
|
||||
devices=devices,
|
||||
max_epochs=config.training.epochs,
|
||||
callbacks=callbacks,
|
||||
logger=pl_logger,
|
||||
log_every_n_steps=max(1, len(train_loader)//10), # Log ~10 times per epoch
|
||||
enable_progress_bar=True, # Set to False for less verbose runs (e.g., HPO)
|
||||
gradient_clip_val=getattr(config.training, 'gradient_clip_val', None),
|
||||
precision=precision,
|
||||
# deterministic=True, # For stricter reproducibility (can slow down)
|
||||
)
|
||||
logger.info(f"Initialized PyTorch Lightning Trainer: accelerator='{accelerator}', devices={devices}, precision={precision}")
|
||||
|
||||
# --- Training ---
|
||||
logger.info(f"Starting training for Fold {fold_id}...")
|
||||
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
|
||||
logger.info(f"Training finished for Fold {fold_id}.")
|
||||
|
||||
# Store best validation score for this fold
|
||||
best_val_score = trainer.checkpoint_callback.best_model_score
|
||||
best_model_path = trainer.checkpoint_callback.best_model_path
|
||||
all_fold_best_val_scores[fold_id] = best_val_score.item() if best_val_score else None
|
||||
if best_val_score is not None:
|
||||
logger.info(f"Best validation score ({monitor_metric}) for Fold {fold_id}: {all_fold_best_val_scores[fold_id]:.4f}")
|
||||
logger.info(f"Best model checkpoint path: {best_model_path}")
|
||||
else:
|
||||
logger.warning(f"Could not retrieve best validation score/path for Fold {fold_id} (metric: {monitor_metric}). Evaluation might use last model.")
|
||||
best_model_path = None # Ensure evaluation doesn't try to load 'best' if checkpointing failed
|
||||
|
||||
# --- Prediction on Test Set ---
|
||||
# Use trainer.predict() to get model outputs
|
||||
logger.info(f"Starting prediction for Fold {fold_id} using best checkpoint...")
|
||||
# predict_step returns dict {'preds_scaled': ..., 'targets_scaled': ...}
|
||||
# We pass the test_loader here, which yields (x, y) pairs, so predict_step will include targets
|
||||
prediction_results_list = trainer.predict(
|
||||
# model=model, # Not needed if using ckpt_path
|
||||
ckpt_path=best_model_path if best_model_path else 'last', # Load best model or last if best failed
|
||||
dataloaders=test_loader
|
||||
# return_predictions=True # Default is True
|
||||
)
|
||||
|
||||
# Check if prediction returned results
|
||||
if not prediction_results_list:
|
||||
logger.error(f"Predict phase did not return any results for Fold {fold_id}. Check predict_step and logs.")
|
||||
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
|
||||
else:
|
||||
try:
|
||||
# Concatenate predictions and targets from predict_step results
|
||||
all_preds_scaled = torch.cat([batch_res['preds_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
|
||||
# Check if targets were included (they should be if using test_loader)
|
||||
if 'targets_scaled' in prediction_results_list[0]:
|
||||
all_targets_scaled = torch.cat([batch_res['targets_scaled'] for batch_res in prediction_results_list], dim=0).numpy()
|
||||
else:
|
||||
# This case shouldn't happen if using test_loader, but good safeguard
|
||||
logger.error(f"Targets not found in prediction results for Fold {fold_id}. Cannot evaluate.")
|
||||
raise ValueError("Targets missing from prediction results.")
|
||||
|
||||
|
||||
# --- Final Evaluation & Plotting ---
|
||||
logger.info(f"Processing prediction results for Fold {fold_id}...")
|
||||
fold_metrics = evaluate_fold_predictions(
|
||||
y_true_scaled=all_targets_scaled,
|
||||
y_pred_scaled=all_preds_scaled,
|
||||
target_scaler=target_scaler, # Use the scaler from this fold
|
||||
eval_config=config.evaluation,
|
||||
fold_num=fold_num, # Pass zero-based index
|
||||
output_dir=output_base_dir, # Base dir for saving plots etc.
|
||||
# time_index=df.iloc[test_idx].index # Pass time index if needed
|
||||
)
|
||||
# Save fold metrics
|
||||
save_results(fold_metrics, fold_output_dir / "test_metrics.json")
|
||||
|
||||
except KeyError as e:
|
||||
logger.error(f"KeyError processing prediction results for Fold {fold_id}: Missing key {e}. Check predict_step return format.", exc_info=True)
|
||||
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing prediction results for Fold {fold_id}: {e}", exc_info=True)
|
||||
fold_metrics = {'MAE': np.nan, 'RMSE': np.nan}
|
||||
|
||||
all_fold_test_metrics.append(fold_metrics)
|
||||
|
||||
# --- (Optional) Log final test metrics using trainer.test() ---
|
||||
# If you want the metrics logged by test_step aggregated, call test now.
|
||||
# logger.info(f"Logging final test metrics via trainer.test() for Fold {fold_id}...")
|
||||
# try:
|
||||
# trainer.test(ckpt_path=best_model_path if best_model_path else 'last', dataloaders=test_loader, verbose=False)
|
||||
# except Exception as e:
|
||||
# logger.warning(f"trainer.test() call failed for Fold {fold_id}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
# Catch errors during the fold processing (data prep, training, prediction, eval)
|
||||
logger.error(f"An error occurred during Fold {fold_id} pipeline: {e}", exc_info=True)
|
||||
all_fold_test_metrics.append({'MAE': np.nan, 'RMSE': np.nan})
|
||||
|
||||
|
||||
# --- Cleanup per fold ---
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.empty_cache()
|
||||
logger.debug("Cleared CUDA cache.")
|
||||
|
||||
fold_end_time = time.perf_counter()
|
||||
logger.info(f"--- Finished Fold {fold_id} in {fold_end_time - fold_start_time:.2f} seconds ---")
|
||||
|
||||
|
||||
# --- Aggregation and Final Reporting ---
|
||||
logger.info("Cross-validation finished. Aggregating results...")
|
||||
aggregated_metrics = aggregate_cv_metrics(all_fold_test_metrics)
|
||||
|
||||
# Save aggregated results
|
||||
final_results = {
|
||||
'aggregated_test_metrics': aggregated_metrics,
|
||||
'per_fold_test_metrics': all_fold_test_metrics,
|
||||
'per_fold_best_val_scores': all_fold_best_val_scores,
|
||||
}
|
||||
save_results(final_results, output_base_dir / "aggregated_cv_results.json")
|
||||
|
||||
|
||||
# Log final results
|
||||
logger.info("--- Aggregated Cross-Validation Test Results ---")
|
||||
if aggregated_metrics:
|
||||
for metric, stats in aggregated_metrics.items():
|
||||
logger.info(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")
|
||||
else:
|
||||
logger.warning("No metrics available for aggregation.")
|
||||
logger.info("-------------------------------------------------")
|
||||
|
||||
end_time = time.perf_counter()
|
||||
logger.info(f"Training pipeline finished successfully in {end_time - start_time:.2f} seconds.")
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
def run():
|
||||
"""Main execution function."""
|
||||
args = parse_arguments()
|
||||
config_path = Path(args.config)
|
||||
output_dir = Path(args.output_dir)
|
||||
|
||||
# Adjust log level if debug flag is set
|
||||
if args.debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.debug("# --- Debug mode enabled. --- #")
|
||||
|
||||
# --- Configuration Loading ---
|
||||
try:
|
||||
config = load_config(config_path)
|
||||
except Exception:
|
||||
# Error already logged in load_config
|
||||
sys.exit(1)
|
||||
|
||||
# --- Seed Setting ---
|
||||
# Use command-line seed if provided, otherwise use config seed
|
||||
seed = args.seed if args.seed is not None else getattr(config, 'random_seed', 42)
|
||||
set_seeds(seed)
|
||||
|
||||
# --- Pipeline Execution ---
|
||||
try:
|
||||
run_training_pipeline(config, output_dir)
|
||||
|
||||
except SystemExit as e:
|
||||
logger.warning(f"Pipeline exited with code {e.code}.")
|
||||
sys.exit(e.code) # Propagate exit code
|
||||
except Exception as e:
|
||||
logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
Reference in New Issue
Block a user