Files
entrix_case_challange/forecasting_model/data_processing.py
2025-05-02 10:45:06 +02:00

67 lines
2.2 KiB
Python

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from typing import Tuple, Generator, List, Optional
from utils.config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig
# --- Data Loading ---
def load_raw_data(config: DataConfig) -> pd.DataFrame:
"""
Load and preprocess raw data from CSV.
"""
# TODO: Implement CSV loading and datetime parsing
pass
# --- Feature Engineering ---
def engineer_features(df: pd.DataFrame, target_col: str, feature_config: FeatureConfig) -> pd.DataFrame:
"""
Create features from the target column and datetime index.
"""
# TODO: Implement feature engineering (lags, rolling stats, time features, wavelets)
pass
# --- Cross Validation ---
class TimeSeriesCrossValidationSplitter:
def __init__(self, config: CrossValidationConfig, n_samples: int):
self.config = config
self.n_samples = n_samples
def split(self) -> Generator[Tuple[np.ndarray, np.ndarray, np.ndarray], None, None]:
"""
Generate train/val/test splits using expanding window approach.
"""
# TODO: Implement expanding window CV splitter
pass
# --- Dataset Class ---
class TimeSeriesDataset(Dataset):
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: int):
self.data = data_array
self.sequence_length = sequence_length
self.forecast_horizon = forecast_horizon
def __len__(self) -> int:
# TODO: Implement length calculation
pass
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
# TODO: Implement sequence extraction
pass
# --- Data Preparation ---
def prepare_fold_data_and_loaders(
full_df: pd.DataFrame,
train_idx: np.ndarray,
val_idx: np.ndarray,
test_idx: np.ndarray,
feature_config: FeatureConfig,
train_config: TrainingConfig,
eval_config: EvaluationConfig
) -> Tuple[DataLoader, DataLoader, DataLoader, object, int]:
"""
Prepare data loaders for a single fold.
"""
# TODO: Implement data preparation pipeline
pass