entrix_case_challange/data_analysis/io/data_handling.py

import logging
from pathlib import Path
import pandas as pd
from typing import Tuple, Optional, Dict, Any

from data_analysis.utils.data_config_model import settings

logger = logging.getLogger(__name__)

# Define constants for column names related to raw loading
TIME_COL_RAW = "MTU (CET/CEST)"
PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]"
PRICE_COL = "Price" # Standardized column name after processing

def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
    """
    Loads the energy price CSV data, parses the time column, sets a
    DatetimeIndex, renames columns, checks frequency, and handles missing values.

    Args:
        file_path: Path to the input CSV file.

    Returns:
        A tuple containing:
        - pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column.
                       May include other columns if they exist in the source.
        - str | None: Error message if loading fails, otherwise None.
    """
    logger.info(f"Attempting to load data from: {file_path.resolve()}")
    err = None
    df = None
    try:
        # Load data, assuming header is on the first row
        df = pd.read_csv(file_path, header=0)

        # Basic check for expected columns
        if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns:
            err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}"
            logger.error(err)
            return None, err

        # --- Time Parsing ---
        df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0]
        df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce')

        original_len = len(df)
        df = df.dropna(subset=['Timestamp'])
        if len(df) < original_len:
            logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.")

        # --- Set Index and Select Columns ---
        df = df.set_index('Timestamp')
        # Convert price column to numeric, coercing errors
        df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce')

        # Keep the price column and any other potential exogenous columns
        # For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed.
        cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]]
        df = df[cols_to_keep].copy()

        # --- Handle Missing Prices ---
        missing_prices = df[PRICE_COL].isnull().sum()
        if missing_prices > 0:
            logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).")
            df[PRICE_COL] = df[PRICE_COL].ffill()
            if df[PRICE_COL].isnull().any():
                 logger.warning("Missing values remain after ffill. Backward-filling (bfill).")
                 df[PRICE_COL] = df[PRICE_COL].bfill()

        # --- Check Time Index Frequency ---
        df = df.sort_index()
        inferred_freq = pd.infer_freq(df.index)
        if inferred_freq == settings.expected_data_frequency:
            logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.")
            df = df.asfreq('h')
            missing_after_asfreq = df[PRICE_COL].isnull().sum()
            if missing_after_asfreq > 0:
                 logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.")
                 df[PRICE_COL] = df[PRICE_COL].ffill().bfill()
        elif inferred_freq:
            logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.")
        else:
            logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.")
            duplicates = df.index.duplicated().sum()
            if duplicates > 0:
                logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.")
                df = df[~df.index.duplicated(keep='first')]

        logger.info(f"Data loaded and prepared. Final shape: {df.shape}")

    except FileNotFoundError:
        err = f"Data file not found: {file_path}"
        logger.error(err)
    except Exception as e:
        err = f"An unexpected error occurred during data loading/preparation: {e}"
        logger.error(err, exc_info=True)
        df = None

    return df, err


def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
    """
    Generates summary information about the DataFrame.

    Args:
        df: The input DataFrame.

    Returns:
        A tuple containing:
        - dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing').
        - str | None: Error message, otherwise None.
    """
    logger.info("Generating data summary...")
    summary = None
    err = None
    if df is None or df.empty:
         return None, "Input DataFrame is empty or None."
    try:
        summary = {
            'head': df.head(),
            'tail': df.tail(),
            'dtypes': df.dtypes,
            'missing': df.isnull().sum()
        }
        logger.info("Data summary generated.")
    except Exception as e:
        err = f"Error generating data summary: {e}"
        logger.error(err, exc_info=True)

    return summary, err


def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]:
    """
    Calculates descriptive statistics for specified column(s).

    Args:
        df: The input DataFrame.
        price_col: The name of the column (or list of columns) for stats.
                   Defaults to the standard 'Price' column.

    Returns:
        A tuple containing:
        - pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics.
        - str | None: Error message, otherwise None.
    """
    logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...")
    stats = None
    err = None
    if df is None or df.empty:
         return None, "Input DataFrame is empty or None."
    try:
        # Check if the target column(s) exist
        target_cols = [price_col] if isinstance(price_col, str) else price_col
        missing_cols = [col for col in target_cols if col not in df.columns]
        if missing_cols:
             err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}."
             logger.error(err)
             return None, err

        stats = df[price_col].describe() # .describe() works on Series and DataFrame
        logger.info("Descriptive statistics calculated.")
    except Exception as e:
        err = f"Error calculating descriptive statistics: {e}"
        logger.error(err, exc_info=True)

    return stats, err