Files
entrix_case_challange/data_analysis/io/data_handling.py
2025-05-03 20:46:14 +02:00

168 lines
6.8 KiB
Python

import logging
from pathlib import Path
import pandas as pd
from typing import Tuple, Optional, Dict, Any
from data_analysis.utils.data_config_model import settings
logger = logging.getLogger(__name__)
# Define constants for column names related to raw loading
TIME_COL_RAW = "MTU (CET/CEST)"
PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]"
PRICE_COL = "Price" # Standardized column name after processing
def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
"""
Loads the energy price CSV data, parses the time column, sets a
DatetimeIndex, renames columns, checks frequency, and handles missing values.
Args:
file_path: Path to the input CSV file.
Returns:
A tuple containing:
- pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column.
May include other columns if they exist in the source.
- str | None: Error message if loading fails, otherwise None.
"""
logger.info(f"Attempting to load data from: {file_path.resolve()}")
err = None
df = None
try:
# Load data, assuming header is on the first row
df = pd.read_csv(file_path, header=0)
# Basic check for expected columns
if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns:
err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}"
logger.error(err)
return None, err
# --- Time Parsing ---
df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0]
df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce')
original_len = len(df)
df = df.dropna(subset=['Timestamp'])
if len(df) < original_len:
logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.")
# --- Set Index and Select Columns ---
df = df.set_index('Timestamp')
# Convert price column to numeric, coercing errors
df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce')
# Keep the price column and any other potential exogenous columns
# For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed.
cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]]
df = df[cols_to_keep].copy()
# --- Handle Missing Prices ---
missing_prices = df[PRICE_COL].isnull().sum()
if missing_prices > 0:
logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).")
df[PRICE_COL] = df[PRICE_COL].ffill()
if df[PRICE_COL].isnull().any():
logger.warning("Missing values remain after ffill. Backward-filling (bfill).")
df[PRICE_COL] = df[PRICE_COL].bfill()
# --- Check Time Index Frequency ---
df = df.sort_index()
inferred_freq = pd.infer_freq(df.index)
if inferred_freq == settings.expected_data_frequency:
logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.")
df = df.asfreq('h')
missing_after_asfreq = df[PRICE_COL].isnull().sum()
if missing_after_asfreq > 0:
logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.")
df[PRICE_COL] = df[PRICE_COL].ffill().bfill()
elif inferred_freq:
logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.")
else:
logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.")
duplicates = df.index.duplicated().sum()
if duplicates > 0:
logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.")
df = df[~df.index.duplicated(keep='first')]
logger.info(f"Data loaded and prepared. Final shape: {df.shape}")
except FileNotFoundError:
err = f"Data file not found: {file_path}"
logger.error(err)
except Exception as e:
err = f"An unexpected error occurred during data loading/preparation: {e}"
logger.error(err, exc_info=True)
df = None
return df, err
def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
"""
Generates summary information about the DataFrame.
Args:
df: The input DataFrame.
Returns:
A tuple containing:
- dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing').
- str | None: Error message, otherwise None.
"""
logger.info("Generating data summary...")
summary = None
err = None
if df is None or df.empty:
return None, "Input DataFrame is empty or None."
try:
summary = {
'head': df.head(),
'tail': df.tail(),
'dtypes': df.dtypes,
'missing': df.isnull().sum()
}
logger.info("Data summary generated.")
except Exception as e:
err = f"Error generating data summary: {e}"
logger.error(err, exc_info=True)
return summary, err
def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]:
"""
Calculates descriptive statistics for specified column(s).
Args:
df: The input DataFrame.
price_col: The name of the column (or list of columns) for stats.
Defaults to the standard 'Price' column.
Returns:
A tuple containing:
- pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics.
- str | None: Error message, otherwise None.
"""
logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...")
stats = None
err = None
if df is None or df.empty:
return None, "Input DataFrame is empty or None."
try:
# Check if the target column(s) exist
target_cols = [price_col] if isinstance(price_col, str) else price_col
missing_cols = [col for col in target_cols if col not in df.columns]
if missing_cols:
err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}."
logger.error(err)
return None, err
stats = df[price_col].describe() # .describe() works on Series and DataFrame
logger.info("Descriptive statistics calculated.")
except Exception as e:
err = f"Error calculating descriptive statistics: {e}"
logger.error(err, exc_info=True)
return stats, err