init

2025-05-02 10:45:06 +02:00
commit 7c9d809a82
29 changed files with 2931 additions and 0 deletions
--- a/data_analysis/io/data_handling.py
+++ b/data_analysis/io/data_handling.py
@ -0,0 +1,168 @@
+import logging
+from pathlib import Path
+import pandas as pd
+from typing import Tuple, Optional, Dict, Any
+
+from data_analysis.utils.config_model import settings
+
+logger = logging.getLogger(__name__)
+
+# Define constants for column names related to raw loading
+TIME_COL_RAW = "MTU (CET/CEST)"
+PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]"
+PRICE_COL = "Price" # Standardized column name after processing
+
+def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
+    """
+    Loads the energy price CSV data, parses the time column, sets a
+    DatetimeIndex, renames columns, checks frequency, and handles missing values.
+
+    Args:
+        file_path: Path to the input CSV file.
+
+    Returns:
+        A tuple containing:
+        - pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column.
+                       May include other columns if they exist in the source.
+        - str | None: Error message if loading fails, otherwise None.
+    """
+    logger.info(f"Attempting to load data from: {file_path.resolve()}")
+    err = None
+    df = None
+    try:
+        # Load data, assuming header is on the first row
+        df = pd.read_csv(file_path, header=0)
+
+        # Basic check for expected columns
+        if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns:
+            err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}"
+            logger.error(err)
+            return None, err
+
+        # --- Time Parsing ---
+        df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0]
+        df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce')
+
+        original_len = len(df)
+        df = df.dropna(subset=['Timestamp'])
+        if len(df) < original_len:
+            logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.")
+
+        # --- Set Index and Select Columns ---
+        df = df.set_index('Timestamp')
+        # Convert price column to numeric, coercing errors
+        df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce')
+
+        # Keep the price column and any other potential exogenous columns
+        # For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed.
+        cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]]
+        df = df[cols_to_keep].copy()
+
+        # --- Handle Missing Prices ---
+        missing_prices = df[PRICE_COL].isnull().sum()
+        if missing_prices > 0:
+            logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).")
+            df[PRICE_COL] = df[PRICE_COL].ffill()
+            if df[PRICE_COL].isnull().any():
+                 logger.warning("Missing values remain after ffill. Backward-filling (bfill).")
+                 df[PRICE_COL] = df[PRICE_COL].bfill()
+
+        # --- Check Time Index Frequency ---
+        df = df.sort_index()
+        inferred_freq = pd.infer_freq(df.index)
+        if inferred_freq == settings.expected_data_frequency:
+            logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.")
+            df = df.asfreq('h')
+            missing_after_asfreq = df[PRICE_COL].isnull().sum()
+            if missing_after_asfreq > 0:
+                 logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.")
+                 df[PRICE_COL] = df[PRICE_COL].ffill().bfill()
+        elif inferred_freq:
+            logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.")
+        else:
+            logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.")
+            duplicates = df.index.duplicated().sum()
+            if duplicates > 0:
+                logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.")
+                df = df[~df.index.duplicated(keep='first')]
+
+        logger.info(f"Data loaded and prepared. Final shape: {df.shape}")
+
+    except FileNotFoundError:
+        err = f"Data file not found: {file_path}"
+        logger.error(err)
+    except Exception as e:
+        err = f"An unexpected error occurred during data loading/preparation: {e}"
+        logger.error(err, exc_info=True)
+        df = None
+
+    return df, err
+
+
+def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
+    """
+    Generates summary information about the DataFrame.
+
+    Args:
+        df: The input DataFrame.
+
+    Returns:
+        A tuple containing:
+        - dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing').
+        - str | None: Error message, otherwise None.
+    """
+    logger.info("Generating data summary...")
+    summary = None
+    err = None
+    if df is None or df.empty:
+         return None, "Input DataFrame is empty or None."
+    try:
+        summary = {
+            'head': df.head(),
+            'tail': df.tail(),
+            'dtypes': df.dtypes,
+            'missing': df.isnull().sum()
+        }
+        logger.info("Data summary generated.")
+    except Exception as e:
+        err = f"Error generating data summary: {e}"
+        logger.error(err, exc_info=True)
+
+    return summary, err
+
+
+def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]:
+    """
+    Calculates descriptive statistics for specified column(s).
+
+    Args:
+        df: The input DataFrame.
+        price_col: The name of the column (or list of columns) for stats.
+                   Defaults to the standard 'Price' column.
+
+    Returns:
+        A tuple containing:
+        - pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics.
+        - str | None: Error message, otherwise None.
+    """
+    logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...")
+    stats = None
+    err = None
+    if df is None or df.empty:
+         return None, "Input DataFrame is empty or None."
+    try:
+        # Check if the target column(s) exist
+        target_cols = [price_col] if isinstance(price_col, str) else price_col
+        missing_cols = [col for col in target_cols if col not in df.columns]
+        if missing_cols:
+             err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}."
+             logger.error(err)
+             return None, err
+
+        stats = df[price_col].describe() # .describe() works on Series and DataFrame
+        logger.info("Descriptive statistics calculated.")
+    except Exception as e:
+        err = f"Error calculating descriptive statistics: {e}"
+        logger.error(err, exc_info=True)
+
+    return stats, err