import logging from pathlib import Path import pandas as pd from typing import Tuple, Optional, Dict, Any from data_analysis.utils.data_config_model import settings logger = logging.getLogger(__name__) # Define constants for column names related to raw loading TIME_COL_RAW = "MTU (CET/CEST)" PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]" PRICE_COL = "Price" # Standardized column name after processing def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]: """ Loads the energy price CSV data, parses the time column, sets a DatetimeIndex, renames columns, checks frequency, and handles missing values. Args: file_path: Path to the input CSV file. Returns: A tuple containing: - pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column. May include other columns if they exist in the source. - str | None: Error message if loading fails, otherwise None. """ logger.info(f"Attempting to load data from: {file_path.resolve()}") err = None df = None try: # Load data, assuming header is on the first row df = pd.read_csv(file_path, header=0) # Basic check for expected columns if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns: err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}" logger.error(err) return None, err # --- Time Parsing --- df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0] df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce') original_len = len(df) df = df.dropna(subset=['Timestamp']) if len(df) < original_len: logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.") # --- Set Index and Select Columns --- df = df.set_index('Timestamp') # Convert price column to numeric, coercing errors df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce') # Keep the price column and any other potential exogenous columns # For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed. cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]] df = df[cols_to_keep].copy() # --- Handle Missing Prices --- missing_prices = df[PRICE_COL].isnull().sum() if missing_prices > 0: logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).") df[PRICE_COL] = df[PRICE_COL].ffill() if df[PRICE_COL].isnull().any(): logger.warning("Missing values remain after ffill. Backward-filling (bfill).") df[PRICE_COL] = df[PRICE_COL].bfill() # --- Check Time Index Frequency --- df = df.sort_index() inferred_freq = pd.infer_freq(df.index) if inferred_freq == settings.expected_data_frequency: logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.") df = df.asfreq('h') missing_after_asfreq = df[PRICE_COL].isnull().sum() if missing_after_asfreq > 0: logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.") df[PRICE_COL] = df[PRICE_COL].ffill().bfill() elif inferred_freq: logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.") else: logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.") duplicates = df.index.duplicated().sum() if duplicates > 0: logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.") df = df[~df.index.duplicated(keep='first')] logger.info(f"Data loaded and prepared. Final shape: {df.shape}") except FileNotFoundError: err = f"Data file not found: {file_path}" logger.error(err) except Exception as e: err = f"An unexpected error occurred during data loading/preparation: {e}" logger.error(err, exc_info=True) df = None return df, err def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]: """ Generates summary information about the DataFrame. Args: df: The input DataFrame. Returns: A tuple containing: - dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing'). - str | None: Error message, otherwise None. """ logger.info("Generating data summary...") summary = None err = None if df is None or df.empty: return None, "Input DataFrame is empty or None." try: summary = { 'head': df.head(), 'tail': df.tail(), 'dtypes': df.dtypes, 'missing': df.isnull().sum() } logger.info("Data summary generated.") except Exception as e: err = f"Error generating data summary: {e}" logger.error(err, exc_info=True) return summary, err def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]: """ Calculates descriptive statistics for specified column(s). Args: df: The input DataFrame. price_col: The name of the column (or list of columns) for stats. Defaults to the standard 'Price' column. Returns: A tuple containing: - pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics. - str | None: Error message, otherwise None. """ logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...") stats = None err = None if df is None or df.empty: return None, "Input DataFrame is empty or None." try: # Check if the target column(s) exist target_cols = [price_col] if isinstance(price_col, str) else price_col missing_cols = [col for col in target_cols if col not in df.columns] if missing_cols: err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}." logger.error(err) return None, err stats = df[price_col].describe() # .describe() works on Series and DataFrame logger.info("Descriptive statistics calculated.") except Exception as e: err = f"Error calculating descriptive statistics: {e}" logger.error(err, exc_info=True) return stats, err