init
This commit is contained in:
168
data_analysis/io/data_handling.py
Normal file
168
data_analysis/io/data_handling.py
Normal file
@ -0,0 +1,168 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Tuple, Optional, Dict, Any
|
||||
|
||||
from data_analysis.utils.config_model import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Define constants for column names related to raw loading
|
||||
TIME_COL_RAW = "MTU (CET/CEST)"
|
||||
PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]"
|
||||
PRICE_COL = "Price" # Standardized column name after processing
|
||||
|
||||
def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
||||
"""
|
||||
Loads the energy price CSV data, parses the time column, sets a
|
||||
DatetimeIndex, renames columns, checks frequency, and handles missing values.
|
||||
|
||||
Args:
|
||||
file_path: Path to the input CSV file.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column.
|
||||
May include other columns if they exist in the source.
|
||||
- str | None: Error message if loading fails, otherwise None.
|
||||
"""
|
||||
logger.info(f"Attempting to load data from: {file_path.resolve()}")
|
||||
err = None
|
||||
df = None
|
||||
try:
|
||||
# Load data, assuming header is on the first row
|
||||
df = pd.read_csv(file_path, header=0)
|
||||
|
||||
# Basic check for expected columns
|
||||
if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns:
|
||||
err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}"
|
||||
logger.error(err)
|
||||
return None, err
|
||||
|
||||
# --- Time Parsing ---
|
||||
df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0]
|
||||
df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce')
|
||||
|
||||
original_len = len(df)
|
||||
df = df.dropna(subset=['Timestamp'])
|
||||
if len(df) < original_len:
|
||||
logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.")
|
||||
|
||||
# --- Set Index and Select Columns ---
|
||||
df = df.set_index('Timestamp')
|
||||
# Convert price column to numeric, coercing errors
|
||||
df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce')
|
||||
|
||||
# Keep the price column and any other potential exogenous columns
|
||||
# For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed.
|
||||
cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]]
|
||||
df = df[cols_to_keep].copy()
|
||||
|
||||
# --- Handle Missing Prices ---
|
||||
missing_prices = df[PRICE_COL].isnull().sum()
|
||||
if missing_prices > 0:
|
||||
logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).")
|
||||
df[PRICE_COL] = df[PRICE_COL].ffill()
|
||||
if df[PRICE_COL].isnull().any():
|
||||
logger.warning("Missing values remain after ffill. Backward-filling (bfill).")
|
||||
df[PRICE_COL] = df[PRICE_COL].bfill()
|
||||
|
||||
# --- Check Time Index Frequency ---
|
||||
df = df.sort_index()
|
||||
inferred_freq = pd.infer_freq(df.index)
|
||||
if inferred_freq == settings.expected_data_frequency:
|
||||
logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.")
|
||||
df = df.asfreq('h')
|
||||
missing_after_asfreq = df[PRICE_COL].isnull().sum()
|
||||
if missing_after_asfreq > 0:
|
||||
logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.")
|
||||
df[PRICE_COL] = df[PRICE_COL].ffill().bfill()
|
||||
elif inferred_freq:
|
||||
logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.")
|
||||
else:
|
||||
logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.")
|
||||
duplicates = df.index.duplicated().sum()
|
||||
if duplicates > 0:
|
||||
logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.")
|
||||
df = df[~df.index.duplicated(keep='first')]
|
||||
|
||||
logger.info(f"Data loaded and prepared. Final shape: {df.shape}")
|
||||
|
||||
except FileNotFoundError:
|
||||
err = f"Data file not found: {file_path}"
|
||||
logger.error(err)
|
||||
except Exception as e:
|
||||
err = f"An unexpected error occurred during data loading/preparation: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
df = None
|
||||
|
||||
return df, err
|
||||
|
||||
|
||||
def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
|
||||
"""
|
||||
Generates summary information about the DataFrame.
|
||||
|
||||
Args:
|
||||
df: The input DataFrame.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing').
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info("Generating data summary...")
|
||||
summary = None
|
||||
err = None
|
||||
if df is None or df.empty:
|
||||
return None, "Input DataFrame is empty or None."
|
||||
try:
|
||||
summary = {
|
||||
'head': df.head(),
|
||||
'tail': df.tail(),
|
||||
'dtypes': df.dtypes,
|
||||
'missing': df.isnull().sum()
|
||||
}
|
||||
logger.info("Data summary generated.")
|
||||
except Exception as e:
|
||||
err = f"Error generating data summary: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
|
||||
return summary, err
|
||||
|
||||
|
||||
def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]:
|
||||
"""
|
||||
Calculates descriptive statistics for specified column(s).
|
||||
|
||||
Args:
|
||||
df: The input DataFrame.
|
||||
price_col: The name of the column (or list of columns) for stats.
|
||||
Defaults to the standard 'Price' column.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics.
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...")
|
||||
stats = None
|
||||
err = None
|
||||
if df is None or df.empty:
|
||||
return None, "Input DataFrame is empty or None."
|
||||
try:
|
||||
# Check if the target column(s) exist
|
||||
target_cols = [price_col] if isinstance(price_col, str) else price_col
|
||||
missing_cols = [col for col in target_cols if col not in df.columns]
|
||||
if missing_cols:
|
||||
err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
|
||||
stats = df[price_col].describe() # .describe() works on Series and DataFrame
|
||||
logger.info("Descriptive statistics calculated.")
|
||||
except Exception as e:
|
||||
err = f"Error calculating descriptive statistics: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
|
||||
return stats, err
|
Reference in New Issue
Block a user