This commit is contained in:
2025-05-02 10:45:06 +02:00
commit 7c9d809a82
29 changed files with 2931 additions and 0 deletions

View File

View File

@ -0,0 +1,126 @@
import logging
import pandas as pd
from typing import Tuple, Optional, Dict, Any
import warnings
from statsmodels.tools.sm_exceptions import InterpolationWarning
# Import analysis tools - ensure statsmodels is installed
from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult
from statsmodels.tsa.stattools import adfuller, kpss
logger = logging.getLogger(__name__)
# PRICE_COL constant moved to io.data_handling
def perform_decomposition(series: pd.Series, model: str = 'additive', period: int = 24) -> Tuple[Optional[DecomposeResult], Optional[str]]:
"""
Performs time series decomposition using statsmodels.
Args:
series: The time series data (e.g., df['Price']).
model: Type of decomposition ('additive' or 'multiplicative').
period: The period of the seasonality.
Returns:
A tuple containing:
- DecomposeResult | None: The decomposition result object.
- str | None: Error message, otherwise None.
"""
logger.info(f"Performing {model} decomposition with period {period}...")
result = None
err = None
# Check if series is empty or None before proceeding
if series is None or series.empty:
err = "Input series for decomposition is empty or None."
logger.error(err)
return None, err
try:
if len(series) < 2 * period:
err = f"Series is too short for decomposition with period {period} (length {len(series)})."
logger.error(err)
return None, err
# Ensure Series has a DatetimeIndex with frequency for extrapolate_trend
if not isinstance(series.index, pd.DatetimeIndex) or series.index.freq is None:
logger.warning("Series index is not a DatetimeIndex with frequency. Decomposition might be less reliable.")
# Consider removing extrapolate_trend or handling differently if freq is often missing
result = seasonal_decompose(series, model=model, period=period)
else:
result = seasonal_decompose(series, model=model, period=period, extrapolate_trend='freq')
logger.info("Decomposition successful.")
except ValueError as ve:
# Catch specific ValueError often related to NaNs or period issues
err = f"ValueError during decomposition (check for NaNs or period > series length/2): {ve}"
logger.error(err, exc_info=True)
except Exception as e:
err = f"Error during decomposition: {e}"
logger.error(err, exc_info=True)
return result, err
def perform_stationarity_tests(series: pd.Series) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
"""
Performs ADF and KPSS stationarity tests.
Args:
series: The time series to test (often residuals or differenced series).
Returns:
A tuple containing:
- dict | None: Dictionary containing test results ('adf', 'kpss').
- str | None: Error message, otherwise None.
"""
logger.info("Performing stationarity tests (ADF, KPSS)...")
results = {}
err = None
# Check if series is empty or None
if series is None or series.empty:
err = "Input series for stationarity tests is empty or None."
logger.error(err)
return None, err
# Check for NaNs
if series.isnull().any():
err = "Input series contains NaNs. Please handle missing values before testing stationarity."
logger.error(err)
return None, err
try:
# ADF Test
adf_test = adfuller(series, autolag='AIC')
adf_keys = ['Test Statistic',
'p-value',
'#Lags Used',
'#Observations Used',
'Critical Values',
'IC Best' # Added by newer statsmodels
]
# Only map existing keys from result tuple
results['adf'] = {key: val for key, val in zip(adf_keys, adf_test) if key != 'IC Best'}
# Add IC Best separately if it exists
if len(adf_test) > 5: results['adf']['IC Best'] = adf_test[5]
logger.debug(f"ADF Test Results: {results['adf']}")
# KPSS Test (common to test for level stationarity 'c')
with warnings.catch_warnings(): # Suppress known KPSS p-value interpolation warnings
warnings.filterwarnings("ignore", category=InterpolationWarning)
kpss_test = kpss(series, regression='c', nlags="auto")
kpss_keys = ['Test Statistic',
'p-value',
'#Lags Used',
'Critical Values'
]
results['kpss'] = {key: val for key, val in zip(kpss_keys, kpss_test)}
# Handle potential p-value bounds reported as strings
if isinstance(results['kpss']['p-value'], str):
logger.warning(f"KPSS p-value reported as bounds: {results['kpss']['p-value']}")
logger.debug(f"KPSS Test Results: {results['kpss']}")
logger.info("Stationarity tests completed.")
except Exception as e:
err = f"Error performing stationarity tests: {e}"
logger.error(err, exc_info=True)
results = None
return results, err

View File

@ -0,0 +1,387 @@
import logging
from pathlib import Path
import pandas as pd
import json
from typing import Optional, Dict, List, Any
# Use utils for config if that's the structure
from data_analysis.utils.config_model import settings
import datetime
logger = logging.getLogger(__name__)
# --- Import data handling functions from io ---
from data_analysis.io.data_handling import (
load_and_prepare_data,
get_data_summary,
get_descriptive_stats,
PRICE_COL, # Standardized price column name
PRICE_COL_RAW # Raw price column name (needed for check below)
)
# --- Import analysis functions from analysis ---
from .data import (
perform_decomposition,
perform_stationarity_tests,
)
# --- Import plotting functions ---
from data_analysis.io.plotting import (
plot_full_time_series,
plot_zoomed_time_series,
plot_boxplot_by_period,
plot_histogram,
plot_decomposition as plot_decomposition_results, # Rename to avoid clash
plot_residuals,
plot_acf_pacf,
plot_seasonal_subseries,
plot_cross_correlation,
plot_weekly_autocorrelation
)
# --- Import report generator ---
from ..io.report import generate_latex_report
from data_analysis.utils.report_model import ReportData
# --- Modified Pipeline Function ---
def run_eda_pipeline():
"""
Orchestrates the Exploratory Data Analysis process using loaded settings
and generates a LaTeX report.
"""
logger.info("Starting Exploratory Data Analysis Pipeline (LaTeX Report)...")
output_dir = settings.output_dir
plots_dir = output_dir / "plots" # Define plots subdirectory
# Ensure output directories exist
try:
output_dir.mkdir(parents=True, exist_ok=True)
plots_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Output directory set to: {output_dir.resolve()}")
logger.info(f"Plots directory set to: {plots_dir.resolve()}")
except Exception as e:
logger.error(f"Failed to create output directories: {e}", exc_info=True)
raise SystemExit(1) from e
# --- Data Holders ---
df: Optional[pd.DataFrame] = None
summary_data_dict: Optional[dict] = None
desc_stats_price: Optional[pd.Series] = None
residuals_daily: Optional[pd.Series] = None
residuals_weekly: Optional[pd.Series] = None
stationarity_results_dict: Optional[dict] = None
series_name_stat_tested: Optional[str] = None
# --- Plot Path Collectors ---
other_plot_paths: Dict[str, str] = {}
acf_pacf_plot_paths: Dict[str, str] = {}
decomposition_plot_paths: Dict[str, str] = {}
# --- Pipeline Steps ---
# 1. Load Data
logger.info("--- Step 1: Load Data ---")
# Store initial raw state temporarily to check missing values before preparation
df_raw, err_load = pd.read_csv(settings.data_file, header=0), None
initial_missing_price = 0
if PRICE_COL_RAW in df_raw.columns:
# Check missing in the raw numeric column before full processing
initial_missing_price = pd.to_numeric(df_raw[PRICE_COL_RAW], errors='coerce').isnull().sum()
else:
logger.warning(f"Raw price column '{PRICE_COL_RAW}' not found for initial missing value check.")
df, err = load_and_prepare_data(settings.data_file)
if err or df is None:
logger.error(f"Data loading failed: {err or 'Unknown error'}. Stopping pipeline.")
raise SystemExit(1)
logger.info(f"Data loaded successfully. Shape: {df.shape}")
logger.info(f"Columns: {', '.join(df.columns)}")
# Construct imputation message based on initial check and final state
imputation_msg = "No missing price values detected."
final_missing_price = df[PRICE_COL].isnull().sum() # Should be 0 after load_and_prepare
if initial_missing_price > 0:
if final_missing_price == 0:
imputation_msg = f"{initial_missing_price} missing price value(s) were detected and imputed (ffill/bfill)."
else:
imputation_msg = f"{initial_missing_price} missing price value(s) were detected, imputation may be incomplete ({final_missing_price} remain)."
elif df.isnull().sum().sum() > final_missing_price: # Check if other columns have NaNs
imputation_msg = "Missing values detected in non-price columns (if any). Price column had no missing values."
# 2. Initial Inspection & Summary Stats
logger.info("--- Step 2: Initial Inspection & Summary ---")
summary_data_dict, err = get_data_summary(df)
summary_file_path = output_dir / "summary_data.txt"
if err:
logger.error(f"Failed to get data summary: {err}")
elif summary_data_dict:
logger.info(f"Saving data summary to {summary_file_path}")
try:
with open(summary_file_path, 'w') as f:
f.write("--- Data Summary ---\n\n")
f.write(f"Data Source: {settings.data_file.name}\n")
f.write(f"Date Range: {df.index.min()} to {df.index.max()}\n")
f.write(f"Number of Points: {len(df)}\n\n")
f.write("First 5 Rows:\n")
f.write(summary_data_dict['head'].to_string())
f.write("\n\nLast 5 Rows:\n")
f.write(summary_data_dict['tail'].to_string())
f.write("\n\nData Types:\n")
f.write(summary_data_dict['dtypes'].to_string())
f.write("\n\nMissing Value Counts (Post Initial Handling):\n") # Updated comment
f.write(summary_data_dict['missing'].to_string())
f.write("\n")
except IOError as e:
logger.error(f"Failed to write data summary to {summary_file_path}: {e}")
# Log summaries as well
logger.info(f"Head:\n{summary_data_dict['head'].to_string()}")
logger.info(f"Tail:\n{summary_data_dict['tail'].to_string()}")
logger.info(f"Data Types:\n{summary_data_dict['dtypes']}")
# Keep it for later
# logger.info(f"Missing Values (Post Initial Handling):\n{summary_data_dict['missing']}")
# Descriptive Stats
desc_stats_price, err = get_descriptive_stats(df, price_col=PRICE_COL)
desc_stats_file_path = output_dir / "descriptive_stats_price.csv" # Make filename specific
if err:
logger.error(f"Failed to get descriptive stats for {PRICE_COL}: {err}")
elif desc_stats_price is not None:
logger.info(f"Saving price descriptive stats to {desc_stats_file_path}")
try:
# Ensure it's a Series before calling to_csv with header=True
if isinstance(desc_stats_price, pd.Series):
desc_stats_price.to_csv(desc_stats_file_path, header=True)
else: # If it returns DataFrame (unlikely for single col but safe)
desc_stats_price.to_csv(desc_stats_file_path)
except IOError as e:
logger.error(f"Failed to write price descriptive stats to {desc_stats_file_path}: {e}")
logger.info(f"Price Descriptive Stats:\n{desc_stats_price.to_string()}")
# 3. Visualizations (Main Price Series)
logger.info("--- Step 3: Visualizations (Price) ---")
plot_name = "01_full_timeseries.png"
err = plot_full_time_series(df, PRICE_COL, plots_dir / plot_name)
if not err: other_plot_paths['full_timeseries'] = plot_name
else: logger.warning(f"Plotting error (full series): {err}")
if settings.zoom_start_date and settings.zoom_end_date:
plot_name = "02_zoomed_timeseries.png"
err = plot_zoomed_time_series(df, PRICE_COL, settings.zoom_start_date, settings.zoom_end_date, plots_dir / plot_name)
if not err: other_plot_paths['zoomed_timeseries'] = plot_name
else: logger.warning(f"Plotting error (zoomed series): {err}")
for period in ['hour', 'dayofweek', 'month', 'year']:
plot_name = f"03_boxplot_{period}.png"
err = plot_boxplot_by_period(df, PRICE_COL, period, plots_dir / plot_name)
if not err: other_plot_paths[f'boxplot_{period}'] = plot_name
else: logger.warning(f"Plotting error (boxplot {period}): {err}")
plot_name = "04_histogram_price.png"
err = plot_histogram(df, PRICE_COL, plots_dir / plot_name)
if not err: other_plot_paths['histogram_price'] = plot_name
else: logger.warning(f"Plotting error (histogram): {err}")
# Optional: Seasonal Subseries Plots
plot_name = "04a_seasonal_subseries_daily.png"
err = plot_seasonal_subseries(df, PRICE_COL, period=24, period_name="Daily", output_path=plots_dir / plot_name)
if not err: other_plot_paths['seasonal_subseries_daily'] = plot_name
else: logger.warning(f"Plotting error (subseries daily): {err}")
if len(df) > 168: # Check if enough data for weekly
plot_name = "04b_seasonal_subseries_weekly.png"
err = plot_seasonal_subseries(df, PRICE_COL, period=168, period_name="Weekly", output_path=plots_dir / plot_name)
if not err: other_plot_paths['seasonal_subseries_weekly'] = plot_name
else: logger.warning(f"Plotting error (subseries weekly): {err}")
# 4. Decomposition
logger.info("--- Step 4: Decomposition ---")
residuals_for_analysis: Optional[pd.Series] = None # Track which residuals to use later
# Daily
decomp_daily, err = perform_decomposition(df[PRICE_COL], model='additive', period=24)
if err: logger.error(f"Daily decomposition failed: {err}")
elif decomp_daily:
plot_name = "05_decomposition_daily.png"
err = plot_decomposition_results(decomp_daily, "Daily (Period=24)", plots_dir / plot_name)
if not err: decomposition_plot_paths['daily'] = plot_name
else: logger.warning(f"Plotting error (daily decomp): {err}")
residuals_daily = decomp_daily.resid.dropna()
plot_name = "06_residuals_daily.png"
err = plot_residuals(residuals_daily, "Daily Decomp", plots_dir / plot_name)
# Save path regardless of error, report might reference it
other_plot_paths['residuals_daily'] = plot_name
if err: logger.warning(f"Plotting error (daily residuals): {err}")
if not residuals_daily.empty: residuals_for_analysis = residuals_daily # Prefer daily initially
# Weekly
if len(df) >= 168 * 2:
decomp_weekly, err = perform_decomposition(df[PRICE_COL], model='additive', period=168)
if err: logger.error(f"Weekly decomposition failed: {err}")
elif decomp_weekly:
plot_name = "07_decomposition_weekly.png"
err = plot_decomposition_results(decomp_weekly, "Weekly (Period=168)", plots_dir / plot_name)
if not err: decomposition_plot_paths['weekly'] = plot_name
else: logger.warning(f"Plotting error (weekly decomp): {err}")
residuals_weekly = decomp_weekly.resid.dropna()
plot_name = "08_residuals_weekly.png"
err = plot_residuals(residuals_weekly, "Weekly Decomp", plots_dir / plot_name)
other_plot_paths['residuals_weekly'] = plot_name
if err: logger.warning(f"Plotting error (weekly residuals): {err}")
if not residuals_weekly.empty: residuals_for_analysis = residuals_weekly # Prefer weekly if available
else:
logger.warning("Skipping weekly decomposition, data length insufficient.")
# Decide which residuals plot to link in stationarity section
if residuals_for_analysis is residuals_weekly:
other_plot_paths['residuals'] = other_plot_paths.get('residuals_weekly', 'placeholder.png')
series_name_stat_tested = "Weekly Residuals"
elif residuals_for_analysis is residuals_daily:
other_plot_paths['residuals'] = other_plot_paths.get('residuals_daily', 'placeholder.png')
series_name_stat_tested = "Daily Residuals"
else:
series_name_stat_tested = None # No residuals available for tests
# 5. Stationarity Analysis
logger.info("--- Step 5: Stationarity Analysis ---")
stationarity_file_path = output_dir / "stationarity_tests.json"
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
logger.info(f"Performing tests on: {series_name_stat_tested}")
stationarity_results_dict, err = perform_stationarity_tests(residuals_for_analysis)
if err: logger.error(f"Stationarity tests failed: {err}")
elif stationarity_results_dict:
logger.info(f"Saving stationarity test results to {stationarity_file_path}")
try:
# Convert numpy arrays/types in critical values to lists for JSON serialization
adf_res = stationarity_results_dict.get('adf', {})
kpss_res = stationarity_results_dict.get('kpss', {})
adf_crit = adf_res.get('Critical Values', {})
kpss_crit = kpss_res.get('Critical Values', {})
if isinstance(adf_crit, dict):
adf_res['Critical Values'] = {k: float(v) for k, v in adf_crit.items()}
if isinstance(kpss_crit, dict):
kpss_res['Critical Values'] = {k: float(v) for k, v in kpss_crit.items()}
results_to_save = {
"series_tested": series_name_stat_tested,
"adf": adf_res,
"kpss": kpss_res
}
with open(stationarity_file_path, 'w') as f:
json.dump(results_to_save, f, indent=4)
except (IOError, TypeError) as e:
logger.error(f"Failed to write stationarity results to {stationarity_file_path}: {e}")
# Log key results
logger.info(f"Stationarity Test Results ({series_name_stat_tested}):")
if 'adf' in stationarity_results_dict and stationarity_results_dict['adf']:
logger.info(f" ADF p-value: {stationarity_results_dict['adf'].get('p-value', 'N/A'):.4f}")
if 'kpss' in stationarity_results_dict and stationarity_results_dict['kpss']:
# Handle string p-values from KPSS
kpss_p = stationarity_results_dict['kpss'].get('p-value', 'N/A')
if isinstance(kpss_p, str):
logger.info(f" KPSS p-value: {kpss_p}")
else:
logger.info(f" KPSS p-value: {kpss_p:.4f}")
else:
logger.warning("Skipping Stationarity Analysis as no suitable residual series is available.")
# 6. Autocorrelation Analysis
logger.info("--- Step 6: Autocorrelation Analysis ---")
# Import plot_acf, plot_pacf from statsmodels graphics directly for saving
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
series_name_acf = series_name_stat_tested.lower().replace(' ','_')
base_name = f"09_{series_name_acf}"
err_acf = None; err_pacf = None
try:
# Create figure and axes explicitly
fig_acf, ax_acf = plt.subplots()
plot_acf(residuals_for_analysis, lags=48, ax=ax_acf, title=f'ACF - {series_name_stat_tested}')
plot_name_acf = f"{base_name}_acf.png"
fig_acf.savefig(plots_dir / plot_name_acf)
plt.close(fig_acf) # Close figure after saving
acf_pacf_plot_paths['acf'] = plot_name_acf
except Exception as e: err_acf = e
try:
# Create figure and axes explicitly
fig_pacf, ax_pacf = plt.subplots()
plot_pacf(residuals_for_analysis, lags=48, ax=ax_pacf, title=f'PACF - {series_name_stat_tested}', method='ywm')
plot_name_pacf = f"{base_name}_pacf.png"
fig_pacf.savefig(plots_dir / plot_name_pacf)
plt.close(fig_pacf) # Close figure after saving
acf_pacf_plot_paths['pacf'] = plot_name_pacf
except Exception as e: err_pacf = e
if err_acf: logger.warning(f"Plotting error (ACF for {series_name_stat_tested}): {err_acf}")
if err_pacf: logger.warning(f"Plotting error (PACF for {series_name_stat_tested}): {err_pacf}")
# Add Weekly Autocorrelation Analysis
try:
plot_name = f"09c_weekly_autocorr_{series_name_acf}.png"
err = plot_weekly_autocorrelation(
series=residuals_for_analysis,
series_name=series_name_stat_tested,
output_path=plots_dir / plot_name,
max_weeks=4
)
if not err: acf_pacf_plot_paths['weekly_autocorr'] = plot_name
else: logger.warning(f"Plotting error (weekly autocorrelation): {err}")
except Exception as e:
logger.warning(f"Error in weekly autocorrelation analysis: {e}")
else:
logger.warning("Skipping Autocorrelation Analysis as no suitable series is available.")
# 7. Exogenous Variable Analysis (if any exist)
logger.info("--- Step 7: Exogenous Variable Analysis ---")
logger.info("--- There are none.... Skipping ---")
# 8. Generate LaTeX Report
logger.info("--- Step 8: Generate LaTeX Report ---")
# --- Determine Decomposition Model and ACF/PACF Lags Used ---
# These are currently hardcoded in the pipeline steps
decomp_model_used = 'additive'
acf_pacf_lags_used = 48
# Create ReportData object, now including imputation_message
report_data = ReportData(
descriptive_stats={'desc_price': desc_stats_price} if desc_stats_price is not None else None,
stationarity_tests=stationarity_results_dict,
summary_data=summary_data_dict, # Pass the summary dict directly
imputation_message=imputation_msg # Pass the generated message
)
try:
generate_latex_report(
output_dir=output_dir,
df=df,
report_data=report_data,
series_name_stat=series_name_stat_tested,
acf_pacf_plot_paths=acf_pacf_plot_paths,
decomposition_plot_paths=decomposition_plot_paths,
other_plot_paths=other_plot_paths,
decomposition_model=decomp_model_used, # Pass the model used
acf_pacf_lags=acf_pacf_lags_used, # Pass the lags used
template_path=settings.latex_template_file
)
except (FileNotFoundError, IOError, ValueError, RuntimeError) as e:
logger.error(f"Report generation failed: {e}", exc_info=True)
# Decide if pipeline should stop or continue
# raise SystemExit(1) from e # Option to stop pipeline
logger.info(f"EDA Pipeline execution finished. Review logs and generated files in {output_dir}.")
# The message about compiling manually is now handled within generate_latex_report if compilation fails
# logger.info(f"Compile the report: cd '{output_dir.resolve()}' && pdflatex eda_report.tex")