import logging from pathlib import Path import pandas as pd import json from typing import Optional, Dict, List, Any # Use utils for config if that's the structure from data_analysis.utils.data_config_model import settings import datetime logger = logging.getLogger(__name__) # --- Import data handling functions from io --- from data_analysis.io.data_handling import ( load_and_prepare_data, get_data_summary, get_descriptive_stats, PRICE_COL, # Standardized price column name PRICE_COL_RAW # Raw price column name (needed for check below) ) # --- Import analysis functions from analysis --- from .data import ( perform_decomposition, perform_stationarity_tests, ) # --- Import plotting functions --- from data_analysis.io.plotting import ( plot_full_time_series, plot_zoomed_time_series, plot_boxplot_by_period, plot_histogram, plot_decomposition as plot_decomposition_results, # Rename to avoid clash plot_residuals, plot_acf_pacf, plot_seasonal_subseries, plot_cross_correlation, plot_weekly_autocorrelation ) # --- Import report generator --- from ..io.report import generate_latex_report from data_analysis.utils.report_model import ReportData # --- Modified Pipeline Function --- def run_eda_pipeline(): """ Orchestrates the Exploratory Data Analysis process using loaded settings and generates a LaTeX report. """ logger.info("Starting Exploratory Data Analysis Pipeline (LaTeX Report)...") output_dir = settings.output_dir plots_dir = output_dir / "plots" # Define plots subdirectory # Ensure output directories exist try: output_dir.mkdir(parents=True, exist_ok=True) plots_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Output directory set to: {output_dir.resolve()}") logger.info(f"Plots directory set to: {plots_dir.resolve()}") except Exception as e: logger.error(f"Failed to create output directories: {e}", exc_info=True) raise SystemExit(1) from e # --- Data Holders --- df: Optional[pd.DataFrame] = None summary_data_dict: Optional[dict] = None desc_stats_price: Optional[pd.Series] = None residuals_daily: Optional[pd.Series] = None residuals_weekly: Optional[pd.Series] = None stationarity_results_dict: Optional[dict] = None series_name_stat_tested: Optional[str] = None # --- Plot Path Collectors --- other_plot_paths: Dict[str, str] = {} acf_pacf_plot_paths: Dict[str, str] = {} decomposition_plot_paths: Dict[str, str] = {} # --- Pipeline Steps --- # 1. Load Data logger.info("--- Step 1: Load Data ---") # Store initial raw state temporarily to check missing values before preparation df_raw, err_load = pd.read_csv(settings.data_file, header=0), None initial_missing_price = 0 if PRICE_COL_RAW in df_raw.columns: # Check missing in the raw numeric column before full processing initial_missing_price = pd.to_numeric(df_raw[PRICE_COL_RAW], errors='coerce').isnull().sum() else: logger.warning(f"Raw price column '{PRICE_COL_RAW}' not found for initial missing value check.") df, err = load_and_prepare_data(settings.data_file) if err or df is None: logger.error(f"Data loading failed: {err or 'Unknown error'}. Stopping pipeline.") raise SystemExit(1) logger.info(f"Data loaded successfully. Shape: {df.shape}") logger.info(f"Columns: {', '.join(df.columns)}") # Construct imputation message based on initial check and final state imputation_msg = "No missing price values detected." final_missing_price = df[PRICE_COL].isnull().sum() # Should be 0 after load_and_prepare if initial_missing_price > 0: if final_missing_price == 0: imputation_msg = f"{initial_missing_price} missing price value(s) were detected and imputed (ffill/bfill)." else: imputation_msg = f"{initial_missing_price} missing price value(s) were detected, imputation may be incomplete ({final_missing_price} remain)." elif df.isnull().sum().sum() > final_missing_price: # Check if other columns have NaNs imputation_msg = "Missing values detected in non-price columns (if any). Price column had no missing values." # 2. Initial Inspection & Summary Stats logger.info("--- Step 2: Initial Inspection & Summary ---") summary_data_dict, err = get_data_summary(df) summary_file_path = output_dir / "summary_data.txt" if err: logger.error(f"Failed to get data summary: {err}") elif summary_data_dict: logger.info(f"Saving data summary to {summary_file_path}") try: with open(summary_file_path, 'w') as f: f.write("--- Data Summary ---\n\n") f.write(f"Data Source: {settings.data_file.name}\n") f.write(f"Date Range: {df.index.min()} to {df.index.max()}\n") f.write(f"Number of Points: {len(df)}\n\n") f.write("First 5 Rows:\n") f.write(summary_data_dict['head'].to_string()) f.write("\n\nLast 5 Rows:\n") f.write(summary_data_dict['tail'].to_string()) f.write("\n\nData Types:\n") f.write(summary_data_dict['dtypes'].to_string()) f.write("\n\nMissing Value Counts (Post Initial Handling):\n") # Updated comment f.write(summary_data_dict['missing'].to_string()) f.write("\n") except IOError as e: logger.error(f"Failed to write data summary to {summary_file_path}: {e}") # Log summaries as well logger.info(f"Head:\n{summary_data_dict['head'].to_string()}") logger.info(f"Tail:\n{summary_data_dict['tail'].to_string()}") logger.info(f"Data Types:\n{summary_data_dict['dtypes']}") # Keep it for later # logger.info(f"Missing Values (Post Initial Handling):\n{summary_data_dict['missing']}") # Descriptive Stats desc_stats_price, err = get_descriptive_stats(df, price_col=PRICE_COL) desc_stats_file_path = output_dir / "descriptive_stats_price.csv" # Make filename specific if err: logger.error(f"Failed to get descriptive stats for {PRICE_COL}: {err}") elif desc_stats_price is not None: logger.info(f"Saving price descriptive stats to {desc_stats_file_path}") try: # Ensure it's a Series before calling to_csv with header=True if isinstance(desc_stats_price, pd.Series): desc_stats_price.to_csv(desc_stats_file_path, header=True) else: # If it returns DataFrame (unlikely for single col but safe) desc_stats_price.to_csv(desc_stats_file_path) except IOError as e: logger.error(f"Failed to write price descriptive stats to {desc_stats_file_path}: {e}") logger.info(f"Price Descriptive Stats:\n{desc_stats_price.to_string()}") # 3. Visualizations (Main Price Series) logger.info("--- Step 3: Visualizations (Price) ---") plot_name = "01_full_timeseries.png" err = plot_full_time_series(df, PRICE_COL, plots_dir / plot_name) if not err: other_plot_paths['full_timeseries'] = plot_name else: logger.warning(f"Plotting error (full series): {err}") if settings.zoom_start_date and settings.zoom_end_date: plot_name = "02_zoomed_timeseries.png" err = plot_zoomed_time_series(df, PRICE_COL, settings.zoom_start_date, settings.zoom_end_date, plots_dir / plot_name) if not err: other_plot_paths['zoomed_timeseries'] = plot_name else: logger.warning(f"Plotting error (zoomed series): {err}") for period in ['hour', 'dayofweek', 'month', 'year']: plot_name = f"03_boxplot_{period}.png" err = plot_boxplot_by_period(df, PRICE_COL, period, plots_dir / plot_name) if not err: other_plot_paths[f'boxplot_{period}'] = plot_name else: logger.warning(f"Plotting error (boxplot {period}): {err}") plot_name = "04_histogram_price.png" err = plot_histogram(df, PRICE_COL, plots_dir / plot_name) if not err: other_plot_paths['histogram_price'] = plot_name else: logger.warning(f"Plotting error (histogram): {err}") # Optional: Seasonal Subseries Plots plot_name = "04a_seasonal_subseries_daily.png" err = plot_seasonal_subseries(df, PRICE_COL, period=24, period_name="Daily", output_path=plots_dir / plot_name) if not err: other_plot_paths['seasonal_subseries_daily'] = plot_name else: logger.warning(f"Plotting error (subseries daily): {err}") if len(df) > 168: # Check if enough data for weekly plot_name = "04b_seasonal_subseries_weekly.png" err = plot_seasonal_subseries(df, PRICE_COL, period=168, period_name="Weekly", output_path=plots_dir / plot_name) if not err: other_plot_paths['seasonal_subseries_weekly'] = plot_name else: logger.warning(f"Plotting error (subseries weekly): {err}") # 4. Decomposition logger.info("--- Step 4: Decomposition ---") residuals_for_analysis: Optional[pd.Series] = None # Track which residuals to use later # Daily decomp_daily, err = perform_decomposition(df[PRICE_COL], model='additive', period=24) if err: logger.error(f"Daily decomposition failed: {err}") elif decomp_daily: plot_name = "05_decomposition_daily.png" err = plot_decomposition_results(decomp_daily, "Daily (Period=24)", plots_dir / plot_name) if not err: decomposition_plot_paths['daily'] = plot_name else: logger.warning(f"Plotting error (daily decomp): {err}") residuals_daily = decomp_daily.resid.dropna() plot_name = "06_residuals_daily.png" err = plot_residuals(residuals_daily, "Daily Decomp", plots_dir / plot_name) # Save path regardless of error, report might reference it other_plot_paths['residuals_daily'] = plot_name if err: logger.warning(f"Plotting error (daily residuals): {err}") if not residuals_daily.empty: residuals_for_analysis = residuals_daily # Prefer daily initially # Weekly if len(df) >= 168 * 2: decomp_weekly, err = perform_decomposition(df[PRICE_COL], model='additive', period=168) if err: logger.error(f"Weekly decomposition failed: {err}") elif decomp_weekly: plot_name = "07_decomposition_weekly.png" err = plot_decomposition_results(decomp_weekly, "Weekly (Period=168)", plots_dir / plot_name) if not err: decomposition_plot_paths['weekly'] = plot_name else: logger.warning(f"Plotting error (weekly decomp): {err}") residuals_weekly = decomp_weekly.resid.dropna() plot_name = "08_residuals_weekly.png" err = plot_residuals(residuals_weekly, "Weekly Decomp", plots_dir / plot_name) other_plot_paths['residuals_weekly'] = plot_name if err: logger.warning(f"Plotting error (weekly residuals): {err}") if not residuals_weekly.empty: residuals_for_analysis = residuals_weekly # Prefer weekly if available else: logger.warning("Skipping weekly decomposition, data length insufficient.") # Decide which residuals plot to link in stationarity section if residuals_for_analysis is residuals_weekly: other_plot_paths['residuals'] = other_plot_paths.get('residuals_weekly', 'placeholder.png') series_name_stat_tested = "Weekly Residuals" elif residuals_for_analysis is residuals_daily: other_plot_paths['residuals'] = other_plot_paths.get('residuals_daily', 'placeholder.png') series_name_stat_tested = "Daily Residuals" else: series_name_stat_tested = None # No residuals available for tests # 5. Stationarity Analysis logger.info("--- Step 5: Stationarity Analysis ---") stationarity_file_path = output_dir / "stationarity_tests.json" if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty: logger.info(f"Performing tests on: {series_name_stat_tested}") stationarity_results_dict, err = perform_stationarity_tests(residuals_for_analysis) if err: logger.error(f"Stationarity tests failed: {err}") elif stationarity_results_dict: logger.info(f"Saving stationarity test results to {stationarity_file_path}") try: # Convert numpy arrays/types in critical values to lists for JSON serialization adf_res = stationarity_results_dict.get('adf', {}) kpss_res = stationarity_results_dict.get('kpss', {}) adf_crit = adf_res.get('Critical Values', {}) kpss_crit = kpss_res.get('Critical Values', {}) if isinstance(adf_crit, dict): adf_res['Critical Values'] = {k: float(v) for k, v in adf_crit.items()} if isinstance(kpss_crit, dict): kpss_res['Critical Values'] = {k: float(v) for k, v in kpss_crit.items()} results_to_save = { "series_tested": series_name_stat_tested, "adf": adf_res, "kpss": kpss_res } with open(stationarity_file_path, 'w') as f: json.dump(results_to_save, f, indent=4) except (IOError, TypeError) as e: logger.error(f"Failed to write stationarity results to {stationarity_file_path}: {e}") # Log key results logger.info(f"Stationarity Test Results ({series_name_stat_tested}):") if 'adf' in stationarity_results_dict and stationarity_results_dict['adf']: logger.info(f" ADF p-value: {stationarity_results_dict['adf'].get('p-value', 'N/A'):.4f}") if 'kpss' in stationarity_results_dict and stationarity_results_dict['kpss']: # Handle string p-values from KPSS kpss_p = stationarity_results_dict['kpss'].get('p-value', 'N/A') if isinstance(kpss_p, str): logger.info(f" KPSS p-value: {kpss_p}") else: logger.info(f" KPSS p-value: {kpss_p:.4f}") else: logger.warning("Skipping Stationarity Analysis as no suitable residual series is available.") # 6. Autocorrelation Analysis logger.info("--- Step 6: Autocorrelation Analysis ---") # Import plot_acf, plot_pacf from statsmodels graphics directly for saving from statsmodels.graphics.tsaplots import plot_acf, plot_pacf import matplotlib.pyplot as plt if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty: series_name_acf = series_name_stat_tested.lower().replace(' ','_') base_name = f"09_{series_name_acf}" err_acf = None; err_pacf = None try: # Create figure and axes explicitly fig_acf, ax_acf = plt.subplots() plot_acf(residuals_for_analysis, lags=48, ax=ax_acf, title=f'ACF - {series_name_stat_tested}') plot_name_acf = f"{base_name}_acf.png" fig_acf.savefig(plots_dir / plot_name_acf) plt.close(fig_acf) # Close figure after saving acf_pacf_plot_paths['acf'] = plot_name_acf except Exception as e: err_acf = e try: # Create figure and axes explicitly fig_pacf, ax_pacf = plt.subplots() plot_pacf(residuals_for_analysis, lags=48, ax=ax_pacf, title=f'PACF - {series_name_stat_tested}', method='ywm') plot_name_pacf = f"{base_name}_pacf.png" fig_pacf.savefig(plots_dir / plot_name_pacf) plt.close(fig_pacf) # Close figure after saving acf_pacf_plot_paths['pacf'] = plot_name_pacf except Exception as e: err_pacf = e if err_acf: logger.warning(f"Plotting error (ACF for {series_name_stat_tested}): {err_acf}") if err_pacf: logger.warning(f"Plotting error (PACF for {series_name_stat_tested}): {err_pacf}") # Add Weekly Autocorrelation Analysis try: plot_name = f"09c_weekly_autocorr_{series_name_acf}.png" err = plot_weekly_autocorrelation( series=residuals_for_analysis, series_name=series_name_stat_tested, output_path=plots_dir / plot_name, max_weeks=4 ) if not err: acf_pacf_plot_paths['weekly_autocorr'] = plot_name else: logger.warning(f"Plotting error (weekly autocorrelation): {err}") except Exception as e: logger.warning(f"Error in weekly autocorrelation analysis: {e}") else: logger.warning("Skipping Autocorrelation Analysis as no suitable series is available.") # 7. Exogenous Variable Analysis (if any exist) logger.info("--- Step 7: Exogenous Variable Analysis ---") logger.info("--- There are none.... Skipping ---") # 8. Generate LaTeX Report logger.info("--- Step 8: Generate LaTeX Report ---") # --- Determine Decomposition Model and ACF/PACF Lags Used --- # These are currently hardcoded in the pipeline steps decomp_model_used = 'additive' acf_pacf_lags_used = 48 # Create ReportData object, now including imputation_message report_data = ReportData( descriptive_stats={'desc_price': desc_stats_price} if desc_stats_price is not None else None, stationarity_tests=stationarity_results_dict, summary_data=summary_data_dict, # Pass the summary dict directly imputation_message=imputation_msg # Pass the generated message ) try: generate_latex_report( output_dir=output_dir, df=df, report_data=report_data, series_name_stat=series_name_stat_tested, acf_pacf_plot_paths=acf_pacf_plot_paths, decomposition_plot_paths=decomposition_plot_paths, other_plot_paths=other_plot_paths, decomposition_model=decomp_model_used, # Pass the model used acf_pacf_lags=acf_pacf_lags_used, # Pass the lags used template_path=settings.latex_template_file ) except (FileNotFoundError, IOError, ValueError, RuntimeError) as e: logger.error(f"Report generation failed: {e}", exc_info=True) # Decide if pipeline should stop or continue # raise SystemExit(1) from e # Option to stop pipeline logger.info(f"EDA Pipeline execution finished. Review logs and generated files in {output_dir}.") # The message about compiling manually is now handled within generate_latex_report if compilation fails # logger.info(f"Compile the report: cd '{output_dir.resolve()}' && pdflatex eda_report.tex")