init

2025-05-02 10:45:06 +02:00
commit 7c9d809a82
29 changed files with 2931 additions and 0 deletions
--- a/data_analysis/analysis/pipeline.py
+++ b/data_analysis/analysis/pipeline.py
@ -0,0 +1,387 @@
+import logging
+from pathlib import Path
+import pandas as pd
+import json
+from typing import Optional, Dict, List, Any
+# Use utils for config if that's the structure
+from data_analysis.utils.config_model import settings
+import datetime
+
+logger = logging.getLogger(__name__)
+
+# --- Import data handling functions from io ---
+from data_analysis.io.data_handling import (
+    load_and_prepare_data,
+    get_data_summary,
+    get_descriptive_stats,
+    PRICE_COL, # Standardized price column name
+    PRICE_COL_RAW # Raw price column name (needed for check below)
+)
+# --- Import analysis functions from analysis ---
+from .data import (
+    perform_decomposition,
+    perform_stationarity_tests,
+)
+# --- Import plotting functions ---
+from data_analysis.io.plotting import (
+    plot_full_time_series,
+    plot_zoomed_time_series,
+    plot_boxplot_by_period,
+    plot_histogram,
+    plot_decomposition as plot_decomposition_results, # Rename to avoid clash
+    plot_residuals,
+    plot_acf_pacf,
+    plot_seasonal_subseries,
+    plot_cross_correlation,
+    plot_weekly_autocorrelation
+)
+# --- Import report generator ---
+from ..io.report import generate_latex_report
+from data_analysis.utils.report_model import ReportData
+
+
+# --- Modified Pipeline Function ---
+def run_eda_pipeline():
+    """
+    Orchestrates the Exploratory Data Analysis process using loaded settings
+    and generates a LaTeX report.
+    """
+    logger.info("Starting Exploratory Data Analysis Pipeline (LaTeX Report)...")
+    output_dir = settings.output_dir
+    plots_dir = output_dir / "plots" # Define plots subdirectory
+
+    # Ensure output directories exist
+    try:
+        output_dir.mkdir(parents=True, exist_ok=True)
+        plots_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Output directory set to: {output_dir.resolve()}")
+        logger.info(f"Plots directory set to: {plots_dir.resolve()}")
+    except Exception as e:
+        logger.error(f"Failed to create output directories: {e}", exc_info=True)
+        raise SystemExit(1) from e
+
+    # --- Data Holders ---
+    df: Optional[pd.DataFrame] = None
+    summary_data_dict: Optional[dict] = None
+    desc_stats_price: Optional[pd.Series] = None
+    residuals_daily: Optional[pd.Series] = None
+    residuals_weekly: Optional[pd.Series] = None
+    stationarity_results_dict: Optional[dict] = None
+    series_name_stat_tested: Optional[str] = None
+
+    # --- Plot Path Collectors ---
+    other_plot_paths: Dict[str, str] = {}
+    acf_pacf_plot_paths: Dict[str, str] = {}
+    decomposition_plot_paths: Dict[str, str] = {}
+
+
+    # --- Pipeline Steps ---
+
+    # 1. Load Data
+    logger.info("--- Step 1: Load Data ---")
+    # Store initial raw state temporarily to check missing values before preparation
+    df_raw, err_load = pd.read_csv(settings.data_file, header=0), None
+    initial_missing_price = 0
+    if PRICE_COL_RAW in df_raw.columns:
+         # Check missing in the raw numeric column before full processing
+         initial_missing_price = pd.to_numeric(df_raw[PRICE_COL_RAW], errors='coerce').isnull().sum()
+    else:
+         logger.warning(f"Raw price column '{PRICE_COL_RAW}' not found for initial missing value check.")
+
+    df, err = load_and_prepare_data(settings.data_file)
+    if err or df is None:
+        logger.error(f"Data loading failed: {err or 'Unknown error'}. Stopping pipeline.")
+        raise SystemExit(1)
+    logger.info(f"Data loaded successfully. Shape: {df.shape}")
+    logger.info(f"Columns: {', '.join(df.columns)}")
+
+    # Construct imputation message based on initial check and final state
+    imputation_msg = "No missing price values detected."
+    final_missing_price = df[PRICE_COL].isnull().sum() # Should be 0 after load_and_prepare
+    if initial_missing_price > 0:
+         if final_missing_price == 0:
+              imputation_msg = f"{initial_missing_price} missing price value(s) were detected and imputed (ffill/bfill)."
+         else:
+              imputation_msg = f"{initial_missing_price} missing price value(s) were detected, imputation may be incomplete ({final_missing_price} remain)."
+    elif df.isnull().sum().sum() > final_missing_price: # Check if other columns have NaNs
+         imputation_msg = "Missing values detected in non-price columns (if any). Price column had no missing values."
+
+
+    # 2. Initial Inspection & Summary Stats
+    logger.info("--- Step 2: Initial Inspection & Summary ---")
+    summary_data_dict, err = get_data_summary(df)
+    summary_file_path = output_dir / "summary_data.txt"
+    if err:
+        logger.error(f"Failed to get data summary: {err}")
+    elif summary_data_dict:
+        logger.info(f"Saving data summary to {summary_file_path}")
+        try:
+            with open(summary_file_path, 'w') as f:
+                f.write("--- Data Summary ---\n\n")
+                f.write(f"Data Source: {settings.data_file.name}\n")
+                f.write(f"Date Range: {df.index.min()} to {df.index.max()}\n")
+                f.write(f"Number of Points: {len(df)}\n\n")
+                f.write("First 5 Rows:\n")
+                f.write(summary_data_dict['head'].to_string())
+                f.write("\n\nLast 5 Rows:\n")
+                f.write(summary_data_dict['tail'].to_string())
+                f.write("\n\nData Types:\n")
+                f.write(summary_data_dict['dtypes'].to_string())
+                f.write("\n\nMissing Value Counts (Post Initial Handling):\n") # Updated comment
+                f.write(summary_data_dict['missing'].to_string())
+                f.write("\n")
+        except IOError as e:
+            logger.error(f"Failed to write data summary to {summary_file_path}: {e}")
+        # Log summaries as well
+        logger.info(f"Head:\n{summary_data_dict['head'].to_string()}")
+        logger.info(f"Tail:\n{summary_data_dict['tail'].to_string()}")
+        logger.info(f"Data Types:\n{summary_data_dict['dtypes']}")
+        # Keep it for later
+        # logger.info(f"Missing Values (Post Initial Handling):\n{summary_data_dict['missing']}")
+
+
+    # Descriptive Stats
+    desc_stats_price, err = get_descriptive_stats(df, price_col=PRICE_COL)
+    desc_stats_file_path = output_dir / "descriptive_stats_price.csv" # Make filename specific
+    if err:
+        logger.error(f"Failed to get descriptive stats for {PRICE_COL}: {err}")
+    elif desc_stats_price is not None:
+         logger.info(f"Saving price descriptive stats to {desc_stats_file_path}")
+         try:
+             # Ensure it's a Series before calling to_csv with header=True
+             if isinstance(desc_stats_price, pd.Series):
+                 desc_stats_price.to_csv(desc_stats_file_path, header=True)
+             else: # If it returns DataFrame (unlikely for single col but safe)
+                 desc_stats_price.to_csv(desc_stats_file_path)
+         except IOError as e:
+             logger.error(f"Failed to write price descriptive stats to {desc_stats_file_path}: {e}")
+         logger.info(f"Price Descriptive Stats:\n{desc_stats_price.to_string()}")
+
+
+    # 3. Visualizations (Main Price Series)
+    logger.info("--- Step 3: Visualizations (Price) ---")
+    plot_name = "01_full_timeseries.png"
+    err = plot_full_time_series(df, PRICE_COL, plots_dir / plot_name)
+    if not err: other_plot_paths['full_timeseries'] = plot_name
+    else: logger.warning(f"Plotting error (full series): {err}")
+
+    if settings.zoom_start_date and settings.zoom_end_date:
+        plot_name = "02_zoomed_timeseries.png"
+        err = plot_zoomed_time_series(df, PRICE_COL, settings.zoom_start_date, settings.zoom_end_date, plots_dir / plot_name)
+        if not err: other_plot_paths['zoomed_timeseries'] = plot_name
+        else: logger.warning(f"Plotting error (zoomed series): {err}")
+
+    for period in ['hour', 'dayofweek', 'month', 'year']:
+        plot_name = f"03_boxplot_{period}.png"
+        err = plot_boxplot_by_period(df, PRICE_COL, period, plots_dir / plot_name)
+        if not err: other_plot_paths[f'boxplot_{period}'] = plot_name
+        else: logger.warning(f"Plotting error (boxplot {period}): {err}")
+
+    plot_name = "04_histogram_price.png"
+    err = plot_histogram(df, PRICE_COL, plots_dir / plot_name)
+    if not err: other_plot_paths['histogram_price'] = plot_name
+    else: logger.warning(f"Plotting error (histogram): {err}")
+
+    # Optional: Seasonal Subseries Plots
+    plot_name = "04a_seasonal_subseries_daily.png"
+    err = plot_seasonal_subseries(df, PRICE_COL, period=24, period_name="Daily", output_path=plots_dir / plot_name)
+    if not err: other_plot_paths['seasonal_subseries_daily'] = plot_name
+    else: logger.warning(f"Plotting error (subseries daily): {err}")
+
+    if len(df) > 168: # Check if enough data for weekly
+         plot_name = "04b_seasonal_subseries_weekly.png"
+         err = plot_seasonal_subseries(df, PRICE_COL, period=168, period_name="Weekly", output_path=plots_dir / plot_name)
+         if not err: other_plot_paths['seasonal_subseries_weekly'] = plot_name
+         else: logger.warning(f"Plotting error (subseries weekly): {err}")
+
+
+    # 4. Decomposition
+    logger.info("--- Step 4: Decomposition ---")
+    residuals_for_analysis: Optional[pd.Series] = None # Track which residuals to use later
+
+    # Daily
+    decomp_daily, err = perform_decomposition(df[PRICE_COL], model='additive', period=24)
+    if err: logger.error(f"Daily decomposition failed: {err}")
+    elif decomp_daily:
+        plot_name = "05_decomposition_daily.png"
+        err = plot_decomposition_results(decomp_daily, "Daily (Period=24)", plots_dir / plot_name)
+        if not err: decomposition_plot_paths['daily'] = plot_name
+        else: logger.warning(f"Plotting error (daily decomp): {err}")
+
+        residuals_daily = decomp_daily.resid.dropna()
+        plot_name = "06_residuals_daily.png"
+        err = plot_residuals(residuals_daily, "Daily Decomp", plots_dir / plot_name)
+        # Save path regardless of error, report might reference it
+        other_plot_paths['residuals_daily'] = plot_name
+        if err: logger.warning(f"Plotting error (daily residuals): {err}")
+        if not residuals_daily.empty: residuals_for_analysis = residuals_daily # Prefer daily initially
+
+    # Weekly
+    if len(df) >= 168 * 2:
+        decomp_weekly, err = perform_decomposition(df[PRICE_COL], model='additive', period=168)
+        if err: logger.error(f"Weekly decomposition failed: {err}")
+        elif decomp_weekly:
+            plot_name = "07_decomposition_weekly.png"
+            err = plot_decomposition_results(decomp_weekly, "Weekly (Period=168)", plots_dir / plot_name)
+            if not err: decomposition_plot_paths['weekly'] = plot_name
+            else: logger.warning(f"Plotting error (weekly decomp): {err}")
+
+            residuals_weekly = decomp_weekly.resid.dropna()
+            plot_name = "08_residuals_weekly.png"
+            err = plot_residuals(residuals_weekly, "Weekly Decomp", plots_dir / plot_name)
+            other_plot_paths['residuals_weekly'] = plot_name
+            if err: logger.warning(f"Plotting error (weekly residuals): {err}")
+            if not residuals_weekly.empty: residuals_for_analysis = residuals_weekly # Prefer weekly if available
+    else:
+        logger.warning("Skipping weekly decomposition, data length insufficient.")
+
+    # Decide which residuals plot to link in stationarity section
+    if residuals_for_analysis is residuals_weekly:
+         other_plot_paths['residuals'] = other_plot_paths.get('residuals_weekly', 'placeholder.png')
+         series_name_stat_tested = "Weekly Residuals"
+    elif residuals_for_analysis is residuals_daily:
+         other_plot_paths['residuals'] = other_plot_paths.get('residuals_daily', 'placeholder.png')
+         series_name_stat_tested = "Daily Residuals"
+    else:
+         series_name_stat_tested = None # No residuals available for tests
+
+
+    # 5. Stationarity Analysis
+    logger.info("--- Step 5: Stationarity Analysis ---")
+    stationarity_file_path = output_dir / "stationarity_tests.json"
+    if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
+        logger.info(f"Performing tests on: {series_name_stat_tested}")
+        stationarity_results_dict, err = perform_stationarity_tests(residuals_for_analysis)
+        if err: logger.error(f"Stationarity tests failed: {err}")
+        elif stationarity_results_dict:
+            logger.info(f"Saving stationarity test results to {stationarity_file_path}")
+            try:
+                # Convert numpy arrays/types in critical values to lists for JSON serialization
+                adf_res = stationarity_results_dict.get('adf', {})
+                kpss_res = stationarity_results_dict.get('kpss', {})
+                adf_crit = adf_res.get('Critical Values', {})
+                kpss_crit = kpss_res.get('Critical Values', {})
+                if isinstance(adf_crit, dict):
+                    adf_res['Critical Values'] = {k: float(v) for k, v in adf_crit.items()}
+                if isinstance(kpss_crit, dict):
+                    kpss_res['Critical Values'] = {k: float(v) for k, v in kpss_crit.items()}
+
+                results_to_save = {
+                    "series_tested": series_name_stat_tested,
+                    "adf": adf_res,
+                    "kpss": kpss_res
+                 }
+                with open(stationarity_file_path, 'w') as f:
+                    json.dump(results_to_save, f, indent=4)
+
+            except (IOError, TypeError) as e:
+                 logger.error(f"Failed to write stationarity results to {stationarity_file_path}: {e}")
+
+            # Log key results
+            logger.info(f"Stationarity Test Results ({series_name_stat_tested}):")
+            if 'adf' in stationarity_results_dict and stationarity_results_dict['adf']:
+                logger.info(f"  ADF p-value: {stationarity_results_dict['adf'].get('p-value', 'N/A'):.4f}")
+            if 'kpss' in stationarity_results_dict and stationarity_results_dict['kpss']:
+                # Handle string p-values from KPSS
+                kpss_p = stationarity_results_dict['kpss'].get('p-value', 'N/A')
+                if isinstance(kpss_p, str):
+                    logger.info(f"  KPSS p-value: {kpss_p}")
+                else:
+                    logger.info(f"  KPSS p-value: {kpss_p:.4f}")
+    else:
+         logger.warning("Skipping Stationarity Analysis as no suitable residual series is available.")
+
+
+    # 6. Autocorrelation Analysis
+    logger.info("--- Step 6: Autocorrelation Analysis ---")
+    # Import plot_acf, plot_pacf from statsmodels graphics directly for saving
+    from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
+    import matplotlib.pyplot as plt
+
+    if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
+        series_name_acf = series_name_stat_tested.lower().replace(' ','_')
+        base_name = f"09_{series_name_acf}"
+        err_acf = None; err_pacf = None
+        try:
+             # Create figure and axes explicitly
+             fig_acf, ax_acf = plt.subplots()
+             plot_acf(residuals_for_analysis, lags=48, ax=ax_acf, title=f'ACF - {series_name_stat_tested}')
+             plot_name_acf = f"{base_name}_acf.png"
+             fig_acf.savefig(plots_dir / plot_name_acf)
+             plt.close(fig_acf) # Close figure after saving
+             acf_pacf_plot_paths['acf'] = plot_name_acf
+        except Exception as e: err_acf = e
+
+        try:
+             # Create figure and axes explicitly
+             fig_pacf, ax_pacf = plt.subplots()
+             plot_pacf(residuals_for_analysis, lags=48, ax=ax_pacf, title=f'PACF - {series_name_stat_tested}', method='ywm')
+             plot_name_pacf = f"{base_name}_pacf.png"
+             fig_pacf.savefig(plots_dir / plot_name_pacf)
+             plt.close(fig_pacf) # Close figure after saving
+             acf_pacf_plot_paths['pacf'] = plot_name_pacf
+        except Exception as e: err_pacf = e
+
+        if err_acf: logger.warning(f"Plotting error (ACF for {series_name_stat_tested}): {err_acf}")
+        if err_pacf: logger.warning(f"Plotting error (PACF for {series_name_stat_tested}): {err_pacf}")
+
+        # Add Weekly Autocorrelation Analysis
+        try:
+            plot_name = f"09c_weekly_autocorr_{series_name_acf}.png"
+            err = plot_weekly_autocorrelation(
+                series=residuals_for_analysis,
+                series_name=series_name_stat_tested,
+                output_path=plots_dir / plot_name,
+                max_weeks=4
+            )
+            if not err: acf_pacf_plot_paths['weekly_autocorr'] = plot_name
+            else: logger.warning(f"Plotting error (weekly autocorrelation): {err}")
+        except Exception as e:
+            logger.warning(f"Error in weekly autocorrelation analysis: {e}")
+
+    else:
+        logger.warning("Skipping Autocorrelation Analysis as no suitable series is available.")
+
+
+    # 7. Exogenous Variable Analysis (if any exist)
+    logger.info("--- Step 7: Exogenous Variable Analysis ---")
+    logger.info("---         There are none.... Skipping ---")
+
+
+    # 8. Generate LaTeX Report
+    logger.info("--- Step 8: Generate LaTeX Report ---")
+    
+    # --- Determine Decomposition Model and ACF/PACF Lags Used ---
+    # These are currently hardcoded in the pipeline steps
+    decomp_model_used = 'additive'
+    acf_pacf_lags_used = 48
+
+
+    # Create ReportData object, now including imputation_message
+    report_data = ReportData(
+        descriptive_stats={'desc_price': desc_stats_price} if desc_stats_price is not None else None,
+        stationarity_tests=stationarity_results_dict,
+        summary_data=summary_data_dict, # Pass the summary dict directly
+        imputation_message=imputation_msg # Pass the generated message
+    )
+    try:
+        generate_latex_report(
+            output_dir=output_dir,
+            df=df,
+            report_data=report_data,
+            series_name_stat=series_name_stat_tested,
+            acf_pacf_plot_paths=acf_pacf_plot_paths,
+            decomposition_plot_paths=decomposition_plot_paths,
+            other_plot_paths=other_plot_paths,
+            decomposition_model=decomp_model_used, # Pass the model used
+            acf_pacf_lags=acf_pacf_lags_used, # Pass the lags used
+            template_path=settings.latex_template_file
+        )
+    except (FileNotFoundError, IOError, ValueError, RuntimeError) as e:
+         logger.error(f"Report generation failed: {e}", exc_info=True)
+         # Decide if pipeline should stop or continue
+         # raise SystemExit(1) from e # Option to stop pipeline
+
+    logger.info(f"EDA Pipeline execution finished. Review logs and generated files in {output_dir}.")
+    # The message about compiling manually is now handled within generate_latex_report if compilation fails
+    # logger.info(f"Compile the report: cd '{output_dir.resolve()}' && pdflatex eda_report.tex")