init
This commit is contained in:
387
data_analysis/analysis/pipeline.py
Normal file
387
data_analysis/analysis/pipeline.py
Normal file
@ -0,0 +1,387 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import json
|
||||
from typing import Optional, Dict, List, Any
|
||||
# Use utils for config if that's the structure
|
||||
from data_analysis.utils.config_model import settings
|
||||
import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Import data handling functions from io ---
|
||||
from data_analysis.io.data_handling import (
|
||||
load_and_prepare_data,
|
||||
get_data_summary,
|
||||
get_descriptive_stats,
|
||||
PRICE_COL, # Standardized price column name
|
||||
PRICE_COL_RAW # Raw price column name (needed for check below)
|
||||
)
|
||||
# --- Import analysis functions from analysis ---
|
||||
from .data import (
|
||||
perform_decomposition,
|
||||
perform_stationarity_tests,
|
||||
)
|
||||
# --- Import plotting functions ---
|
||||
from data_analysis.io.plotting import (
|
||||
plot_full_time_series,
|
||||
plot_zoomed_time_series,
|
||||
plot_boxplot_by_period,
|
||||
plot_histogram,
|
||||
plot_decomposition as plot_decomposition_results, # Rename to avoid clash
|
||||
plot_residuals,
|
||||
plot_acf_pacf,
|
||||
plot_seasonal_subseries,
|
||||
plot_cross_correlation,
|
||||
plot_weekly_autocorrelation
|
||||
)
|
||||
# --- Import report generator ---
|
||||
from ..io.report import generate_latex_report
|
||||
from data_analysis.utils.report_model import ReportData
|
||||
|
||||
|
||||
# --- Modified Pipeline Function ---
|
||||
def run_eda_pipeline():
|
||||
"""
|
||||
Orchestrates the Exploratory Data Analysis process using loaded settings
|
||||
and generates a LaTeX report.
|
||||
"""
|
||||
logger.info("Starting Exploratory Data Analysis Pipeline (LaTeX Report)...")
|
||||
output_dir = settings.output_dir
|
||||
plots_dir = output_dir / "plots" # Define plots subdirectory
|
||||
|
||||
# Ensure output directories exist
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
plots_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Output directory set to: {output_dir.resolve()}")
|
||||
logger.info(f"Plots directory set to: {plots_dir.resolve()}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create output directories: {e}", exc_info=True)
|
||||
raise SystemExit(1) from e
|
||||
|
||||
# --- Data Holders ---
|
||||
df: Optional[pd.DataFrame] = None
|
||||
summary_data_dict: Optional[dict] = None
|
||||
desc_stats_price: Optional[pd.Series] = None
|
||||
residuals_daily: Optional[pd.Series] = None
|
||||
residuals_weekly: Optional[pd.Series] = None
|
||||
stationarity_results_dict: Optional[dict] = None
|
||||
series_name_stat_tested: Optional[str] = None
|
||||
|
||||
# --- Plot Path Collectors ---
|
||||
other_plot_paths: Dict[str, str] = {}
|
||||
acf_pacf_plot_paths: Dict[str, str] = {}
|
||||
decomposition_plot_paths: Dict[str, str] = {}
|
||||
|
||||
|
||||
# --- Pipeline Steps ---
|
||||
|
||||
# 1. Load Data
|
||||
logger.info("--- Step 1: Load Data ---")
|
||||
# Store initial raw state temporarily to check missing values before preparation
|
||||
df_raw, err_load = pd.read_csv(settings.data_file, header=0), None
|
||||
initial_missing_price = 0
|
||||
if PRICE_COL_RAW in df_raw.columns:
|
||||
# Check missing in the raw numeric column before full processing
|
||||
initial_missing_price = pd.to_numeric(df_raw[PRICE_COL_RAW], errors='coerce').isnull().sum()
|
||||
else:
|
||||
logger.warning(f"Raw price column '{PRICE_COL_RAW}' not found for initial missing value check.")
|
||||
|
||||
df, err = load_and_prepare_data(settings.data_file)
|
||||
if err or df is None:
|
||||
logger.error(f"Data loading failed: {err or 'Unknown error'}. Stopping pipeline.")
|
||||
raise SystemExit(1)
|
||||
logger.info(f"Data loaded successfully. Shape: {df.shape}")
|
||||
logger.info(f"Columns: {', '.join(df.columns)}")
|
||||
|
||||
# Construct imputation message based on initial check and final state
|
||||
imputation_msg = "No missing price values detected."
|
||||
final_missing_price = df[PRICE_COL].isnull().sum() # Should be 0 after load_and_prepare
|
||||
if initial_missing_price > 0:
|
||||
if final_missing_price == 0:
|
||||
imputation_msg = f"{initial_missing_price} missing price value(s) were detected and imputed (ffill/bfill)."
|
||||
else:
|
||||
imputation_msg = f"{initial_missing_price} missing price value(s) were detected, imputation may be incomplete ({final_missing_price} remain)."
|
||||
elif df.isnull().sum().sum() > final_missing_price: # Check if other columns have NaNs
|
||||
imputation_msg = "Missing values detected in non-price columns (if any). Price column had no missing values."
|
||||
|
||||
|
||||
# 2. Initial Inspection & Summary Stats
|
||||
logger.info("--- Step 2: Initial Inspection & Summary ---")
|
||||
summary_data_dict, err = get_data_summary(df)
|
||||
summary_file_path = output_dir / "summary_data.txt"
|
||||
if err:
|
||||
logger.error(f"Failed to get data summary: {err}")
|
||||
elif summary_data_dict:
|
||||
logger.info(f"Saving data summary to {summary_file_path}")
|
||||
try:
|
||||
with open(summary_file_path, 'w') as f:
|
||||
f.write("--- Data Summary ---\n\n")
|
||||
f.write(f"Data Source: {settings.data_file.name}\n")
|
||||
f.write(f"Date Range: {df.index.min()} to {df.index.max()}\n")
|
||||
f.write(f"Number of Points: {len(df)}\n\n")
|
||||
f.write("First 5 Rows:\n")
|
||||
f.write(summary_data_dict['head'].to_string())
|
||||
f.write("\n\nLast 5 Rows:\n")
|
||||
f.write(summary_data_dict['tail'].to_string())
|
||||
f.write("\n\nData Types:\n")
|
||||
f.write(summary_data_dict['dtypes'].to_string())
|
||||
f.write("\n\nMissing Value Counts (Post Initial Handling):\n") # Updated comment
|
||||
f.write(summary_data_dict['missing'].to_string())
|
||||
f.write("\n")
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write data summary to {summary_file_path}: {e}")
|
||||
# Log summaries as well
|
||||
logger.info(f"Head:\n{summary_data_dict['head'].to_string()}")
|
||||
logger.info(f"Tail:\n{summary_data_dict['tail'].to_string()}")
|
||||
logger.info(f"Data Types:\n{summary_data_dict['dtypes']}")
|
||||
# Keep it for later
|
||||
# logger.info(f"Missing Values (Post Initial Handling):\n{summary_data_dict['missing']}")
|
||||
|
||||
|
||||
# Descriptive Stats
|
||||
desc_stats_price, err = get_descriptive_stats(df, price_col=PRICE_COL)
|
||||
desc_stats_file_path = output_dir / "descriptive_stats_price.csv" # Make filename specific
|
||||
if err:
|
||||
logger.error(f"Failed to get descriptive stats for {PRICE_COL}: {err}")
|
||||
elif desc_stats_price is not None:
|
||||
logger.info(f"Saving price descriptive stats to {desc_stats_file_path}")
|
||||
try:
|
||||
# Ensure it's a Series before calling to_csv with header=True
|
||||
if isinstance(desc_stats_price, pd.Series):
|
||||
desc_stats_price.to_csv(desc_stats_file_path, header=True)
|
||||
else: # If it returns DataFrame (unlikely for single col but safe)
|
||||
desc_stats_price.to_csv(desc_stats_file_path)
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write price descriptive stats to {desc_stats_file_path}: {e}")
|
||||
logger.info(f"Price Descriptive Stats:\n{desc_stats_price.to_string()}")
|
||||
|
||||
|
||||
# 3. Visualizations (Main Price Series)
|
||||
logger.info("--- Step 3: Visualizations (Price) ---")
|
||||
plot_name = "01_full_timeseries.png"
|
||||
err = plot_full_time_series(df, PRICE_COL, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['full_timeseries'] = plot_name
|
||||
else: logger.warning(f"Plotting error (full series): {err}")
|
||||
|
||||
if settings.zoom_start_date and settings.zoom_end_date:
|
||||
plot_name = "02_zoomed_timeseries.png"
|
||||
err = plot_zoomed_time_series(df, PRICE_COL, settings.zoom_start_date, settings.zoom_end_date, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['zoomed_timeseries'] = plot_name
|
||||
else: logger.warning(f"Plotting error (zoomed series): {err}")
|
||||
|
||||
for period in ['hour', 'dayofweek', 'month', 'year']:
|
||||
plot_name = f"03_boxplot_{period}.png"
|
||||
err = plot_boxplot_by_period(df, PRICE_COL, period, plots_dir / plot_name)
|
||||
if not err: other_plot_paths[f'boxplot_{period}'] = plot_name
|
||||
else: logger.warning(f"Plotting error (boxplot {period}): {err}")
|
||||
|
||||
plot_name = "04_histogram_price.png"
|
||||
err = plot_histogram(df, PRICE_COL, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['histogram_price'] = plot_name
|
||||
else: logger.warning(f"Plotting error (histogram): {err}")
|
||||
|
||||
# Optional: Seasonal Subseries Plots
|
||||
plot_name = "04a_seasonal_subseries_daily.png"
|
||||
err = plot_seasonal_subseries(df, PRICE_COL, period=24, period_name="Daily", output_path=plots_dir / plot_name)
|
||||
if not err: other_plot_paths['seasonal_subseries_daily'] = plot_name
|
||||
else: logger.warning(f"Plotting error (subseries daily): {err}")
|
||||
|
||||
if len(df) > 168: # Check if enough data for weekly
|
||||
plot_name = "04b_seasonal_subseries_weekly.png"
|
||||
err = plot_seasonal_subseries(df, PRICE_COL, period=168, period_name="Weekly", output_path=plots_dir / plot_name)
|
||||
if not err: other_plot_paths['seasonal_subseries_weekly'] = plot_name
|
||||
else: logger.warning(f"Plotting error (subseries weekly): {err}")
|
||||
|
||||
|
||||
# 4. Decomposition
|
||||
logger.info("--- Step 4: Decomposition ---")
|
||||
residuals_for_analysis: Optional[pd.Series] = None # Track which residuals to use later
|
||||
|
||||
# Daily
|
||||
decomp_daily, err = perform_decomposition(df[PRICE_COL], model='additive', period=24)
|
||||
if err: logger.error(f"Daily decomposition failed: {err}")
|
||||
elif decomp_daily:
|
||||
plot_name = "05_decomposition_daily.png"
|
||||
err = plot_decomposition_results(decomp_daily, "Daily (Period=24)", plots_dir / plot_name)
|
||||
if not err: decomposition_plot_paths['daily'] = plot_name
|
||||
else: logger.warning(f"Plotting error (daily decomp): {err}")
|
||||
|
||||
residuals_daily = decomp_daily.resid.dropna()
|
||||
plot_name = "06_residuals_daily.png"
|
||||
err = plot_residuals(residuals_daily, "Daily Decomp", plots_dir / plot_name)
|
||||
# Save path regardless of error, report might reference it
|
||||
other_plot_paths['residuals_daily'] = plot_name
|
||||
if err: logger.warning(f"Plotting error (daily residuals): {err}")
|
||||
if not residuals_daily.empty: residuals_for_analysis = residuals_daily # Prefer daily initially
|
||||
|
||||
# Weekly
|
||||
if len(df) >= 168 * 2:
|
||||
decomp_weekly, err = perform_decomposition(df[PRICE_COL], model='additive', period=168)
|
||||
if err: logger.error(f"Weekly decomposition failed: {err}")
|
||||
elif decomp_weekly:
|
||||
plot_name = "07_decomposition_weekly.png"
|
||||
err = plot_decomposition_results(decomp_weekly, "Weekly (Period=168)", plots_dir / plot_name)
|
||||
if not err: decomposition_plot_paths['weekly'] = plot_name
|
||||
else: logger.warning(f"Plotting error (weekly decomp): {err}")
|
||||
|
||||
residuals_weekly = decomp_weekly.resid.dropna()
|
||||
plot_name = "08_residuals_weekly.png"
|
||||
err = plot_residuals(residuals_weekly, "Weekly Decomp", plots_dir / plot_name)
|
||||
other_plot_paths['residuals_weekly'] = plot_name
|
||||
if err: logger.warning(f"Plotting error (weekly residuals): {err}")
|
||||
if not residuals_weekly.empty: residuals_for_analysis = residuals_weekly # Prefer weekly if available
|
||||
else:
|
||||
logger.warning("Skipping weekly decomposition, data length insufficient.")
|
||||
|
||||
# Decide which residuals plot to link in stationarity section
|
||||
if residuals_for_analysis is residuals_weekly:
|
||||
other_plot_paths['residuals'] = other_plot_paths.get('residuals_weekly', 'placeholder.png')
|
||||
series_name_stat_tested = "Weekly Residuals"
|
||||
elif residuals_for_analysis is residuals_daily:
|
||||
other_plot_paths['residuals'] = other_plot_paths.get('residuals_daily', 'placeholder.png')
|
||||
series_name_stat_tested = "Daily Residuals"
|
||||
else:
|
||||
series_name_stat_tested = None # No residuals available for tests
|
||||
|
||||
|
||||
# 5. Stationarity Analysis
|
||||
logger.info("--- Step 5: Stationarity Analysis ---")
|
||||
stationarity_file_path = output_dir / "stationarity_tests.json"
|
||||
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
|
||||
logger.info(f"Performing tests on: {series_name_stat_tested}")
|
||||
stationarity_results_dict, err = perform_stationarity_tests(residuals_for_analysis)
|
||||
if err: logger.error(f"Stationarity tests failed: {err}")
|
||||
elif stationarity_results_dict:
|
||||
logger.info(f"Saving stationarity test results to {stationarity_file_path}")
|
||||
try:
|
||||
# Convert numpy arrays/types in critical values to lists for JSON serialization
|
||||
adf_res = stationarity_results_dict.get('adf', {})
|
||||
kpss_res = stationarity_results_dict.get('kpss', {})
|
||||
adf_crit = adf_res.get('Critical Values', {})
|
||||
kpss_crit = kpss_res.get('Critical Values', {})
|
||||
if isinstance(adf_crit, dict):
|
||||
adf_res['Critical Values'] = {k: float(v) for k, v in adf_crit.items()}
|
||||
if isinstance(kpss_crit, dict):
|
||||
kpss_res['Critical Values'] = {k: float(v) for k, v in kpss_crit.items()}
|
||||
|
||||
results_to_save = {
|
||||
"series_tested": series_name_stat_tested,
|
||||
"adf": adf_res,
|
||||
"kpss": kpss_res
|
||||
}
|
||||
with open(stationarity_file_path, 'w') as f:
|
||||
json.dump(results_to_save, f, indent=4)
|
||||
|
||||
except (IOError, TypeError) as e:
|
||||
logger.error(f"Failed to write stationarity results to {stationarity_file_path}: {e}")
|
||||
|
||||
# Log key results
|
||||
logger.info(f"Stationarity Test Results ({series_name_stat_tested}):")
|
||||
if 'adf' in stationarity_results_dict and stationarity_results_dict['adf']:
|
||||
logger.info(f" ADF p-value: {stationarity_results_dict['adf'].get('p-value', 'N/A'):.4f}")
|
||||
if 'kpss' in stationarity_results_dict and stationarity_results_dict['kpss']:
|
||||
# Handle string p-values from KPSS
|
||||
kpss_p = stationarity_results_dict['kpss'].get('p-value', 'N/A')
|
||||
if isinstance(kpss_p, str):
|
||||
logger.info(f" KPSS p-value: {kpss_p}")
|
||||
else:
|
||||
logger.info(f" KPSS p-value: {kpss_p:.4f}")
|
||||
else:
|
||||
logger.warning("Skipping Stationarity Analysis as no suitable residual series is available.")
|
||||
|
||||
|
||||
# 6. Autocorrelation Analysis
|
||||
logger.info("--- Step 6: Autocorrelation Analysis ---")
|
||||
# Import plot_acf, plot_pacf from statsmodels graphics directly for saving
|
||||
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
|
||||
series_name_acf = series_name_stat_tested.lower().replace(' ','_')
|
||||
base_name = f"09_{series_name_acf}"
|
||||
err_acf = None; err_pacf = None
|
||||
try:
|
||||
# Create figure and axes explicitly
|
||||
fig_acf, ax_acf = plt.subplots()
|
||||
plot_acf(residuals_for_analysis, lags=48, ax=ax_acf, title=f'ACF - {series_name_stat_tested}')
|
||||
plot_name_acf = f"{base_name}_acf.png"
|
||||
fig_acf.savefig(plots_dir / plot_name_acf)
|
||||
plt.close(fig_acf) # Close figure after saving
|
||||
acf_pacf_plot_paths['acf'] = plot_name_acf
|
||||
except Exception as e: err_acf = e
|
||||
|
||||
try:
|
||||
# Create figure and axes explicitly
|
||||
fig_pacf, ax_pacf = plt.subplots()
|
||||
plot_pacf(residuals_for_analysis, lags=48, ax=ax_pacf, title=f'PACF - {series_name_stat_tested}', method='ywm')
|
||||
plot_name_pacf = f"{base_name}_pacf.png"
|
||||
fig_pacf.savefig(plots_dir / plot_name_pacf)
|
||||
plt.close(fig_pacf) # Close figure after saving
|
||||
acf_pacf_plot_paths['pacf'] = plot_name_pacf
|
||||
except Exception as e: err_pacf = e
|
||||
|
||||
if err_acf: logger.warning(f"Plotting error (ACF for {series_name_stat_tested}): {err_acf}")
|
||||
if err_pacf: logger.warning(f"Plotting error (PACF for {series_name_stat_tested}): {err_pacf}")
|
||||
|
||||
# Add Weekly Autocorrelation Analysis
|
||||
try:
|
||||
plot_name = f"09c_weekly_autocorr_{series_name_acf}.png"
|
||||
err = plot_weekly_autocorrelation(
|
||||
series=residuals_for_analysis,
|
||||
series_name=series_name_stat_tested,
|
||||
output_path=plots_dir / plot_name,
|
||||
max_weeks=4
|
||||
)
|
||||
if not err: acf_pacf_plot_paths['weekly_autocorr'] = plot_name
|
||||
else: logger.warning(f"Plotting error (weekly autocorrelation): {err}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in weekly autocorrelation analysis: {e}")
|
||||
|
||||
else:
|
||||
logger.warning("Skipping Autocorrelation Analysis as no suitable series is available.")
|
||||
|
||||
|
||||
# 7. Exogenous Variable Analysis (if any exist)
|
||||
logger.info("--- Step 7: Exogenous Variable Analysis ---")
|
||||
logger.info("--- There are none.... Skipping ---")
|
||||
|
||||
|
||||
# 8. Generate LaTeX Report
|
||||
logger.info("--- Step 8: Generate LaTeX Report ---")
|
||||
|
||||
# --- Determine Decomposition Model and ACF/PACF Lags Used ---
|
||||
# These are currently hardcoded in the pipeline steps
|
||||
decomp_model_used = 'additive'
|
||||
acf_pacf_lags_used = 48
|
||||
|
||||
|
||||
# Create ReportData object, now including imputation_message
|
||||
report_data = ReportData(
|
||||
descriptive_stats={'desc_price': desc_stats_price} if desc_stats_price is not None else None,
|
||||
stationarity_tests=stationarity_results_dict,
|
||||
summary_data=summary_data_dict, # Pass the summary dict directly
|
||||
imputation_message=imputation_msg # Pass the generated message
|
||||
)
|
||||
try:
|
||||
generate_latex_report(
|
||||
output_dir=output_dir,
|
||||
df=df,
|
||||
report_data=report_data,
|
||||
series_name_stat=series_name_stat_tested,
|
||||
acf_pacf_plot_paths=acf_pacf_plot_paths,
|
||||
decomposition_plot_paths=decomposition_plot_paths,
|
||||
other_plot_paths=other_plot_paths,
|
||||
decomposition_model=decomp_model_used, # Pass the model used
|
||||
acf_pacf_lags=acf_pacf_lags_used, # Pass the lags used
|
||||
template_path=settings.latex_template_file
|
||||
)
|
||||
except (FileNotFoundError, IOError, ValueError, RuntimeError) as e:
|
||||
logger.error(f"Report generation failed: {e}", exc_info=True)
|
||||
# Decide if pipeline should stop or continue
|
||||
# raise SystemExit(1) from e # Option to stop pipeline
|
||||
|
||||
logger.info(f"EDA Pipeline execution finished. Review logs and generated files in {output_dir}.")
|
||||
# The message about compiling manually is now handled within generate_latex_report if compilation fails
|
||||
# logger.info(f"Compile the report: cd '{output_dir.resolve()}' && pdflatex eda_report.tex")
|
Reference in New Issue
Block a user