init
This commit is contained in:
110
.gitignore
vendored
Normal file
110
.gitignore
vendored
Normal file
@ -0,0 +1,110 @@
|
||||
# ---> General Python <---
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django/Flask/Scrapy/Sphinx/PyBuilder specific (can often be removed if not used)
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
instance/
|
||||
.webassets-cache
|
||||
.scrapy
|
||||
docs/_build/
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# Environments & Secrets
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
secrets.yaml
|
||||
*.credential
|
||||
|
||||
# IDE / Editor Settings
|
||||
.vscode/
|
||||
.idea/
|
||||
.spyderproject
|
||||
.spyproject
|
||||
.ropeproject
|
||||
|
||||
# Linting / Type Checking Caches
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
.pyre/
|
||||
.pytype/
|
||||
cython_debug/
|
||||
|
||||
# Other Tooling
|
||||
.continue/ # Cursor state
|
||||
|
||||
# ---> Project Specific <---
|
||||
input/
|
||||
output/
|
||||
tests/
|
||||
*.xlsx # Often generated reports
|
||||
*.csv # Often generated
|
||||
|
||||
# If temporary planning files are used
|
||||
data/
|
||||
output/
|
||||
|
1
config.yaml
Normal file
1
config.yaml
Normal file
@ -0,0 +1 @@
|
||||
|
75
data_analysis.py
Normal file
75
data_analysis.py
Normal file
@ -0,0 +1,75 @@
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
# Import necessary components from your project structure
|
||||
from data_analysis.utils.config_model import load_settings, Settings # Import loading function and model
|
||||
from data_analysis.analysis.pipeline import run_eda_pipeline # Import the pipeline entry point
|
||||
|
||||
# Silence overly verbose libraries if needed (e.g., matplotlib)
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING) # Example: set to WARNING or ERROR
|
||||
|
||||
# --- Basic Logging Setup ---
|
||||
# Configure logging early to catch basic issues.
|
||||
# The level might be adjusted after config loading.
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)-7s - %(message)s',
|
||||
datefmt='%H:%M:%S')
|
||||
# Get the root logger
|
||||
logger = logging.getLogger()
|
||||
|
||||
# --- Argument Parsing ---
|
||||
def parse_arguments():
|
||||
"""Parses command-line arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run the Energy Forecasting EDA pipeline.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--config',
|
||||
type=str,
|
||||
default='config.yaml', # Provide a default config file name
|
||||
help="Path to the YAML configuration file."
|
||||
)
|
||||
# Add other potential command-line overrides here if needed later
|
||||
# parser.add_argument('--debug', action='store_true', help="Override log level to DEBUG.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
# --- Main Execution ---
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
args = parse_arguments()
|
||||
config_path = Path(args.config)
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# --- Configuration Loading ---
|
||||
_ = load_settings(config_path)
|
||||
logger.info(f"Using configuration from: {config_path.resolve()} (or defaults if loading failed)")
|
||||
|
||||
# --- Pipeline Execution ---
|
||||
try:
|
||||
# Call the main function from your pipeline module
|
||||
run_eda_pipeline()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
logger.info(f"Main script finished successfully in {end_time - start_time:.2f} seconds.")
|
||||
|
||||
except SystemExit as e:
|
||||
# Catch SystemExit if pipeline runner exits intentionally
|
||||
logger.warning(f"Pipeline exited with code {e.code}.")
|
||||
sys.exit(e.code) # Propagate exit code
|
||||
except Exception as e:
|
||||
logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
|
||||
end_time = time.perf_counter()
|
||||
logger.info(f"Main script failed after {end_time - start_time:.2f} seconds.")
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
exit(1)
|
0
data_analysis/__init__.py
Normal file
0
data_analysis/__init__.py
Normal file
0
data_analysis/analysis/__init__.py
Normal file
0
data_analysis/analysis/__init__.py
Normal file
126
data_analysis/analysis/data.py
Normal file
126
data_analysis/analysis/data.py
Normal file
@ -0,0 +1,126 @@
|
||||
import logging
|
||||
import pandas as pd
|
||||
from typing import Tuple, Optional, Dict, Any
|
||||
|
||||
import warnings
|
||||
from statsmodels.tools.sm_exceptions import InterpolationWarning
|
||||
|
||||
# Import analysis tools - ensure statsmodels is installed
|
||||
from statsmodels.tsa.seasonal import seasonal_decompose, DecomposeResult
|
||||
from statsmodels.tsa.stattools import adfuller, kpss
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# PRICE_COL constant moved to io.data_handling
|
||||
|
||||
def perform_decomposition(series: pd.Series, model: str = 'additive', period: int = 24) -> Tuple[Optional[DecomposeResult], Optional[str]]:
|
||||
"""
|
||||
Performs time series decomposition using statsmodels.
|
||||
|
||||
Args:
|
||||
series: The time series data (e.g., df['Price']).
|
||||
model: Type of decomposition ('additive' or 'multiplicative').
|
||||
period: The period of the seasonality.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- DecomposeResult | None: The decomposition result object.
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info(f"Performing {model} decomposition with period {period}...")
|
||||
result = None
|
||||
err = None
|
||||
# Check if series is empty or None before proceeding
|
||||
if series is None or series.empty:
|
||||
err = "Input series for decomposition is empty or None."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
try:
|
||||
if len(series) < 2 * period:
|
||||
err = f"Series is too short for decomposition with period {period} (length {len(series)})."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
# Ensure Series has a DatetimeIndex with frequency for extrapolate_trend
|
||||
if not isinstance(series.index, pd.DatetimeIndex) or series.index.freq is None:
|
||||
logger.warning("Series index is not a DatetimeIndex with frequency. Decomposition might be less reliable.")
|
||||
# Consider removing extrapolate_trend or handling differently if freq is often missing
|
||||
result = seasonal_decompose(series, model=model, period=period)
|
||||
else:
|
||||
result = seasonal_decompose(series, model=model, period=period, extrapolate_trend='freq')
|
||||
logger.info("Decomposition successful.")
|
||||
except ValueError as ve:
|
||||
# Catch specific ValueError often related to NaNs or period issues
|
||||
err = f"ValueError during decomposition (check for NaNs or period > series length/2): {ve}"
|
||||
logger.error(err, exc_info=True)
|
||||
except Exception as e:
|
||||
err = f"Error during decomposition: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
|
||||
return result, err
|
||||
|
||||
|
||||
def perform_stationarity_tests(series: pd.Series) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
|
||||
"""
|
||||
Performs ADF and KPSS stationarity tests.
|
||||
|
||||
Args:
|
||||
series: The time series to test (often residuals or differenced series).
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- dict | None: Dictionary containing test results ('adf', 'kpss').
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info("Performing stationarity tests (ADF, KPSS)...")
|
||||
results = {}
|
||||
err = None
|
||||
# Check if series is empty or None
|
||||
if series is None or series.empty:
|
||||
err = "Input series for stationarity tests is empty or None."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
# Check for NaNs
|
||||
if series.isnull().any():
|
||||
err = "Input series contains NaNs. Please handle missing values before testing stationarity."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
|
||||
try:
|
||||
# ADF Test
|
||||
adf_test = adfuller(series, autolag='AIC')
|
||||
adf_keys = ['Test Statistic',
|
||||
'p-value',
|
||||
'#Lags Used',
|
||||
'#Observations Used',
|
||||
'Critical Values',
|
||||
'IC Best' # Added by newer statsmodels
|
||||
]
|
||||
# Only map existing keys from result tuple
|
||||
results['adf'] = {key: val for key, val in zip(adf_keys, adf_test) if key != 'IC Best'}
|
||||
# Add IC Best separately if it exists
|
||||
if len(adf_test) > 5: results['adf']['IC Best'] = adf_test[5]
|
||||
logger.debug(f"ADF Test Results: {results['adf']}")
|
||||
|
||||
# KPSS Test (common to test for level stationarity 'c')
|
||||
with warnings.catch_warnings(): # Suppress known KPSS p-value interpolation warnings
|
||||
warnings.filterwarnings("ignore", category=InterpolationWarning)
|
||||
kpss_test = kpss(series, regression='c', nlags="auto")
|
||||
kpss_keys = ['Test Statistic',
|
||||
'p-value',
|
||||
'#Lags Used',
|
||||
'Critical Values'
|
||||
]
|
||||
results['kpss'] = {key: val for key, val in zip(kpss_keys, kpss_test)}
|
||||
# Handle potential p-value bounds reported as strings
|
||||
if isinstance(results['kpss']['p-value'], str):
|
||||
logger.warning(f"KPSS p-value reported as bounds: {results['kpss']['p-value']}")
|
||||
logger.debug(f"KPSS Test Results: {results['kpss']}")
|
||||
|
||||
logger.info("Stationarity tests completed.")
|
||||
|
||||
except Exception as e:
|
||||
err = f"Error performing stationarity tests: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
results = None
|
||||
|
||||
return results, err
|
387
data_analysis/analysis/pipeline.py
Normal file
387
data_analysis/analysis/pipeline.py
Normal file
@ -0,0 +1,387 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import json
|
||||
from typing import Optional, Dict, List, Any
|
||||
# Use utils for config if that's the structure
|
||||
from data_analysis.utils.config_model import settings
|
||||
import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Import data handling functions from io ---
|
||||
from data_analysis.io.data_handling import (
|
||||
load_and_prepare_data,
|
||||
get_data_summary,
|
||||
get_descriptive_stats,
|
||||
PRICE_COL, # Standardized price column name
|
||||
PRICE_COL_RAW # Raw price column name (needed for check below)
|
||||
)
|
||||
# --- Import analysis functions from analysis ---
|
||||
from .data import (
|
||||
perform_decomposition,
|
||||
perform_stationarity_tests,
|
||||
)
|
||||
# --- Import plotting functions ---
|
||||
from data_analysis.io.plotting import (
|
||||
plot_full_time_series,
|
||||
plot_zoomed_time_series,
|
||||
plot_boxplot_by_period,
|
||||
plot_histogram,
|
||||
plot_decomposition as plot_decomposition_results, # Rename to avoid clash
|
||||
plot_residuals,
|
||||
plot_acf_pacf,
|
||||
plot_seasonal_subseries,
|
||||
plot_cross_correlation,
|
||||
plot_weekly_autocorrelation
|
||||
)
|
||||
# --- Import report generator ---
|
||||
from ..io.report import generate_latex_report
|
||||
from data_analysis.utils.report_model import ReportData
|
||||
|
||||
|
||||
# --- Modified Pipeline Function ---
|
||||
def run_eda_pipeline():
|
||||
"""
|
||||
Orchestrates the Exploratory Data Analysis process using loaded settings
|
||||
and generates a LaTeX report.
|
||||
"""
|
||||
logger.info("Starting Exploratory Data Analysis Pipeline (LaTeX Report)...")
|
||||
output_dir = settings.output_dir
|
||||
plots_dir = output_dir / "plots" # Define plots subdirectory
|
||||
|
||||
# Ensure output directories exist
|
||||
try:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
plots_dir.mkdir(parents=True, exist_ok=True)
|
||||
logger.info(f"Output directory set to: {output_dir.resolve()}")
|
||||
logger.info(f"Plots directory set to: {plots_dir.resolve()}")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create output directories: {e}", exc_info=True)
|
||||
raise SystemExit(1) from e
|
||||
|
||||
# --- Data Holders ---
|
||||
df: Optional[pd.DataFrame] = None
|
||||
summary_data_dict: Optional[dict] = None
|
||||
desc_stats_price: Optional[pd.Series] = None
|
||||
residuals_daily: Optional[pd.Series] = None
|
||||
residuals_weekly: Optional[pd.Series] = None
|
||||
stationarity_results_dict: Optional[dict] = None
|
||||
series_name_stat_tested: Optional[str] = None
|
||||
|
||||
# --- Plot Path Collectors ---
|
||||
other_plot_paths: Dict[str, str] = {}
|
||||
acf_pacf_plot_paths: Dict[str, str] = {}
|
||||
decomposition_plot_paths: Dict[str, str] = {}
|
||||
|
||||
|
||||
# --- Pipeline Steps ---
|
||||
|
||||
# 1. Load Data
|
||||
logger.info("--- Step 1: Load Data ---")
|
||||
# Store initial raw state temporarily to check missing values before preparation
|
||||
df_raw, err_load = pd.read_csv(settings.data_file, header=0), None
|
||||
initial_missing_price = 0
|
||||
if PRICE_COL_RAW in df_raw.columns:
|
||||
# Check missing in the raw numeric column before full processing
|
||||
initial_missing_price = pd.to_numeric(df_raw[PRICE_COL_RAW], errors='coerce').isnull().sum()
|
||||
else:
|
||||
logger.warning(f"Raw price column '{PRICE_COL_RAW}' not found for initial missing value check.")
|
||||
|
||||
df, err = load_and_prepare_data(settings.data_file)
|
||||
if err or df is None:
|
||||
logger.error(f"Data loading failed: {err or 'Unknown error'}. Stopping pipeline.")
|
||||
raise SystemExit(1)
|
||||
logger.info(f"Data loaded successfully. Shape: {df.shape}")
|
||||
logger.info(f"Columns: {', '.join(df.columns)}")
|
||||
|
||||
# Construct imputation message based on initial check and final state
|
||||
imputation_msg = "No missing price values detected."
|
||||
final_missing_price = df[PRICE_COL].isnull().sum() # Should be 0 after load_and_prepare
|
||||
if initial_missing_price > 0:
|
||||
if final_missing_price == 0:
|
||||
imputation_msg = f"{initial_missing_price} missing price value(s) were detected and imputed (ffill/bfill)."
|
||||
else:
|
||||
imputation_msg = f"{initial_missing_price} missing price value(s) were detected, imputation may be incomplete ({final_missing_price} remain)."
|
||||
elif df.isnull().sum().sum() > final_missing_price: # Check if other columns have NaNs
|
||||
imputation_msg = "Missing values detected in non-price columns (if any). Price column had no missing values."
|
||||
|
||||
|
||||
# 2. Initial Inspection & Summary Stats
|
||||
logger.info("--- Step 2: Initial Inspection & Summary ---")
|
||||
summary_data_dict, err = get_data_summary(df)
|
||||
summary_file_path = output_dir / "summary_data.txt"
|
||||
if err:
|
||||
logger.error(f"Failed to get data summary: {err}")
|
||||
elif summary_data_dict:
|
||||
logger.info(f"Saving data summary to {summary_file_path}")
|
||||
try:
|
||||
with open(summary_file_path, 'w') as f:
|
||||
f.write("--- Data Summary ---\n\n")
|
||||
f.write(f"Data Source: {settings.data_file.name}\n")
|
||||
f.write(f"Date Range: {df.index.min()} to {df.index.max()}\n")
|
||||
f.write(f"Number of Points: {len(df)}\n\n")
|
||||
f.write("First 5 Rows:\n")
|
||||
f.write(summary_data_dict['head'].to_string())
|
||||
f.write("\n\nLast 5 Rows:\n")
|
||||
f.write(summary_data_dict['tail'].to_string())
|
||||
f.write("\n\nData Types:\n")
|
||||
f.write(summary_data_dict['dtypes'].to_string())
|
||||
f.write("\n\nMissing Value Counts (Post Initial Handling):\n") # Updated comment
|
||||
f.write(summary_data_dict['missing'].to_string())
|
||||
f.write("\n")
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write data summary to {summary_file_path}: {e}")
|
||||
# Log summaries as well
|
||||
logger.info(f"Head:\n{summary_data_dict['head'].to_string()}")
|
||||
logger.info(f"Tail:\n{summary_data_dict['tail'].to_string()}")
|
||||
logger.info(f"Data Types:\n{summary_data_dict['dtypes']}")
|
||||
# Keep it for later
|
||||
# logger.info(f"Missing Values (Post Initial Handling):\n{summary_data_dict['missing']}")
|
||||
|
||||
|
||||
# Descriptive Stats
|
||||
desc_stats_price, err = get_descriptive_stats(df, price_col=PRICE_COL)
|
||||
desc_stats_file_path = output_dir / "descriptive_stats_price.csv" # Make filename specific
|
||||
if err:
|
||||
logger.error(f"Failed to get descriptive stats for {PRICE_COL}: {err}")
|
||||
elif desc_stats_price is not None:
|
||||
logger.info(f"Saving price descriptive stats to {desc_stats_file_path}")
|
||||
try:
|
||||
# Ensure it's a Series before calling to_csv with header=True
|
||||
if isinstance(desc_stats_price, pd.Series):
|
||||
desc_stats_price.to_csv(desc_stats_file_path, header=True)
|
||||
else: # If it returns DataFrame (unlikely for single col but safe)
|
||||
desc_stats_price.to_csv(desc_stats_file_path)
|
||||
except IOError as e:
|
||||
logger.error(f"Failed to write price descriptive stats to {desc_stats_file_path}: {e}")
|
||||
logger.info(f"Price Descriptive Stats:\n{desc_stats_price.to_string()}")
|
||||
|
||||
|
||||
# 3. Visualizations (Main Price Series)
|
||||
logger.info("--- Step 3: Visualizations (Price) ---")
|
||||
plot_name = "01_full_timeseries.png"
|
||||
err = plot_full_time_series(df, PRICE_COL, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['full_timeseries'] = plot_name
|
||||
else: logger.warning(f"Plotting error (full series): {err}")
|
||||
|
||||
if settings.zoom_start_date and settings.zoom_end_date:
|
||||
plot_name = "02_zoomed_timeseries.png"
|
||||
err = plot_zoomed_time_series(df, PRICE_COL, settings.zoom_start_date, settings.zoom_end_date, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['zoomed_timeseries'] = plot_name
|
||||
else: logger.warning(f"Plotting error (zoomed series): {err}")
|
||||
|
||||
for period in ['hour', 'dayofweek', 'month', 'year']:
|
||||
plot_name = f"03_boxplot_{period}.png"
|
||||
err = plot_boxplot_by_period(df, PRICE_COL, period, plots_dir / plot_name)
|
||||
if not err: other_plot_paths[f'boxplot_{period}'] = plot_name
|
||||
else: logger.warning(f"Plotting error (boxplot {period}): {err}")
|
||||
|
||||
plot_name = "04_histogram_price.png"
|
||||
err = plot_histogram(df, PRICE_COL, plots_dir / plot_name)
|
||||
if not err: other_plot_paths['histogram_price'] = plot_name
|
||||
else: logger.warning(f"Plotting error (histogram): {err}")
|
||||
|
||||
# Optional: Seasonal Subseries Plots
|
||||
plot_name = "04a_seasonal_subseries_daily.png"
|
||||
err = plot_seasonal_subseries(df, PRICE_COL, period=24, period_name="Daily", output_path=plots_dir / plot_name)
|
||||
if not err: other_plot_paths['seasonal_subseries_daily'] = plot_name
|
||||
else: logger.warning(f"Plotting error (subseries daily): {err}")
|
||||
|
||||
if len(df) > 168: # Check if enough data for weekly
|
||||
plot_name = "04b_seasonal_subseries_weekly.png"
|
||||
err = plot_seasonal_subseries(df, PRICE_COL, period=168, period_name="Weekly", output_path=plots_dir / plot_name)
|
||||
if not err: other_plot_paths['seasonal_subseries_weekly'] = plot_name
|
||||
else: logger.warning(f"Plotting error (subseries weekly): {err}")
|
||||
|
||||
|
||||
# 4. Decomposition
|
||||
logger.info("--- Step 4: Decomposition ---")
|
||||
residuals_for_analysis: Optional[pd.Series] = None # Track which residuals to use later
|
||||
|
||||
# Daily
|
||||
decomp_daily, err = perform_decomposition(df[PRICE_COL], model='additive', period=24)
|
||||
if err: logger.error(f"Daily decomposition failed: {err}")
|
||||
elif decomp_daily:
|
||||
plot_name = "05_decomposition_daily.png"
|
||||
err = plot_decomposition_results(decomp_daily, "Daily (Period=24)", plots_dir / plot_name)
|
||||
if not err: decomposition_plot_paths['daily'] = plot_name
|
||||
else: logger.warning(f"Plotting error (daily decomp): {err}")
|
||||
|
||||
residuals_daily = decomp_daily.resid.dropna()
|
||||
plot_name = "06_residuals_daily.png"
|
||||
err = plot_residuals(residuals_daily, "Daily Decomp", plots_dir / plot_name)
|
||||
# Save path regardless of error, report might reference it
|
||||
other_plot_paths['residuals_daily'] = plot_name
|
||||
if err: logger.warning(f"Plotting error (daily residuals): {err}")
|
||||
if not residuals_daily.empty: residuals_for_analysis = residuals_daily # Prefer daily initially
|
||||
|
||||
# Weekly
|
||||
if len(df) >= 168 * 2:
|
||||
decomp_weekly, err = perform_decomposition(df[PRICE_COL], model='additive', period=168)
|
||||
if err: logger.error(f"Weekly decomposition failed: {err}")
|
||||
elif decomp_weekly:
|
||||
plot_name = "07_decomposition_weekly.png"
|
||||
err = plot_decomposition_results(decomp_weekly, "Weekly (Period=168)", plots_dir / plot_name)
|
||||
if not err: decomposition_plot_paths['weekly'] = plot_name
|
||||
else: logger.warning(f"Plotting error (weekly decomp): {err}")
|
||||
|
||||
residuals_weekly = decomp_weekly.resid.dropna()
|
||||
plot_name = "08_residuals_weekly.png"
|
||||
err = plot_residuals(residuals_weekly, "Weekly Decomp", plots_dir / plot_name)
|
||||
other_plot_paths['residuals_weekly'] = plot_name
|
||||
if err: logger.warning(f"Plotting error (weekly residuals): {err}")
|
||||
if not residuals_weekly.empty: residuals_for_analysis = residuals_weekly # Prefer weekly if available
|
||||
else:
|
||||
logger.warning("Skipping weekly decomposition, data length insufficient.")
|
||||
|
||||
# Decide which residuals plot to link in stationarity section
|
||||
if residuals_for_analysis is residuals_weekly:
|
||||
other_plot_paths['residuals'] = other_plot_paths.get('residuals_weekly', 'placeholder.png')
|
||||
series_name_stat_tested = "Weekly Residuals"
|
||||
elif residuals_for_analysis is residuals_daily:
|
||||
other_plot_paths['residuals'] = other_plot_paths.get('residuals_daily', 'placeholder.png')
|
||||
series_name_stat_tested = "Daily Residuals"
|
||||
else:
|
||||
series_name_stat_tested = None # No residuals available for tests
|
||||
|
||||
|
||||
# 5. Stationarity Analysis
|
||||
logger.info("--- Step 5: Stationarity Analysis ---")
|
||||
stationarity_file_path = output_dir / "stationarity_tests.json"
|
||||
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
|
||||
logger.info(f"Performing tests on: {series_name_stat_tested}")
|
||||
stationarity_results_dict, err = perform_stationarity_tests(residuals_for_analysis)
|
||||
if err: logger.error(f"Stationarity tests failed: {err}")
|
||||
elif stationarity_results_dict:
|
||||
logger.info(f"Saving stationarity test results to {stationarity_file_path}")
|
||||
try:
|
||||
# Convert numpy arrays/types in critical values to lists for JSON serialization
|
||||
adf_res = stationarity_results_dict.get('adf', {})
|
||||
kpss_res = stationarity_results_dict.get('kpss', {})
|
||||
adf_crit = adf_res.get('Critical Values', {})
|
||||
kpss_crit = kpss_res.get('Critical Values', {})
|
||||
if isinstance(adf_crit, dict):
|
||||
adf_res['Critical Values'] = {k: float(v) for k, v in adf_crit.items()}
|
||||
if isinstance(kpss_crit, dict):
|
||||
kpss_res['Critical Values'] = {k: float(v) for k, v in kpss_crit.items()}
|
||||
|
||||
results_to_save = {
|
||||
"series_tested": series_name_stat_tested,
|
||||
"adf": adf_res,
|
||||
"kpss": kpss_res
|
||||
}
|
||||
with open(stationarity_file_path, 'w') as f:
|
||||
json.dump(results_to_save, f, indent=4)
|
||||
|
||||
except (IOError, TypeError) as e:
|
||||
logger.error(f"Failed to write stationarity results to {stationarity_file_path}: {e}")
|
||||
|
||||
# Log key results
|
||||
logger.info(f"Stationarity Test Results ({series_name_stat_tested}):")
|
||||
if 'adf' in stationarity_results_dict and stationarity_results_dict['adf']:
|
||||
logger.info(f" ADF p-value: {stationarity_results_dict['adf'].get('p-value', 'N/A'):.4f}")
|
||||
if 'kpss' in stationarity_results_dict and stationarity_results_dict['kpss']:
|
||||
# Handle string p-values from KPSS
|
||||
kpss_p = stationarity_results_dict['kpss'].get('p-value', 'N/A')
|
||||
if isinstance(kpss_p, str):
|
||||
logger.info(f" KPSS p-value: {kpss_p}")
|
||||
else:
|
||||
logger.info(f" KPSS p-value: {kpss_p:.4f}")
|
||||
else:
|
||||
logger.warning("Skipping Stationarity Analysis as no suitable residual series is available.")
|
||||
|
||||
|
||||
# 6. Autocorrelation Analysis
|
||||
logger.info("--- Step 6: Autocorrelation Analysis ---")
|
||||
# Import plot_acf, plot_pacf from statsmodels graphics directly for saving
|
||||
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
if series_name_stat_tested and residuals_for_analysis is not None and not residuals_for_analysis.empty:
|
||||
series_name_acf = series_name_stat_tested.lower().replace(' ','_')
|
||||
base_name = f"09_{series_name_acf}"
|
||||
err_acf = None; err_pacf = None
|
||||
try:
|
||||
# Create figure and axes explicitly
|
||||
fig_acf, ax_acf = plt.subplots()
|
||||
plot_acf(residuals_for_analysis, lags=48, ax=ax_acf, title=f'ACF - {series_name_stat_tested}')
|
||||
plot_name_acf = f"{base_name}_acf.png"
|
||||
fig_acf.savefig(plots_dir / plot_name_acf)
|
||||
plt.close(fig_acf) # Close figure after saving
|
||||
acf_pacf_plot_paths['acf'] = plot_name_acf
|
||||
except Exception as e: err_acf = e
|
||||
|
||||
try:
|
||||
# Create figure and axes explicitly
|
||||
fig_pacf, ax_pacf = plt.subplots()
|
||||
plot_pacf(residuals_for_analysis, lags=48, ax=ax_pacf, title=f'PACF - {series_name_stat_tested}', method='ywm')
|
||||
plot_name_pacf = f"{base_name}_pacf.png"
|
||||
fig_pacf.savefig(plots_dir / plot_name_pacf)
|
||||
plt.close(fig_pacf) # Close figure after saving
|
||||
acf_pacf_plot_paths['pacf'] = plot_name_pacf
|
||||
except Exception as e: err_pacf = e
|
||||
|
||||
if err_acf: logger.warning(f"Plotting error (ACF for {series_name_stat_tested}): {err_acf}")
|
||||
if err_pacf: logger.warning(f"Plotting error (PACF for {series_name_stat_tested}): {err_pacf}")
|
||||
|
||||
# Add Weekly Autocorrelation Analysis
|
||||
try:
|
||||
plot_name = f"09c_weekly_autocorr_{series_name_acf}.png"
|
||||
err = plot_weekly_autocorrelation(
|
||||
series=residuals_for_analysis,
|
||||
series_name=series_name_stat_tested,
|
||||
output_path=plots_dir / plot_name,
|
||||
max_weeks=4
|
||||
)
|
||||
if not err: acf_pacf_plot_paths['weekly_autocorr'] = plot_name
|
||||
else: logger.warning(f"Plotting error (weekly autocorrelation): {err}")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error in weekly autocorrelation analysis: {e}")
|
||||
|
||||
else:
|
||||
logger.warning("Skipping Autocorrelation Analysis as no suitable series is available.")
|
||||
|
||||
|
||||
# 7. Exogenous Variable Analysis (if any exist)
|
||||
logger.info("--- Step 7: Exogenous Variable Analysis ---")
|
||||
logger.info("--- There are none.... Skipping ---")
|
||||
|
||||
|
||||
# 8. Generate LaTeX Report
|
||||
logger.info("--- Step 8: Generate LaTeX Report ---")
|
||||
|
||||
# --- Determine Decomposition Model and ACF/PACF Lags Used ---
|
||||
# These are currently hardcoded in the pipeline steps
|
||||
decomp_model_used = 'additive'
|
||||
acf_pacf_lags_used = 48
|
||||
|
||||
|
||||
# Create ReportData object, now including imputation_message
|
||||
report_data = ReportData(
|
||||
descriptive_stats={'desc_price': desc_stats_price} if desc_stats_price is not None else None,
|
||||
stationarity_tests=stationarity_results_dict,
|
||||
summary_data=summary_data_dict, # Pass the summary dict directly
|
||||
imputation_message=imputation_msg # Pass the generated message
|
||||
)
|
||||
try:
|
||||
generate_latex_report(
|
||||
output_dir=output_dir,
|
||||
df=df,
|
||||
report_data=report_data,
|
||||
series_name_stat=series_name_stat_tested,
|
||||
acf_pacf_plot_paths=acf_pacf_plot_paths,
|
||||
decomposition_plot_paths=decomposition_plot_paths,
|
||||
other_plot_paths=other_plot_paths,
|
||||
decomposition_model=decomp_model_used, # Pass the model used
|
||||
acf_pacf_lags=acf_pacf_lags_used, # Pass the lags used
|
||||
template_path=settings.latex_template_file
|
||||
)
|
||||
except (FileNotFoundError, IOError, ValueError, RuntimeError) as e:
|
||||
logger.error(f"Report generation failed: {e}", exc_info=True)
|
||||
# Decide if pipeline should stop or continue
|
||||
# raise SystemExit(1) from e # Option to stop pipeline
|
||||
|
||||
logger.info(f"EDA Pipeline execution finished. Review logs and generated files in {output_dir}.")
|
||||
# The message about compiling manually is now handled within generate_latex_report if compilation fails
|
||||
# logger.info(f"Compile the report: cd '{output_dir.resolve()}' && pdflatex eda_report.tex")
|
0
data_analysis/io/__init__.py
Normal file
0
data_analysis/io/__init__.py
Normal file
168
data_analysis/io/data_handling.py
Normal file
168
data_analysis/io/data_handling.py
Normal file
@ -0,0 +1,168 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Tuple, Optional, Dict, Any
|
||||
|
||||
from data_analysis.utils.config_model import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Define constants for column names related to raw loading
|
||||
TIME_COL_RAW = "MTU (CET/CEST)"
|
||||
PRICE_COL_RAW = "Day-ahead Price [EUR/MWh]"
|
||||
PRICE_COL = "Price" # Standardized column name after processing
|
||||
|
||||
def load_and_prepare_data(file_path: Path) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
||||
"""
|
||||
Loads the energy price CSV data, parses the time column, sets a
|
||||
DatetimeIndex, renames columns, checks frequency, and handles missing values.
|
||||
|
||||
Args:
|
||||
file_path: Path to the input CSV file.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- pd.DataFrame: Processed DataFrame with DatetimeIndex and 'Price' column.
|
||||
May include other columns if they exist in the source.
|
||||
- str | None: Error message if loading fails, otherwise None.
|
||||
"""
|
||||
logger.info(f"Attempting to load data from: {file_path.resolve()}")
|
||||
err = None
|
||||
df = None
|
||||
try:
|
||||
# Load data, assuming header is on the first row
|
||||
df = pd.read_csv(file_path, header=0)
|
||||
|
||||
# Basic check for expected columns
|
||||
if TIME_COL_RAW not in df.columns or PRICE_COL_RAW not in df.columns:
|
||||
err = f"Missing expected columns '{TIME_COL_RAW}' or '{PRICE_COL_RAW}' in {file_path}"
|
||||
logger.error(err)
|
||||
return None, err
|
||||
|
||||
# --- Time Parsing ---
|
||||
df['StartTime'] = df[TIME_COL_RAW].str.split(' - ', expand=True)[0]
|
||||
df['Timestamp'] = pd.to_datetime(df['StartTime'], format='%d.%m.%Y %H:%M', errors='coerce')
|
||||
|
||||
original_len = len(df)
|
||||
df = df.dropna(subset=['Timestamp'])
|
||||
if len(df) < original_len:
|
||||
logger.warning(f"Dropped {original_len - len(df)} rows due to timestamp parsing errors.")
|
||||
|
||||
# --- Set Index and Select Columns ---
|
||||
df = df.set_index('Timestamp')
|
||||
# Convert price column to numeric, coercing errors
|
||||
df[PRICE_COL] = pd.to_numeric(df[PRICE_COL_RAW], errors='coerce')
|
||||
|
||||
# Keep the price column and any other potential exogenous columns
|
||||
# For now, just keep PRICE_COL, drop raw ones. Adapt if exog needed.
|
||||
cols_to_keep = [PRICE_COL] + [col for col in df.columns if col not in [TIME_COL_RAW, PRICE_COL_RAW, 'StartTime', PRICE_COL]]
|
||||
df = df[cols_to_keep].copy()
|
||||
|
||||
# --- Handle Missing Prices ---
|
||||
missing_prices = df[PRICE_COL].isnull().sum()
|
||||
if missing_prices > 0:
|
||||
logger.warning(f"Found {missing_prices} missing '{PRICE_COL}' values. Forward-filling (ffill).")
|
||||
df[PRICE_COL] = df[PRICE_COL].ffill()
|
||||
if df[PRICE_COL].isnull().any():
|
||||
logger.warning("Missing values remain after ffill. Backward-filling (bfill).")
|
||||
df[PRICE_COL] = df[PRICE_COL].bfill()
|
||||
|
||||
# --- Check Time Index Frequency ---
|
||||
df = df.sort_index()
|
||||
inferred_freq = pd.infer_freq(df.index)
|
||||
if inferred_freq == settings.expected_data_frequency:
|
||||
logger.info(f"Inferred index frequency matches the expected '{settings.expected_data_frequency}': ({inferred_freq}). Setting frequency as {inferred_freq}.")
|
||||
df = df.asfreq('h')
|
||||
missing_after_asfreq = df[PRICE_COL].isnull().sum()
|
||||
if missing_after_asfreq > 0:
|
||||
logger.warning(f"{missing_after_asfreq} NaNs appeared after setting frequency to Hourly. Forward-filling again.")
|
||||
df[PRICE_COL] = df[PRICE_COL].ffill().bfill()
|
||||
elif inferred_freq:
|
||||
logger.warning(f"Inferred frequency is '{inferred_freq}', not the expected '{settings.expected_data_frequency}'. Proceeding without setting frequency.")
|
||||
else:
|
||||
logger.warning("Could not infer frequency. Check data for gaps or irregularities. Proceeding without setting frequency.")
|
||||
duplicates = df.index.duplicated().sum()
|
||||
if duplicates > 0:
|
||||
logger.warning(f"Found {duplicates} duplicate timestamps. Keeping the first occurrence.")
|
||||
df = df[~df.index.duplicated(keep='first')]
|
||||
|
||||
logger.info(f"Data loaded and prepared. Final shape: {df.shape}")
|
||||
|
||||
except FileNotFoundError:
|
||||
err = f"Data file not found: {file_path}"
|
||||
logger.error(err)
|
||||
except Exception as e:
|
||||
err = f"An unexpected error occurred during data loading/preparation: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
df = None
|
||||
|
||||
return df, err
|
||||
|
||||
|
||||
def get_data_summary(df: pd.DataFrame) -> Tuple[Optional[Dict[str, Any]], Optional[str]]:
|
||||
"""
|
||||
Generates summary information about the DataFrame.
|
||||
|
||||
Args:
|
||||
df: The input DataFrame.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- dict | None: Dictionary with summary data ('head', 'tail', 'dtypes', 'missing').
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info("Generating data summary...")
|
||||
summary = None
|
||||
err = None
|
||||
if df is None or df.empty:
|
||||
return None, "Input DataFrame is empty or None."
|
||||
try:
|
||||
summary = {
|
||||
'head': df.head(),
|
||||
'tail': df.tail(),
|
||||
'dtypes': df.dtypes,
|
||||
'missing': df.isnull().sum()
|
||||
}
|
||||
logger.info("Data summary generated.")
|
||||
except Exception as e:
|
||||
err = f"Error generating data summary: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
|
||||
return summary, err
|
||||
|
||||
|
||||
def get_descriptive_stats(df: pd.DataFrame, price_col: str = PRICE_COL) -> Tuple[Optional[pd.Series | pd.DataFrame], Optional[str]]:
|
||||
"""
|
||||
Calculates descriptive statistics for specified column(s).
|
||||
|
||||
Args:
|
||||
df: The input DataFrame.
|
||||
price_col: The name of the column (or list of columns) for stats.
|
||||
Defaults to the standard 'Price' column.
|
||||
|
||||
Returns:
|
||||
A tuple containing:
|
||||
- pd.Series | pd.DataFrame | None: Series/DataFrame with descriptive statistics.
|
||||
- str | None: Error message, otherwise None.
|
||||
"""
|
||||
logger.info(f"Calculating descriptive statistics for column(s): '{price_col}'...")
|
||||
stats = None
|
||||
err = None
|
||||
if df is None or df.empty:
|
||||
return None, "Input DataFrame is empty or None."
|
||||
try:
|
||||
# Check if the target column(s) exist
|
||||
target_cols = [price_col] if isinstance(price_col, str) else price_col
|
||||
missing_cols = [col for col in target_cols if col not in df.columns]
|
||||
if missing_cols:
|
||||
err = f"Column(s) not found in DataFrame: {', '.join(missing_cols)}."
|
||||
logger.error(err)
|
||||
return None, err
|
||||
|
||||
stats = df[price_col].describe() # .describe() works on Series and DataFrame
|
||||
logger.info("Descriptive statistics calculated.")
|
||||
except Exception as e:
|
||||
err = f"Error calculating descriptive statistics: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
|
||||
return stats, err
|
398
data_analysis/io/plotting.py
Normal file
398
data_analysis/io/plotting.py
Normal file
@ -0,0 +1,398 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
import numpy as np # Import numpy for CI calculation
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from typing import Optional, List
|
||||
|
||||
# Import analysis tools for plotting results
|
||||
from statsmodels.tsa.seasonal import DecomposeResult
|
||||
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf, seasonal_plot
|
||||
from statsmodels.tsa.stattools import ccf # Import ccf
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Plotting Configuration ---
|
||||
# Increase default figure size for better readability
|
||||
plt.rcParams['figure.figsize'] = (15, 7)
|
||||
# Use a clean style
|
||||
plt.style.use('seaborn-v0_8-whitegrid')
|
||||
|
||||
|
||||
def _save_plot(fig: plt.Figure, output_path: Path) -> Optional[str]:
|
||||
"""Helper to save plots and handle errors."""
|
||||
err = None
|
||||
try:
|
||||
fig.tight_layout() # Adjust layout before saving
|
||||
fig.savefig(output_path, dpi=150, bbox_inches='tight')
|
||||
logger.info(f"Plot saved to: {output_path}")
|
||||
plt.close(fig) # Close the figure to free memory
|
||||
except Exception as e:
|
||||
err = f"Failed to save plot to {output_path}: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig) # Still try to close figure on error
|
||||
return err
|
||||
|
||||
|
||||
def plot_full_time_series(df: pd.DataFrame, price_col: str, output_path: Path) -> Optional[str]:
|
||||
"""Plots the entire time series."""
|
||||
logger.info(f"Generating full time series plot to {output_path}")
|
||||
fig, ax = plt.subplots()
|
||||
err = None
|
||||
try:
|
||||
sns.lineplot(data=df, x=df.index, y=price_col, ax=ax, linewidth=1)
|
||||
ax.set_title('Full Time Series: Price Over Time')
|
||||
ax.set_xlabel('Time')
|
||||
ax.set_ylabel(price_col)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting full time series: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
|
||||
def plot_zoomed_time_series(df: pd.DataFrame, price_col: str, start_date: str, end_date: str, output_path: Path) -> Optional[str]:
|
||||
"""Plots a specified time range of the series."""
|
||||
logger.info(f"Generating zoomed time series plot ({start_date} to {end_date}) to {output_path}")
|
||||
fig, ax = plt.subplots()
|
||||
err = None
|
||||
try:
|
||||
# Ensure start_date and end_date are compatible with index type
|
||||
df_zoomed = df.loc[start_date:end_date]
|
||||
if df_zoomed.empty:
|
||||
err = f"No data found in the specified zoom range: {start_date} to {end_date}"
|
||||
logger.warning(err) # Use warning for empty range, not necessarily error
|
||||
plt.close(fig)
|
||||
return err
|
||||
sns.lineplot(data=df_zoomed, x=df_zoomed.index, y=price_col, ax=ax, linewidth=1)
|
||||
ax.set_title(f'Time Series: {start_date} to {end_date}')
|
||||
ax.set_xlabel('Time')
|
||||
ax.set_ylabel(price_col)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting zoomed time series: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
|
||||
def plot_boxplot_by_period(df: pd.DataFrame, price_col: str, period: str, output_path: Path) -> Optional[str]:
|
||||
"""
|
||||
Generates box plots of the price grouped by a specific time period.
|
||||
Periods: 'hour', 'dayofweek', 'month', 'year'.
|
||||
"""
|
||||
logger.info(f"Generating box plot by {period} to {output_path}")
|
||||
fig, ax = plt.subplots()
|
||||
err = None
|
||||
try:
|
||||
# Create temporary column for the period
|
||||
if period == 'hour':
|
||||
group_col = df.index.hour
|
||||
title = 'Price Distribution by Hour of Day'
|
||||
x_label = 'Hour'
|
||||
elif period == 'dayofweek':
|
||||
group_col = df.index.dayofweek # Monday=0, Sunday=6
|
||||
title = 'Price Distribution by Day of Week'
|
||||
x_label = 'Day of Week (0=Mon, 6=Sun)'
|
||||
elif period == 'month':
|
||||
group_col = df.index.month
|
||||
title = 'Price Distribution by Month'
|
||||
x_label = 'Month'
|
||||
elif period == 'year':
|
||||
group_col = df.index.year
|
||||
title = 'Price Distribution by Year'
|
||||
x_label = 'Year'
|
||||
else:
|
||||
err = f"Unsupported period '{period}' for boxplot."
|
||||
logger.error(err)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
# Ensure group_col is numeric or categorical for plotting
|
||||
sns.boxplot(x=group_col, y=df[price_col], ax=ax, palette="viridis", hue=group_col)
|
||||
ax.set_title(title)
|
||||
ax.set_xlabel(x_label)
|
||||
ax.set_ylabel(price_col)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting boxplot by {period}: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
# New function signature for seasonal subseries plot
|
||||
def plot_seasonal_subseries(df: pd.DataFrame, price_col: str, period: int, period_name: str, output_path: Path) -> Optional[str]:
|
||||
"""
|
||||
Generates a seasonal subseries plot for a given period (e.g., 24 for daily).
|
||||
"""
|
||||
logger.info(f"Generating seasonal subseries plot for {period_name} (period={period}) to {output_path}")
|
||||
err = None
|
||||
try:
|
||||
# Ensure the index is datetime and frequency is set or can be inferred
|
||||
if not isinstance(df.index, pd.DatetimeIndex):
|
||||
err = "DataFrame index must be a DatetimeIndex for seasonal subseries plot."
|
||||
logger.error(err)
|
||||
return err
|
||||
|
||||
# Create the appropriate grouping based on the period
|
||||
if period == 24: # Daily
|
||||
grouped = df[price_col].groupby(df.index.hour)
|
||||
xticklabels = [f"{i:02d}:00" for i in range(24)]
|
||||
elif period == 168: # Weekly
|
||||
grouped = df[price_col].groupby(df.index.dayofweek)
|
||||
xticklabels = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
|
||||
else:
|
||||
# For other periods, create a custom grouping
|
||||
grouped = df[price_col].groupby(df.index % period)
|
||||
xticklabels = [str(i) for i in range(period)]
|
||||
|
||||
# Create the plot using seasonal_plot
|
||||
fig = seasonal_plot(grouped, xticklabels=xticklabels, ylabel=price_col)
|
||||
fig.suptitle(f'Seasonal Subseries Plot ({period_name})', y=1.02)
|
||||
fig.set_size_inches(15, 10)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting seasonal subseries ({period_name}): {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close('all')
|
||||
return err
|
||||
|
||||
|
||||
def plot_histogram(df: pd.DataFrame, price_col: str, output_path: Path, bins: int = 50) -> Optional[str]:
|
||||
"""Plots a histogram of the price values."""
|
||||
logger.info(f"Generating histogram of '{price_col}' to {output_path}")
|
||||
fig, ax = plt.subplots()
|
||||
err = None
|
||||
try:
|
||||
sns.histplot(data=df, x=price_col, bins=bins, kde=True, ax=ax)
|
||||
ax.set_title(f'Distribution of {price_col}')
|
||||
ax.set_xlabel(price_col)
|
||||
ax.set_ylabel('Frequency')
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting histogram: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
|
||||
def plot_decomposition(decomposition_result: DecomposeResult, period_name: str, output_path: Path) -> Optional[str]:
|
||||
"""
|
||||
Plots the observed, trend, seasonal, and residual components from a
|
||||
time series decomposition result.
|
||||
"""
|
||||
logger.info(f"Generating {period_name} decomposition plot to {output_path}")
|
||||
err = None
|
||||
try:
|
||||
# The plot method of DecomposeResult returns a Figure
|
||||
fig = decomposition_result.plot()
|
||||
fig.set_size_inches(15, 10) # Adjust size for better visibility
|
||||
fig.suptitle(f'Time Series Decomposition ({period_name} Seasonality)', y=1.02)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting decomposition ({period_name}): {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
# No access to the fig object if decomposition_result.plot() fails early
|
||||
# Close all figures as a fallback
|
||||
plt.close('all')
|
||||
return err
|
||||
|
||||
|
||||
def plot_residuals(residuals: pd.Series, title_suffix: str, output_path: Path) -> Optional[str]:
|
||||
"""Plots the residuals over time."""
|
||||
logger.info(f"Generating residuals plot ({title_suffix}) to {output_path}")
|
||||
fig, ax = plt.subplots()
|
||||
err = None
|
||||
try:
|
||||
residuals.plot(ax=ax, title=f'Residuals ({title_suffix})')
|
||||
ax.set_xlabel('Time')
|
||||
ax.set_ylabel('Residual Value')
|
||||
# Add a horizontal line at zero
|
||||
ax.axhline(0, color='r', linestyle='--', alpha=0.7)
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting residuals ({title_suffix}): {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
def plot_acf_pacf(series: pd.Series, series_name: str, lags: int | None, output_path_base: Path) -> Optional[str]:
|
||||
"""
|
||||
Plots the Autocorrelation Function (ACF) and Partial Autocorrelation
|
||||
Function (PACF) for a given series, saving them as separate files.
|
||||
"""
|
||||
logger.info(f"Generating ACF/PACF plots for {series_name} to {output_path_base.parent}")
|
||||
err_acf = None
|
||||
err_pacf = None
|
||||
|
||||
# Plot ACF
|
||||
try:
|
||||
fig_acf = plt.figure()
|
||||
ax_acf = fig_acf.add_subplot(111)
|
||||
plot_acf(series, lags=lags, ax=ax_acf, title=f'ACF - {series_name}')
|
||||
acf_path = output_path_base.with_name(f"{output_path_base.stem}_acf.png")
|
||||
err_acf = _save_plot(fig_acf, acf_path)
|
||||
except Exception as e:
|
||||
err_acf = f"Error plotting ACF for {series_name}: {e}"
|
||||
logger.error(err_acf, exc_info=True)
|
||||
plt.close(fig_acf)
|
||||
|
||||
# Plot PACF
|
||||
try:
|
||||
fig_pacf = plt.figure()
|
||||
ax_pacf = fig_pacf.add_subplot(111)
|
||||
# Use method='ywm' for Yule-Walker method, often preferred
|
||||
plot_pacf(series, lags=lags, ax=ax_pacf, title=f'PACF - {series_name}', method='ywm')
|
||||
pacf_path = output_path_base.with_name(f"{output_path_base.stem}_pacf.png")
|
||||
err_pacf = _save_plot(fig_pacf, pacf_path)
|
||||
except Exception as e:
|
||||
err_pacf = f"Error plotting PACF for {series_name}: {e}"
|
||||
logger.error(err_pacf, exc_info=True)
|
||||
plt.close(fig_pacf)
|
||||
|
||||
# Return the first error encountered, or None if both succeeded
|
||||
return err_acf or err_pacf
|
||||
|
||||
|
||||
# Update cross-correlation plot function
|
||||
def plot_cross_correlation(
|
||||
target_series: pd.Series,
|
||||
exog_series: pd.Series,
|
||||
target_name: str,
|
||||
exog_name: str,
|
||||
max_lags: int,
|
||||
output_path: Path
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Generates and saves a cross-correlation plot between a target series and an exogenous series.
|
||||
Plots correlation of target_series(t) with exog_series(t-lag).
|
||||
|
||||
Args:
|
||||
target_series: The main time series to analyze
|
||||
exog_series: The exogenous time series to correlate with
|
||||
target_name: Name of the target series for labeling
|
||||
exog_name: Name of the exogenous series for labeling
|
||||
max_lags: Maximum number of lags to compute correlation for
|
||||
output_path: Where to save the plot
|
||||
|
||||
Returns:
|
||||
Optional[str]: Error message if something went wrong, None if successful
|
||||
"""
|
||||
logger.info(f"Generating cross-correlation plot ({target_name} vs {exog_name}) for lags up to {max_lags} to {output_path}")
|
||||
err = None
|
||||
try:
|
||||
# Ensure series are aligned and have no NaNs affecting calculation
|
||||
combined = pd.concat([target_series.rename(target_name), exog_series.rename(exog_name)], axis=1).dropna()
|
||||
|
||||
# Check if we have enough data points
|
||||
if combined.empty or len(combined) <= max_lags:
|
||||
err = f"Not enough overlapping non-NaN data points between {target_name} and {exog_name} for CCF calculation (need > {max_lags})."
|
||||
# Will warn above!
|
||||
# logger.warning(err)
|
||||
return err
|
||||
|
||||
# Check if the exogenous variable actually varies
|
||||
if exog_series.nunique() <= 1:
|
||||
err = f"Cannot compute cross-correlation: {exog_name} has no variation (all values are the same)."
|
||||
# Will warn above!
|
||||
# logger.warning(err)
|
||||
return err
|
||||
|
||||
# Calculate CCF: ccf(x, y) computes corr(x[t], y[t-lag])
|
||||
# We want corr(target[t], exog[t-lag]), so order is ccf(target, exog)
|
||||
cross_corr_values = ccf(combined[target_name], combined[exog_name], adjusted=False, nlags=max_lags)
|
||||
lags_range = range(max_lags + 1) # CCF includes lag 0
|
||||
|
||||
# Plotting
|
||||
fig, ax = plt.subplots()
|
||||
markerline, stemlines, baseline = ax.stem(
|
||||
lags_range, cross_corr_values, markerfmt='o', basefmt="gray"
|
||||
)
|
||||
plt.setp(markerline, markersize=5)
|
||||
plt.setp(stemlines, linewidth=1)
|
||||
|
||||
# Add approximate 95% confidence intervals (Bartlett's formula approximation)
|
||||
conf_level = 1.96 / np.sqrt(len(combined))
|
||||
ax.axhspan(-conf_level, conf_level, alpha=0.2, color='blue', zorder=0)
|
||||
|
||||
ax.set_title(f'Cross-Correlation: {target_name}(t) vs {exog_name}(t-lag)')
|
||||
ax.set_xlabel('Lag (k)')
|
||||
ax.set_ylabel(f'Corr({target_name}(t), {exog_name}(t-k))')
|
||||
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting cross-correlation ({target_name} vs {exog_name}): {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
||||
|
||||
def plot_weekly_autocorrelation(
|
||||
series: pd.Series,
|
||||
series_name: str,
|
||||
output_path: Path,
|
||||
max_weeks: int = 4
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Generates and saves an autocorrelation plot between a series and its weekly lags.
|
||||
This helps identify weekly seasonality patterns.
|
||||
|
||||
Args:
|
||||
series: The time series to analyze
|
||||
series_name: Name of the series for labeling
|
||||
output_path: Where to save the plot
|
||||
max_weeks: Maximum number of weeks to look back (default: 4)
|
||||
|
||||
Returns:
|
||||
Optional[str]: Error message if something went wrong, None if successful
|
||||
"""
|
||||
logger.info(f"Generating weekly autocorrelation plot for {series_name} up to {max_weeks} weeks to {output_path}")
|
||||
err = None
|
||||
try:
|
||||
# Ensure series has no NaNs
|
||||
series = series.dropna()
|
||||
if series.empty:
|
||||
err = f"Series {series_name} is empty after dropping NaNs."
|
||||
logger.warning(err)
|
||||
return err
|
||||
|
||||
# Calculate weekly lags (168 hours = 1 week)
|
||||
hours_per_week = 24 * 7
|
||||
max_lags = max_weeks * hours_per_week
|
||||
|
||||
# Calculate autocorrelation
|
||||
autocorr_values = ccf(series, series, adjusted=False, nlags=max_lags)
|
||||
lags_range = list(range(0, min(max_lags + 1, autocorr_values.size - 1), hours_per_week)) # Only plot weekly intervals
|
||||
|
||||
# Plotting
|
||||
fig, ax = plt.subplots()
|
||||
markerline, stemlines, baseline = ax.stem(
|
||||
[lag/hours_per_week for lag in lags_range], # Convert to weeks for x-axis
|
||||
autocorr_values[lags_range],
|
||||
markerfmt='o',
|
||||
basefmt="gray"
|
||||
)
|
||||
plt.setp(markerline, markersize=5)
|
||||
plt.setp(stemlines, linewidth=1)
|
||||
|
||||
# Add approximate 95% confidence intervals
|
||||
conf_level = 1.96 / np.sqrt(len(series))
|
||||
ax.axhspan(-conf_level, conf_level, alpha=0.2, color='blue', zorder=0)
|
||||
|
||||
ax.set_title(f'Weekly Autocorrelation: {series_name}')
|
||||
ax.set_xlabel('Lag (weeks)')
|
||||
ax.set_ylabel(f'Corr({series_name}(t), {series_name}(t-lag))')
|
||||
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
|
||||
|
||||
# Add vertical lines at each week
|
||||
for week in range(max_weeks + 1):
|
||||
ax.axvline(x=week, color='gray', linestyle=':', alpha=0.3)
|
||||
|
||||
err = _save_plot(fig, output_path)
|
||||
except Exception as e:
|
||||
err = f"Error plotting weekly autocorrelation for {series_name}: {e}"
|
||||
logger.error(err, exc_info=True)
|
||||
plt.close(fig)
|
||||
return err
|
556
data_analysis/io/report.py
Normal file
556
data_analysis/io/report.py
Normal file
@ -0,0 +1,556 @@
|
||||
import datetime
|
||||
import logging
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any
|
||||
import shutil
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from data_analysis.utils.config_model import settings # Assuming settings are configured
|
||||
from data_analysis.utils.report_model import ReportData
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
# --- Helper function to format DataFrames/Series as LaTeX tables ---
|
||||
CHARS = {
|
||||
'&': r'\&',
|
||||
'%': r'\%',
|
||||
'$': r'\$',
|
||||
'#': r'\#',
|
||||
'_': r'\_',
|
||||
'{': r'\{',
|
||||
'}': r'\}',
|
||||
'~': r'\textasciitilde{}',
|
||||
'^': r'\^{}',
|
||||
'\\': r'\textbackslash{}',
|
||||
'<': r'\textless{}',
|
||||
'>': r'\textgreater{}',
|
||||
}
|
||||
|
||||
def _escape_latex(text: str) -> str:
|
||||
"""Escapes special LaTeX characters in a string."""
|
||||
# Convert input to string first to handle potential non-string types
|
||||
t = str(text)
|
||||
# Use a compiled regex for efficiency if called many times
|
||||
# The pattern needs to be carefully ordered to handle overlapping keys (e.g., '\' vs '\\') correctly,
|
||||
# although the current CHARS doesn't have overlaps. Sorting by length desc is safest.
|
||||
pattern = re.compile('|'.join(re.escape(str(key)) for key in sorted(CHARS.keys(), key=lambda item: - len(item))))
|
||||
t = pattern.sub(lambda match: CHARS[match.group()], t)
|
||||
return t
|
||||
|
||||
|
||||
def dataframe_to_latex(df: Optional[pd.DataFrame], title: Optional[str] = None, caption: Optional[str] = None, label: Optional[str] = None, escape: bool = True) -> Optional[str]:
|
||||
"""Converts a pandas DataFrame to a LaTeX tabular environment using booktabs."""
|
||||
if df is None or df.empty:
|
||||
return None
|
||||
|
||||
# Prepare DataFrame for LaTeX conversion
|
||||
df_copy = df.copy()
|
||||
# Include index if it's named or not a simple RangeIndex
|
||||
include_index = df_copy.index.name is not None or not isinstance(df_copy.index, pd.RangeIndex)
|
||||
|
||||
# Escape column names and data if required
|
||||
if escape:
|
||||
# Ensure column names are strings before escaping
|
||||
df_copy.columns = [_escape_latex(str(col)) for col in df_copy.columns]
|
||||
if include_index and df_copy.index.name:
|
||||
# Ensure index name is a string before escaping
|
||||
df_copy.index.name = _escape_latex(str(df_copy.index.name))
|
||||
# Escape data - map works element-wise, ensure elements are str first if necessary
|
||||
# Using applymap instead of map for broader compatibility
|
||||
df_copy = df_copy.map(lambda x: _escape_latex(str(x)))
|
||||
|
||||
# Determine column format (e.g., 'llr' for left, left, right)
|
||||
# Default to left-aligned ('l') for all columns
|
||||
num_cols = len(df_copy.columns) + (1 if include_index else 0)
|
||||
col_format = "l" * num_cols
|
||||
|
||||
try:
|
||||
# Ensure title and caption are escaped if they exist and escape=True was requested
|
||||
# However, dataframe_to_latex itself handles caption/label escaping internally if its `escape` is True.
|
||||
# We are setting escape=False because we do it manually above.
|
||||
# If a title is provided separately, it should be escaped before adding.
|
||||
escaped_title = _escape_latex(str(title)) if title and escape else title
|
||||
escaped_caption = _escape_latex(str(caption)) if caption and escape else caption
|
||||
|
||||
latex_str = df_copy.to_latex(
|
||||
index=include_index,
|
||||
escape=False, # We already escaped manually if escape=True
|
||||
column_format=col_format,
|
||||
header=True,
|
||||
# Pass potentially pre-escaped caption/title to to_latex's caption
|
||||
caption=escaped_caption if escaped_caption else escaped_title,
|
||||
label=f"tab:{label}" if label else None,
|
||||
position='!htbp', # Placement suggestion
|
||||
)
|
||||
# Add the pre-escaped title above the table if provided and different from caption
|
||||
if escaped_title and escaped_title != escaped_caption:
|
||||
# Ensure title is treated as LaTeX command if needed, or just text
|
||||
# Using \textbf might require braces if title contains commands
|
||||
latex_str = fr"\textbf{{{escaped_title}}}\par\par\medskip{latex_str}" # Already escaped title
|
||||
|
||||
return latex_str
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to convert DataFrame to LaTeX: {e}", exc_info=True)
|
||||
# Escape the error message itself for safe inclusion in LaTeX
|
||||
return fr"\textit{{Error generating LaTeX table: {_escape_latex(str(e))}}}"
|
||||
|
||||
def series_to_latex(series: Optional[pd.Series], title: Optional[str] = None, caption: Optional[str] = None, label: Optional[str] = None, escape: bool = True) -> str:
|
||||
"""Converts a pandas Series to a LaTeX table (two columns: Index, Value)."""
|
||||
if series is None or series.empty:
|
||||
# Ensure the default string is safe for LaTeX
|
||||
return r"\textit{N/A}\par"
|
||||
# Convert series to DataFrame
|
||||
df = series.reset_index()
|
||||
# Use clear default column names if none exist, ensure they are strings
|
||||
index_name = str(series.index.name) if series.index.name else 'Index'
|
||||
value_name = str(series.name) if series.name else 'Value'
|
||||
df.columns = [index_name, value_name]
|
||||
# Delegate to dataframe_to_latex, passing the escape parameter
|
||||
return dataframe_to_latex(df, title=title, caption=caption, label=label, escape=escape)
|
||||
|
||||
|
||||
# --- Report Generation Function (LaTeX) ---
|
||||
def compile_latex_report(report_tex_path: Path, output_dir: Path) -> bool:
|
||||
"""
|
||||
Attempts to compile the LaTeX report using the local LaTeX installation.
|
||||
|
||||
Args:
|
||||
report_tex_path: Path to the .tex file
|
||||
output_dir: Directory where the PDF should be saved
|
||||
|
||||
Returns:
|
||||
bool: True if compilation was successful, False otherwise
|
||||
"""
|
||||
logger.info(f"Attempting to compile LaTeX report: {report_tex_path}")
|
||||
|
||||
# Create necessary directories
|
||||
reports_dir = output_dir / "reports"
|
||||
tmp_dir = output_dir / "_tmp"
|
||||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Run pdflatex twice to ensure proper references and table of contents
|
||||
for i in range(2):
|
||||
logger.info(f"Running pdflatex (attempt {i+1}/2)...")
|
||||
result = subprocess.run(
|
||||
["pdflatex", "-interaction=nonstopmode", "-output-directory", str(tmp_dir), str(report_tex_path)],
|
||||
capture_output=False if settings.debug else True,
|
||||
text=True
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.error(f"LaTeX compilation failed (attempt {i+1})")
|
||||
return False
|
||||
|
||||
# Move the PDF to the reports directory
|
||||
pdf_path = tmp_dir / f"{report_tex_path.stem}.pdf"
|
||||
if pdf_path.exists():
|
||||
target_pdf = reports_dir / "report.pdf"
|
||||
shutil.move(str(pdf_path), str(target_pdf))
|
||||
logger.info(f"Successfully compiled and moved report to: {target_pdf}")
|
||||
|
||||
# Clean up the _tmp directory
|
||||
shutil.rmtree(tmp_dir)
|
||||
logger.info("Cleaned up temporary LaTeX files")
|
||||
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Expected PDF file not found: {pdf_path}")
|
||||
return False
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.error("pdflatex command not found. Please ensure LaTeX is installed and in your PATH.")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error during LaTeX compilation: {e}")
|
||||
return False
|
||||
|
||||
def get_plot_path(key: str, plot_paths: Optional[Dict[str, str]]) -> str:
|
||||
"""Get the correct path for a plot file."""
|
||||
if plot_paths is None:
|
||||
# Return placeholder if the entire dictionary is missing
|
||||
return "reports/plots/placeholder.png"
|
||||
|
||||
# Lookup the specific filename using the key
|
||||
filename = plot_paths.get(key)
|
||||
|
||||
# Construct path or return placeholder if key wasn't found
|
||||
return f"reports/plots/{filename}" if filename else "reports/plots/placeholder.png"
|
||||
|
||||
def _format_latex_command(macro_name: str, value: str) -> str:
|
||||
"""Formats a LaTeX \newcommand definition. Assumes value is correctly escaped/formatted."""
|
||||
# Creates \newcommand{\macroName}{value}
|
||||
# Using simple string concatenation to avoid f-string/raw-string issues.
|
||||
return "\\newcommand{\\" + macro_name + "}{" + value + "}"
|
||||
|
||||
def _format_stationarity_results(results: Optional[Dict[str, Any]], test_name: str) -> str:
|
||||
"""Formats stationarity test results dictionary into a LaTeX string."""
|
||||
default_na = r"\textit{N/A}"
|
||||
if not results:
|
||||
return default_na
|
||||
|
||||
test_data = results.get(test_name.lower())
|
||||
if not test_data:
|
||||
return default_na
|
||||
|
||||
# Ensure keys and values are escaped correctly *before* creating the Series
|
||||
formatted_data = {}
|
||||
for key, value in test_data.items():
|
||||
escaped_key = _escape_latex(str(key)) # Escape the key
|
||||
if isinstance(value, dict): # Handle Critical Values
|
||||
# Escape keys and format values within the string
|
||||
cv_str = ", ".join([f"{_escape_latex(k)}: {v:.3f}" for k, v in value.items()])
|
||||
formatted_data[escaped_key] = cv_str
|
||||
elif isinstance(value, (int, float)):
|
||||
# Apply specific formatting for p-value and test statistic
|
||||
if 'p-value' in key.lower():
|
||||
formatted_data[escaped_key] = f"{value:.4f}"
|
||||
elif 'statistic' in key.lower():
|
||||
formatted_data[escaped_key] = f"{value:.3f}"
|
||||
else:
|
||||
# Convert non-float numbers to string
|
||||
formatted_data[escaped_key] = str(value)
|
||||
else:
|
||||
# Escape other string values
|
||||
formatted_data[escaped_key] = _escape_latex(str(value))
|
||||
|
||||
if not formatted_data:
|
||||
return default_na
|
||||
|
||||
series = pd.Series(formatted_data)
|
||||
series.name = "Value" # This name doesn't get escaped by default in series_to_latex
|
||||
series.index.name = "Metric" # This name doesn't get escaped by default in series_to_latex
|
||||
|
||||
# Use series_to_latex for table structure, disable its internal escaping
|
||||
# as we have already escaped the content. Title also needs pre-escaping.
|
||||
escaped_title = _escape_latex(f"{test_name.upper()} Test Results")
|
||||
return series_to_latex(series, title=escaped_title, label=f"{test_name.lower()}_results", escape=False)
|
||||
|
||||
def generate_latex_report(
|
||||
output_dir: Path,
|
||||
df: Optional[pd.DataFrame],
|
||||
report_data: ReportData,
|
||||
series_name_stat: Optional[str],
|
||||
acf_pacf_plot_paths: Optional[Dict[str, str]] = None,
|
||||
decomposition_plot_paths: Optional[Dict[str, str]] = None,
|
||||
other_plot_paths: Optional[Dict[str, str]] = None,
|
||||
decomposition_model: str = 'additive',
|
||||
acf_pacf_lags: Optional[int] = 48,
|
||||
template_path: Path = Path("data_analysis/utils/_latex_report_template.tex")
|
||||
):
|
||||
"""Generates the LaTeX report (.tex file) by filling the template using macros."""
|
||||
logger.info(f"Generating LaTeX EDA report using template: {template_path.resolve()}")
|
||||
|
||||
reports_dir = output_dir / "reports"
|
||||
source_plots_dir = reports_dir / "plots" # Define source plot dir
|
||||
tmp_dir = output_dir / "_tmp"
|
||||
tmp_plots_dir = tmp_dir / "plots" # Define target plot dir within tmp
|
||||
reports_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
# Ensure the target plot dir exists and is empty before copying
|
||||
if tmp_plots_dir.exists():
|
||||
shutil.rmtree(tmp_plots_dir)
|
||||
tmp_plots_dir.mkdir()
|
||||
shutil.copytree( output_dir / "plots", tmp_plots_dir, dirs_exist_ok=True)
|
||||
|
||||
report_tex_path = tmp_dir / "eda_report.tex"
|
||||
|
||||
if not template_path.exists():
|
||||
logger.error(f"Report template not found at {template_path.resolve()}. Cannot generate report.")
|
||||
raise FileNotFoundError(f"Report template not found: {template_path.resolve()}")
|
||||
|
||||
try:
|
||||
with open(template_path, 'r', encoding='utf-8') as f:
|
||||
template = f.read()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to read report template {template_path.resolve()}: {e}", exc_info=True)
|
||||
raise IOError(f"Failed to read report template {template_path.resolve()}: {e}") from e
|
||||
|
||||
# --- Prepare LaTeX Definitions ---
|
||||
latex_definitions = []
|
||||
default_na = r"\textit{N/A}"
|
||||
default_text = r"\textit{Not provided - requires manual interpretation or more data.}\medskip"
|
||||
|
||||
# Refined helper to add definitions
|
||||
def add_def(macro_name: str, value: Optional[Any], formatter=None, default=default_na, escape_if_plain: bool = True):
|
||||
"""
|
||||
Adds a LaTeX definition. Handles None values, applies formatter if provided,
|
||||
and escapes the result if it's considered plain text.
|
||||
|
||||
Args:
|
||||
macro_name: The name of the LaTeX macro (without backslash).
|
||||
value: The value for the macro.
|
||||
formatter: A function to format the value (e.g., dataframe_to_latex).
|
||||
If None, str() is used. If the formatter returns LaTeX code,
|
||||
set escape_if_plain=False.
|
||||
default: The default string to use if value is None. Assumed safe for LaTeX.
|
||||
escape_if_plain: If True and the final value is not known to be LaTeX
|
||||
(i.e., not from specific formatters or defaults), apply _escape_latex.
|
||||
"""
|
||||
final_str = default
|
||||
is_known_latex = False
|
||||
|
||||
if value is not None:
|
||||
if formatter:
|
||||
final_str = formatter(value)
|
||||
# Assume formatters producing tables/complex output return valid LaTeX
|
||||
if formatter in [dataframe_to_latex, series_to_latex, _format_stationarity_results]:
|
||||
is_known_latex = True
|
||||
else:
|
||||
final_str = str(value) # Default to string conversion
|
||||
else:
|
||||
# Value is None, using default. Check if default is known LaTeX.
|
||||
if default in [default_na, default_text]:
|
||||
is_known_latex = True
|
||||
|
||||
# Convert to string one last time in case formatter returned non-string
|
||||
final_str = str(final_str)
|
||||
|
||||
# Escape the result *unless* it's known LaTeX or escaping is turned off
|
||||
if escape_if_plain and not is_known_latex:
|
||||
final_str = _escape_latex(final_str)
|
||||
|
||||
latex_definitions.append(_format_latex_command(macro_name, final_str))
|
||||
|
||||
|
||||
# Helper for paths - Now points to plots/filename within the _tmp directory
|
||||
# Uses example-image-a as the default placeholder
|
||||
def add_path_def(macro_name: str, path_dict: Optional[Dict[str, str]], key: str, default_filename='example-image-a'): # Changed default
|
||||
filename = default_filename
|
||||
is_placeholder = True # Flag to track if we're using the placeholder
|
||||
source_filename = None
|
||||
|
||||
if path_dict and key in path_dict and path_dict[key]:
|
||||
actual_filename_from_dict = Path(path_dict[key]).name
|
||||
if actual_filename_from_dict: # Check if it's not an empty string
|
||||
filename = actual_filename_from_dict
|
||||
source_filename = path_dict[key] # Keep original potentially relative path for source lookup
|
||||
is_placeholder = False
|
||||
# else: filename remains default_filename ('example-image-a')
|
||||
|
||||
# Construct path for \includegraphics
|
||||
# If it's a real plot, use the "plots/" prefix for the copied location.
|
||||
# If it's the placeholder, use the name directly (LaTeX finds it).
|
||||
if not is_placeholder:
|
||||
formatted_path = f"plots/{filename}".replace('\\', '/')
|
||||
else:
|
||||
# Ensure placeholder name itself doesn't get 'plots/' prefix
|
||||
formatted_path = Path(filename).name # Use Path().name just in case
|
||||
|
||||
# Pass the path string to add_def, explicitly disable escaping
|
||||
add_def(macro_name, formatted_path, escape_if_plain=False)
|
||||
|
||||
# Copy the actual plot file only if it's NOT the placeholder
|
||||
if not is_placeholder and source_filename:
|
||||
# Resolve source relative to the main reports/plots dir
|
||||
source_file_path = source_plots_dir / Path(source_filename).name
|
||||
target_file_path = tmp_plots_dir / filename # Target uses just the filename
|
||||
if source_file_path.is_file():
|
||||
try:
|
||||
shutil.copy2(source_file_path, target_file_path)
|
||||
except Exception as copy_e:
|
||||
logger.warning(f"Could not copy plot file {source_file_path} to {target_file_path}: {copy_e}")
|
||||
# else: # Optionally log if source plot missing
|
||||
# logger.warning(f"Source plot file not found: {source_file_path}")
|
||||
|
||||
# Return the boolean flag indicating if it was a real plot or placeholder
|
||||
return not is_placeholder
|
||||
|
||||
|
||||
# --- Generate Definitions using the new add_def ---
|
||||
# Basic Info
|
||||
add_def("reportDateGenerated", datetime.date.today(), formatter=lambda d: d.strftime("%Y-%m-%d"))
|
||||
add_def("dataSourceDescription", f"Hourly prices from {settings.data_file.name}")
|
||||
add_def("priceVariableName", settings.data_file.stem)
|
||||
|
||||
# Info from DataFrame
|
||||
if df is not None and not df.empty:
|
||||
add_def("dateRangeStart", df.index.min().date())
|
||||
add_def("dateRangeEnd", df.index.max().date())
|
||||
add_def("numDataPoints", len(df))
|
||||
freq_info = "Irregular/Not Inferred"
|
||||
if isinstance(df.index, pd.DatetimeIndex):
|
||||
try:
|
||||
inferred = pd.infer_freq(df.index)
|
||||
freq_info = inferred if inferred else freq_info
|
||||
except Exception: # Handle potential errors in infer_freq
|
||||
logger.warning("Could not infer frequency.", exc_info=True)
|
||||
add_def("timeIndexFrequency", f"Hourly (Inferred: {freq_info})")
|
||||
add_def("timeIndexConfirmation", f"DatetimeIndex, Hourly (Inferred: {freq_info})")
|
||||
# Escape column names individually before joining
|
||||
all_cols_str = ", ".join([_escape_latex(str(c)) for c in df.columns])
|
||||
add_def("otherColumnsList", all_cols_str if all_cols_str else "None", escape_if_plain=False) # Already escaped
|
||||
else:
|
||||
add_def("dateRangeStart", None, default=default_na)
|
||||
add_def("dateRangeEnd", None, default=default_na)
|
||||
add_def("numDataPoints", None, default=default_na)
|
||||
add_def("timeIndexFrequency", None, default=default_na)
|
||||
add_def("timeIndexConfirmation", None, default=default_na)
|
||||
add_def("otherColumnsList", "None") # Simple string, escape
|
||||
|
||||
# Section 1 Tables
|
||||
summary_data = report_data.summary_data or {}
|
||||
add_def("tableHeadData", summary_data.get('head'),
|
||||
formatter=lambda df_val: dataframe_to_latex(df_val, title="First 5 Rows", label="head", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
add_def("tableTailData", summary_data.get('tail'),
|
||||
formatter=lambda df_val: dataframe_to_latex(df_val, title="Last 5 Rows", label="tail", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
add_def("tableDtypesInfo", summary_data.get('dtypes'),
|
||||
formatter=lambda s: series_to_latex(s, title="Data Types", label="dtypes", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
|
||||
# Section 2 Tables
|
||||
desc_stats = report_data.descriptive_stats or {}
|
||||
escaped_desc_title = _escape_latex(f"Descriptive Statistics ({settings.data_file.stem})")
|
||||
add_def("tableDescriptiveStats", desc_stats.get('desc_price'),
|
||||
formatter=lambda s: series_to_latex(s, title=escaped_desc_title, label="desc_price", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
|
||||
missing_counts = summary_data.get('missing')
|
||||
add_def("tableMissingCounts", missing_counts,
|
||||
formatter=lambda s: series_to_latex(s, title="Missing Value Counts (Post-Imputation)", label="missing_counts", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
|
||||
missing_pct = None
|
||||
if missing_counts is not None and df is not None and len(df) > 0:
|
||||
missing_pct = (missing_counts / len(df)) * 100
|
||||
missing_pct = missing_pct.round(3)
|
||||
|
||||
add_def("tableMissingPercentages", missing_pct,
|
||||
formatter=lambda s: series_to_latex(s, title="Missing Value Percentage (Post-Imputation)", label="missing_pct", escape=True),
|
||||
escape_if_plain=False, default=default_na)
|
||||
|
||||
add_def("missingValuesObservations", report_data.imputation_message, default="Missing value check information not available.")
|
||||
|
||||
# Section 3 Plots
|
||||
add_path_def("plotFullTimeseries", other_plot_paths, 'full_timeseries')
|
||||
# Capture the return value of add_path_def to see if a real plot was added
|
||||
show_zoomed = add_path_def("plotZoomedTimeseries", other_plot_paths, 'zoomed_timeseries')
|
||||
add_def("ifShowZoomedTimeseries", "true" if show_zoomed else "false", escape_if_plain=False) # Add boolean macro
|
||||
|
||||
add_path_def("plotHistogram", other_plot_paths, 'histogram_price')
|
||||
add_path_def("plotBoxplotHour", other_plot_paths, 'boxplot_hour')
|
||||
add_path_def("plotBoxplotDayofweek", other_plot_paths, 'boxplot_dayofweek')
|
||||
add_path_def("plotBoxplotMonth", other_plot_paths, 'boxplot_month')
|
||||
add_path_def("plotBoxplotYear", other_plot_paths, 'boxplot_year')
|
||||
add_path_def("plotSeasonalSubseriesDaily", other_plot_paths, 'seasonal_subseries_daily')
|
||||
add_path_def("plotSeasonalSubseriesWeekly", other_plot_paths, 'seasonal_subseries_weekly')
|
||||
add_def("seasonalInteractionsObservations", None, default=default_text, escape_if_plain=False)
|
||||
|
||||
# Section 4 Decomposition
|
||||
add_def("decompositionMethodDetails", f"Statsmodels seasonal_decompose (model='{decomposition_model}')")
|
||||
add_path_def("plotDecompositionDaily", decomposition_plot_paths, 'daily')
|
||||
add_path_def("plotDecompositionWeekly", decomposition_plot_paths, 'weekly')
|
||||
# Capture the return value for yearly decomp
|
||||
show_yearly = add_path_def("plotDecompositionYearly", decomposition_plot_paths, 'yearly')
|
||||
add_def("ifShowYearlyDecomp", "true" if show_yearly else "false", escape_if_plain=False) # Add boolean macro
|
||||
|
||||
add_def("decompositionObservations", None, default=default_text, escape_if_plain=False)
|
||||
|
||||
# Section 5 Stationarity
|
||||
stationarity_tests = report_data.stationarity_tests or {}
|
||||
add_def("stationaritySeriesTested", series_name_stat)
|
||||
add_path_def("plotResiduals", other_plot_paths, 'residuals')
|
||||
|
||||
add_def("tableAdfResults", stationarity_tests,
|
||||
formatter=lambda tests: _format_stationarity_results(tests, "ADF"),
|
||||
escape_if_plain=False, default=default_na)
|
||||
add_def("tableKpssResults", stationarity_tests,
|
||||
formatter=lambda tests: _format_stationarity_results(tests, "KPSS"),
|
||||
escape_if_plain=False, default=default_na)
|
||||
|
||||
findings_summary = r"\textit{Analysis requires both ADF and KPSS results.}"
|
||||
try:
|
||||
adf_res = stationarity_tests.get('adf')
|
||||
kpss_res = stationarity_tests.get('kpss')
|
||||
adf_p = adf_res.get('p-value') if adf_res else None
|
||||
kpss_p = kpss_res.get('p-value') if kpss_res else None
|
||||
|
||||
if adf_p is not None and kpss_p is not None:
|
||||
if adf_p < 0.05 and kpss_p >= 0.05:
|
||||
findings_summary = "Tests suggest the series is stationary (ADF rejects H0, KPSS fails to reject H0)."
|
||||
elif adf_p >= 0.05 and kpss_p < 0.05:
|
||||
findings_summary = "Tests suggest the series is non-stationary (trend-stationary) and requires differencing (ADF fails to reject H0, KPSS rejects H0)."
|
||||
elif adf_p < 0.05 and kpss_p < 0.05:
|
||||
findings_summary = "Test results conflict: ADF suggests stationarity, KPSS suggests non-stationarity. May indicate difference-stationarity."
|
||||
else:
|
||||
findings_summary = "Tests suggest the series is non-stationary (unit root present) and requires differencing (Both fail to reject H0)."
|
||||
elif adf_p is not None:
|
||||
findings_summary = f"ADF test p-value: {adf_p:.4f}. Stationarity conclusion requires KPSS test."
|
||||
elif kpss_p is not None:
|
||||
findings_summary = f"KPSS test p-value: {kpss_p:.4f}. Stationarity conclusion requires ADF test."
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not generate stationarity summary: {e}")
|
||||
findings_summary = r"\textit{Error generating summary.}"
|
||||
|
||||
add_def("stationarityFindingsSummary", findings_summary)
|
||||
|
||||
# Section 6 Autocorrelation
|
||||
add_def("autocorrSeriesAnalyzed", series_name_stat)
|
||||
add_def("autocorrLagsShown", acf_pacf_lags)
|
||||
add_path_def("plotAcf", acf_pacf_plot_paths, 'acf')
|
||||
add_path_def("plotPacf", acf_pacf_plot_paths, 'pacf')
|
||||
add_def("autocorrObservations", None, default=default_text, escape_if_plain=False)
|
||||
|
||||
# Section 7 Summary & Implications
|
||||
add_def("summaryTrendCycles", None, default=default_text, escape_if_plain=False)
|
||||
add_def("summarySeasonality", None, default=default_text, escape_if_plain=False)
|
||||
add_def("summaryStationarity", None, default=default_text, escape_if_plain=False)
|
||||
add_def("summaryAutocorrelations", None, default=default_text, escape_if_plain=False)
|
||||
add_def("summaryOutliersVolatility", None, default=default_text, escape_if_plain=False)
|
||||
add_def("implicationsModelChoice", None, default=default_text, escape_if_plain=False)
|
||||
add_def("implicationsFeatureEngineering", None, default=default_text, escape_if_plain=False)
|
||||
add_def("implicationsPreprocessing", None, default=default_text, escape_if_plain=False)
|
||||
add_def("implicationsEvaluation", None, default=default_text, escape_if_plain=False)
|
||||
add_def("implicationsProbabilistic", None, default=default_text, escape_if_plain=False)
|
||||
|
||||
# Section 8 Conclusion
|
||||
add_def("conclusionStatement", None, default=default_text, escape_if_plain=False)
|
||||
|
||||
|
||||
# --- Apply Definitions to Template ---
|
||||
definitions_block = "\n".join(latex_definitions)
|
||||
if "{{LATEX_DEFINITIONS}}" not in template:
|
||||
logger.error("Placeholder '{{LATEX_DEFINITIONS}}' not found in the LaTeX template preamble.")
|
||||
raise ValueError("Template missing '{{LATEX_DEFINITIONS}}' placeholder in preamble.")
|
||||
report_content = template.replace("{{LATEX_DEFINITIONS}}", definitions_block)
|
||||
|
||||
# --- Write Report ---
|
||||
try:
|
||||
with open(report_tex_path, 'w', encoding='utf-8') as f:
|
||||
f.write(report_content)
|
||||
logger.info(f"Successfully generated LaTeX report source: {report_tex_path}")
|
||||
|
||||
# --- Copy Plots ---
|
||||
# This is now handled within add_path_def to copy files individually
|
||||
# logger.info(f"Copying plots from {source_plots_dir} to {tmp_plots_dir}")
|
||||
# try:
|
||||
# shutil.copytree(source_plots_dir, tmp_plots_dir, dirs_exist_ok=True) # dirs_exist_ok=True allows overwriting
|
||||
# except FileNotFoundError:
|
||||
# logger.error(f"Source plots directory not found: {source_plots_dir}")
|
||||
# raise # Re-raise error if plots dir is essential
|
||||
# except Exception as e:
|
||||
# logger.error(f"Failed to copy plots directory: {e}", exc_info=True)
|
||||
# raise # Re-raise error
|
||||
|
||||
# Attempt to compile the report
|
||||
if compile_latex_report(report_tex_path, output_dir):
|
||||
logger.info("LaTeX report successfully compiled to PDF")
|
||||
else:
|
||||
logger.warning("LaTeX compilation failed. Check logs above. The .tex file is available for manual compilation.")
|
||||
# Consider raising an error if PDF generation is critical
|
||||
# raise RuntimeError("LaTeX compilation failed.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to write LaTeX report to {report_tex_path}: {e}", exc_info=True)
|
||||
raise IOError(f"Failed to write LaTeX report to {report_tex_path}: {e}") from e
|
0
data_analysis/utils/__init__.py
Normal file
0
data_analysis/utils/__init__.py
Normal file
306
data_analysis/utils/_latex_report_template.tex
Normal file
306
data_analysis/utils/_latex_report_template.tex
Normal file
@ -0,0 +1,306 @@
|
||||
% LaTeX EDA Report Template
|
||||
\documentclass[11pt, a4paper]{article}
|
||||
|
||||
% --- Packages ---
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{lmodern} % Use Latin Modern fonts
|
||||
\usepackage[margin=1in]{geometry} % Set page margins
|
||||
\usepackage{graphicx} % Required for including images
|
||||
% \graphicspath{{../reports/plots/}} % REMOVE OR COMMENT OUT THIS LINE
|
||||
\usepackage{booktabs} % For professional quality tables (\toprule, \midrule, \bottomrule)
|
||||
\usepackage{amsmath} % For math symbols and environments
|
||||
\usepackage{datetime2} % For date formatting (optional, can use simple text)
|
||||
\usepackage{float} % For finer control over figure placement (e.g., [H] option)
|
||||
\usepackage{caption} % For customizing captions
|
||||
\usepackage{hyperref} % For clickable links (optional)
|
||||
\usepackage{sectsty} % To potentially adjust section font sizes/styles (optional)
|
||||
\usepackage{parskip} % Use vertical space between paragraphs instead of indentation
|
||||
\usepackage{ifthen} % ADD THIS PACKAGE for conditional logic
|
||||
|
||||
% --- Hyperref Setup (Optional) ---
|
||||
\hypersetup{
|
||||
colorlinks=true,
|
||||
linkcolor=blue,
|
||||
filecolor=magenta,
|
||||
urlcolor=cyan,
|
||||
pdftitle={Time Series EDA Report},
|
||||
pdfpagemode=FullScreen,
|
||||
}
|
||||
|
||||
% --- Custom LaTeX Definitions Placeholder ---
|
||||
{{LATEX_DEFINITIONS}} % Python script will insert \newcommand definitions here
|
||||
% Define boolean flags if they don't exist (e.g., for manual compilation)
|
||||
\ifdefined\ifShowZoomedTimeseries\else\newcommand{\ifShowZoomedTimeseries}{false}\fi
|
||||
\ifdefined\ifShowYearlyDecomp\else\newcommand{\ifShowYearlyDecomp}{false}\fi
|
||||
|
||||
% --- Document Information ---
|
||||
\title{Time Series Exploratory Data Analysis Report: Hourly Prices}
|
||||
\author{Generated Automatically}
|
||||
\date{\reportDateGenerated} % Use the macro defined in Python
|
||||
|
||||
% --- Start Document ---
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
% --- Overview Section ---
|
||||
\section*{Report Overview}
|
||||
\begin{itemize}
|
||||
\item \textbf{Data Source:} \dataSourceDescription
|
||||
\item \textbf{Time Series Variable:} `\priceVariableName`
|
||||
\item \textbf{Time Index Frequency:} \timeIndexFrequency
|
||||
\item \textbf{Date Range:} \dateRangeStart \ to \dateRangeEnd
|
||||
\end{itemize}
|
||||
|
||||
% --- Section 1: Data Overview ---
|
||||
\section{Data Overview and Initial Inspection}
|
||||
Purpose: Understand the basic structure, size, and data types of the dataset. Check the time index integrity.
|
||||
|
||||
\subsection*{Key Information}
|
||||
\begin{itemize}
|
||||
\item Number of data points (length of the series): \numDataPoints
|
||||
\item Confirmation of time index format and frequency: \timeIndexConfirmation
|
||||
\item Presence of other columns/variables: \otherColumnsList
|
||||
\end{itemize}
|
||||
|
||||
\subsection*{Raw Data Sample}
|
||||
% Placeholder for Table: First 5 Rows
|
||||
\tableHeadData
|
||||
\vspace{\baselineskip} % Add some vertical space
|
||||
|
||||
% Placeholder for Table: Last 5 Rows
|
||||
\tableTailData
|
||||
|
||||
\subsection*{Data Types}
|
||||
% Placeholder for Table: Data Types (`df.info()`)
|
||||
\tableDtypesInfo
|
||||
|
||||
% --- Section 2: Descriptive Statistics & Missing Values ---
|
||||
\section{Descriptive Statistics and Missing Values}
|
||||
Purpose: Summarize the central tendency, dispersion, and distribution of the price variable and identify data completeness issues. Note any unusual values (like negative prices).
|
||||
|
||||
\subsection*{Price Variable Statistics}
|
||||
% Placeholder for Table: Descriptive Statistics (`df['Price'].describe()`)
|
||||
\tableDescriptiveStats
|
||||
|
||||
\subsection*{Missing Values}
|
||||
% Placeholder for Table: Count of Missing Values
|
||||
\tableMissingCounts
|
||||
\vspace{\baselineskip}
|
||||
|
||||
% Placeholder for Table: Percentage of Missing Values
|
||||
\tableMissingPercentages
|
||||
\vspace{\baselineskip}
|
||||
|
||||
Observations on missing values: \missingValuesObservations % Add a text placeholder
|
||||
|
||||
% --- Section 3: Visual Exploration ---
|
||||
\section{Visual Exploration of Time Series Patterns}
|
||||
Purpose: Visually identify overall trends, seasonality (daily, weekly, yearly), cycles, outliers, and changes in variance. Investigate interactions between patterns.
|
||||
|
||||
\begin{figure}[H] % Use [H] from float package to place figure 'here' if possible
|
||||
\centering
|
||||
% Placeholder for Plot: Full Time Series
|
||||
\includegraphics[width=0.9\textwidth]{\plotFullTimeseries}
|
||||
\caption{Full Time Series: Price vs. Time.}
|
||||
\label{fig:full_ts}
|
||||
\end{figure}
|
||||
|
||||
% --- Conditionally include Zoomed Timeseries Plot ---
|
||||
\ifthenelse{\boolean{\ifShowZoomedTimeseries}}{%
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Zoomed Time Series
|
||||
\includegraphics[width=0.9\textwidth]{\plotZoomedTimeseries}
|
||||
\caption{Zoomed Time Series (Specific Period).}
|
||||
\label{fig:zoomed_ts}
|
||||
\end{figure}
|
||||
}{} % Empty 'else' part - include nothing if false
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Histogram
|
||||
\includegraphics[width=0.7\textwidth]{\plotHistogram}
|
||||
\caption{Distribution of Price Values.}
|
||||
\label{fig:histogram}
|
||||
\end{figure}
|
||||
|
||||
\subsection*{Seasonal Patterns \& Interactions}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Hour
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotHour}
|
||||
\caption{Price Distribution by Hour of Day.}
|
||||
\label{fig:boxplot_hour}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Day of Week
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotDayofweek}
|
||||
\caption{Price Distribution by Day of Week.}
|
||||
\label{fig:boxplot_dayofweek}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Month
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotMonth}
|
||||
\caption{Price Distribution by Month.}
|
||||
\label{fig:boxplot_month}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Year
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotYear}
|
||||
\caption{Price Distribution by Year.}
|
||||
\label{fig:boxplot_year}
|
||||
\end{figure}
|
||||
|
||||
% Optional Seasonal Subseries Plots
|
||||
\textit{Optional: Seasonal Sub-series plots below.}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Optional Plot: Seasonal Sub-series Daily
|
||||
\includegraphics[width=0.9\textwidth]{\plotSeasonalSubseriesDaily}
|
||||
\caption{Seasonal Sub-series Plot (Daily Pattern).}
|
||||
\label{fig:subseries_daily}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Optional Plot: Seasonal Sub-series Weekly
|
||||
\includegraphics[width=0.9\textwidth]{\plotSeasonalSubseriesWeekly}
|
||||
\caption{Seasonal Sub-series Plot (Weekly Pattern).}
|
||||
\label{fig:subseries_weekly}
|
||||
\end{figure}
|
||||
|
||||
Observations on seasonal interactions: \seasonalInteractionsObservations % Placeholder
|
||||
|
||||
% --- Section 4: Time Series Decomposition ---
|
||||
\section{Time Series Decomposition}
|
||||
Purpose: Separate the time series into its underlying components: Trend, Seasonality, and Residuals. Assess how well the decomposition captures the main patterns.
|
||||
|
||||
Method Used: \decompositionMethodDetails
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Daily Period)
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionDaily}
|
||||
\caption{Time Series Decomposition (Daily Seasonality, Period=24).}
|
||||
\label{fig:decomp_daily}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Weekly Period)
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionWeekly}
|
||||
\caption{Time Series Decomposition (Weekly Seasonality, Period=168).}
|
||||
\label{fig:decomp_weekly}
|
||||
\end{figure}
|
||||
|
||||
% Optional Yearly Decomposition
|
||||
\textit{Optional: Yearly decomposition plot below.}
|
||||
% --- Conditionally include Yearly Decomposition Plot ---
|
||||
\ifthenelse{\boolean{\ifShowYearlyDecomp}}{%
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Yearly Period) - Optional
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionYearly}
|
||||
\caption{Time Series Decomposition (Yearly Seasonality, Period=8760).}
|
||||
\label{fig:decomp_yearly}
|
||||
\end{figure}
|
||||
}{} % Empty 'else' part - include nothing if false
|
||||
|
||||
Observations on decomposition: \decompositionObservations % Placeholder
|
||||
|
||||
% --- Section 5: Stationarity Analysis ---
|
||||
\section{Stationarity Analysis}
|
||||
Purpose: Determine if the statistical properties (mean, variance, autocorrelation) are constant over time.
|
||||
|
||||
Methods: Visual inspection, Augmented Dickey-Fuller (ADF) Test, KPSS Test.
|
||||
|
||||
Series Tested: \stationaritySeriesTested
|
||||
|
||||
\subsection*{Visual Inspection (Residuals)}
|
||||
Refer to the trend component in the decomposition plots (Figures \ref{fig:decomp_daily}, \ref{fig:decomp_weekly}).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Residuals
|
||||
\includegraphics[width=0.9\textwidth]{\plotResiduals}
|
||||
\caption{Residuals from Decomposition (used for stationarity tests).}
|
||||
\label{fig:residuals}
|
||||
\end{figure}
|
||||
|
||||
\subsection*{Statistical Test Results}
|
||||
% Placeholder for Table: ADF Test Results
|
||||
\tableAdfResults
|
||||
\vspace{\baselineskip}
|
||||
|
||||
% Placeholder for Table: KPSS Test Results
|
||||
\tableKpssResults
|
||||
|
||||
\subsection*{Findings}
|
||||
\stationarityFindingsSummary % Placeholder
|
||||
|
||||
% --- Section 6: Autocorrelation Analysis ---
|
||||
\section{Autocorrelation Analysis}
|
||||
Purpose: Understand the linear dependence between the series (or tested series) and its past values.
|
||||
|
||||
Series Analyzed: \autocorrSeriesAnalyzed
|
||||
|
||||
Lags Shown: \autocorrLagsShown
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: ACF
|
||||
\includegraphics[width=0.9\textwidth]{\plotAcf}
|
||||
\caption{Autocorrelation Function (ACF).}
|
||||
\label{fig:acf}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: PACF
|
||||
\includegraphics[width=0.9\textwidth]{\plotPacf}
|
||||
\caption{Partial Autocorrelation Function (PACF).}
|
||||
\label{fig:pacf}
|
||||
\end{figure}
|
||||
|
||||
Observations: \autocorrObservations % Placeholder
|
||||
|
||||
% --- Section 7: Summary and Implications ---
|
||||
\section{Analysis Summary and Implications for Forecasting}
|
||||
Purpose: Synthesize the findings and discuss their relevance for modeling.
|
||||
|
||||
\subsection*{Key Findings Summary}
|
||||
\begin{itemize}
|
||||
\item \textbf{Trend \& Cycles:} \summaryTrendCycles
|
||||
\item \textbf{Seasonality:} \summarySeasonality
|
||||
\item \textbf{Stationarity:} \summaryStationarity
|
||||
\item \textbf{Autocorrelations:} \summaryAutocorrelations
|
||||
\item \textbf{Outliers/Volatility:} \summaryOutliersVolatility
|
||||
\end{itemize}
|
||||
|
||||
\subsection*{Implications for Day-Ahead Model}
|
||||
\begin{itemize}
|
||||
\item \textbf{Model Choice:} \implicationsModelChoice
|
||||
\item \textbf{Feature Engineering:} \implicationsFeatureEngineering
|
||||
\item \textbf{Preprocessing:} \implicationsPreprocessing
|
||||
\item \textbf{Evaluation:} \implicationsEvaluation
|
||||
\item \textbf{Probabilistic Forecasting:} \implicationsProbabilistic
|
||||
\end{itemize}
|
||||
|
||||
% --- Section 8: Conclusion ---
|
||||
\section{Conclusion}
|
||||
Purpose: Briefly summarize the EDA process.
|
||||
|
||||
\conclusionStatement % Placeholder
|
||||
|
||||
% --- End Document ---
|
||||
\end{document}
|
166
data_analysis/utils/config_model.py
Normal file
166
data_analysis/utils/config_model.py
Normal file
@ -0,0 +1,166 @@
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from pydantic import BaseModel, Field, ValidationError, field_validator # Use BaseModel for direct dict init
|
||||
from typing import Optional # Use Optional for type hints
|
||||
|
||||
# --- Logger Setup ---
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Configuration File Path ---
|
||||
# Define the default path for the configuration file
|
||||
CONFIG_YAML_PATH = Path("config.yaml")
|
||||
|
||||
# --- Settings Model ---
|
||||
class Settings(BaseModel):
|
||||
"""
|
||||
Application settings loaded from YAML configuration.
|
||||
|
||||
This class defines the configuration structure for the forecasting model,
|
||||
including data paths, logging settings, and analysis parameters.
|
||||
"""
|
||||
# -- General Settings --
|
||||
debug: bool = Field(
|
||||
default=False,
|
||||
description="Enable debug mode for detailed logging and latex stderr output",
|
||||
examples=[True, False]
|
||||
)
|
||||
log_level: str = Field(
|
||||
default="INFO",
|
||||
description="Logging level for the application",
|
||||
examples=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
)
|
||||
# -- IO Settings --
|
||||
data_file: Path = Field(
|
||||
default=Path("data/energy_prices.csv"),
|
||||
description="Path to the input data CSV file relative to project root",
|
||||
examples=["data/energy_prices.csv", "data/Day-ahead_Prices_60min.csv"]
|
||||
)
|
||||
latex_template_file: Optional[Path] = Field(
|
||||
default=Path("data_analysis/utils/_latex_report_template.tex"),
|
||||
description="Path to the LTX template file relative to project root",
|
||||
examples=["data_analysis/utils/_latex_report_template.tex", "data/byo_template.tex"]
|
||||
)
|
||||
output_dir: Path = Field(
|
||||
default=Path("output/reports"),
|
||||
description="Directory to save generated plots and report artifacts",
|
||||
examples=["output/reports", "analysis/results"]
|
||||
)
|
||||
# -- Zoom Settings (Plotting and Analysis) --
|
||||
zoom_start_date: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Start date for zoomed-in analysis plots (YYYY-MM-DD format)",
|
||||
examples=["2023-01-01"]
|
||||
)
|
||||
zoom_end_date: Optional[str] = Field(
|
||||
default=None,
|
||||
description="End date for zoomed-in analysis plots (YYYY-MM-DD format)",
|
||||
examples=["2023-12-31"]
|
||||
)
|
||||
|
||||
# -- Data Settings --
|
||||
expected_data_frequency: str = Field(
|
||||
default="h",
|
||||
description="Expected frequency of the time series data",
|
||||
examples=["h", "D", "M", "Y"]
|
||||
)
|
||||
|
||||
@field_validator('log_level')
|
||||
def validate_log_level(cls, v):
|
||||
"""Validate that log_level is one of the standard logging levels."""
|
||||
valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
if v.upper() not in valid_levels:
|
||||
raise ValueError(f"log_level must be one of {valid_levels}")
|
||||
return v.upper()
|
||||
|
||||
@field_validator('expected_data_frequency')
|
||||
def validate_frequency(cls, v):
|
||||
"""Validate that frequency is a valid pandas frequency string."""
|
||||
valid_freqs = ["h", "D", "M", "Y"]
|
||||
v_lower = v.lower() # Convert input to lowercase for comparison
|
||||
if v_lower not in [f.lower() for f in valid_freqs]:
|
||||
raise ValueError(f"expected_data_frequency must be one of {valid_freqs}")
|
||||
return v_lower # Return normalized lowercase value
|
||||
|
||||
@field_validator('zoom_start_date', 'zoom_end_date')
|
||||
def validate_date_format(cls, v):
|
||||
"""Validate date format if provided."""
|
||||
if v is None:
|
||||
return v
|
||||
try:
|
||||
from datetime import datetime
|
||||
datetime.strptime(v, "%Y-%m-%d")
|
||||
return v
|
||||
except ValueError:
|
||||
raise ValueError("Date must be in YYYY-MM-DD format")
|
||||
|
||||
@field_validator('latex_template_file')
|
||||
def validate_latex_template_file(cls, latex_template_file):
|
||||
return latex_template_file or cls.model_fields['latex_template_file'].default
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, yaml_path: Path) -> 'Settings':
|
||||
"""
|
||||
Load settings from a YAML file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the YAML configuration file
|
||||
|
||||
Returns:
|
||||
Settings instance with values from the YAML file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the YAML file doesn't exist
|
||||
yaml.YAMLError: If the YAML file is invalid
|
||||
ValidationError: If the YAML values don't match the schema
|
||||
"""
|
||||
if not yaml_path.exists():
|
||||
raise FileNotFoundError(f"Configuration file not found: {yaml_path}")
|
||||
|
||||
try:
|
||||
with open(yaml_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
return cls(**config)
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Error parsing YAML file {yaml_path}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading settings from {yaml_path}: {e}")
|
||||
raise
|
||||
|
||||
# --- Loading Function ---
|
||||
def load_settings(config_path: Path = CONFIG_YAML_PATH) -> Settings:
|
||||
"""Loads settings from a YAML file."""
|
||||
logger.info(f"Attempting to load configuration from: {config_path.resolve()}")
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
if not config_data:
|
||||
logger.warning(f"Configuration file {config_path} is empty. Using default settings.")
|
||||
return Settings() # Return default settings if file is empty
|
||||
|
||||
settings = Settings(**config_data)
|
||||
logger.info("Configuration loaded successfully.")
|
||||
|
||||
# Update logger level based on loaded settings
|
||||
logging.getLogger().setLevel(settings.log_level.upper())
|
||||
logger.info(f"Log level set to: {settings.log_level.upper()}")
|
||||
logger.debug(settings.model_dump_json(indent=2)) # Log loaded settings at debug level
|
||||
return settings
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Configuration file {config_path} not found. Using default settings.")
|
||||
return Settings() # Return default settings if file not found
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Error parsing YAML file {config_path}: {e}. Using default settings.")
|
||||
return Settings() # Return default settings on parse error
|
||||
except ValidationError as e:
|
||||
logger.error(f"Configuration validation error: {e}. Using default settings.")
|
||||
return Settings() # Return default settings on validation error
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred while loading settings: {e}. Using default settings.")
|
||||
return Settings() # Catch other potential errors
|
||||
|
||||
# --- Global Settings Instance ---
|
||||
# Load settings when the module is imported
|
||||
settings = load_settings()
|
11
data_analysis/utils/report_model.py
Normal file
11
data_analysis/utils/report_model.py
Normal file
@ -0,0 +1,11 @@
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ReportData(BaseModel):
|
||||
"""Container for all report-related data."""
|
||||
descriptive_stats: Optional[Dict[str, Any]] = None
|
||||
stationarity_tests: Optional[Dict[str, Any]] = None
|
||||
summary_data: Optional[Dict[str, Any]] = None
|
||||
imputation_message: Optional[str] = None
|
22
data_analysis_config.yaml
Normal file
22
data_analysis_config.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
# Configuration for the forecasting model EDA
|
||||
# This file defines the settings for data loading, analysis, and visualization
|
||||
|
||||
# -- General Settings --
|
||||
log_level: INFO # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
debug: true
|
||||
|
||||
# -- IO Settings --
|
||||
data_file: data/Day-ahead_Prices_60min.csv # Path to the input data CSV relative to project root
|
||||
output_dir: output/reports # Directory to save generated plots and report artifacts
|
||||
latex_template_file: null # Path to the LaTeX template file relative to project root
|
||||
|
||||
|
||||
# -- Zoom Settings (Plotting and Analysis) --
|
||||
# Optional: Specify a date range for zoomed-in plots (YYYY-MM-DD format)
|
||||
# Example: zoom_start_date: "2023-01-01"
|
||||
# Example: zoom_end_date: "2023-12-31"
|
||||
zoom_start_date: null # Default to null
|
||||
zoom_end_date: null # Default to null
|
||||
|
||||
# -- Data Settings --
|
||||
expected_data_frequency: "h" # Expected frequency of the time series data (h=hourly, D=daily, M=monthly, Y=yearly)
|
22
forecasting_config.yaml
Normal file
22
forecasting_config.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
# Configuration for the forecasting model EDA
|
||||
# This file defines the settings for data loading, analysis, and visualization
|
||||
|
||||
# -- General Settings --
|
||||
log_level: INFO # Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
||||
debug: true
|
||||
|
||||
# -- IO Settings --
|
||||
data_file: data/Day-ahead_Prices_60min.csv # Path to the input data CSV relative to project root
|
||||
output_dir: output/reports # Directory to save generated plots and report artifacts
|
||||
latex_template_file: null # Path to the LaTeX template file relative to project root
|
||||
|
||||
|
||||
# -- Zoom Settings (Plotting and Analysis) --
|
||||
# Optional: Specify a date range for zoomed-in plots (YYYY-MM-DD format)
|
||||
# Example: zoom_start_date: "2023-01-01"
|
||||
# Example: zoom_end_date: "2023-12-31"
|
||||
zoom_start_date: null # Default to null
|
||||
zoom_end_date: null # Default to null
|
||||
|
||||
# -- Data Settings --
|
||||
expected_data_frequency: "h" # Expected frequency of the time series data (h=hourly, D=daily, M=monthly, Y=yearly)
|
75
forecasting_model.py
Normal file
75
forecasting_model.py
Normal file
@ -0,0 +1,75 @@
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import time
|
||||
|
||||
# Import necessary components from your project structure
|
||||
from data_analysis.utils.config_model import load_settings, Settings # Import loading function and model
|
||||
from data_analysis.analysis.pipeline import run_eda_pipeline # Import the pipeline entry point
|
||||
|
||||
# Silence overly verbose libraries if needed (e.g., matplotlib)
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING) # Example: set to WARNING or ERROR
|
||||
|
||||
# --- Basic Logging Setup ---
|
||||
# Configure logging early to catch basic issues.
|
||||
# The level might be adjusted after config loading.
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)-7s - %(message)s',
|
||||
datefmt='%H:%M:%S')
|
||||
# Get the root logger
|
||||
logger = logging.getLogger()
|
||||
|
||||
# --- Argument Parsing ---
|
||||
def parse_arguments():
|
||||
"""Parses command-line arguments."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run the Energy Forecasting EDA pipeline.",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--config',
|
||||
type=str,
|
||||
default='config.yaml', # Provide a default config file name
|
||||
help="Path to the YAML configuration file."
|
||||
)
|
||||
# Add other potential command-line overrides here if needed later
|
||||
# parser.add_argument('--debug', action='store_true', help="Override log level to DEBUG.")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
# --- Main Execution ---
|
||||
def main():
|
||||
"""Main execution function."""
|
||||
args = parse_arguments()
|
||||
config_path = Path(args.config)
|
||||
start_time = time.perf_counter()
|
||||
|
||||
# --- Configuration Loading ---
|
||||
_ = load_settings(config_path)
|
||||
logger.info(f"Using configuration from: {config_path.resolve()} (or defaults if loading failed)")
|
||||
|
||||
# --- Pipeline Execution ---
|
||||
try:
|
||||
# Call the main function from your pipeline module
|
||||
run_eda_pipeline()
|
||||
|
||||
end_time = time.perf_counter()
|
||||
logger.info(f"Main script finished successfully in {end_time - start_time:.2f} seconds.")
|
||||
|
||||
except SystemExit as e:
|
||||
# Catch SystemExit if pipeline runner exits intentionally
|
||||
logger.warning(f"Pipeline exited with code {e.code}.")
|
||||
sys.exit(e.code) # Propagate exit code
|
||||
except Exception as e:
|
||||
logger.critical(f"An critical error occurred during pipeline execution: {e}", exc_info=True)
|
||||
end_time = time.perf_counter()
|
||||
logger.info(f"Main script failed after {end_time - start_time:.2f} seconds.")
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
exit(1)
|
8
forecasting_model/__init__.py
Normal file
8
forecasting_model/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
"""
|
||||
Time Series Forecasting Module with LSTM
|
||||
|
||||
This module provides a configurable PyTorch-based LSTM model for time series forecasting,
|
||||
with support for feature engineering, cross-validation, and evaluation.
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
67
forecasting_model/data_processing.py
Normal file
67
forecasting_model/data_processing.py
Normal file
@ -0,0 +1,67 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
||||
from typing import Tuple, Generator, List, Optional
|
||||
from utils.config_model import DataConfig, FeatureConfig, TrainingConfig, EvaluationConfig
|
||||
|
||||
# --- Data Loading ---
|
||||
def load_raw_data(config: DataConfig) -> pd.DataFrame:
|
||||
"""
|
||||
Load and preprocess raw data from CSV.
|
||||
"""
|
||||
# TODO: Implement CSV loading and datetime parsing
|
||||
pass
|
||||
|
||||
# --- Feature Engineering ---
|
||||
def engineer_features(df: pd.DataFrame, target_col: str, feature_config: FeatureConfig) -> pd.DataFrame:
|
||||
"""
|
||||
Create features from the target column and datetime index.
|
||||
"""
|
||||
# TODO: Implement feature engineering (lags, rolling stats, time features, wavelets)
|
||||
pass
|
||||
|
||||
# --- Cross Validation ---
|
||||
class TimeSeriesCrossValidationSplitter:
|
||||
def __init__(self, config: CrossValidationConfig, n_samples: int):
|
||||
self.config = config
|
||||
self.n_samples = n_samples
|
||||
|
||||
def split(self) -> Generator[Tuple[np.ndarray, np.ndarray, np.ndarray], None, None]:
|
||||
"""
|
||||
Generate train/val/test splits using expanding window approach.
|
||||
"""
|
||||
# TODO: Implement expanding window CV splitter
|
||||
pass
|
||||
|
||||
# --- Dataset Class ---
|
||||
class TimeSeriesDataset(Dataset):
|
||||
def __init__(self, data_array: np.ndarray, sequence_length: int, forecast_horizon: int):
|
||||
self.data = data_array
|
||||
self.sequence_length = sequence_length
|
||||
self.forecast_horizon = forecast_horizon
|
||||
|
||||
def __len__(self) -> int:
|
||||
# TODO: Implement length calculation
|
||||
pass
|
||||
|
||||
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
# TODO: Implement sequence extraction
|
||||
pass
|
||||
|
||||
# --- Data Preparation ---
|
||||
def prepare_fold_data_and_loaders(
|
||||
full_df: pd.DataFrame,
|
||||
train_idx: np.ndarray,
|
||||
val_idx: np.ndarray,
|
||||
test_idx: np.ndarray,
|
||||
feature_config: FeatureConfig,
|
||||
train_config: TrainingConfig,
|
||||
eval_config: EvaluationConfig
|
||||
) -> Tuple[DataLoader, DataLoader, DataLoader, object, int]:
|
||||
"""
|
||||
Prepare data loaders for a single fold.
|
||||
"""
|
||||
# TODO: Implement data preparation pipeline
|
||||
pass
|
82
forecasting_model/evaluation.py
Normal file
82
forecasting_model/evaluation.py
Normal file
@ -0,0 +1,82 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from typing import Dict, Any, Optional
|
||||
from utils.config_model import EvaluationConfig
|
||||
|
||||
def calculate_mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||
"""
|
||||
Calculate Mean Absolute Error.
|
||||
"""
|
||||
# TODO: Implement MAE calculation
|
||||
pass
|
||||
|
||||
def calculate_rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
||||
"""
|
||||
Calculate Root Mean Squared Error.
|
||||
"""
|
||||
# TODO: Implement RMSE calculation
|
||||
pass
|
||||
|
||||
def plot_predictions_vs_actual(
|
||||
y_true: np.ndarray,
|
||||
y_pred: np.ndarray,
|
||||
title_suffix: str,
|
||||
filename: str,
|
||||
max_points: Optional[int] = None
|
||||
) -> None:
|
||||
"""
|
||||
Create line plot of predictions vs actual values.
|
||||
"""
|
||||
# TODO: Implement prediction vs actual plot
|
||||
pass
|
||||
|
||||
def plot_scatter_predictions(
|
||||
y_true: np.ndarray,
|
||||
y_pred: np.ndarray,
|
||||
title_suffix: str,
|
||||
filename: str
|
||||
) -> None:
|
||||
"""
|
||||
Create scatter plot of predictions vs actual values.
|
||||
"""
|
||||
# TODO: Implement scatter plot
|
||||
pass
|
||||
|
||||
def plot_residuals_time(
|
||||
residuals: np.ndarray,
|
||||
title_suffix: str,
|
||||
filename: str,
|
||||
max_points: Optional[int] = None
|
||||
) -> None:
|
||||
"""
|
||||
Create plot of residuals over time.
|
||||
"""
|
||||
# TODO: Implement residuals time plot
|
||||
pass
|
||||
|
||||
def plot_residuals_distribution(
|
||||
residuals: np.ndarray,
|
||||
title_suffix: str,
|
||||
filename: str
|
||||
) -> None:
|
||||
"""
|
||||
Create histogram/KDE of residuals.
|
||||
"""
|
||||
# TODO: Implement residuals distribution plot
|
||||
pass
|
||||
|
||||
def evaluate_fold(
|
||||
model: torch.nn.Module,
|
||||
test_loader: DataLoader,
|
||||
loss_fn: torch.nn.Module,
|
||||
device: torch.device,
|
||||
target_scaler: Any,
|
||||
eval_config: EvaluationConfig,
|
||||
fold_num: int
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Evaluate model on test set and generate plots.
|
||||
"""
|
||||
# TODO: Implement full evaluation pipeline
|
||||
pass
|
5
forecasting_model/io/__init__.py
Normal file
5
forecasting_model/io/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
"""
|
||||
IO utilities for the forecasting model.
|
||||
|
||||
This package contains utilities for data loading, saving, and visualization.
|
||||
"""
|
75
forecasting_model/io/plotting.py
Normal file
75
forecasting_model/io/plotting.py
Normal file
@ -0,0 +1,75 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import numpy as np
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def setup_plot_style() -> None:
|
||||
"""
|
||||
Set up consistent plotting style.
|
||||
"""
|
||||
# TODO: Implement plot style configuration
|
||||
pass
|
||||
|
||||
def save_plot(fig: plt.Figure, filename: str) -> None:
|
||||
"""
|
||||
Save plot to file with proper error handling.
|
||||
"""
|
||||
# TODO: Implement plot saving with error handling
|
||||
pass
|
||||
|
||||
def create_time_series_plot(
|
||||
x: np.ndarray,
|
||||
y_true: np.ndarray,
|
||||
y_pred: np.ndarray,
|
||||
title: str,
|
||||
xlabel: str,
|
||||
ylabel: str,
|
||||
max_points: Optional[int] = None
|
||||
) -> plt.Figure:
|
||||
"""
|
||||
Create a time series plot with actual vs predicted values.
|
||||
"""
|
||||
# TODO: Implement time series plot creation
|
||||
pass
|
||||
|
||||
def create_scatter_plot(
|
||||
y_true: np.ndarray,
|
||||
y_pred: np.ndarray,
|
||||
title: str,
|
||||
xlabel: str,
|
||||
ylabel: str
|
||||
) -> plt.Figure:
|
||||
"""
|
||||
Create a scatter plot of actual vs predicted values.
|
||||
"""
|
||||
# TODO: Implement scatter plot creation
|
||||
pass
|
||||
|
||||
def create_residuals_plot(
|
||||
x: np.ndarray,
|
||||
residuals: np.ndarray,
|
||||
title: str,
|
||||
xlabel: str,
|
||||
ylabel: str,
|
||||
max_points: Optional[int] = None
|
||||
) -> plt.Figure:
|
||||
"""
|
||||
Create a plot of residuals over time.
|
||||
"""
|
||||
# TODO: Implement residuals plot creation
|
||||
pass
|
||||
|
||||
def create_residuals_distribution_plot(
|
||||
residuals: np.ndarray,
|
||||
title: str,
|
||||
xlabel: str,
|
||||
ylabel: str
|
||||
) -> plt.Figure:
|
||||
"""
|
||||
Create a distribution plot of residuals.
|
||||
"""
|
||||
# TODO: Implement residuals distribution plot creation
|
||||
pass
|
28
forecasting_model/model.py
Normal file
28
forecasting_model/model.py
Normal file
@ -0,0 +1,28 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional
|
||||
from utils.config_model import ModelConfig
|
||||
|
||||
class LSTMForecastModel(nn.Module):
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
super().__init__()
|
||||
self.config = model_config
|
||||
self.use_residual_skips = model_config.use_residual_skips
|
||||
|
||||
# TODO: Initialize LSTM layers
|
||||
# TODO: Initialize dropout
|
||||
# TODO: Initialize output layer
|
||||
# TODO: Initialize residual connection layer if needed
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Forward pass through the LSTM network.
|
||||
|
||||
Args:
|
||||
x: Input tensor of shape (batch_size, sequence_length, input_size)
|
||||
|
||||
Returns:
|
||||
Predictions tensor of shape (batch_size, forecast_horizon)
|
||||
"""
|
||||
# TODO: Implement forward pass with optional residual connections
|
||||
pass
|
50
forecasting_model/trainer.py
Normal file
50
forecasting_model/trainer.py
Normal file
@ -0,0 +1,50 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import DataLoader
|
||||
from typing import Optional, Dict, Any
|
||||
from ..utils.config_model import TrainingConfig
|
||||
|
||||
class Trainer:
|
||||
def __init__(
|
||||
self,
|
||||
model: nn.Module,
|
||||
train_loader: DataLoader,
|
||||
val_loader: DataLoader,
|
||||
loss_fn: nn.Module,
|
||||
device: torch.device,
|
||||
config: TrainingConfig,
|
||||
scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
|
||||
target_scaler: Optional[Any] = None
|
||||
):
|
||||
self.model = model
|
||||
self.train_loader = train_loader
|
||||
self.val_loader = val_loader
|
||||
self.loss_fn = loss_fn
|
||||
self.device = device
|
||||
self.config = config
|
||||
self.scheduler = scheduler
|
||||
self.target_scaler = target_scaler
|
||||
|
||||
# TODO: Initialize optimizer (Adam)
|
||||
# TODO: Initialize early stopping if configured
|
||||
|
||||
def train_epoch(self) -> Dict[str, float]:
|
||||
"""
|
||||
Train for one epoch.
|
||||
"""
|
||||
# TODO: Implement training loop for one epoch
|
||||
pass
|
||||
|
||||
def evaluate(self, loader: DataLoader) -> Dict[str, float]:
|
||||
"""
|
||||
Evaluate model on given data loader.
|
||||
"""
|
||||
# TODO: Implement evaluation with metrics on original scale
|
||||
pass
|
||||
|
||||
def train(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Main training loop with validation and early stopping.
|
||||
"""
|
||||
# TODO: Implement full training loop with validation
|
||||
pass
|
5
forecasting_model/utils/__init__.py
Normal file
5
forecasting_model/utils/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
"""
|
||||
Utility functions and classes for the forecasting model.
|
||||
|
||||
This package contains configuration models, helper functions, and other utilities.
|
||||
"""
|
62
forecasting_model/utils/config_model.py
Normal file
62
forecasting_model/utils/config_model.py
Normal file
@ -0,0 +1,62 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Optional, List, Union
|
||||
from enum import Enum
|
||||
|
||||
class WaveletTransformConfig(BaseModel):
|
||||
apply: bool = False
|
||||
target_or_feature: str = "target"
|
||||
wavelet_type: str = "db4"
|
||||
level: int = 3
|
||||
use_coeffs: List[str] = ["approx", "detail_1"]
|
||||
|
||||
class DataConfig(BaseModel):
|
||||
data_path: str
|
||||
datetime_col: str
|
||||
target_col: str
|
||||
|
||||
class FeatureConfig(BaseModel):
|
||||
sequence_length: int
|
||||
forecast_horizon: int
|
||||
lags: List[int]
|
||||
rolling_window_sizes: List[int]
|
||||
use_time_features: bool
|
||||
scaling_method: Optional[str] = None
|
||||
wavelet_transform: Optional[WaveletTransformConfig] = None
|
||||
|
||||
class ModelConfig(BaseModel):
|
||||
input_size: Optional[int] = None # Will be calculated
|
||||
hidden_size: int
|
||||
num_layers: int
|
||||
dropout: float
|
||||
use_residual_skips: bool = False
|
||||
output_size: Optional[int] = None # Will be calculated
|
||||
|
||||
class TrainingConfig(BaseModel):
|
||||
batch_size: int
|
||||
epochs: int
|
||||
learning_rate: float
|
||||
loss_function: str
|
||||
device: str
|
||||
early_stopping_patience: Optional[int] = None
|
||||
scheduler_step_size: Optional[int] = None
|
||||
scheduler_gamma: Optional[float] = None
|
||||
|
||||
class CrossValidationConfig(BaseModel):
|
||||
n_splits: int
|
||||
test_size_fraction: float
|
||||
val_size_fraction: float
|
||||
initial_train_size: Optional[Union[int, float]] = None
|
||||
|
||||
class EvaluationConfig(BaseModel):
|
||||
metrics: List[str]
|
||||
eval_batch_size: int
|
||||
save_plots: bool
|
||||
plot_sample_size: int
|
||||
|
||||
class MainConfig(BaseModel):
|
||||
data: DataConfig
|
||||
features: FeatureConfig
|
||||
model: ModelConfig
|
||||
training: TrainingConfig
|
||||
cross_validation: CrossValidationConfig
|
||||
evaluation: EvaluationConfig
|
126
main.py
Normal file
126
main.py
Normal file
@ -0,0 +1,126 @@
|
||||
import logging
|
||||
import torch
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any
|
||||
|
||||
from forecasting_model.utils.config_model import MainConfig
|
||||
from forecasting_model.data_processing import (
|
||||
load_raw_data,
|
||||
TimeSeriesCrossValidationSplitter,
|
||||
prepare_fold_data_and_loaders
|
||||
)
|
||||
from forecasting_model.model import LSTMForecastModel
|
||||
from forecasting_model.trainer import Trainer
|
||||
from forecasting_model.evaluation import evaluate_fold
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def load_config(config_path: Path) -> MainConfig:
|
||||
"""
|
||||
Load and validate configuration from YAML file.
|
||||
"""
|
||||
# TODO: Implement config loading
|
||||
pass
|
||||
|
||||
def set_seeds(seed: int = 42) -> None:
|
||||
"""
|
||||
Set random seeds for reproducibility.
|
||||
"""
|
||||
# TODO: Implement seed setting
|
||||
pass
|
||||
|
||||
def determine_device(config: MainConfig) -> torch.device:
|
||||
"""
|
||||
Determine the device to use for training.
|
||||
"""
|
||||
# TODO: Implement device determination
|
||||
pass
|
||||
|
||||
def aggregate_cv_metrics(all_fold_metrics: List[Dict[str, float]]) -> Dict[str, Dict[str, float]]:
|
||||
"""
|
||||
Calculate mean and standard deviation of metrics across folds.
|
||||
"""
|
||||
# TODO: Implement metric aggregation
|
||||
pass
|
||||
|
||||
def main():
|
||||
# Load configuration
|
||||
config = load_config(Path("config.yaml"))
|
||||
|
||||
# Set random seeds
|
||||
set_seeds()
|
||||
|
||||
# Determine device
|
||||
device = determine_device(config)
|
||||
|
||||
# Load raw data
|
||||
df = load_raw_data(config.data)
|
||||
|
||||
# Initialize CV splitter
|
||||
cv_splitter = TimeSeriesCrossValidationSplitter(config.cross_validation, len(df))
|
||||
|
||||
# Initialize list to store fold metrics
|
||||
all_fold_metrics = []
|
||||
|
||||
# Cross-validation loop
|
||||
for fold_num, (train_idx, val_idx, test_idx) in enumerate(cv_splitter.split(), 1):
|
||||
logger.info(f"Starting fold {fold_num}")
|
||||
|
||||
# Prepare data loaders
|
||||
train_loader, val_loader, test_loader, target_scaler, input_size = prepare_fold_data_and_loaders(
|
||||
df, train_idx, val_idx, test_idx,
|
||||
config.features, config.training, config.evaluation
|
||||
)
|
||||
|
||||
# Update model config with input size
|
||||
config.model.input_size = input_size
|
||||
|
||||
# Initialize model
|
||||
model = LSTMForecastModel(config.model).to(device)
|
||||
|
||||
# Initialize loss function
|
||||
loss_fn = torch.nn.MSELoss() if config.training.loss_function == "MSE" else torch.nn.L1Loss()
|
||||
|
||||
# Initialize scheduler if configured
|
||||
scheduler = None
|
||||
if config.training.scheduler_step_size is not None:
|
||||
# TODO: Initialize scheduler
|
||||
pass
|
||||
|
||||
# Initialize trainer
|
||||
trainer = Trainer(
|
||||
model, train_loader, val_loader, loss_fn, device,
|
||||
config.training, scheduler, target_scaler
|
||||
)
|
||||
|
||||
# Train model
|
||||
trainer.train()
|
||||
|
||||
# Evaluate on test set
|
||||
fold_metrics = evaluate_fold(
|
||||
model, test_loader, loss_fn, device,
|
||||
target_scaler, config.evaluation, fold_num
|
||||
)
|
||||
|
||||
all_fold_metrics.append(fold_metrics)
|
||||
|
||||
# Optional: Clear GPU memory
|
||||
if device.type == "cuda":
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# Aggregate metrics
|
||||
aggregated_metrics = aggregate_cv_metrics(all_fold_metrics)
|
||||
|
||||
# Log final results
|
||||
logger.info("Cross-validation results:")
|
||||
for metric, stats in aggregated_metrics.items():
|
||||
logger.info(f"{metric}: {stats['mean']:.4f} ± {stats['std']:.4f}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user