init
This commit is contained in:
0
data_analysis/utils/__init__.py
Normal file
0
data_analysis/utils/__init__.py
Normal file
306
data_analysis/utils/_latex_report_template.tex
Normal file
306
data_analysis/utils/_latex_report_template.tex
Normal file
@ -0,0 +1,306 @@
|
||||
% LaTeX EDA Report Template
|
||||
\documentclass[11pt, a4paper]{article}
|
||||
|
||||
% --- Packages ---
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage{lmodern} % Use Latin Modern fonts
|
||||
\usepackage[margin=1in]{geometry} % Set page margins
|
||||
\usepackage{graphicx} % Required for including images
|
||||
% \graphicspath{{../reports/plots/}} % REMOVE OR COMMENT OUT THIS LINE
|
||||
\usepackage{booktabs} % For professional quality tables (\toprule, \midrule, \bottomrule)
|
||||
\usepackage{amsmath} % For math symbols and environments
|
||||
\usepackage{datetime2} % For date formatting (optional, can use simple text)
|
||||
\usepackage{float} % For finer control over figure placement (e.g., [H] option)
|
||||
\usepackage{caption} % For customizing captions
|
||||
\usepackage{hyperref} % For clickable links (optional)
|
||||
\usepackage{sectsty} % To potentially adjust section font sizes/styles (optional)
|
||||
\usepackage{parskip} % Use vertical space between paragraphs instead of indentation
|
||||
\usepackage{ifthen} % ADD THIS PACKAGE for conditional logic
|
||||
|
||||
% --- Hyperref Setup (Optional) ---
|
||||
\hypersetup{
|
||||
colorlinks=true,
|
||||
linkcolor=blue,
|
||||
filecolor=magenta,
|
||||
urlcolor=cyan,
|
||||
pdftitle={Time Series EDA Report},
|
||||
pdfpagemode=FullScreen,
|
||||
}
|
||||
|
||||
% --- Custom LaTeX Definitions Placeholder ---
|
||||
{{LATEX_DEFINITIONS}} % Python script will insert \newcommand definitions here
|
||||
% Define boolean flags if they don't exist (e.g., for manual compilation)
|
||||
\ifdefined\ifShowZoomedTimeseries\else\newcommand{\ifShowZoomedTimeseries}{false}\fi
|
||||
\ifdefined\ifShowYearlyDecomp\else\newcommand{\ifShowYearlyDecomp}{false}\fi
|
||||
|
||||
% --- Document Information ---
|
||||
\title{Time Series Exploratory Data Analysis Report: Hourly Prices}
|
||||
\author{Generated Automatically}
|
||||
\date{\reportDateGenerated} % Use the macro defined in Python
|
||||
|
||||
% --- Start Document ---
|
||||
\begin{document}
|
||||
|
||||
\maketitle
|
||||
|
||||
% --- Overview Section ---
|
||||
\section*{Report Overview}
|
||||
\begin{itemize}
|
||||
\item \textbf{Data Source:} \dataSourceDescription
|
||||
\item \textbf{Time Series Variable:} `\priceVariableName`
|
||||
\item \textbf{Time Index Frequency:} \timeIndexFrequency
|
||||
\item \textbf{Date Range:} \dateRangeStart \ to \dateRangeEnd
|
||||
\end{itemize}
|
||||
|
||||
% --- Section 1: Data Overview ---
|
||||
\section{Data Overview and Initial Inspection}
|
||||
Purpose: Understand the basic structure, size, and data types of the dataset. Check the time index integrity.
|
||||
|
||||
\subsection*{Key Information}
|
||||
\begin{itemize}
|
||||
\item Number of data points (length of the series): \numDataPoints
|
||||
\item Confirmation of time index format and frequency: \timeIndexConfirmation
|
||||
\item Presence of other columns/variables: \otherColumnsList
|
||||
\end{itemize}
|
||||
|
||||
\subsection*{Raw Data Sample}
|
||||
% Placeholder for Table: First 5 Rows
|
||||
\tableHeadData
|
||||
\vspace{\baselineskip} % Add some vertical space
|
||||
|
||||
% Placeholder for Table: Last 5 Rows
|
||||
\tableTailData
|
||||
|
||||
\subsection*{Data Types}
|
||||
% Placeholder for Table: Data Types (`df.info()`)
|
||||
\tableDtypesInfo
|
||||
|
||||
% --- Section 2: Descriptive Statistics & Missing Values ---
|
||||
\section{Descriptive Statistics and Missing Values}
|
||||
Purpose: Summarize the central tendency, dispersion, and distribution of the price variable and identify data completeness issues. Note any unusual values (like negative prices).
|
||||
|
||||
\subsection*{Price Variable Statistics}
|
||||
% Placeholder for Table: Descriptive Statistics (`df['Price'].describe()`)
|
||||
\tableDescriptiveStats
|
||||
|
||||
\subsection*{Missing Values}
|
||||
% Placeholder for Table: Count of Missing Values
|
||||
\tableMissingCounts
|
||||
\vspace{\baselineskip}
|
||||
|
||||
% Placeholder for Table: Percentage of Missing Values
|
||||
\tableMissingPercentages
|
||||
\vspace{\baselineskip}
|
||||
|
||||
Observations on missing values: \missingValuesObservations % Add a text placeholder
|
||||
|
||||
% --- Section 3: Visual Exploration ---
|
||||
\section{Visual Exploration of Time Series Patterns}
|
||||
Purpose: Visually identify overall trends, seasonality (daily, weekly, yearly), cycles, outliers, and changes in variance. Investigate interactions between patterns.
|
||||
|
||||
\begin{figure}[H] % Use [H] from float package to place figure 'here' if possible
|
||||
\centering
|
||||
% Placeholder for Plot: Full Time Series
|
||||
\includegraphics[width=0.9\textwidth]{\plotFullTimeseries}
|
||||
\caption{Full Time Series: Price vs. Time.}
|
||||
\label{fig:full_ts}
|
||||
\end{figure}
|
||||
|
||||
% --- Conditionally include Zoomed Timeseries Plot ---
|
||||
\ifthenelse{\boolean{\ifShowZoomedTimeseries}}{%
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Zoomed Time Series
|
||||
\includegraphics[width=0.9\textwidth]{\plotZoomedTimeseries}
|
||||
\caption{Zoomed Time Series (Specific Period).}
|
||||
\label{fig:zoomed_ts}
|
||||
\end{figure}
|
||||
}{} % Empty 'else' part - include nothing if false
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Histogram
|
||||
\includegraphics[width=0.7\textwidth]{\plotHistogram}
|
||||
\caption{Distribution of Price Values.}
|
||||
\label{fig:histogram}
|
||||
\end{figure}
|
||||
|
||||
\subsection*{Seasonal Patterns \& Interactions}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Hour
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotHour}
|
||||
\caption{Price Distribution by Hour of Day.}
|
||||
\label{fig:boxplot_hour}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Day of Week
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotDayofweek}
|
||||
\caption{Price Distribution by Day of Week.}
|
||||
\label{fig:boxplot_dayofweek}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Month
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotMonth}
|
||||
\caption{Price Distribution by Month.}
|
||||
\label{fig:boxplot_month}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Box Plots by Year
|
||||
\includegraphics[width=0.9\textwidth]{\plotBoxplotYear}
|
||||
\caption{Price Distribution by Year.}
|
||||
\label{fig:boxplot_year}
|
||||
\end{figure}
|
||||
|
||||
% Optional Seasonal Subseries Plots
|
||||
\textit{Optional: Seasonal Sub-series plots below.}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Optional Plot: Seasonal Sub-series Daily
|
||||
\includegraphics[width=0.9\textwidth]{\plotSeasonalSubseriesDaily}
|
||||
\caption{Seasonal Sub-series Plot (Daily Pattern).}
|
||||
\label{fig:subseries_daily}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Optional Plot: Seasonal Sub-series Weekly
|
||||
\includegraphics[width=0.9\textwidth]{\plotSeasonalSubseriesWeekly}
|
||||
\caption{Seasonal Sub-series Plot (Weekly Pattern).}
|
||||
\label{fig:subseries_weekly}
|
||||
\end{figure}
|
||||
|
||||
Observations on seasonal interactions: \seasonalInteractionsObservations % Placeholder
|
||||
|
||||
% --- Section 4: Time Series Decomposition ---
|
||||
\section{Time Series Decomposition}
|
||||
Purpose: Separate the time series into its underlying components: Trend, Seasonality, and Residuals. Assess how well the decomposition captures the main patterns.
|
||||
|
||||
Method Used: \decompositionMethodDetails
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Daily Period)
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionDaily}
|
||||
\caption{Time Series Decomposition (Daily Seasonality, Period=24).}
|
||||
\label{fig:decomp_daily}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Weekly Period)
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionWeekly}
|
||||
\caption{Time Series Decomposition (Weekly Seasonality, Period=168).}
|
||||
\label{fig:decomp_weekly}
|
||||
\end{figure}
|
||||
|
||||
% Optional Yearly Decomposition
|
||||
\textit{Optional: Yearly decomposition plot below.}
|
||||
% --- Conditionally include Yearly Decomposition Plot ---
|
||||
\ifthenelse{\boolean{\ifShowYearlyDecomp}}{%
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Decomposition (Yearly Period) - Optional
|
||||
\includegraphics[width=0.9\textwidth]{\plotDecompositionYearly}
|
||||
\caption{Time Series Decomposition (Yearly Seasonality, Period=8760).}
|
||||
\label{fig:decomp_yearly}
|
||||
\end{figure}
|
||||
}{} % Empty 'else' part - include nothing if false
|
||||
|
||||
Observations on decomposition: \decompositionObservations % Placeholder
|
||||
|
||||
% --- Section 5: Stationarity Analysis ---
|
||||
\section{Stationarity Analysis}
|
||||
Purpose: Determine if the statistical properties (mean, variance, autocorrelation) are constant over time.
|
||||
|
||||
Methods: Visual inspection, Augmented Dickey-Fuller (ADF) Test, KPSS Test.
|
||||
|
||||
Series Tested: \stationaritySeriesTested
|
||||
|
||||
\subsection*{Visual Inspection (Residuals)}
|
||||
Refer to the trend component in the decomposition plots (Figures \ref{fig:decomp_daily}, \ref{fig:decomp_weekly}).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: Residuals
|
||||
\includegraphics[width=0.9\textwidth]{\plotResiduals}
|
||||
\caption{Residuals from Decomposition (used for stationarity tests).}
|
||||
\label{fig:residuals}
|
||||
\end{figure}
|
||||
|
||||
\subsection*{Statistical Test Results}
|
||||
% Placeholder for Table: ADF Test Results
|
||||
\tableAdfResults
|
||||
\vspace{\baselineskip}
|
||||
|
||||
% Placeholder for Table: KPSS Test Results
|
||||
\tableKpssResults
|
||||
|
||||
\subsection*{Findings}
|
||||
\stationarityFindingsSummary % Placeholder
|
||||
|
||||
% --- Section 6: Autocorrelation Analysis ---
|
||||
\section{Autocorrelation Analysis}
|
||||
Purpose: Understand the linear dependence between the series (or tested series) and its past values.
|
||||
|
||||
Series Analyzed: \autocorrSeriesAnalyzed
|
||||
|
||||
Lags Shown: \autocorrLagsShown
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: ACF
|
||||
\includegraphics[width=0.9\textwidth]{\plotAcf}
|
||||
\caption{Autocorrelation Function (ACF).}
|
||||
\label{fig:acf}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
% Placeholder for Plot: PACF
|
||||
\includegraphics[width=0.9\textwidth]{\plotPacf}
|
||||
\caption{Partial Autocorrelation Function (PACF).}
|
||||
\label{fig:pacf}
|
||||
\end{figure}
|
||||
|
||||
Observations: \autocorrObservations % Placeholder
|
||||
|
||||
% --- Section 7: Summary and Implications ---
|
||||
\section{Analysis Summary and Implications for Forecasting}
|
||||
Purpose: Synthesize the findings and discuss their relevance for modeling.
|
||||
|
||||
\subsection*{Key Findings Summary}
|
||||
\begin{itemize}
|
||||
\item \textbf{Trend \& Cycles:} \summaryTrendCycles
|
||||
\item \textbf{Seasonality:} \summarySeasonality
|
||||
\item \textbf{Stationarity:} \summaryStationarity
|
||||
\item \textbf{Autocorrelations:} \summaryAutocorrelations
|
||||
\item \textbf{Outliers/Volatility:} \summaryOutliersVolatility
|
||||
\end{itemize}
|
||||
|
||||
\subsection*{Implications for Day-Ahead Model}
|
||||
\begin{itemize}
|
||||
\item \textbf{Model Choice:} \implicationsModelChoice
|
||||
\item \textbf{Feature Engineering:} \implicationsFeatureEngineering
|
||||
\item \textbf{Preprocessing:} \implicationsPreprocessing
|
||||
\item \textbf{Evaluation:} \implicationsEvaluation
|
||||
\item \textbf{Probabilistic Forecasting:} \implicationsProbabilistic
|
||||
\end{itemize}
|
||||
|
||||
% --- Section 8: Conclusion ---
|
||||
\section{Conclusion}
|
||||
Purpose: Briefly summarize the EDA process.
|
||||
|
||||
\conclusionStatement % Placeholder
|
||||
|
||||
% --- End Document ---
|
||||
\end{document}
|
166
data_analysis/utils/config_model.py
Normal file
166
data_analysis/utils/config_model.py
Normal file
@ -0,0 +1,166 @@
|
||||
import logging
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from pydantic import BaseModel, Field, ValidationError, field_validator # Use BaseModel for direct dict init
|
||||
from typing import Optional # Use Optional for type hints
|
||||
|
||||
# --- Logger Setup ---
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# --- Configuration File Path ---
|
||||
# Define the default path for the configuration file
|
||||
CONFIG_YAML_PATH = Path("config.yaml")
|
||||
|
||||
# --- Settings Model ---
|
||||
class Settings(BaseModel):
|
||||
"""
|
||||
Application settings loaded from YAML configuration.
|
||||
|
||||
This class defines the configuration structure for the forecasting model,
|
||||
including data paths, logging settings, and analysis parameters.
|
||||
"""
|
||||
# -- General Settings --
|
||||
debug: bool = Field(
|
||||
default=False,
|
||||
description="Enable debug mode for detailed logging and latex stderr output",
|
||||
examples=[True, False]
|
||||
)
|
||||
log_level: str = Field(
|
||||
default="INFO",
|
||||
description="Logging level for the application",
|
||||
examples=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
)
|
||||
# -- IO Settings --
|
||||
data_file: Path = Field(
|
||||
default=Path("data/energy_prices.csv"),
|
||||
description="Path to the input data CSV file relative to project root",
|
||||
examples=["data/energy_prices.csv", "data/Day-ahead_Prices_60min.csv"]
|
||||
)
|
||||
latex_template_file: Optional[Path] = Field(
|
||||
default=Path("data_analysis/utils/_latex_report_template.tex"),
|
||||
description="Path to the LTX template file relative to project root",
|
||||
examples=["data_analysis/utils/_latex_report_template.tex", "data/byo_template.tex"]
|
||||
)
|
||||
output_dir: Path = Field(
|
||||
default=Path("output/reports"),
|
||||
description="Directory to save generated plots and report artifacts",
|
||||
examples=["output/reports", "analysis/results"]
|
||||
)
|
||||
# -- Zoom Settings (Plotting and Analysis) --
|
||||
zoom_start_date: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Start date for zoomed-in analysis plots (YYYY-MM-DD format)",
|
||||
examples=["2023-01-01"]
|
||||
)
|
||||
zoom_end_date: Optional[str] = Field(
|
||||
default=None,
|
||||
description="End date for zoomed-in analysis plots (YYYY-MM-DD format)",
|
||||
examples=["2023-12-31"]
|
||||
)
|
||||
|
||||
# -- Data Settings --
|
||||
expected_data_frequency: str = Field(
|
||||
default="h",
|
||||
description="Expected frequency of the time series data",
|
||||
examples=["h", "D", "M", "Y"]
|
||||
)
|
||||
|
||||
@field_validator('log_level')
|
||||
def validate_log_level(cls, v):
|
||||
"""Validate that log_level is one of the standard logging levels."""
|
||||
valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
||||
if v.upper() not in valid_levels:
|
||||
raise ValueError(f"log_level must be one of {valid_levels}")
|
||||
return v.upper()
|
||||
|
||||
@field_validator('expected_data_frequency')
|
||||
def validate_frequency(cls, v):
|
||||
"""Validate that frequency is a valid pandas frequency string."""
|
||||
valid_freqs = ["h", "D", "M", "Y"]
|
||||
v_lower = v.lower() # Convert input to lowercase for comparison
|
||||
if v_lower not in [f.lower() for f in valid_freqs]:
|
||||
raise ValueError(f"expected_data_frequency must be one of {valid_freqs}")
|
||||
return v_lower # Return normalized lowercase value
|
||||
|
||||
@field_validator('zoom_start_date', 'zoom_end_date')
|
||||
def validate_date_format(cls, v):
|
||||
"""Validate date format if provided."""
|
||||
if v is None:
|
||||
return v
|
||||
try:
|
||||
from datetime import datetime
|
||||
datetime.strptime(v, "%Y-%m-%d")
|
||||
return v
|
||||
except ValueError:
|
||||
raise ValueError("Date must be in YYYY-MM-DD format")
|
||||
|
||||
@field_validator('latex_template_file')
|
||||
def validate_latex_template_file(cls, latex_template_file):
|
||||
return latex_template_file or cls.model_fields['latex_template_file'].default
|
||||
|
||||
@classmethod
|
||||
def from_yaml(cls, yaml_path: Path) -> 'Settings':
|
||||
"""
|
||||
Load settings from a YAML file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the YAML configuration file
|
||||
|
||||
Returns:
|
||||
Settings instance with values from the YAML file
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the YAML file doesn't exist
|
||||
yaml.YAMLError: If the YAML file is invalid
|
||||
ValidationError: If the YAML values don't match the schema
|
||||
"""
|
||||
if not yaml_path.exists():
|
||||
raise FileNotFoundError(f"Configuration file not found: {yaml_path}")
|
||||
|
||||
try:
|
||||
with open(yaml_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
return cls(**config)
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Error parsing YAML file {yaml_path}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading settings from {yaml_path}: {e}")
|
||||
raise
|
||||
|
||||
# --- Loading Function ---
|
||||
def load_settings(config_path: Path = CONFIG_YAML_PATH) -> Settings:
|
||||
"""Loads settings from a YAML file."""
|
||||
logger.info(f"Attempting to load configuration from: {config_path.resolve()}")
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
config_data = yaml.safe_load(f)
|
||||
if not config_data:
|
||||
logger.warning(f"Configuration file {config_path} is empty. Using default settings.")
|
||||
return Settings() # Return default settings if file is empty
|
||||
|
||||
settings = Settings(**config_data)
|
||||
logger.info("Configuration loaded successfully.")
|
||||
|
||||
# Update logger level based on loaded settings
|
||||
logging.getLogger().setLevel(settings.log_level.upper())
|
||||
logger.info(f"Log level set to: {settings.log_level.upper()}")
|
||||
logger.debug(settings.model_dump_json(indent=2)) # Log loaded settings at debug level
|
||||
return settings
|
||||
|
||||
except FileNotFoundError:
|
||||
logger.warning(f"Configuration file {config_path} not found. Using default settings.")
|
||||
return Settings() # Return default settings if file not found
|
||||
except yaml.YAMLError as e:
|
||||
logger.error(f"Error parsing YAML file {config_path}: {e}. Using default settings.")
|
||||
return Settings() # Return default settings on parse error
|
||||
except ValidationError as e:
|
||||
logger.error(f"Configuration validation error: {e}. Using default settings.")
|
||||
return Settings() # Return default settings on validation error
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred while loading settings: {e}. Using default settings.")
|
||||
return Settings() # Catch other potential errors
|
||||
|
||||
# --- Global Settings Instance ---
|
||||
# Load settings when the module is imported
|
||||
settings = load_settings()
|
11
data_analysis/utils/report_model.py
Normal file
11
data_analysis/utils/report_model.py
Normal file
@ -0,0 +1,11 @@
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ReportData(BaseModel):
|
||||
"""Container for all report-related data."""
|
||||
descriptive_stats: Optional[Dict[str, Any]] = None
|
||||
stationarity_tests: Optional[Dict[str, Any]] = None
|
||||
summary_data: Optional[Dict[str, Any]] = None
|
||||
imputation_message: Optional[str] = None
|
Reference in New Issue
Block a user