Files
entrix_case_challange/data_analysis/io/report.py
2025-05-03 20:46:14 +02:00

557 lines
27 KiB
Python

import datetime
import logging
import re
import subprocess
from pathlib import Path
from typing import Optional, Dict, Any
import shutil
import pandas as pd
from data_analysis.utils.data_config_model import settings # Assuming settings are configured
from data_analysis.utils.report_model import ReportData
logger = logging.getLogger(__name__)
# --- Helper function to format DataFrames/Series as LaTeX tables ---
CHARS = {
'&': r'\&',
'%': r'\%',
'$': r'\$',
'#': r'\#',
'_': r'\_',
'{': r'\{',
'}': r'\}',
'~': r'\textasciitilde{}',
'^': r'\^{}',
'\\': r'\textbackslash{}',
'<': r'\textless{}',
'>': r'\textgreater{}',
}
def _escape_latex(text: str) -> str:
"""Escapes special LaTeX characters in a string."""
# Convert input to string first to handle potential non-string types
t = str(text)
# Use a compiled regex for efficiency if called many times
# The pattern needs to be carefully ordered to handle overlapping keys (e.g., '\' vs '\\') correctly,
# although the current CHARS doesn't have overlaps. Sorting by length desc is safest.
pattern = re.compile('|'.join(re.escape(str(key)) for key in sorted(CHARS.keys(), key=lambda item: - len(item))))
t = pattern.sub(lambda match: CHARS[match.group()], t)
return t
def dataframe_to_latex(df: Optional[pd.DataFrame], title: Optional[str] = None, caption: Optional[str] = None, label: Optional[str] = None, escape: bool = True) -> Optional[str]:
"""Converts a pandas DataFrame to a LaTeX tabular environment using booktabs."""
if df is None or df.empty:
return None
# Prepare DataFrame for LaTeX conversion
df_copy = df.copy()
# Include index if it's named or not a simple RangeIndex
include_index = df_copy.index.name is not None or not isinstance(df_copy.index, pd.RangeIndex)
# Escape column names and data if required
if escape:
# Ensure column names are strings before escaping
df_copy.columns = [_escape_latex(str(col)) for col in df_copy.columns]
if include_index and df_copy.index.name:
# Ensure index name is a string before escaping
df_copy.index.name = _escape_latex(str(df_copy.index.name))
# Escape data - map works element-wise, ensure elements are str first if necessary
# Using applymap instead of map for broader compatibility
df_copy = df_copy.map(lambda x: _escape_latex(str(x)))
# Determine column format (e.g., 'llr' for left, left, right)
# Default to left-aligned ('l') for all columns
num_cols = len(df_copy.columns) + (1 if include_index else 0)
col_format = "l" * num_cols
try:
# Ensure title and caption are escaped if they exist and escape=True was requested
# However, dataframe_to_latex itself handles caption/label escaping internally if its `escape` is True.
# We are setting escape=False because we do it manually above.
# If a title is provided separately, it should be escaped before adding.
escaped_title = _escape_latex(str(title)) if title and escape else title
escaped_caption = _escape_latex(str(caption)) if caption and escape else caption
latex_str = df_copy.to_latex(
index=include_index,
escape=False, # We already escaped manually if escape=True
column_format=col_format,
header=True,
# Pass potentially pre-escaped caption/title to to_latex's caption
caption=escaped_caption if escaped_caption else escaped_title,
label=f"tab:{label}" if label else None,
position='!htbp', # Placement suggestion
)
# Add the pre-escaped title above the table if provided and different from caption
if escaped_title and escaped_title != escaped_caption:
# Ensure title is treated as LaTeX command if needed, or just text
# Using \textbf might require braces if title contains commands
latex_str = fr"\textbf{{{escaped_title}}}\par\par\medskip{latex_str}" # Already escaped title
return latex_str
except Exception as e:
logger.error(f"Failed to convert DataFrame to LaTeX: {e}", exc_info=True)
# Escape the error message itself for safe inclusion in LaTeX
return fr"\textit{{Error generating LaTeX table: {_escape_latex(str(e))}}}"
def series_to_latex(series: Optional[pd.Series], title: Optional[str] = None, caption: Optional[str] = None, label: Optional[str] = None, escape: bool = True) -> str:
"""Converts a pandas Series to a LaTeX table (two columns: Index, Value)."""
if series is None or series.empty:
# Ensure the default string is safe for LaTeX
return r"\textit{N/A}\par"
# Convert series to DataFrame
df = series.reset_index()
# Use clear default column names if none exist, ensure they are strings
index_name = str(series.index.name) if series.index.name else 'Index'
value_name = str(series.name) if series.name else 'Value'
df.columns = [index_name, value_name]
# Delegate to dataframe_to_latex, passing the escape parameter
return dataframe_to_latex(df, title=title, caption=caption, label=label, escape=escape)
# --- Report Generation Function (LaTeX) ---
def compile_latex_report(report_tex_path: Path, output_dir: Path) -> bool:
"""
Attempts to compile the LaTeX report using the local LaTeX installation.
Args:
report_tex_path: Path to the .tex file
output_dir: Directory where the PDF should be saved
Returns:
bool: True if compilation was successful, False otherwise
"""
logger.info(f"Attempting to compile LaTeX report: {report_tex_path}")
# Create necessary directories
reports_dir = output_dir / "reports"
tmp_dir = output_dir / "_tmp"
reports_dir.mkdir(parents=True, exist_ok=True)
tmp_dir.mkdir(parents=True, exist_ok=True)
try:
# Run pdflatex twice to ensure proper references and table of contents
for i in range(2):
logger.info(f"Running pdflatex (attempt {i+1}/2)...")
result = subprocess.run(
["pdflatex", "-interaction=nonstopmode", "-output-directory", str(tmp_dir), str(report_tex_path)],
capture_output=False if settings.debug else True,
text=True
)
if result.returncode != 0:
logger.error(f"LaTeX compilation failed (attempt {i+1})")
return False
# Move the PDF to the reports directory
pdf_path = tmp_dir / f"{report_tex_path.stem}.pdf"
if pdf_path.exists():
target_pdf = reports_dir / "report.pdf"
shutil.move(str(pdf_path), str(target_pdf))
logger.info(f"Successfully compiled and moved report to: {target_pdf}")
# Clean up the _tmp directory
shutil.rmtree(tmp_dir)
logger.info("Cleaned up temporary LaTeX files")
return True
else:
logger.error(f"Expected PDF file not found: {pdf_path}")
return False
except FileNotFoundError:
logger.error("pdflatex command not found. Please ensure LaTeX is installed and in your PATH.")
return False
except Exception as e:
logger.error(f"Unexpected error during LaTeX compilation: {e}")
return False
def get_plot_path(key: str, plot_paths: Optional[Dict[str, str]]) -> str:
"""Get the correct path for a plot file."""
if plot_paths is None:
# Return placeholder if the entire dictionary is missing
return "reports/plots/placeholder.png"
# Lookup the specific filename using the key
filename = plot_paths.get(key)
# Construct path or return placeholder if key wasn't found
return f"reports/plots/{filename}" if filename else "reports/plots/placeholder.png"
def _format_latex_command(macro_name: str, value: str) -> str:
"""Formats a LaTeX \newcommand definition. Assumes value is correctly escaped/formatted."""
# Creates \newcommand{\macroName}{value}
# Using simple string concatenation to avoid f-string/raw-string issues.
return "\\newcommand{\\" + macro_name + "}{" + value + "}"
def _format_stationarity_results(results: Optional[Dict[str, Any]], test_name: str) -> str:
"""Formats stationarity test results dictionary into a LaTeX string."""
default_na = r"\textit{N/A}"
if not results:
return default_na
test_data = results.get(test_name.lower())
if not test_data:
return default_na
# Ensure keys and values are escaped correctly *before* creating the Series
formatted_data = {}
for key, value in test_data.items():
escaped_key = _escape_latex(str(key)) # Escape the key
if isinstance(value, dict): # Handle Critical Values
# Escape keys and format values within the string
cv_str = ", ".join([f"{_escape_latex(k)}: {v:.3f}" for k, v in value.items()])
formatted_data[escaped_key] = cv_str
elif isinstance(value, (int, float)):
# Apply specific formatting for p-value and test statistic
if 'p-value' in key.lower():
formatted_data[escaped_key] = f"{value:.4f}"
elif 'statistic' in key.lower():
formatted_data[escaped_key] = f"{value:.3f}"
else:
# Convert non-float numbers to string
formatted_data[escaped_key] = str(value)
else:
# Escape other string values
formatted_data[escaped_key] = _escape_latex(str(value))
if not formatted_data:
return default_na
series = pd.Series(formatted_data)
series.name = "Value" # This name doesn't get escaped by default in series_to_latex
series.index.name = "Metric" # This name doesn't get escaped by default in series_to_latex
# Use series_to_latex for table structure, disable its internal escaping
# as we have already escaped the content. Title also needs pre-escaping.
escaped_title = _escape_latex(f"{test_name.upper()} Test Results")
return series_to_latex(series, title=escaped_title, label=f"{test_name.lower()}_results", escape=False)
def generate_latex_report(
output_dir: Path,
df: Optional[pd.DataFrame],
report_data: ReportData,
series_name_stat: Optional[str],
acf_pacf_plot_paths: Optional[Dict[str, str]] = None,
decomposition_plot_paths: Optional[Dict[str, str]] = None,
other_plot_paths: Optional[Dict[str, str]] = None,
decomposition_model: str = 'additive',
acf_pacf_lags: Optional[int] = 48,
template_path: Path = Path("data_analysis/utils/_latex_report_template.tex")
):
"""Generates the LaTeX report (.tex file) by filling the template using macros."""
logger.info(f"Generating LaTeX EDA report using template: {template_path.resolve()}")
reports_dir = output_dir / "reports"
source_plots_dir = reports_dir / "plots" # Define source plot dir
tmp_dir = output_dir / "_tmp"
tmp_plots_dir = tmp_dir / "plots" # Define target plot dir within tmp
reports_dir.mkdir(parents=True, exist_ok=True)
tmp_dir.mkdir(parents=True, exist_ok=True)
# Ensure the target plot dir exists and is empty before copying
if tmp_plots_dir.exists():
shutil.rmtree(tmp_plots_dir)
tmp_plots_dir.mkdir()
shutil.copytree( output_dir / "plots", tmp_plots_dir, dirs_exist_ok=True)
report_tex_path = tmp_dir / "eda_report.tex"
if not template_path.exists():
logger.error(f"Report template not found at {template_path.resolve()}. Cannot generate report.")
raise FileNotFoundError(f"Report template not found: {template_path.resolve()}")
try:
with open(template_path, 'r', encoding='utf-8') as f:
template = f.read()
except Exception as e:
logger.error(f"Failed to read report template {template_path.resolve()}: {e}", exc_info=True)
raise IOError(f"Failed to read report template {template_path.resolve()}: {e}") from e
# --- Prepare LaTeX Definitions ---
latex_definitions = []
default_na = r"\textit{N/A}"
default_text = r"\textit{Not provided - requires manual interpretation or more data.}\medskip"
# Refined helper to add definitions
def add_def(macro_name: str, value: Optional[Any], formatter=None, default=default_na, escape_if_plain: bool = True):
"""
Adds a LaTeX definition. Handles None values, applies formatter if provided,
and escapes the result if it's considered plain text.
Args:
macro_name: The name of the LaTeX macro (without backslash).
value: The value for the macro.
formatter: A function to format the value (e.g., dataframe_to_latex).
If None, str() is used. If the formatter returns LaTeX code,
set escape_if_plain=False.
default: The default string to use if value is None. Assumed safe for LaTeX.
escape_if_plain: If True and the final value is not known to be LaTeX
(i.e., not from specific formatters or defaults), apply _escape_latex.
"""
final_str = default
is_known_latex = False
if value is not None:
if formatter:
final_str = formatter(value)
# Assume formatters producing tables/complex output return valid LaTeX
if formatter in [dataframe_to_latex, series_to_latex, _format_stationarity_results]:
is_known_latex = True
else:
final_str = str(value) # Default to string conversion
else:
# Value is None, using default. Check if default is known LaTeX.
if default in [default_na, default_text]:
is_known_latex = True
# Convert to string one last time in case formatter returned non-string
final_str = str(final_str)
# Escape the result *unless* it's known LaTeX or escaping is turned off
if escape_if_plain and not is_known_latex:
final_str = _escape_latex(final_str)
latex_definitions.append(_format_latex_command(macro_name, final_str))
# Helper for paths - Now points to plots/filename within the _tmp directory
# Uses example-image-a as the default placeholder
def add_path_def(macro_name: str, path_dict: Optional[Dict[str, str]], key: str, default_filename='example-image-a'): # Changed default
filename = default_filename
is_placeholder = True # Flag to track if we're using the placeholder
source_filename = None
if path_dict and key in path_dict and path_dict[key]:
actual_filename_from_dict = Path(path_dict[key]).name
if actual_filename_from_dict: # Check if it's not an empty string
filename = actual_filename_from_dict
source_filename = path_dict[key] # Keep original potentially relative path for source lookup
is_placeholder = False
# else: filename remains default_filename ('example-image-a')
# Construct path for \includegraphics
# If it's a real plot, use the "plots/" prefix for the copied location.
# If it's the placeholder, use the name directly (LaTeX finds it).
if not is_placeholder:
formatted_path = f"plots/{filename}".replace('\\', '/')
else:
# Ensure placeholder name itself doesn't get 'plots/' prefix
formatted_path = Path(filename).name # Use Path().name just in case
# Pass the path string to add_def, explicitly disable escaping
add_def(macro_name, formatted_path, escape_if_plain=False)
# Copy the actual plot file only if it's NOT the placeholder
if not is_placeholder and source_filename:
# Resolve source relative to the main reports/plots dir
source_file_path = source_plots_dir / Path(source_filename).name
target_file_path = tmp_plots_dir / filename # Target uses just the filename
if source_file_path.is_file():
try:
shutil.copy2(source_file_path, target_file_path)
except Exception as copy_e:
logger.warning(f"Could not copy plot file {source_file_path} to {target_file_path}: {copy_e}")
# else: # Optionally log if source plot missing
# logger.warning(f"Source plot file not found: {source_file_path}")
# Return the boolean flag indicating if it was a real plot or placeholder
return not is_placeholder
# --- Generate Definitions using the new add_def ---
# Basic Info
add_def("reportDateGenerated", datetime.date.today(), formatter=lambda d: d.strftime("%Y-%m-%d"))
add_def("dataSourceDescription", f"Hourly prices from {settings.data_file.name}")
add_def("priceVariableName", settings.data_file.stem)
# Info from DataFrame
if df is not None and not df.empty:
add_def("dateRangeStart", df.index.min().date())
add_def("dateRangeEnd", df.index.max().date())
add_def("numDataPoints", len(df))
freq_info = "Irregular/Not Inferred"
if isinstance(df.index, pd.DatetimeIndex):
try:
inferred = pd.infer_freq(df.index)
freq_info = inferred if inferred else freq_info
except Exception: # Handle potential errors in infer_freq
logger.warning("Could not infer frequency.", exc_info=True)
add_def("timeIndexFrequency", f"Hourly (Inferred: {freq_info})")
add_def("timeIndexConfirmation", f"DatetimeIndex, Hourly (Inferred: {freq_info})")
# Escape column names individually before joining
all_cols_str = ", ".join([_escape_latex(str(c)) for c in df.columns])
add_def("otherColumnsList", all_cols_str if all_cols_str else "None", escape_if_plain=False) # Already escaped
else:
add_def("dateRangeStart", None, default=default_na)
add_def("dateRangeEnd", None, default=default_na)
add_def("numDataPoints", None, default=default_na)
add_def("timeIndexFrequency", None, default=default_na)
add_def("timeIndexConfirmation", None, default=default_na)
add_def("otherColumnsList", "None") # Simple string, escape
# Section 1 Tables
summary_data = report_data.summary_data or {}
add_def("tableHeadData", summary_data.get('head'),
formatter=lambda df_val: dataframe_to_latex(df_val, title="First 5 Rows", label="head", escape=True),
escape_if_plain=False, default=default_na)
add_def("tableTailData", summary_data.get('tail'),
formatter=lambda df_val: dataframe_to_latex(df_val, title="Last 5 Rows", label="tail", escape=True),
escape_if_plain=False, default=default_na)
add_def("tableDtypesInfo", summary_data.get('dtypes'),
formatter=lambda s: series_to_latex(s, title="Data Types", label="dtypes", escape=True),
escape_if_plain=False, default=default_na)
# Section 2 Tables
desc_stats = report_data.descriptive_stats or {}
escaped_desc_title = _escape_latex(f"Descriptive Statistics ({settings.data_file.stem})")
add_def("tableDescriptiveStats", desc_stats.get('desc_price'),
formatter=lambda s: series_to_latex(s, title=escaped_desc_title, label="desc_price", escape=True),
escape_if_plain=False, default=default_na)
missing_counts = summary_data.get('missing')
add_def("tableMissingCounts", missing_counts,
formatter=lambda s: series_to_latex(s, title="Missing Value Counts (Post-Imputation)", label="missing_counts", escape=True),
escape_if_plain=False, default=default_na)
missing_pct = None
if missing_counts is not None and df is not None and len(df) > 0:
missing_pct = (missing_counts / len(df)) * 100
missing_pct = missing_pct.round(3)
add_def("tableMissingPercentages", missing_pct,
formatter=lambda s: series_to_latex(s, title="Missing Value Percentage (Post-Imputation)", label="missing_pct", escape=True),
escape_if_plain=False, default=default_na)
add_def("missingValuesObservations", report_data.imputation_message, default="Missing value check information not available.")
# Section 3 Plots
add_path_def("plotFullTimeseries", other_plot_paths, 'full_timeseries')
# Capture the return value of add_path_def to see if a real plot was added
show_zoomed = add_path_def("plotZoomedTimeseries", other_plot_paths, 'zoomed_timeseries')
add_def("ifShowZoomedTimeseries", "true" if show_zoomed else "false", escape_if_plain=False) # Add boolean macro
add_path_def("plotHistogram", other_plot_paths, 'histogram_price')
add_path_def("plotBoxplotHour", other_plot_paths, 'boxplot_hour')
add_path_def("plotBoxplotDayofweek", other_plot_paths, 'boxplot_dayofweek')
add_path_def("plotBoxplotMonth", other_plot_paths, 'boxplot_month')
add_path_def("plotBoxplotYear", other_plot_paths, 'boxplot_year')
add_path_def("plotSeasonalSubseriesDaily", other_plot_paths, 'seasonal_subseries_daily')
add_path_def("plotSeasonalSubseriesWeekly", other_plot_paths, 'seasonal_subseries_weekly')
add_def("seasonalInteractionsObservations", None, default=default_text, escape_if_plain=False)
# Section 4 Decomposition
add_def("decompositionMethodDetails", f"Statsmodels seasonal_decompose (model='{decomposition_model}')")
add_path_def("plotDecompositionDaily", decomposition_plot_paths, 'daily')
add_path_def("plotDecompositionWeekly", decomposition_plot_paths, 'weekly')
# Capture the return value for yearly decomp
show_yearly = add_path_def("plotDecompositionYearly", decomposition_plot_paths, 'yearly')
add_def("ifShowYearlyDecomp", "true" if show_yearly else "false", escape_if_plain=False) # Add boolean macro
add_def("decompositionObservations", None, default=default_text, escape_if_plain=False)
# Section 5 Stationarity
stationarity_tests = report_data.stationarity_tests or {}
add_def("stationaritySeriesTested", series_name_stat)
add_path_def("plotResiduals", other_plot_paths, 'residuals')
add_def("tableAdfResults", stationarity_tests,
formatter=lambda tests: _format_stationarity_results(tests, "ADF"),
escape_if_plain=False, default=default_na)
add_def("tableKpssResults", stationarity_tests,
formatter=lambda tests: _format_stationarity_results(tests, "KPSS"),
escape_if_plain=False, default=default_na)
findings_summary = r"\textit{Analysis requires both ADF and KPSS results.}"
try:
adf_res = stationarity_tests.get('adf')
kpss_res = stationarity_tests.get('kpss')
adf_p = adf_res.get('p-value') if adf_res else None
kpss_p = kpss_res.get('p-value') if kpss_res else None
if adf_p is not None and kpss_p is not None:
if adf_p < 0.05 and kpss_p >= 0.05:
findings_summary = "Tests suggest the series is stationary (ADF rejects H0, KPSS fails to reject H0)."
elif adf_p >= 0.05 and kpss_p < 0.05:
findings_summary = "Tests suggest the series is non-stationary (trend-stationary) and requires differencing (ADF fails to reject H0, KPSS rejects H0)."
elif adf_p < 0.05 and kpss_p < 0.05:
findings_summary = "Test results conflict: ADF suggests stationarity, KPSS suggests non-stationarity. May indicate difference-stationarity."
else:
findings_summary = "Tests suggest the series is non-stationary (unit root present) and requires differencing (Both fail to reject H0)."
elif adf_p is not None:
findings_summary = f"ADF test p-value: {adf_p:.4f}. Stationarity conclusion requires KPSS test."
elif kpss_p is not None:
findings_summary = f"KPSS test p-value: {kpss_p:.4f}. Stationarity conclusion requires ADF test."
except Exception as e:
logger.warning(f"Could not generate stationarity summary: {e}")
findings_summary = r"\textit{Error generating summary.}"
add_def("stationarityFindingsSummary", findings_summary)
# Section 6 Autocorrelation
add_def("autocorrSeriesAnalyzed", series_name_stat)
add_def("autocorrLagsShown", acf_pacf_lags)
add_path_def("plotAcf", acf_pacf_plot_paths, 'acf')
add_path_def("plotPacf", acf_pacf_plot_paths, 'pacf')
add_def("autocorrObservations", None, default=default_text, escape_if_plain=False)
# Section 7 Summary & Implications
add_def("summaryTrendCycles", None, default=default_text, escape_if_plain=False)
add_def("summarySeasonality", None, default=default_text, escape_if_plain=False)
add_def("summaryStationarity", None, default=default_text, escape_if_plain=False)
add_def("summaryAutocorrelations", None, default=default_text, escape_if_plain=False)
add_def("summaryOutliersVolatility", None, default=default_text, escape_if_plain=False)
add_def("implicationsModelChoice", None, default=default_text, escape_if_plain=False)
add_def("implicationsFeatureEngineering", None, default=default_text, escape_if_plain=False)
add_def("implicationsPreprocessing", None, default=default_text, escape_if_plain=False)
add_def("implicationsEvaluation", None, default=default_text, escape_if_plain=False)
add_def("implicationsProbabilistic", None, default=default_text, escape_if_plain=False)
# Section 8 Conclusion
add_def("conclusionStatement", None, default=default_text, escape_if_plain=False)
# --- Apply Definitions to Template ---
definitions_block = "\n".join(latex_definitions)
if "{{LATEX_DEFINITIONS}}" not in template:
logger.error("Placeholder '{{LATEX_DEFINITIONS}}' not found in the LaTeX template preamble.")
raise ValueError("Template missing '{{LATEX_DEFINITIONS}}' placeholder in preamble.")
report_content = template.replace("{{LATEX_DEFINITIONS}}", definitions_block)
# --- Write Report ---
try:
with open(report_tex_path, 'w', encoding='utf-8') as f:
f.write(report_content)
logger.info(f"Successfully generated LaTeX report source: {report_tex_path}")
# --- Copy Plots ---
# This is now handled within add_path_def to copy files individually
# logger.info(f"Copying plots from {source_plots_dir} to {tmp_plots_dir}")
# try:
# shutil.copytree(source_plots_dir, tmp_plots_dir, dirs_exist_ok=True) # dirs_exist_ok=True allows overwriting
# except FileNotFoundError:
# logger.error(f"Source plots directory not found: {source_plots_dir}")
# raise # Re-raise error if plots dir is essential
# except Exception as e:
# logger.error(f"Failed to copy plots directory: {e}", exc_info=True)
# raise # Re-raise error
# Attempt to compile the report
if compile_latex_report(report_tex_path, output_dir):
logger.info("LaTeX report successfully compiled to PDF")
else:
logger.warning("LaTeX compilation failed. Check logs above. The .tex file is available for manual compilation.")
# Consider raising an error if PDF generation is critical
# raise RuntimeError("LaTeX compilation failed.")
except Exception as e:
logger.error(f"Failed to write LaTeX report to {report_tex_path}: {e}", exc_info=True)
raise IOError(f"Failed to write LaTeX report to {report_tex_path}: {e}") from e