Source code for edaflow.analysis.report

"""
Automated profiling and reporting module for edaflow.

This module provides comprehensive EDA reporting functionality similar to ydata-profiling's
ProfileReport, generating detailed analysis reports with statistics, visualizations, and insights.
"""

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from typing import Any, Dict, List, Optional, Union
import tempfile
import os
from io import BytesIO
import base64

# Import existing edaflow functions
from .core import (
    check_null_columns,
    visualize_histograms,
    visualize_heatmap
)


[docs] def profile_report( df: pd.DataFrame, top_n_categorical: int = 5, output_format: str = "html" ) -> Any: """ Generate a comprehensive profiling report for a DataFrame. This function creates an automated EDA report similar to ydata-profiling's ProfileReport, including dataset overview, missing value analysis, categorical insights, and visualizations. Args: df (pd.DataFrame): The input DataFrame to profile top_n_categorical (int, optional): Number of top categorical columns to analyze. Defaults to 5. output_format (str, optional): Output format for the report. Options: "html" (saves to temp file), "dict" (returns dict). Defaults to "html". Returns: Any: If output_format="html", returns path to HTML file. If output_format="dict", returns dict with: - 'overview': DataFrame with dataset info - 'summary_stats': DataFrame with summary statistics - 'missing_values': DataFrame with null analysis - 'categorical_insights': Dict with category distributions - 'numeric_insights': Dict with numeric column info - 'visualizations': Dict with matplotlib figures Raises: ValueError: If df is empty or output_format is invalid TypeError: If df is not a pandas DataFrame Examples: >>> import pandas as pd >>> import edaflow >>> >>> # Create sample data >>> df = pd.DataFrame({ ... 'age': [25, 30, 35, 28, None, 45], ... 'salary': [50000, 60000, 70000, 55000, 65000, 80000], ... 'department': ['HR', 'IT', 'IT', 'HR', 'Finance', 'IT'], ... 'city': ['NYC', 'LA', 'NYC', 'LA', 'NYC', 'LA'] ... }) >>> >>> # Generate HTML report >>> report_path = edaflow.profile_report(df) >>> print(f"Report saved to: {report_path}") >>> >>> # Generate dict report >>> report_dict = edaflow.profile_report(df, output_format="dict") >>> print(report_dict['overview']) >>> >>> # Analyze top 3 categorical columns >>> report = edaflow.profile_report(df, top_n_categorical=3, output_format="dict") >>> print(report['categorical_insights']) Alternative import: >>> from edaflow.analysis import profile_report >>> report = profile_report(df) """ # Input validation (defensive programming) if not isinstance(df, pd.DataFrame): raise TypeError(f"df must be a pandas DataFrame, got {type(df)}") if df.empty: raise ValueError("DataFrame is empty. Cannot generate profile report.") if output_format not in ["html", "dict"]: raise ValueError(f"output_format must be 'html' or 'dict', got '{output_format}'") if not isinstance(top_n_categorical, int) or top_n_categorical < 1: raise ValueError(f"top_n_categorical must be a positive integer, got {top_n_categorical}") # Initialize report components report_data = {} # 1. Dataset Overview n_rows, n_cols = df.shape memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB overview_data = { 'Metric': [ 'Number of Rows', 'Number of Columns', 'Memory Usage (MB)', 'Duplicate Rows', 'Total Missing Cells', 'Missing %' ], 'Value': [ f"{n_rows:,}", f"{n_cols:,}", f"{memory_usage:.2f}", f"{df.duplicated().sum():,}", f"{df.isnull().sum().sum():,}", f"{(df.isnull().sum().sum() / (n_rows * n_cols) * 100):.2f}%" ] } overview_df = pd.DataFrame(overview_data) report_data['overview'] = overview_df # 2. Data Types Summary dtype_counts = df.dtypes.value_counts() dtypes_data = { 'Data Type': [str(dtype) for dtype in dtype_counts.index], 'Count': dtype_counts.values.tolist() } report_data['data_types'] = pd.DataFrame(dtypes_data) # 3. Summary Statistics for Numeric Columns numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() if numeric_cols: # Get standard summary stats summary_stats = df[numeric_cols].describe().T # Add additional useful statistics summary_stats['missing'] = df[numeric_cols].isnull().sum() summary_stats['missing_%'] = (df[numeric_cols].isnull().sum() / len(df) * 100).round(2) summary_stats['unique'] = df[numeric_cols].nunique() summary_stats['zeros'] = (df[numeric_cols] == 0).sum() summary_stats['zeros_%'] = ((df[numeric_cols] == 0).sum() / len(df) * 100).round(2) # Reorder columns for better readability column_order = ['count', 'missing', 'missing_%', 'unique', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'zeros', 'zeros_%'] summary_stats = summary_stats[[col for col in column_order if col in summary_stats.columns]] report_data['summary_stats'] = summary_stats report_data['numeric_insights'] = { 'columns': numeric_cols, 'count': len(numeric_cols), 'total_values': len(df) * len(numeric_cols), 'missing_values': df[numeric_cols].isnull().sum().sum() } else: report_data['summary_stats'] = pd.DataFrame() report_data['numeric_insights'] = { 'columns': [], 'count': 0, 'message': 'No numeric columns found' } # 4. Missing Value Analysis (reuse existing function) try: # check_null_columns returns styled DataFrame or prints # We'll call it and capture the result null_analysis = df.isnull().sum() null_percentages = (null_analysis / len(df) * 100).round(2) missing_df = pd.DataFrame({ 'Column': null_analysis.index, 'Missing Count': null_analysis.values, 'Missing %': null_percentages.values }) missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values( 'Missing %', ascending=False ) if missing_df.empty: missing_df = pd.DataFrame({ 'Column': ['No missing values'], 'Missing Count': [0], 'Missing %': [0.0] }) report_data['missing_values'] = missing_df except Exception as e: report_data['missing_values'] = pd.DataFrame({ 'Error': [f"Could not analyze missing values: {str(e)}"] }) # 5. Categorical Insights categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() if categorical_cols: # Limit to top N categorical columns by number of unique values cat_info = [] for col in categorical_cols: try: n_unique = df[col].nunique() n_missing = df[col].isnull().sum() cat_info.append({ 'column': col, 'unique': n_unique, 'missing': n_missing }) except: continue # Sort by unique count and take top N cat_info_sorted = sorted(cat_info, key=lambda x: x['unique'], reverse=False) top_cats = cat_info_sorted[:min(top_n_categorical, len(cat_info_sorted))] categorical_insights = {} for cat_dict in top_cats: col = cat_dict['column'] try: value_counts = df[col].value_counts() top_values = value_counts.head(10) # Top 10 most frequent categorical_insights[col] = { 'unique_values': cat_dict['unique'], 'missing_values': cat_dict['missing'], 'top_categories': { 'values': top_values.index.tolist(), 'counts': top_values.values.tolist(), 'percentages': (top_values / len(df) * 100).round(2).tolist() } } except Exception as e: categorical_insights[col] = { 'error': f"Could not analyze: {str(e)}" } report_data['categorical_insights'] = categorical_insights else: report_data['categorical_insights'] = { 'message': 'No categorical columns found' } # 6. Visualizations (reuse existing functions) visualizations = {} # Set matplotlib backend for non-interactive plotting matplotlib.use('Agg') # 6a. Histograms for numeric columns if numeric_cols and len(numeric_cols) > 0: try: fig_hist = plt.figure(figsize=(14, max(4, 3 * len(numeric_cols) // 3))) # Calculate subplot grid n_plots = len(numeric_cols) n_cols_plot = min(3, n_plots) n_rows_plot = (n_plots + n_cols_plot - 1) // n_cols_plot for idx, col in enumerate(numeric_cols, 1): plt.subplot(n_rows_plot, n_cols_plot, idx) # Filter out NaN values data = df[col].dropna() if len(data) > 0: plt.hist(data, bins='auto', alpha=0.7, color='steelblue', edgecolor='black') plt.xlabel(col) plt.ylabel('Frequency') plt.title(f'Distribution of {col}') plt.grid(True, alpha=0.3) # Add mean line mean_val = data.mean() plt.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}') plt.legend() else: plt.text(0.5, 0.5, 'No data', ha='center', va='center') plt.title(f'{col} (No data)') plt.tight_layout() visualizations['histograms'] = fig_hist except Exception as e: print(f"Warning: Could not create histograms: {e}") visualizations['histograms'] = None # Close figure if error occurred if 'fig_hist' in locals(): plt.close(fig_hist) # 6b. Correlation Heatmap if numeric_cols and len(numeric_cols) >= 2: try: fig_corr = plt.figure(figsize=(10, 8)) # Calculate correlation matrix corr_matrix = df[numeric_cols].corr() # Create heatmap im = plt.imshow(corr_matrix, cmap='RdYlBu_r', vmin=-1, vmax=1, aspect='auto') plt.colorbar(im, label='Correlation') # Add labels plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45, ha='right') plt.yticks(range(len(numeric_cols)), numeric_cols) # Add correlation values for i in range(len(numeric_cols)): for j in range(len(numeric_cols)): text = plt.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', ha='center', va='center', color='black', fontsize=10) plt.title('Correlation Heatmap', fontsize=14, pad=20) plt.tight_layout() visualizations['correlation_heatmap'] = fig_corr except Exception as e: print(f"Warning: Could not create correlation heatmap: {e}") visualizations['correlation_heatmap'] = None # Close figure if error occurred if 'fig_corr' in locals(): plt.close(fig_corr) report_data['visualizations'] = visualizations # 7. Generate output based on format if output_format == "dict": return report_data elif output_format == "html": html_content = _generate_html_report(report_data, df) # Save to temporary file temp_file = tempfile.NamedTemporaryFile( mode='w', suffix='.html', delete=False, encoding='utf-8' ) temp_file.write(html_content) temp_file.close() return temp_file.name
def _generate_html_report(report_data: Dict, df: pd.DataFrame) -> str: """ Generate HTML report from report data. Args: report_data: Dictionary containing report components df: Original DataFrame Returns: str: HTML content """ html_parts = [] # HTML Header html_parts.append(""" <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>edaflow Profile Report</title> <style> body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; margin: 0; padding: 20px; background-color: #f5f5f5; } .container { max-width: 1200px; margin: 0 auto; background-color: white; padding: 30px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } h1 { color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px; } h2 { color: #34495e; margin-top: 30px; border-left: 4px solid #3498db; padding-left: 15px; } h3 { color: #7f8c8d; margin-top: 20px; } table { border-collapse: collapse; width: 100%; margin: 20px 0; box-shadow: 0 1px 3px rgba(0,0,0,0.1); } th { background-color: #3498db; color: white; text-align: left; padding: 12px; font-weight: 600; } td { padding: 10px 12px; border-bottom: 1px solid #ecf0f1; } tr:hover { background-color: #f8f9fa; } .metric-card { background-color: #ecf0f1; padding: 15px; border-radius: 5px; margin: 10px 0; } .warning { color: #e74c3c; font-weight: bold; } .good { color: #27ae60; font-weight: bold; } .section { margin: 30px 0; } img { max-width: 100%; height: auto; margin: 20px 0; border-radius: 5px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); } .footer { margin-top: 50px; padding-top: 20px; border-top: 1px solid #ecf0f1; text-align: center; color: #95a5a6; font-size: 14px; } .categorical-item { background-color: #f8f9fa; padding: 15px; margin: 15px 0; border-radius: 5px; border-left: 3px solid #3498db; } </style> </head> <body> <div class="container"> <h1>📊 edaflow Profile Report</h1> <p style="color: #7f8c8d; font-size: 14px;">Generated using edaflow automated profiling</p> """) # Overview Section html_parts.append("<div class='section'>") html_parts.append("<h2>🔍 Dataset Overview</h2>") if 'overview' in report_data and not report_data['overview'].empty: html_parts.append(report_data['overview'].to_html(index=False, classes='')) html_parts.append("</div>") # Data Types Section if 'data_types' in report_data and not report_data['data_types'].empty: html_parts.append("<div class='section'>") html_parts.append("<h2>📋 Data Types</h2>") html_parts.append(report_data['data_types'].to_html(index=False, classes='')) html_parts.append("</div>") # Missing Values Section html_parts.append("<div class='section'>") html_parts.append("<h2>❓ Missing Values Analysis</h2>") if 'missing_values' in report_data and not report_data['missing_values'].empty: html_parts.append(report_data['missing_values'].to_html(index=False, classes='')) else: html_parts.append("<p class='good'>✓ No missing values detected!</p>") html_parts.append("</div>") # Summary Statistics Section if 'summary_stats' in report_data and not report_data['summary_stats'].empty: html_parts.append("<div class='section'>") html_parts.append("<h2>📈 Summary Statistics (Numeric Columns)</h2>") html_parts.append(report_data['summary_stats'].to_html(classes='')) html_parts.append("</div>") # Categorical Insights Section if 'categorical_insights' in report_data: cat_insights = report_data['categorical_insights'] if isinstance(cat_insights, dict) and 'message' not in cat_insights: html_parts.append("<div class='section'>") html_parts.append("<h2>🏷️ Categorical Column Insights</h2>") for col_name, col_data in cat_insights.items(): if 'error' not in col_data: html_parts.append(f"<div class='categorical-item'>") html_parts.append(f"<h3>{col_name}</h3>") html_parts.append(f"<p><strong>Unique Values:</strong> {col_data.get('unique_values', 'N/A')}</p>") html_parts.append(f"<p><strong>Missing Values:</strong> {col_data.get('missing_values', 'N/A')}</p>") if 'top_categories' in col_data: top_cats = col_data['top_categories'] html_parts.append("<p><strong>Top Categories:</strong></p>") html_parts.append("<table><tr><th>Value</th><th>Count</th><th>Percentage</th></tr>") for val, cnt, pct in zip(top_cats['values'], top_cats['counts'], top_cats['percentages']): html_parts.append(f"<tr><td>{val}</td><td>{cnt}</td><td>{pct}%</td></tr>") html_parts.append("</table>") html_parts.append("</div>") html_parts.append("</div>") # Visualizations Section if 'visualizations' in report_data: html_parts.append("<div class='section'>") html_parts.append("<h2>📊 Visualizations</h2>") # Histograms if 'histograms' in report_data['visualizations'] and report_data['visualizations']['histograms'] is not None: fig = report_data['visualizations']['histograms'] img_data = _fig_to_base64(fig) html_parts.append("<h3>Distribution Histograms</h3>") html_parts.append(f'<img src="data:image/png;base64,{img_data}" alt="Histograms"/>') plt.close(fig) # Correlation Heatmap if 'correlation_heatmap' in report_data['visualizations'] and report_data['visualizations']['correlation_heatmap'] is not None: fig = report_data['visualizations']['correlation_heatmap'] img_data = _fig_to_base64(fig) html_parts.append("<h3>Correlation Heatmap</h3>") html_parts.append(f'<img src="data:image/png;base64,{img_data}" alt="Correlation Heatmap"/>') plt.close(fig) html_parts.append("</div>") # Footer html_parts.append(""" <div class="footer"> <p>Generated by <strong>edaflow</strong> - Exploratory Data Analysis Workflows</p> <p>For more information, visit: <a href="https://edaflow.readthedocs.io">edaflow.readthedocs.io</a></p> </div> </div> </body> </html> """) return "".join(html_parts) def _fig_to_base64(fig) -> str: """ Convert matplotlib figure to base64 string for HTML embedding. Args: fig: Matplotlib figure object Returns: str: Base64 encoded image string """ buffer = BytesIO() fig.savefig(buffer, format='png', dpi=100, bbox_inches='tight') buffer.seek(0) img_str = base64.b64encode(buffer.read()).decode() buffer.close() return img_str __all__ = ['profile_report']