"""
Automated profiling and reporting module for edaflow.
This module provides comprehensive EDA reporting functionality similar to ydata-profiling's
ProfileReport, generating detailed analysis reports with statistics, visualizations, and insights.
"""
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from typing import Any, Dict, List, Optional, Union
import tempfile
import os
from io import BytesIO
import base64
# Import existing edaflow functions
from .core import (
check_null_columns,
visualize_histograms,
visualize_heatmap
)
[docs]
def profile_report(
df: pd.DataFrame,
top_n_categorical: int = 5,
output_format: str = "html"
) -> Any:
"""
Generate a comprehensive profiling report for a DataFrame.
This function creates an automated EDA report similar to ydata-profiling's ProfileReport,
including dataset overview, missing value analysis, categorical insights, and visualizations.
Args:
df (pd.DataFrame): The input DataFrame to profile
top_n_categorical (int, optional): Number of top categorical columns to analyze.
Defaults to 5.
output_format (str, optional): Output format for the report.
Options: "html" (saves to temp file), "dict" (returns dict).
Defaults to "html".
Returns:
Any: If output_format="html", returns path to HTML file.
If output_format="dict", returns dict with:
- 'overview': DataFrame with dataset info
- 'summary_stats': DataFrame with summary statistics
- 'missing_values': DataFrame with null analysis
- 'categorical_insights': Dict with category distributions
- 'numeric_insights': Dict with numeric column info
- 'visualizations': Dict with matplotlib figures
Raises:
ValueError: If df is empty or output_format is invalid
TypeError: If df is not a pandas DataFrame
Examples:
>>> import pandas as pd
>>> import edaflow
>>>
>>> # Create sample data
>>> df = pd.DataFrame({
... 'age': [25, 30, 35, 28, None, 45],
... 'salary': [50000, 60000, 70000, 55000, 65000, 80000],
... 'department': ['HR', 'IT', 'IT', 'HR', 'Finance', 'IT'],
... 'city': ['NYC', 'LA', 'NYC', 'LA', 'NYC', 'LA']
... })
>>>
>>> # Generate HTML report
>>> report_path = edaflow.profile_report(df)
>>> print(f"Report saved to: {report_path}")
>>>
>>> # Generate dict report
>>> report_dict = edaflow.profile_report(df, output_format="dict")
>>> print(report_dict['overview'])
>>>
>>> # Analyze top 3 categorical columns
>>> report = edaflow.profile_report(df, top_n_categorical=3, output_format="dict")
>>> print(report['categorical_insights'])
Alternative import:
>>> from edaflow.analysis import profile_report
>>> report = profile_report(df)
"""
# Input validation (defensive programming)
if not isinstance(df, pd.DataFrame):
raise TypeError(f"df must be a pandas DataFrame, got {type(df)}")
if df.empty:
raise ValueError("DataFrame is empty. Cannot generate profile report.")
if output_format not in ["html", "dict"]:
raise ValueError(f"output_format must be 'html' or 'dict', got '{output_format}'")
if not isinstance(top_n_categorical, int) or top_n_categorical < 1:
raise ValueError(f"top_n_categorical must be a positive integer, got {top_n_categorical}")
# Initialize report components
report_data = {}
# 1. Dataset Overview
n_rows, n_cols = df.shape
memory_usage = df.memory_usage(deep=True).sum() / 1024**2 # MB
overview_data = {
'Metric': [
'Number of Rows',
'Number of Columns',
'Memory Usage (MB)',
'Duplicate Rows',
'Total Missing Cells',
'Missing %'
],
'Value': [
f"{n_rows:,}",
f"{n_cols:,}",
f"{memory_usage:.2f}",
f"{df.duplicated().sum():,}",
f"{df.isnull().sum().sum():,}",
f"{(df.isnull().sum().sum() / (n_rows * n_cols) * 100):.2f}%"
]
}
overview_df = pd.DataFrame(overview_data)
report_data['overview'] = overview_df
# 2. Data Types Summary
dtype_counts = df.dtypes.value_counts()
dtypes_data = {
'Data Type': [str(dtype) for dtype in dtype_counts.index],
'Count': dtype_counts.values.tolist()
}
report_data['data_types'] = pd.DataFrame(dtypes_data)
# 3. Summary Statistics for Numeric Columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
# Get standard summary stats
summary_stats = df[numeric_cols].describe().T
# Add additional useful statistics
summary_stats['missing'] = df[numeric_cols].isnull().sum()
summary_stats['missing_%'] = (df[numeric_cols].isnull().sum() / len(df) * 100).round(2)
summary_stats['unique'] = df[numeric_cols].nunique()
summary_stats['zeros'] = (df[numeric_cols] == 0).sum()
summary_stats['zeros_%'] = ((df[numeric_cols] == 0).sum() / len(df) * 100).round(2)
# Reorder columns for better readability
column_order = ['count', 'missing', 'missing_%', 'unique', 'mean', 'std',
'min', '25%', '50%', '75%', 'max', 'zeros', 'zeros_%']
summary_stats = summary_stats[[col for col in column_order if col in summary_stats.columns]]
report_data['summary_stats'] = summary_stats
report_data['numeric_insights'] = {
'columns': numeric_cols,
'count': len(numeric_cols),
'total_values': len(df) * len(numeric_cols),
'missing_values': df[numeric_cols].isnull().sum().sum()
}
else:
report_data['summary_stats'] = pd.DataFrame()
report_data['numeric_insights'] = {
'columns': [],
'count': 0,
'message': 'No numeric columns found'
}
# 4. Missing Value Analysis (reuse existing function)
try:
# check_null_columns returns styled DataFrame or prints
# We'll call it and capture the result
null_analysis = df.isnull().sum()
null_percentages = (null_analysis / len(df) * 100).round(2)
missing_df = pd.DataFrame({
'Column': null_analysis.index,
'Missing Count': null_analysis.values,
'Missing %': null_percentages.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values(
'Missing %', ascending=False
)
if missing_df.empty:
missing_df = pd.DataFrame({
'Column': ['No missing values'],
'Missing Count': [0],
'Missing %': [0.0]
})
report_data['missing_values'] = missing_df
except Exception as e:
report_data['missing_values'] = pd.DataFrame({
'Error': [f"Could not analyze missing values: {str(e)}"]
})
# 5. Categorical Insights
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
if categorical_cols:
# Limit to top N categorical columns by number of unique values
cat_info = []
for col in categorical_cols:
try:
n_unique = df[col].nunique()
n_missing = df[col].isnull().sum()
cat_info.append({
'column': col,
'unique': n_unique,
'missing': n_missing
})
except:
continue
# Sort by unique count and take top N
cat_info_sorted = sorted(cat_info, key=lambda x: x['unique'], reverse=False)
top_cats = cat_info_sorted[:min(top_n_categorical, len(cat_info_sorted))]
categorical_insights = {}
for cat_dict in top_cats:
col = cat_dict['column']
try:
value_counts = df[col].value_counts()
top_values = value_counts.head(10) # Top 10 most frequent
categorical_insights[col] = {
'unique_values': cat_dict['unique'],
'missing_values': cat_dict['missing'],
'top_categories': {
'values': top_values.index.tolist(),
'counts': top_values.values.tolist(),
'percentages': (top_values / len(df) * 100).round(2).tolist()
}
}
except Exception as e:
categorical_insights[col] = {
'error': f"Could not analyze: {str(e)}"
}
report_data['categorical_insights'] = categorical_insights
else:
report_data['categorical_insights'] = {
'message': 'No categorical columns found'
}
# 6. Visualizations (reuse existing functions)
visualizations = {}
# Set matplotlib backend for non-interactive plotting
matplotlib.use('Agg')
# 6a. Histograms for numeric columns
if numeric_cols and len(numeric_cols) > 0:
try:
fig_hist = plt.figure(figsize=(14, max(4, 3 * len(numeric_cols) // 3)))
# Calculate subplot grid
n_plots = len(numeric_cols)
n_cols_plot = min(3, n_plots)
n_rows_plot = (n_plots + n_cols_plot - 1) // n_cols_plot
for idx, col in enumerate(numeric_cols, 1):
plt.subplot(n_rows_plot, n_cols_plot, idx)
# Filter out NaN values
data = df[col].dropna()
if len(data) > 0:
plt.hist(data, bins='auto', alpha=0.7, color='steelblue', edgecolor='black')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.title(f'Distribution of {col}')
plt.grid(True, alpha=0.3)
# Add mean line
mean_val = data.mean()
plt.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_val:.2f}')
plt.legend()
else:
plt.text(0.5, 0.5, 'No data', ha='center', va='center')
plt.title(f'{col} (No data)')
plt.tight_layout()
visualizations['histograms'] = fig_hist
except Exception as e:
print(f"Warning: Could not create histograms: {e}")
visualizations['histograms'] = None
# Close figure if error occurred
if 'fig_hist' in locals():
plt.close(fig_hist)
# 6b. Correlation Heatmap
if numeric_cols and len(numeric_cols) >= 2:
try:
fig_corr = plt.figure(figsize=(10, 8))
# Calculate correlation matrix
corr_matrix = df[numeric_cols].corr()
# Create heatmap
im = plt.imshow(corr_matrix, cmap='RdYlBu_r', vmin=-1, vmax=1, aspect='auto')
plt.colorbar(im, label='Correlation')
# Add labels
plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45, ha='right')
plt.yticks(range(len(numeric_cols)), numeric_cols)
# Add correlation values
for i in range(len(numeric_cols)):
for j in range(len(numeric_cols)):
text = plt.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
ha='center', va='center', color='black', fontsize=10)
plt.title('Correlation Heatmap', fontsize=14, pad=20)
plt.tight_layout()
visualizations['correlation_heatmap'] = fig_corr
except Exception as e:
print(f"Warning: Could not create correlation heatmap: {e}")
visualizations['correlation_heatmap'] = None
# Close figure if error occurred
if 'fig_corr' in locals():
plt.close(fig_corr)
report_data['visualizations'] = visualizations
# 7. Generate output based on format
if output_format == "dict":
return report_data
elif output_format == "html":
html_content = _generate_html_report(report_data, df)
# Save to temporary file
temp_file = tempfile.NamedTemporaryFile(
mode='w',
suffix='.html',
delete=False,
encoding='utf-8'
)
temp_file.write(html_content)
temp_file.close()
return temp_file.name
def _generate_html_report(report_data: Dict, df: pd.DataFrame) -> str:
"""
Generate HTML report from report data.
Args:
report_data: Dictionary containing report components
df: Original DataFrame
Returns:
str: HTML content
"""
html_parts = []
# HTML Header
html_parts.append("""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>edaflow Profile Report</title>
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.container {
max-width: 1200px;
margin: 0 auto;
background-color: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
h1 {
color: #2c3e50;
border-bottom: 3px solid #3498db;
padding-bottom: 10px;
}
h2 {
color: #34495e;
margin-top: 30px;
border-left: 4px solid #3498db;
padding-left: 15px;
}
h3 {
color: #7f8c8d;
margin-top: 20px;
}
table {
border-collapse: collapse;
width: 100%;
margin: 20px 0;
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
}
th {
background-color: #3498db;
color: white;
text-align: left;
padding: 12px;
font-weight: 600;
}
td {
padding: 10px 12px;
border-bottom: 1px solid #ecf0f1;
}
tr:hover {
background-color: #f8f9fa;
}
.metric-card {
background-color: #ecf0f1;
padding: 15px;
border-radius: 5px;
margin: 10px 0;
}
.warning {
color: #e74c3c;
font-weight: bold;
}
.good {
color: #27ae60;
font-weight: bold;
}
.section {
margin: 30px 0;
}
img {
max-width: 100%;
height: auto;
margin: 20px 0;
border-radius: 5px;
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}
.footer {
margin-top: 50px;
padding-top: 20px;
border-top: 1px solid #ecf0f1;
text-align: center;
color: #95a5a6;
font-size: 14px;
}
.categorical-item {
background-color: #f8f9fa;
padding: 15px;
margin: 15px 0;
border-radius: 5px;
border-left: 3px solid #3498db;
}
</style>
</head>
<body>
<div class="container">
<h1>📊 edaflow Profile Report</h1>
<p style="color: #7f8c8d; font-size: 14px;">Generated using edaflow automated profiling</p>
""")
# Overview Section
html_parts.append("<div class='section'>")
html_parts.append("<h2>🔍 Dataset Overview</h2>")
if 'overview' in report_data and not report_data['overview'].empty:
html_parts.append(report_data['overview'].to_html(index=False, classes=''))
html_parts.append("</div>")
# Data Types Section
if 'data_types' in report_data and not report_data['data_types'].empty:
html_parts.append("<div class='section'>")
html_parts.append("<h2>📋 Data Types</h2>")
html_parts.append(report_data['data_types'].to_html(index=False, classes=''))
html_parts.append("</div>")
# Missing Values Section
html_parts.append("<div class='section'>")
html_parts.append("<h2>❓ Missing Values Analysis</h2>")
if 'missing_values' in report_data and not report_data['missing_values'].empty:
html_parts.append(report_data['missing_values'].to_html(index=False, classes=''))
else:
html_parts.append("<p class='good'>✓ No missing values detected!</p>")
html_parts.append("</div>")
# Summary Statistics Section
if 'summary_stats' in report_data and not report_data['summary_stats'].empty:
html_parts.append("<div class='section'>")
html_parts.append("<h2>📈 Summary Statistics (Numeric Columns)</h2>")
html_parts.append(report_data['summary_stats'].to_html(classes=''))
html_parts.append("</div>")
# Categorical Insights Section
if 'categorical_insights' in report_data:
cat_insights = report_data['categorical_insights']
if isinstance(cat_insights, dict) and 'message' not in cat_insights:
html_parts.append("<div class='section'>")
html_parts.append("<h2>🏷️ Categorical Column Insights</h2>")
for col_name, col_data in cat_insights.items():
if 'error' not in col_data:
html_parts.append(f"<div class='categorical-item'>")
html_parts.append(f"<h3>{col_name}</h3>")
html_parts.append(f"<p><strong>Unique Values:</strong> {col_data.get('unique_values', 'N/A')}</p>")
html_parts.append(f"<p><strong>Missing Values:</strong> {col_data.get('missing_values', 'N/A')}</p>")
if 'top_categories' in col_data:
top_cats = col_data['top_categories']
html_parts.append("<p><strong>Top Categories:</strong></p>")
html_parts.append("<table><tr><th>Value</th><th>Count</th><th>Percentage</th></tr>")
for val, cnt, pct in zip(top_cats['values'], top_cats['counts'], top_cats['percentages']):
html_parts.append(f"<tr><td>{val}</td><td>{cnt}</td><td>{pct}%</td></tr>")
html_parts.append("</table>")
html_parts.append("</div>")
html_parts.append("</div>")
# Visualizations Section
if 'visualizations' in report_data:
html_parts.append("<div class='section'>")
html_parts.append("<h2>📊 Visualizations</h2>")
# Histograms
if 'histograms' in report_data['visualizations'] and report_data['visualizations']['histograms'] is not None:
fig = report_data['visualizations']['histograms']
img_data = _fig_to_base64(fig)
html_parts.append("<h3>Distribution Histograms</h3>")
html_parts.append(f'<img src="data:image/png;base64,{img_data}" alt="Histograms"/>')
plt.close(fig)
# Correlation Heatmap
if 'correlation_heatmap' in report_data['visualizations'] and report_data['visualizations']['correlation_heatmap'] is not None:
fig = report_data['visualizations']['correlation_heatmap']
img_data = _fig_to_base64(fig)
html_parts.append("<h3>Correlation Heatmap</h3>")
html_parts.append(f'<img src="data:image/png;base64,{img_data}" alt="Correlation Heatmap"/>')
plt.close(fig)
html_parts.append("</div>")
# Footer
html_parts.append("""
<div class="footer">
<p>Generated by <strong>edaflow</strong> - Exploratory Data Analysis Workflows</p>
<p>For more information, visit: <a href="https://edaflow.readthedocs.io">edaflow.readthedocs.io</a></p>
</div>
</div>
</body>
</html>
""")
return "".join(html_parts)
def _fig_to_base64(fig) -> str:
"""
Convert matplotlib figure to base64 string for HTML embedding.
Args:
fig: Matplotlib figure object
Returns:
str: Base64 encoded image string
"""
buffer = BytesIO()
fig.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
buffer.seek(0)
img_str = base64.b64encode(buffer.read()).decode()
buffer.close()
return img_str
__all__ = ['profile_report']