"""
edaflow.ml.leaderboard - Model comparison and ranking functionality
This module provides utilities for comparing multiple models, ranking them
based on performance metrics, and displaying comprehensive leaderboards.
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Any, Union
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
mean_squared_error, mean_absolute_error, r2_score
)
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
[docs]
def compare_models(
models: Dict[str, BaseEstimator],
X_train: Optional[pd.DataFrame] = None,
X_val: Optional[pd.DataFrame] = None,
X_test: Optional[pd.DataFrame] = None,
y_train: Optional[pd.Series] = None,
y_val: Optional[pd.Series] = None,
y_test: Optional[pd.Series] = None,
experiment_config: Optional[Dict[str, Any]] = None,
problem_type: str = 'auto',
metrics: Optional[List[str]] = None,
cv_folds: int = 5,
scoring: Optional[Union[str, List[str]]] = None,
verbose: bool = True
) -> pd.DataFrame:
"""
Compare multiple models across various performance metrics.
Parameters:
-----------
models : Dict[str, BaseEstimator]
Dictionary of model name -> fitted model pairs
X_train : pd.DataFrame, optional
Training features (can be provided via experiment_config)
X_val : pd.DataFrame, optional
Validation features (can be provided via experiment_config)
X_test : pd.DataFrame, optional
Test features for final evaluation
y_train : pd.Series, optional
Training target (can be provided via experiment_config)
y_val : pd.Series, optional
Validation target (can be provided via experiment_config)
y_test : pd.Series, optional
Test target for final evaluation
experiment_config : Dict[str, Any], optional
Complete experiment configuration from setup_ml_experiment()
If provided, will extract X_train, X_val, y_train, y_val from it
problem_type : str, default='auto'
'classification', 'regression', or 'auto' to detect
metrics : List[str], optional
Specific metrics to calculate. If None, uses default metrics
cv_folds : int, default=5
Number of cross-validation folds (if applicable)
scoring : str or List[str], optional
Scoring metric(s) to use for evaluation
verbose : bool, default=True
Whether to print comparison progress
Returns:
--------
pd.DataFrame
Comparison results with models as rows and metrics as columns
"""
# Extract data from experiment_config if provided
if experiment_config is not None:
X_train = experiment_config['X_train']
X_val = experiment_config['X_val']
y_train = experiment_config['y_train']
y_val = experiment_config['y_val']
# Extract test data if available in experiment config
X_test = experiment_config.get('X_test', X_test)
y_test = experiment_config.get('y_test', y_test)
# Use problem type from experiment if available
if problem_type == 'auto' and 'experiment_config' in experiment_config:
problem_type = experiment_config['experiment_config'].get('problem_type', 'auto')
if verbose:
exp_name = experiment_config.get('experiment_config', {}).get('experiment_name', 'Unknown')
print(f"📋 Using experiment: {exp_name}")
# Prioritize test data for evaluation if available, otherwise use validation data
eval_X = X_test if X_test is not None else X_val
eval_y = y_test if y_test is not None else y_val
eval_label = "test" if X_test is not None else "validation"
# Validate required data is available
if X_train is None or eval_X is None or y_train is None or eval_y is None:
raise ValueError("Must provide either (X_train, y_train, X_val/X_test, y_val/y_test) OR experiment_config")
if verbose:
print("🏆 Comparing Models...")
print(f"📊 Models to compare: {len(models)}")
print(f"📈 Training samples: {len(X_train)}")
print(f"🔍 Evaluation samples ({eval_label}): {len(eval_X)}")
if scoring is not None:
print(f"📏 Custom scoring: {scoring}")
if cv_folds > 1:
print(f"🔄 Cross-validation folds: {cv_folds}")
# Auto-detect problem type
if problem_type == 'auto':
problem_type = _detect_problem_type(y_train)
# Set default metrics based on problem type and scoring parameter
if metrics is None:
if scoring is not None:
# Use scoring parameter if provided
if isinstance(scoring, str):
metrics = [scoring]
else:
metrics = list(scoring)
else:
# Use default metrics
if problem_type == 'classification':
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
else:
metrics = ['mse', 'mae', 'r2']
results = []
for model_name, model in models.items():
if verbose:
print(f"⚡ Evaluating {model_name}...")
start_time = time.time()
# Make predictions
try:
y_pred = model.predict(eval_X)
if problem_type == 'classification' and hasattr(model, 'predict_proba'):
y_proba = model.predict_proba(eval_X)
except Exception as e:
if verbose:
print(f"❌ Error with {model_name}: {str(e)}")
continue
# Calculate metrics
model_results = {'model': model_name}
for metric in metrics:
try:
if problem_type == 'classification':
score = _calculate_classification_metric(metric, eval_y, y_pred, y_proba if 'y_proba' in locals() else None)
else:
score = _calculate_regression_metric(metric, eval_y, y_pred)
model_results[metric] = score
except Exception as e:
if verbose:
print(f"⚠️ Could not calculate {metric} for {model_name}: {str(e)}")
model_results[metric] = np.nan
# Calculate training time (if available)
end_time = time.time()
model_results['eval_time_ms'] = (end_time - start_time) * 1000
# Add model complexity info if available
if hasattr(model, 'get_params'):
n_params = len(str(model.get_params()))
model_results['complexity'] = n_params
results.append(model_results)
# Convert to DataFrame
comparison_df = pd.DataFrame(results)
if verbose:
print(f"✅ Comparison complete! {len(comparison_df)} models evaluated.")
return comparison_df
[docs]
def rank_models(
comparison_df: pd.DataFrame,
primary_metric: str,
ascending: bool = False,
secondary_metrics: Optional[List[str]] = None,
weights: Optional[Dict[str, float]] = None,
return_format: str = 'dataframe'
) -> Union[pd.DataFrame, List[Dict]]:
"""
Rank models based on performance metrics.
Parameters:
-----------
comparison_df : pd.DataFrame
Results from compare_models()
primary_metric : str
Main metric to rank by
ascending : bool, default=False
Whether to sort in ascending order (True for error metrics)
secondary_metrics : List[str], optional
Additional metrics to consider for tie-breaking
weights : Dict[str, float], optional
Weights for weighted ranking across multiple metrics
return_format : str, default='dataframe'
Format to return: 'dataframe' or 'list'
Returns:
--------
Union[pd.DataFrame, List[Dict]]
If 'dataframe': Ranked models DataFrame
If 'list': List of dicts for easy access with pattern [0]["model_name"]
Examples:
---------
# DataFrame format (default)
ranked_df = rank_models(results, 'accuracy')
best_model = ranked_df.iloc[0]['model']
# List format for easier access
ranked_list = rank_models(results, 'accuracy', return_format='list')
best_model = ranked_list[0]["model_name"]
"""
ranked_df = comparison_df.copy()
# Validate primary metric exists
if primary_metric not in ranked_df.columns:
raise ValueError(f"Primary metric '{primary_metric}' not found in comparison results")
# Simple ranking by primary metric
if weights is None:
ranked_df = ranked_df.sort_values(
by=[primary_metric] + (secondary_metrics or []),
ascending=ascending
).reset_index(drop=True)
ranked_df['rank'] = range(1, len(ranked_df) + 1)
ranked_df['rank_score'] = ranked_df[primary_metric]
# Weighted ranking across multiple metrics
else:
# Normalize metrics to 0-1 scale
metric_columns = [col for col in weights.keys() if col in ranked_df.columns]
normalized_df = ranked_df[metric_columns].copy()
for metric in metric_columns:
col_values = ranked_df[metric].dropna()
if len(col_values) > 0:
min_val, max_val = col_values.min(), col_values.max()
if max_val > min_val:
# Normalize to 0-1, flip if lower is better (like error metrics)
if metric.lower() in ['mse', 'mae', 'rmse', 'error']:
normalized_df[metric] = 1 - (ranked_df[metric] - min_val) / (max_val - min_val)
else:
normalized_df[metric] = (ranked_df[metric] - min_val) / (max_val - min_val)
# Calculate weighted score
weighted_scores = []
for idx, row in normalized_df.iterrows():
score = sum(row[metric] * weights[metric] for metric in metric_columns if not pd.isna(row[metric]))
weighted_scores.append(score)
ranked_df['rank_score'] = weighted_scores
ranked_df = ranked_df.sort_values('rank_score', ascending=False).reset_index(drop=True)
ranked_df['rank'] = range(1, len(ranked_df) + 1)
# Return in requested format
if return_format == 'list':
# Convert to list of dictionaries for easy access
result_list = []
for _, row in ranked_df.iterrows():
model_dict = row.to_dict()
# Add model_name key for consistency with user's pattern
if 'model' in model_dict:
model_dict['model_name'] = model_dict['model']
result_list.append(model_dict)
return result_list
return ranked_df
[docs]
def display_leaderboard(
comparison_results: pd.DataFrame = None,
ranked_df: pd.DataFrame = None,
sort_by: str = None,
ascending: bool = False,
show_std: bool = False,
top_n: int = 10,
show_metrics: Optional[List[str]] = None,
highlight_best: bool = True,
figsize: Tuple[int, int] = (12, 8)
) -> None:
"""
Display a visual leaderboard of model performance.
Parameters:
-----------
comparison_results : pd.DataFrame, optional
Raw comparison results from compare_models()
ranked_df : pd.DataFrame, optional
Pre-ranked results (alternative to comparison_results)
sort_by : str, optional
Metric to sort by. If None, uses first numeric column
ascending : bool, default=False
Whether to sort in ascending order
show_std : bool, default=False
Whether to show standard deviation columns
top_n : int, default=10
Number of top models to display
show_metrics : List[str], optional
Specific metrics to show. If None, shows all numeric metrics
highlight_best : bool, default=True
Whether to highlight the best performing model
figsize : Tuple[int, int], default=(12, 8)
Figure size for the visualization
"""
# Handle input data
if comparison_results is not None:
display_df = comparison_results.copy()
# Sort by specified metric
if sort_by is not None and sort_by in display_df.columns:
display_df = display_df.sort_values(sort_by, ascending=ascending)
elif len(display_df.select_dtypes(include=[np.number]).columns) > 0:
# Sort by first numeric column if sort_by not specified
numeric_cols = display_df.select_dtypes(include=[np.number]).columns
display_df = display_df.sort_values(numeric_cols[0], ascending=ascending)
elif ranked_df is not None:
display_df = ranked_df.copy()
else:
raise ValueError("Must provide either comparison_results or ranked_df")
# Filter out std columns if not requested
if not show_std:
std_cols = [col for col in display_df.columns if '_std' in col.lower() or 'std_' in col.lower()]
display_df = display_df.drop(columns=std_cols, errors='ignore')
# Filter to specific metrics if requested
if show_metrics is not None:
keep_cols = ['model'] + [col for col in display_df.columns if any(metric in col.lower() for metric in show_metrics)]
display_df = display_df[keep_cols]
print("🏆 MODEL LEADERBOARD 🏆")
print("=" * 50)
# Take top_n results
display_df = display_df.head(top_n).copy()
# Highlight best model if requested
if highlight_best and len(display_df) > 0:
best_model = display_df.iloc[0]['model']
print(f"🥇 Best Model: {best_model}")
print()
# Display the results
print(display_df.to_string(index=False))
print()
# Create simple visualization if matplotlib is available
try:
import matplotlib.pyplot as plt
# Get numeric columns for plotting
numeric_cols = display_df.select_dtypes(include=[np.number]).columns.tolist()
if len(numeric_cols) > 0:
# Create a simple bar plot for the first metric
plt.figure(figsize=figsize)
first_metric = numeric_cols[0]
models = display_df['model'].tolist()
scores = display_df[first_metric].tolist()
bars = plt.barh(range(len(models)), scores)
plt.yticks(range(len(models)), models)
plt.xlabel(first_metric.title())
plt.title(f'Model Comparison - {first_metric.title()}')
# Highlight best model
if highlight_best and len(bars) > 0:
bars[0].set_color('gold')
plt.tight_layout()
plt.show()
except ImportError:
print("📊 Matplotlib not available for visualization")
return display_df
def _detect_problem_type(y):
"""Detect if problem is classification or regression"""
if hasattr(y, 'dtype'):
if y.dtype.name in ['object', 'category', 'bool']:
return 'classification'
elif len(np.unique(y)) <= 10: # Likely categorical
return 'classification'
else:
return 'regression'
else:
unique_values = len(set(y))
if unique_values <= 10:
return 'classification'
else:
return 'regression'
[docs]
def export_model_comparison(
comparison_df: pd.DataFrame,
filepath: str,
include_config: bool = True,
format: str = 'csv'
) -> None:
"""
Export model comparison results to file.
Parameters:
-----------
comparison_df : pd.DataFrame
Comparison results to export
filepath : str
Path where to save the file
include_config : bool, default=True
Whether to include experiment configuration
format : str, default='csv'
Export format ('csv', 'excel', 'json')
"""
print(f"💾 Exporting comparison results to {filepath}...")
if format.lower() == 'csv':
comparison_df.to_csv(filepath, index=False)
elif format.lower() == 'excel':
comparison_df.to_excel(filepath, index=False)
elif format.lower() == 'json':
comparison_df.to_json(filepath, indent=2)
else:
raise ValueError(f"Unsupported format: {format}")
print("✅ Export completed!")
def _calculate_classification_metric(metric: str, y_true: pd.Series, y_pred: np.ndarray, y_proba: Optional[np.ndarray] = None) -> float:
"""Calculate classification metric."""
metric = metric.lower()
if metric == 'accuracy':
return accuracy_score(y_true, y_pred)
elif metric == 'precision':
return precision_score(y_true, y_pred, average='weighted', zero_division=0)
elif metric == 'recall':
return recall_score(y_true, y_pred, average='weighted', zero_division=0)
elif metric == 'f1':
return f1_score(y_true, y_pred, average='weighted', zero_division=0)
elif metric == 'roc_auc':
if y_proba is not None and len(np.unique(y_true)) == 2:
return roc_auc_score(y_true, y_proba[:, 1])
else:
return np.nan
else:
raise ValueError(f"Unknown classification metric: {metric}")
def _calculate_regression_metric(metric: str, y_true: pd.Series, y_pred: np.ndarray) -> float:
"""Calculate regression metric."""
metric = metric.lower()
if metric == 'mse':
return mean_squared_error(y_true, y_pred)
elif metric == 'mae':
return mean_absolute_error(y_true, y_pred)
elif metric == 'rmse':
return np.sqrt(mean_squared_error(y_true, y_pred))
elif metric == 'r2':
return r2_score(y_true, y_pred)
else:
raise ValueError(f"Unknown regression metric: {metric}")