"""
edaflow.ml.config - Configuration and setup utilities for ML experiments
This module provides utilities for setting up machine learning experiments,
configuring model pipelines, and validating data for ML workflows.
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Any, Union
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings
[docs]
def setup_ml_experiment(
data: Optional[pd.DataFrame] = None,
target_column: Optional[str] = None,
test_size: float = 0.2,
validation_size: Optional[float] = None,
random_state: int = 42,
stratify: bool = True,
verbose: bool = True,
experiment_name: Optional[str] = None,
# Alternative sklearn-style parameters
X: Optional[pd.DataFrame] = None,
y: Optional[pd.Series] = None,
# Alternative parameter names for compatibility
val_size: Optional[float] = None,
primary_metric: Optional[str] = None
) -> Dict[str, Any]:
"""
Set up a complete ML experiment with train/validation/test splits.
This function supports two calling patterns:
1. DataFrame with target column: setup_ml_experiment(data, target_column)
2. Sklearn-style: setup_ml_experiment(X=X, y=y)
Parameters:
-----------
...existing parameters...
primary_metric : str, optional
The main metric used for model selection and ranking (e.g., 'roc_auc', 'f1', 'accuracy', 'r2').
This will be stored in the config for downstream use.
-----------
data : pd.DataFrame, optional
The complete dataset including features and target
target_column : str, optional
Name of the target variable column (required if using data parameter)
test_size : float, default=0.2
Proportion of data to use for testing
validation_size : float, optional
Proportion of training data to use for validation (default=0.2)
random_state : int, default=42
Random seed for reproducibility
stratify : bool, default=True
Whether to stratify the splits (for classification)
verbose : bool, default=True
Whether to print experiment setup details
experiment_name : str, optional
Name for the experiment (default='ml_experiment')
X : pd.DataFrame, optional
Feature matrix (alternative to data + target_column pattern)
y : pd.Series, optional
Target vector (alternative to data + target_column pattern)
val_size : float, optional
Alternative name for validation_size (for compatibility)
Returns:
--------
Dict[str, Any]
Dictionary containing X_train, X_val, X_test, y_train, y_val, y_test,
feature_names, target_name, and experiment_config
Examples:
---------
# Method 1: DataFrame with target column (recommended)
>>> experiment = ml.setup_ml_experiment(df, target_column='target')
# Method 2: Sklearn-style (also supported)
>>> X = df.drop('target', axis=1)
>>> y = df['target']
>>> experiment = ml.setup_ml_experiment(X=X, y=y)
"""
# Handle parameter compatibility
# Use val_size if provided, otherwise use validation_size, default to 0.2
if val_size is not None:
validation_size = val_size
elif validation_size is None:
validation_size = 0.2
# Set default experiment name if not provided
if experiment_name is None:
experiment_name = "ml_experiment"
# Handle different calling patterns
if data is not None and target_column is not None:
# Standard edaflow pattern: DataFrame + target_column
if verbose:
print("๐งช Setting up ML Experiment...")
print(f"๐ Dataset shape: {data.shape}")
print(f"๐ฏ Target column: {target_column}")
# Validate target column exists
if target_column not in data.columns:
raise ValueError(f"Target column '{target_column}' not found in dataset")
# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]
target_name = target_column
elif X is not None and y is not None:
# Sklearn-style pattern: separate X and y
if not isinstance(X, pd.DataFrame):
raise TypeError("X must be a pandas DataFrame")
if not isinstance(y, (pd.Series, pd.DataFrame)):
raise TypeError("y must be a pandas Series or DataFrame")
# Convert y to Series if it's a DataFrame
if isinstance(y, pd.DataFrame):
if y.shape[1] != 1:
raise ValueError("y DataFrame must have exactly one column")
target_name = y.columns[0]
y = y.iloc[:, 0]
else:
target_name = y.name if y.name else 'target'
if verbose:
print("๐งช Setting up ML Experiment (sklearn-style)...")
print(f"๐ Features shape: {X.shape}")
print(f"๐ Target shape: {y.shape}")
print(f"๐ฏ Target name: {target_name}")
else:
raise ValueError(
"Must provide either:\n"
"1. data and target_column parameters, or\n"
"2. X and y parameters"
)
# Determine problem type
is_classification = _is_classification_problem(y)
problem_type = "classification" if is_classification else "regression"
if verbose:
print(f"๐ Problem type: {problem_type}")
print(f"๐ Features: {len(X.columns)}")
if is_classification:
print(f"๐ท๏ธ Classes: {len(y.unique())} unique values")
else:
print(f"๐ Target range: [{y.min():.3f}, {y.max():.3f}]")
# Configure stratification
stratify_param = y if (stratify and is_classification) else None
# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
X, y,
test_size=test_size,
random_state=random_state,
stratify=stratify_param
)
# Second split: training and validation from remaining data
val_size_adjusted = validation_size / (1 - test_size)
stratify_temp = y_temp if (stratify and is_classification) else None
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp,
test_size=val_size_adjusted,
random_state=random_state,
stratify=stratify_temp
)
if verbose:
print(f"โ
Train set: {X_train.shape[0]} samples")
print(f"โ
Validation set: {X_val.shape[0]} samples")
print(f"โ
Test set: {X_test.shape[0]} samples")
# Create experiment configuration
experiment_config = {
'experiment_name': experiment_name,
'problem_type': problem_type,
'target_column': target_name,
'feature_names': list(X.columns),
'n_classes': len(y.unique()) if is_classification else None,
'test_size': test_size,
'validation_size': validation_size,
'random_state': random_state,
'stratified': stratify and is_classification,
'total_samples': len(X) + len(y),
'train_samples': len(X_train),
'val_samples': len(X_val),
'test_samples': len(X_test),
'primary_metric': primary_metric
}
return {
'X_train': X_train,
'X_val': X_val,
'X_test': X_test,
'y_train': y_train,
'y_val': y_val,
'y_test': y_test,
'feature_names': list(X.columns),
'target_name': target_name,
'experiment_name': experiment_name, # Add experiment_name to top level for easy access
'primary_metric': primary_metric,
'experiment_config': experiment_config
}
[docs]
def validate_ml_data(
experiment_data: Optional[Dict[str, Any]] = None,
check_missing: bool = True,
check_duplicates: bool = True,
check_outliers: bool = True,
verbose: bool = True,
# Alternative sklearn-style parameters
X: Optional[pd.DataFrame] = None,
y: Optional[pd.Series] = None,
# Additional parameters for direct X,y usage
check_cardinality: bool = True,
check_distributions: bool = True
) -> Dict[str, Any]:
"""
Validate data quality for ML experiments.
This function supports two calling patterns:
1. Experiment config: validate_ml_data(experiment_config)
2. Sklearn-style: validate_ml_data(X=X_train, y=y_train)
Parameters:
-----------
experiment_data : Dict[str, Any], optional
Dictionary from setup_ml_experiment containing splits
check_missing : bool, default=True
Whether to check for missing values
check_duplicates : bool, default=True
Whether to check for duplicate rows
check_outliers : bool, default=True
Whether to check for outliers
verbose : bool, default=True
Whether to print validation details
X : pd.DataFrame, optional
Feature data (alternative to experiment_data)
y : pd.Series, optional
Target data (alternative to experiment_data)
check_cardinality : bool, default=True
Whether to check feature cardinality
check_distributions : bool, default=True
Whether to check feature distributions
Returns:
--------
Dict[str, Any]
Dictionary containing validation results and recommendations
"""
if verbose:
print("๐ Validating ML Data Quality...")
validation_results = {}
recommendations = []
# Handle different calling patterns
if experiment_data is not None:
# Pattern 1: Using experiment config
X_train = experiment_data['X_train']
X_val = experiment_data.get('X_val')
X_test = experiment_data.get('X_test')
y_train = experiment_data['y_train']
if verbose:
print(f"๐ Validating experiment: {experiment_data.get('experiment_config', {}).get('experiment_name', 'Unknown')}")
elif X is not None and y is not None:
# Pattern 2: Using X, y directly (sklearn-style)
X_train = X
X_val = None
X_test = None
y_train = y
if verbose:
print("๐ Validating provided X, y data")
else:
raise ValueError("Must provide either experiment_data OR both X and y parameters")
# Get primary data for validation
primary_X = X_train
primary_y = y_train
# Check missing values
if check_missing:
train_missing = primary_X.isnull().sum()
missing_cols = train_missing[train_missing > 0]
validation_results['missing_values'] = {
'total_missing': train_missing.sum(),
'columns_with_missing': len(missing_cols),
'missing_percentages': (missing_cols / len(primary_X) * 100).to_dict()
}
if len(missing_cols) > 0:
recommendations.append(f"โ ๏ธ {len(missing_cols)} columns have missing values")
if verbose:
print(f"โ Missing values found in {len(missing_cols)} columns")
# Check for class imbalance (classification only)
# Determine problem type
if experiment_data is not None:
problem_type = experiment_data.get('experiment_config', {}).get('problem_type', 'auto')
else:
# Auto-detect for X, y pattern
if len(primary_y.unique()) <= 20:
problem_type = 'classification'
else:
problem_type = 'regression'
if problem_type == 'classification':
class_counts = primary_y.value_counts()
class_ratios = class_counts / len(primary_y)
min_class_ratio = class_ratios.min()
validation_results['class_balance'] = {
'class_counts': class_counts.to_dict(),
'class_ratios': class_ratios.to_dict(),
'min_class_ratio': min_class_ratio,
'is_imbalanced': min_class_ratio < 0.1
}
if min_class_ratio < 0.1:
recommendations.append(f"โ ๏ธ Class imbalance detected (min class: {min_class_ratio:.1%})")
if verbose:
print(f"โ๏ธ Class imbalance: smallest class is {min_class_ratio:.1%}")
# Check duplicates
if check_duplicates:
train_duplicates = primary_X.duplicated().sum()
validation_results['duplicates'] = {
'duplicate_rows': train_duplicates,
'duplicate_percentage': (train_duplicates / len(primary_X) * 100)
}
if train_duplicates > 0:
recommendations.append(f"โ ๏ธ {train_duplicates} duplicate rows found")
if verbose:
print(f"๐ {train_duplicates} duplicate rows detected")
# Additional checks for X, y pattern
if X is not None and y is not None:
# Check cardinality
if check_cardinality:
high_cardinality_cols = []
for col in primary_X.columns:
if primary_X[col].dtype == 'object':
unique_count = primary_X[col].nunique()
if unique_count > 50:
high_cardinality_cols.append((col, unique_count))
validation_results['cardinality'] = {
'high_cardinality_columns': high_cardinality_cols
}
if high_cardinality_cols:
recommendations.append(f"โ ๏ธ {len(high_cardinality_cols)} high-cardinality columns found")
if verbose:
print(f"๐ข High cardinality columns: {len(high_cardinality_cols)}")
# Check distributions
if check_distributions:
skewed_cols = []
for col in primary_X.select_dtypes(include=[np.number]).columns:
skewness = abs(primary_X[col].skew())
if skewness > 2.0:
skewed_cols.append((col, skewness))
validation_results['distributions'] = {
'skewed_columns': skewed_cols
}
if skewed_cols:
recommendations.append(f"โ ๏ธ {len(skewed_cols)} highly skewed columns found")
if verbose:
print(f"๐ Skewed distributions: {len(skewed_cols)}")
# Data quality score
quality_score = 100.0
if validation_results.get('missing_values', {}).get('total_missing', 0) > 0:
quality_score -= 20
if validation_results.get('duplicates', {}).get('duplicate_rows', 0) > 0:
quality_score -= 10
if validation_results.get('class_balance', {}).get('is_imbalanced', False):
quality_score -= 15
validation_results['quality_score'] = quality_score
validation_results['recommendations'] = recommendations
if verbose:
print(f"๐ Data Quality Score: {quality_score:.1f}/100")
if recommendations:
print("๐ Recommendations:")
for rec in recommendations:
print(f" {rec}")
else:
print("โ
No major data quality issues detected!")
return validation_results
def _is_classification_problem(y: pd.Series) -> bool:
"""
Determine if the target variable represents a classification problem.
Parameters:
-----------
y : pd.Series
Target variable
Returns:
--------
bool
True if classification, False if regression
"""
# Check data type
if y.dtype == 'object' or pd.api.types.is_categorical_dtype(y):
return True
# Check if all values are integers and relatively few unique values
if y.dtype in ['int64', 'int32']:
unique_ratio = len(y.unique()) / len(y)
if unique_ratio < 0.05 or len(y.unique()) <= 20:
return True
# Check for boolean values
if y.dtype == 'bool':
return True
# Default to regression for continuous values
return False