Source code for edaflow.ml.config

"""
edaflow.ml.config - Configuration and setup utilities for ML experiments

This module provides utilities for setting up machine learning experiments,
configuring model pipelines, and validating data for ML workflows.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Any, Union
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings


[docs] def setup_ml_experiment( data: Optional[pd.DataFrame] = None, target_column: Optional[str] = None, test_size: float = 0.2, validation_size: Optional[float] = None, random_state: int = 42, stratify: bool = True, verbose: bool = True, experiment_name: Optional[str] = None, # Alternative sklearn-style parameters X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, # Alternative parameter names for compatibility val_size: Optional[float] = None, primary_metric: Optional[str] = None ) -> Dict[str, Any]: """ Set up a complete ML experiment with train/validation/test splits. This function supports two calling patterns: 1. DataFrame with target column: setup_ml_experiment(data, target_column) 2. Sklearn-style: setup_ml_experiment(X=X, y=y) Parameters: ----------- ...existing parameters... primary_metric : str, optional The main metric used for model selection and ranking (e.g., 'roc_auc', 'f1', 'accuracy', 'r2'). This will be stored in the config for downstream use. ----------- data : pd.DataFrame, optional The complete dataset including features and target target_column : str, optional Name of the target variable column (required if using data parameter) test_size : float, default=0.2 Proportion of data to use for testing validation_size : float, optional Proportion of training data to use for validation (default=0.2) random_state : int, default=42 Random seed for reproducibility stratify : bool, default=True Whether to stratify the splits (for classification) verbose : bool, default=True Whether to print experiment setup details experiment_name : str, optional Name for the experiment (default='ml_experiment') X : pd.DataFrame, optional Feature matrix (alternative to data + target_column pattern) y : pd.Series, optional Target vector (alternative to data + target_column pattern) val_size : float, optional Alternative name for validation_size (for compatibility) Returns: -------- Dict[str, Any] Dictionary containing X_train, X_val, X_test, y_train, y_val, y_test, feature_names, target_name, and experiment_config Examples: --------- # Method 1: DataFrame with target column (recommended) >>> experiment = ml.setup_ml_experiment(df, target_column='target') # Method 2: Sklearn-style (also supported) >>> X = df.drop('target', axis=1) >>> y = df['target'] >>> experiment = ml.setup_ml_experiment(X=X, y=y) """ # Handle parameter compatibility # Use val_size if provided, otherwise use validation_size, default to 0.2 if val_size is not None: validation_size = val_size elif validation_size is None: validation_size = 0.2 # Set default experiment name if not provided if experiment_name is None: experiment_name = "ml_experiment" # Handle different calling patterns if data is not None and target_column is not None: # Standard edaflow pattern: DataFrame + target_column if verbose: print("๐Ÿงช Setting up ML Experiment...") print(f"๐Ÿ“Š Dataset shape: {data.shape}") print(f"๐ŸŽฏ Target column: {target_column}") # Validate target column exists if target_column not in data.columns: raise ValueError(f"Target column '{target_column}' not found in dataset") # Separate features and target X = data.drop(columns=[target_column]) y = data[target_column] target_name = target_column elif X is not None and y is not None: # Sklearn-style pattern: separate X and y if not isinstance(X, pd.DataFrame): raise TypeError("X must be a pandas DataFrame") if not isinstance(y, (pd.Series, pd.DataFrame)): raise TypeError("y must be a pandas Series or DataFrame") # Convert y to Series if it's a DataFrame if isinstance(y, pd.DataFrame): if y.shape[1] != 1: raise ValueError("y DataFrame must have exactly one column") target_name = y.columns[0] y = y.iloc[:, 0] else: target_name = y.name if y.name else 'target' if verbose: print("๐Ÿงช Setting up ML Experiment (sklearn-style)...") print(f"๐Ÿ“Š Features shape: {X.shape}") print(f"๐Ÿ“Š Target shape: {y.shape}") print(f"๐ŸŽฏ Target name: {target_name}") else: raise ValueError( "Must provide either:\n" "1. data and target_column parameters, or\n" "2. X and y parameters" ) # Determine problem type is_classification = _is_classification_problem(y) problem_type = "classification" if is_classification else "regression" if verbose: print(f"๐Ÿ“ˆ Problem type: {problem_type}") print(f"๐Ÿ“‹ Features: {len(X.columns)}") if is_classification: print(f"๐Ÿท๏ธ Classes: {len(y.unique())} unique values") else: print(f"๐Ÿ“Š Target range: [{y.min():.3f}, {y.max():.3f}]") # Configure stratification stratify_param = y if (stratify and is_classification) else None # First split: separate test set X_temp, X_test, y_temp, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=stratify_param ) # Second split: training and validation from remaining data val_size_adjusted = validation_size / (1 - test_size) stratify_temp = y_temp if (stratify and is_classification) else None X_train, X_val, y_train, y_val = train_test_split( X_temp, y_temp, test_size=val_size_adjusted, random_state=random_state, stratify=stratify_temp ) if verbose: print(f"โœ… Train set: {X_train.shape[0]} samples") print(f"โœ… Validation set: {X_val.shape[0]} samples") print(f"โœ… Test set: {X_test.shape[0]} samples") # Create experiment configuration experiment_config = { 'experiment_name': experiment_name, 'problem_type': problem_type, 'target_column': target_name, 'feature_names': list(X.columns), 'n_classes': len(y.unique()) if is_classification else None, 'test_size': test_size, 'validation_size': validation_size, 'random_state': random_state, 'stratified': stratify and is_classification, 'total_samples': len(X) + len(y), 'train_samples': len(X_train), 'val_samples': len(X_val), 'test_samples': len(X_test), 'primary_metric': primary_metric } return { 'X_train': X_train, 'X_val': X_val, 'X_test': X_test, 'y_train': y_train, 'y_val': y_val, 'y_test': y_test, 'feature_names': list(X.columns), 'target_name': target_name, 'experiment_name': experiment_name, # Add experiment_name to top level for easy access 'primary_metric': primary_metric, 'experiment_config': experiment_config }
[docs] def configure_model_pipeline( data_config: Dict[str, Any], numerical_strategy: str = 'standard', categorical_strategy: str = 'onehot', handle_missing: str = 'drop', verbose: bool = True ) -> Pipeline: """ Configure a preprocessing pipeline for the ML experiment. Parameters: ----------- data_config : Dict[str, Any] Configuration dictionary from setup_ml_experiment numerical_strategy : str, default='standard' Scaling strategy for numerical features ('standard', 'minmax', 'robust', 'none') categorical_strategy : str, default='onehot' Encoding strategy for categorical features ('onehot', 'target', 'none') handle_missing : str, default='drop' Missing value strategy ('drop', 'impute', 'flag') verbose : bool, default=True Whether to print pipeline configuration details Returns: -------- Pipeline Configured sklearn Pipeline for preprocessing """ if verbose: print("๐Ÿ”ง Configuring Model Pipeline...") print(f"๐Ÿ“Š Numerical strategy: {numerical_strategy}") print(f"๐Ÿท๏ธ Categorical strategy: {categorical_strategy}") print(f"โ“ Missing values: {handle_missing}") # Get sample data to analyze column types X_sample = data_config['X_train'] # Identify numerical and categorical columns numerical_columns = X_sample.select_dtypes(include=[np.number]).columns.tolist() categorical_columns = X_sample.select_dtypes(include=['object', 'category']).columns.tolist() if verbose: print(f"๐Ÿ“ˆ Numerical columns: {len(numerical_columns)}") print(f"๐Ÿ“‹ Categorical columns: {len(categorical_columns)}") # Configure transformers transformers = [] # Numerical preprocessing if numerical_columns and numerical_strategy != 'none': if numerical_strategy == 'standard': num_transformer = StandardScaler() elif numerical_strategy == 'minmax': num_transformer = MinMaxScaler() elif numerical_strategy == 'robust': num_transformer = RobustScaler() else: raise ValueError(f"Unknown numerical strategy: {numerical_strategy}") transformers.append(('num', num_transformer, numerical_columns)) # Categorical preprocessing if categorical_columns and categorical_strategy != 'none': if categorical_strategy == 'onehot': from sklearn.preprocessing import OneHotEncoder cat_transformer = OneHotEncoder(drop='first', sparse_output=False) transformers.append(('cat', cat_transformer, categorical_columns)) elif categorical_strategy == 'target': warnings.warn("Target encoding not implemented yet. Using OneHot encoding.") from sklearn.preprocessing import OneHotEncoder cat_transformer = OneHotEncoder(drop='first', sparse_output=False) transformers.append(('cat', cat_transformer, categorical_columns)) # Create column transformer if transformers: preprocessor = ColumnTransformer( transformers=transformers, remainder='passthrough' # Keep other columns as-is ) else: # No transformation needed from sklearn.preprocessing import FunctionTransformer preprocessor = FunctionTransformer(validate=False) # Create pipeline pipeline = Pipeline([ ('preprocessor', preprocessor) ]) if verbose: print("โœ… Pipeline configured successfully!") return pipeline
[docs] def validate_ml_data( experiment_data: Optional[Dict[str, Any]] = None, check_missing: bool = True, check_duplicates: bool = True, check_outliers: bool = True, verbose: bool = True, # Alternative sklearn-style parameters X: Optional[pd.DataFrame] = None, y: Optional[pd.Series] = None, # Additional parameters for direct X,y usage check_cardinality: bool = True, check_distributions: bool = True ) -> Dict[str, Any]: """ Validate data quality for ML experiments. This function supports two calling patterns: 1. Experiment config: validate_ml_data(experiment_config) 2. Sklearn-style: validate_ml_data(X=X_train, y=y_train) Parameters: ----------- experiment_data : Dict[str, Any], optional Dictionary from setup_ml_experiment containing splits check_missing : bool, default=True Whether to check for missing values check_duplicates : bool, default=True Whether to check for duplicate rows check_outliers : bool, default=True Whether to check for outliers verbose : bool, default=True Whether to print validation details X : pd.DataFrame, optional Feature data (alternative to experiment_data) y : pd.Series, optional Target data (alternative to experiment_data) check_cardinality : bool, default=True Whether to check feature cardinality check_distributions : bool, default=True Whether to check feature distributions Returns: -------- Dict[str, Any] Dictionary containing validation results and recommendations """ if verbose: print("๐Ÿ” Validating ML Data Quality...") validation_results = {} recommendations = [] # Handle different calling patterns if experiment_data is not None: # Pattern 1: Using experiment config X_train = experiment_data['X_train'] X_val = experiment_data.get('X_val') X_test = experiment_data.get('X_test') y_train = experiment_data['y_train'] if verbose: print(f"๐Ÿ“Š Validating experiment: {experiment_data.get('experiment_config', {}).get('experiment_name', 'Unknown')}") elif X is not None and y is not None: # Pattern 2: Using X, y directly (sklearn-style) X_train = X X_val = None X_test = None y_train = y if verbose: print("๐Ÿ“Š Validating provided X, y data") else: raise ValueError("Must provide either experiment_data OR both X and y parameters") # Get primary data for validation primary_X = X_train primary_y = y_train # Check missing values if check_missing: train_missing = primary_X.isnull().sum() missing_cols = train_missing[train_missing > 0] validation_results['missing_values'] = { 'total_missing': train_missing.sum(), 'columns_with_missing': len(missing_cols), 'missing_percentages': (missing_cols / len(primary_X) * 100).to_dict() } if len(missing_cols) > 0: recommendations.append(f"โš ๏ธ {len(missing_cols)} columns have missing values") if verbose: print(f"โ“ Missing values found in {len(missing_cols)} columns") # Check for class imbalance (classification only) # Determine problem type if experiment_data is not None: problem_type = experiment_data.get('experiment_config', {}).get('problem_type', 'auto') else: # Auto-detect for X, y pattern if len(primary_y.unique()) <= 20: problem_type = 'classification' else: problem_type = 'regression' if problem_type == 'classification': class_counts = primary_y.value_counts() class_ratios = class_counts / len(primary_y) min_class_ratio = class_ratios.min() validation_results['class_balance'] = { 'class_counts': class_counts.to_dict(), 'class_ratios': class_ratios.to_dict(), 'min_class_ratio': min_class_ratio, 'is_imbalanced': min_class_ratio < 0.1 } if min_class_ratio < 0.1: recommendations.append(f"โš ๏ธ Class imbalance detected (min class: {min_class_ratio:.1%})") if verbose: print(f"โš–๏ธ Class imbalance: smallest class is {min_class_ratio:.1%}") # Check duplicates if check_duplicates: train_duplicates = primary_X.duplicated().sum() validation_results['duplicates'] = { 'duplicate_rows': train_duplicates, 'duplicate_percentage': (train_duplicates / len(primary_X) * 100) } if train_duplicates > 0: recommendations.append(f"โš ๏ธ {train_duplicates} duplicate rows found") if verbose: print(f"๐Ÿ”„ {train_duplicates} duplicate rows detected") # Additional checks for X, y pattern if X is not None and y is not None: # Check cardinality if check_cardinality: high_cardinality_cols = [] for col in primary_X.columns: if primary_X[col].dtype == 'object': unique_count = primary_X[col].nunique() if unique_count > 50: high_cardinality_cols.append((col, unique_count)) validation_results['cardinality'] = { 'high_cardinality_columns': high_cardinality_cols } if high_cardinality_cols: recommendations.append(f"โš ๏ธ {len(high_cardinality_cols)} high-cardinality columns found") if verbose: print(f"๐Ÿ”ข High cardinality columns: {len(high_cardinality_cols)}") # Check distributions if check_distributions: skewed_cols = [] for col in primary_X.select_dtypes(include=[np.number]).columns: skewness = abs(primary_X[col].skew()) if skewness > 2.0: skewed_cols.append((col, skewness)) validation_results['distributions'] = { 'skewed_columns': skewed_cols } if skewed_cols: recommendations.append(f"โš ๏ธ {len(skewed_cols)} highly skewed columns found") if verbose: print(f"๐Ÿ“Š Skewed distributions: {len(skewed_cols)}") # Data quality score quality_score = 100.0 if validation_results.get('missing_values', {}).get('total_missing', 0) > 0: quality_score -= 20 if validation_results.get('duplicates', {}).get('duplicate_rows', 0) > 0: quality_score -= 10 if validation_results.get('class_balance', {}).get('is_imbalanced', False): quality_score -= 15 validation_results['quality_score'] = quality_score validation_results['recommendations'] = recommendations if verbose: print(f"๐Ÿ“Š Data Quality Score: {quality_score:.1f}/100") if recommendations: print("๐Ÿ“‹ Recommendations:") for rec in recommendations: print(f" {rec}") else: print("โœ… No major data quality issues detected!") return validation_results
def _is_classification_problem(y: pd.Series) -> bool: """ Determine if the target variable represents a classification problem. Parameters: ----------- y : pd.Series Target variable Returns: -------- bool True if classification, False if regression """ # Check data type if y.dtype == 'object' or pd.api.types.is_categorical_dtype(y): return True # Check if all values are integers and relatively few unique values if y.dtype in ['int64', 'int32']: unique_ratio = len(y.unique()) / len(y) if unique_ratio < 0.05 or len(y.unique()) <= 20: return True # Check for boolean values if y.dtype == 'bool': return True # Default to regression for continuous values return False