Source code for edaflow.ml.artifacts

"""
edaflow.ml.artifacts - Model persistence and experiment tracking

This module provides utilities for saving and loading model artifacts,
tracking experiments, and generating comprehensive model reports.
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Any, Union
import pickle
import joblib
import json
import os
from datetime import datetime
from pathlib import Path
import warnings


[docs] def save_model_artifacts( model: Any, model_name: str, experiment_config: Dict[str, Any], performance_metrics: Dict[str, float], save_dir: str = "model_artifacts", include_data_sample: bool = True, X_sample: Optional[pd.DataFrame] = None, format: str = 'joblib' ) -> Dict[str, str]: """ Save complete model artifacts including model, config, and metadata. Parameters: ----------- model : Any The trained model to save model_name : str Name of the model for file naming experiment_config : Dict[str, Any] Configuration dictionary from setup_ml_experiment performance_metrics : Dict[str, float] Dictionary of performance metrics save_dir : str, default="model_artifacts" Directory to save artifacts include_data_sample : bool, default=True Whether to save a sample of training data X_sample : pd.DataFrame, optional Sample data to save (if not provided, uses first 100 rows) format : str, default='joblib' Format to save model ('joblib' or 'pickle') Returns: -------- Dict[str, str] Dictionary with paths to saved artifacts """ print(f"๐Ÿ’พ Saving model artifacts for '{model_name}'...") # Create save directory save_path = Path(save_dir) save_path.mkdir(parents=True, exist_ok=True) # Generate timestamp for unique naming timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") base_filename = f"{model_name}_{timestamp}" saved_files = {} # Save the model if format.lower() == 'joblib': model_file = save_path / f"{base_filename}_model.joblib" joblib.dump(model, model_file) elif format.lower() == 'pickle': model_file = save_path / f"{base_filename}_model.pkl" with open(model_file, 'wb') as f: pickle.dump(model, f) else: raise ValueError(f"Unsupported format: {format}") saved_files['model'] = str(model_file) print(f"โœ… Model saved: {model_file}") # Save experiment configuration config_file = save_path / f"{base_filename}_config.json" with open(config_file, 'w') as f: # Convert any non-serializable objects to strings serializable_config = _make_json_serializable(experiment_config) json.dump(serializable_config, f, indent=2) saved_files['config'] = str(config_file) print(f"โœ… Config saved: {config_file}") # Save performance metrics metrics_file = save_path / f"{base_filename}_metrics.json" with open(metrics_file, 'w') as f: serializable_metrics = _make_json_serializable(performance_metrics) json.dump(serializable_metrics, f, indent=2) saved_files['metrics'] = str(metrics_file) print(f"โœ… Metrics saved: {metrics_file}") # Save model metadata metadata = { 'model_name': model_name, 'model_type': type(model).__name__, 'saved_at': datetime.now().isoformat(), 'saved_format': format, 'feature_count': experiment_config.get('feature_count', 'unknown'), 'problem_type': experiment_config.get('problem_type', 'unknown'), 'training_samples': experiment_config.get('train_samples', 'unknown') } metadata_file = save_path / f"{base_filename}_metadata.json" with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=2) saved_files['metadata'] = str(metadata_file) print(f"โœ… Metadata saved: {metadata_file}") # Save data sample if requested if include_data_sample and X_sample is not None: sample_file = save_path / f"{base_filename}_data_sample.csv" sample_data = X_sample.head(100) if len(X_sample) > 100 else X_sample sample_data.to_csv(sample_file, index=False) saved_files['data_sample'] = str(sample_file) print(f"โœ… Data sample saved: {sample_file}") print(f"๐ŸŽ‰ All artifacts saved to: {save_path}") return saved_files
[docs] def load_model_artifacts( artifact_path: str, load_model: bool = True, load_config: bool = True, load_metrics: bool = True ) -> Dict[str, Any]: """ Load model artifacts from saved files. Parameters: ----------- artifact_path : str Path to the model file or directory containing artifacts load_model : bool, default=True Whether to load the model load_config : bool, default=True Whether to load the configuration load_metrics : bool, default=True Whether to load the metrics Returns: -------- Dict[str, Any] Dictionary containing loaded artifacts """ print(f"๐Ÿ“‚ Loading model artifacts from: {artifact_path}") artifact_path = Path(artifact_path) loaded_artifacts = {} # If path is a directory, find model files if artifact_path.is_dir(): model_files = list(artifact_path.glob("*_model.joblib")) + list(artifact_path.glob("*_model.pkl")) if not model_files: raise FileNotFoundError(f"No model files found in {artifact_path}") # Use the most recent model file model_file = max(model_files, key=os.path.getctime) base_name = model_file.stem.replace("_model", "") else: # Path is a specific model file model_file = artifact_path base_name = model_file.stem.replace("_model", "") artifact_path = model_file.parent # Load model if load_model: try: if model_file.suffix == '.joblib': model = joblib.load(model_file) elif model_file.suffix == '.pkl': with open(model_file, 'rb') as f: model = pickle.load(f) else: raise ValueError(f"Unsupported model file format: {model_file.suffix}") loaded_artifacts['model'] = model print(f"โœ… Model loaded: {model_file}") except Exception as e: print(f"โŒ Failed to load model: {str(e)}") # Load configuration if load_config: config_file = artifact_path / f"{base_name}_config.json" if config_file.exists(): try: with open(config_file, 'r') as f: config = json.load(f) loaded_artifacts['config'] = config print(f"โœ… Config loaded: {config_file}") except Exception as e: print(f"โŒ Failed to load config: {str(e)}") else: print(f"โš ๏ธ Config file not found: {config_file}") # Load metrics if load_metrics: metrics_file = artifact_path / f"{base_name}_metrics.json" if metrics_file.exists(): try: with open(metrics_file, 'r') as f: metrics = json.load(f) loaded_artifacts['metrics'] = metrics print(f"โœ… Metrics loaded: {metrics_file}") except Exception as e: print(f"โŒ Failed to load metrics: {str(e)}") else: print(f"โš ๏ธ Metrics file not found: {metrics_file}") # Load metadata if available metadata_file = artifact_path / f"{base_name}_metadata.json" if metadata_file.exists(): try: with open(metadata_file, 'r') as f: metadata = json.load(f) loaded_artifacts['metadata'] = metadata print(f"โœ… Metadata loaded: {metadata_file}") except Exception as e: print(f"โŒ Failed to load metadata: {str(e)}") print(f"๐ŸŽ‰ Loaded {len(loaded_artifacts)} artifact types") return loaded_artifacts
[docs] def track_experiment( experiment_name: str, model_results: Dict[str, Any], experiment_config: Dict[str, Any], notes: Optional[str] = None, log_file: str = "experiment_log.csv" ) -> None: """ Track experiment results in a CSV log file. Parameters: ----------- experiment_name : str Name of the experiment model_results : Dict[str, Any] Results dictionary from model comparison experiment_config : Dict[str, Any] Configuration dictionary from setup_ml_experiment notes : str, optional Additional notes about the experiment log_file : str, default="experiment_log.csv" Path to the log file """ print(f"๐Ÿ“Š Tracking experiment: {experiment_name}") # Prepare experiment record record = { 'timestamp': datetime.now().isoformat(), 'experiment_name': experiment_name, 'model_type': model_results.get('model_name', 'unknown'), 'problem_type': experiment_config.get('problem_type', 'unknown'), 'dataset_size': experiment_config.get('total_samples', 'unknown'), 'feature_count': len(experiment_config.get('feature_names', [])), 'best_score': model_results.get('best_score', 'unknown'), 'optimization_time': model_results.get('optimization_time', 'unknown'), 'notes': notes or '' } # Add specific metrics if available if 'cv_results' in model_results and isinstance(model_results['cv_results'], pd.DataFrame): cv_results = model_results['cv_results'] if not cv_results.empty: record.update({ 'mean_test_score': cv_results['mean_test_score'].iloc[0] if 'mean_test_score' in cv_results else 'unknown', 'std_test_score': cv_results['std_test_score'].iloc[0] if 'std_test_score' in cv_results else 'unknown' }) # Convert record to DataFrame record_df = pd.DataFrame([record]) # Append to log file log_path = Path(log_file) if log_path.exists(): # Append to existing log existing_log = pd.read_csv(log_file) updated_log = pd.concat([existing_log, record_df], ignore_index=True) else: # Create new log updated_log = record_df # Save updated log updated_log.to_csv(log_file, index=False) print(f"โœ… Experiment logged to: {log_file}")
[docs] def create_model_report( model: Any, model_name: str, experiment_config: Dict[str, Any], performance_metrics: Dict[str, float], feature_importance: Optional[pd.DataFrame] = None, validation_results: Optional[Dict[str, Any]] = None, save_path: Optional[str] = None ) -> str: """ Generate a comprehensive model report. Parameters: ----------- model : Any The trained model model_name : str Name of the model experiment_config : Dict[str, Any] Configuration dictionary performance_metrics : Dict[str, float] Performance metrics dictionary feature_importance : pd.DataFrame, optional Feature importance data validation_results : Dict[str, Any], optional Validation results from validate_ml_data save_path : str, optional Path to save the report Returns: -------- str The generated report as a string """ print(f"๐Ÿ“„ Generating model report for '{model_name}'...") # Generate report content report_lines = [ "="*80, f"MODEL PERFORMANCE REPORT", f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", "="*80, "", f"๐Ÿ“Š MODEL INFORMATION", f"Model Name: {model_name}", f"Model Type: {type(model).__name__}", f"Problem Type: {experiment_config.get('problem_type', 'unknown')}", "", f"๐Ÿ“ˆ DATASET INFORMATION", f"Total Samples: {experiment_config.get('total_samples', 'unknown')}", f"Training Samples: {experiment_config.get('train_samples', 'unknown')}", f"Validation Samples: {experiment_config.get('val_samples', 'unknown')}", f"Test Samples: {experiment_config.get('test_samples', 'unknown')}", f"Number of Features: {len(experiment_config.get('feature_names', []))}", "", f"๐ŸŽฏ PERFORMANCE METRICS", ] # Add performance metrics for metric_name, metric_value in performance_metrics.items(): if isinstance(metric_value, (int, float)): report_lines.append(f"{metric_name.upper()}: {metric_value:.4f}") else: report_lines.append(f"{metric_name.upper()}: {metric_value}") report_lines.append("") # Add data quality information if available if validation_results: report_lines.extend([ f"๐Ÿ“Š DATA QUALITY ASSESSMENT", f"Quality Score: {validation_results.get('quality_score', 'unknown'):.1f}/100" ]) if validation_results.get('recommendations'): report_lines.append("Recommendations:") for rec in validation_results['recommendations']: report_lines.append(f" - {rec}") report_lines.append("") # Add feature importance if available if feature_importance is not None and not feature_importance.empty: report_lines.extend([ f"๐Ÿ” TOP 10 MOST IMPORTANT FEATURES", ]) top_features = feature_importance.head(10) for _, row in top_features.iterrows(): report_lines.append(f"{row['feature']}: {row['importance']:.4f}") report_lines.append("") # Add model parameters if available if hasattr(model, 'get_params'): params = model.get_params() if params: report_lines.extend([ f"โš™๏ธ MODEL PARAMETERS", ]) for param_name, param_value in params.items(): report_lines.append(f"{param_name}: {param_value}") report_lines.append("") # Add experiment configuration report_lines.extend([ f"๐Ÿงช EXPERIMENT CONFIGURATION", f"Random State: {experiment_config.get('random_state', 'unknown')}", f"Test Size: {experiment_config.get('test_size', 'unknown')}", f"Validation Size: {experiment_config.get('validation_size', 'unknown')}", f"Stratified Split: {experiment_config.get('stratified', 'unknown')}", ]) report_lines.extend([ "", "="*80, "Report generated by edaflow.ml", "="*80 ]) # Combine into single report string report_content = "\n".join(report_lines) # Save report if path provided if save_path: report_path = Path(save_path) report_path.parent.mkdir(parents=True, exist_ok=True) with open(report_path, 'w') as f: f.write(report_content) print(f"โœ… Report saved to: {report_path}") return report_content
def _make_json_serializable(obj: Any) -> Any: """Convert non-JSON-serializable objects to serializable format.""" if isinstance(obj, dict): return {key: _make_json_serializable(value) for key, value in obj.items()} elif isinstance(obj, list): return [_make_json_serializable(item) for item in obj] elif isinstance(obj, (np.integer, np.int64, np.int32)): return int(obj) elif isinstance(obj, (np.floating, np.float64, np.float32)): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() elif pd.isna(obj): return None else: try: # Test if object is JSON serializable json.dumps(obj) return obj except (TypeError, ValueError): # Convert to string if not serializable return str(obj)