Source code for nirs4all.controllers.models.autogluon_model

"""
AutoGluon Model Controller - Controller for AutoGluon TabularPredictor

This controller handles AutoGluon TabularPredictor with support for:
- Automatic model selection and ensembling
- Training on tabular data (samples x features)
- Model persistence and prediction storage
- Integration with the nirs4all pipeline

AutoGluon differs from sklearn models in that:
- It trains an ensemble of models automatically
- It uses DataFrames internally, not numpy arrays
- It manages its own model directory for persistence
- It has its own hyperparameter tuning (no need for Optuna)

Lazy loading pattern: AutoGluon is only imported when actually needed
for training or prediction, not at module import time.
"""

from typing import Any, Dict, List, Tuple, Optional, TYPE_CHECKING
import numpy as np
import pandas as pd
import copy
import tempfile
import shutil
import os

from ..models.base_model import BaseModelController
from nirs4all.controllers.registry import register_controller
from nirs4all.core.logging import get_logger
from nirs4all.utils.backend import is_available, require_backend, BackendNotAvailableError

logger = get_logger(__name__)

from nirs4all.pipeline.steps.parser import ParsedStep
from nirs4all.pipeline.config.context import ExecutionContext, RuntimeContext
from nirs4all.pipeline.storage.artifacts.artifact_persistence import ArtifactMeta

# Fast availability check at module level - no imports
AUTOGLUON_AVAILABLE = is_available('autogluon')

if TYPE_CHECKING:
    from nirs4all.pipeline.runner import PipelineRunner
    from nirs4all.data.dataset import SpectroDataset
    try:
        from autogluon.tabular import TabularPredictor
    except ImportError:
        pass


# Lazy-loaded module cache
_ag_modules: Dict[str, Any] = {}


def _get_tabular_predictor():
    """Lazy load AutoGluon TabularPredictor with caching."""
    if 'TabularPredictor' not in _ag_modules:
        require_backend('autogluon', feature='AutoGluon AutoML')
        from autogluon.tabular import TabularPredictor
        _ag_modules['TabularPredictor'] = TabularPredictor
    return _ag_modules['TabularPredictor']


def _is_autogluon_predictor(obj: Any) -> bool:
    """Check if an object is an AutoGluon TabularPredictor.

    Uses module introspection first to avoid importing AutoGluon
    for non-AutoGluon objects.

    Args:
        obj: Object to check.

    Returns:
        bool: True if object is a TabularPredictor instance or class.
    """
    if not AUTOGLUON_AVAILABLE:
        return False

    if obj is None:
        return False

    # Check if it's a dict config with autogluon reference (no import needed)
    if isinstance(obj, dict):
        if 'framework' in obj and obj['framework'] == 'autogluon':
            return True
        if 'class' in obj and 'autogluon' in str(obj['class']):
            return True

    # Check by module name for instances (no import needed)
    if hasattr(obj, '__class__'):
        module = obj.__class__.__module__
        if 'autogluon' in module:
            return True

    # If we need to check with isinstance, load TabularPredictor
    try:
        TabularPredictor = _get_tabular_predictor()

        # Check instance
        if isinstance(obj, TabularPredictor):
            return True

        # Check class
        if obj is TabularPredictor:
            return True
    except (ImportError, BackendNotAvailableError):
        pass

    return False


[docs] @register_controller class AutoGluonModelController(BaseModelController): """Controller for AutoGluon TabularPredictor. This controller handles AutoGluon models with automatic model selection, ensembling, and integration with the nirs4all pipeline. AutoGluon automatically: - Trains multiple models (LightGBM, CatBoost, XGBoost, Neural Networks, etc.) - Performs cross-validation - Creates weighted ensembles - Handles hyperparameter tuning internally Uses lazy loading - AutoGluon is only imported when training starts. Attributes: priority (int): Controller priority (5) - higher than sklearn (6) to prioritize AutoGluon when explicitly requested. """ priority = 5 # Higher priority than sklearn to catch autogluon configs
[docs] @classmethod def matches(cls, step: Any, operator: Any, keyword: str) -> bool: """Match AutoGluon TabularPredictor configurations. Args: step (Any): Pipeline step to check. operator (Any): Optional operator object. keyword (str): Pipeline keyword (unused). Returns: bool: True if the step matches an AutoGluon configuration. """ if not AUTOGLUON_AVAILABLE: return False # Check if step is an explicit AutoGluon config if isinstance(step, dict): # Check for explicit framework key if step.get('framework') == 'autogluon': return True # Check if 'model' contains autogluon reference model = step.get('model') if _is_autogluon_predictor(model): return True # Check for class path containing autogluon if 'class' in step and 'autogluon' in str(step.get('class', '')): return True # Check if operator is AutoGluon predictor if _is_autogluon_predictor(operator): return True # Check if step itself is AutoGluon predictor if _is_autogluon_predictor(step): return True return False
def _get_model_instance( self, dataset: 'SpectroDataset', model_config: Dict[str, Any], force_params: Optional[Dict[str, Any]] = None ) -> Any: """Create AutoGluon TabularPredictor instance from configuration. AutoGluon predictor is created with a temporary path and configured based on the task type and user parameters. Args: dataset (SpectroDataset): Dataset for context-aware configuration. model_config (Dict[str, Any]): Model configuration. force_params (Optional[Dict[str, Any]]): Parameters to override. Returns: TabularPredictor: Configured AutoGluon predictor (not yet fitted). """ require_backend('autogluon', feature='AutoGluon AutoML') # Get parameters from config params = model_config.get('params', {}).copy() if force_params: params.update(force_params) # Extract random_state if provided random_state = params.pop('random_state', None) # Determine problem type from dataset problem_type = None if dataset.task_type: if dataset.task_type.is_classification: if dataset.task_type.value == 'binary_classification': problem_type = 'binary' else: problem_type = 'multiclass' else: problem_type = 'regression' # Create temporary directory for AutoGluon models # This will be managed by nirs4all's artifact system temp_dir = tempfile.mkdtemp(prefix='autogluon_') # Create predictor with label placeholder (will be set during fit) predictor_params = { 'label': '__target__', # Placeholder, actual column name set in _train_model 'path': temp_dir, 'problem_type': problem_type, 'verbosity': params.get('verbosity', 0), } # Add optional parameters if provided if 'eval_metric' in params: predictor_params['eval_metric'] = params['eval_metric'] # Store fit parameters separately for use in _train_model self._fit_params = { k: v for k, v in params.items() if k not in predictor_params } # We don't create the predictor here, as it needs to be created with # the actual label column name. Return a config dict instead. return { 'predictor_params': predictor_params, 'fit_params': self._fit_params, 'temp_dir': temp_dir, 'random_state': random_state } def _train_model( self, model: Any, X_train: np.ndarray, y_train: np.ndarray, X_val: Optional[np.ndarray] = None, y_val: Optional[np.ndarray] = None, **kwargs ) -> Any: """Train AutoGluon TabularPredictor. AutoGluon handles cross-validation, model selection, and ensembling internally. This method creates a DataFrame from the numpy arrays and calls TabularPredictor.fit(). Args: model: Model config dict from _get_model_instance. X_train (np.ndarray): Training features. y_train (np.ndarray): Training targets. X_val (Optional[np.ndarray]): Validation features (used as tuning_data). y_val (Optional[np.ndarray]): Validation targets. **kwargs: Additional training parameters. Returns: TabularPredictor: Trained AutoGluon predictor. """ TabularPredictor = _get_tabular_predictor() # Extract controller-specific parameters verbose = kwargs.pop('verbose', 0) task_type = kwargs.pop('task_type', None) # Get config from model predictor_params = model['predictor_params'].copy() fit_params = model.get('fit_params', {}).copy() random_state = model.get('random_state', None) # Create DataFrame with features and target label_col = '__target__' predictor_params['label'] = label_col # Convert to DataFrame train_df = pd.DataFrame(X_train) train_df[label_col] = y_train.ravel() if y_train.ndim > 1 else y_train # Create predictor predictor = TabularPredictor(**predictor_params) # Prepare fit parameters fit_kwargs = {} # Add validation data if available if X_val is not None and y_val is not None and len(X_val) > 0: val_df = pd.DataFrame(X_val) val_df[label_col] = y_val.ravel() if y_val.ndim > 1 else y_val fit_kwargs['tuning_data'] = val_df # Add time_limit if specified if 'time_limit' in fit_params: fit_kwargs['time_limit'] = fit_params.pop('time_limit') # Add presets if specified if 'presets' in fit_params: fit_kwargs['presets'] = fit_params.pop('presets') else: # Default to 'best_quality' for good results, 'medium_quality' for speed fit_kwargs['presets'] = 'medium_quality' # Add hyperparameters if specified if 'hyperparameters' in fit_params: fit_kwargs['hyperparameters'] = fit_params.pop('hyperparameters') # Add num_bag_folds if specified (for bagging) if 'num_bag_folds' in fit_params: fit_kwargs['num_bag_folds'] = fit_params.pop('num_bag_folds') # Add random_state via ag_args_fit for reproducibility # AutoGluon propagates random seeds to models through ag_args_fit if random_state is not None: ag_args_fit = fit_kwargs.get('ag_args_fit', {}) ag_args_fit['random_seed'] = random_state fit_kwargs['ag_args_fit'] = ag_args_fit # Add remaining fit params fit_kwargs.update(fit_params) fit_kwargs.update(kwargs) # Fit the predictor if verbose > 0: logger.starting("Training AutoGluon TabularPredictor...") logger.info(f"Presets: {fit_kwargs.get('presets', 'default')}") logger.info(f"Time limit: {fit_kwargs.get('time_limit', 'None')}") predictor.fit(train_df, **fit_kwargs) if verbose > 0: # Print leaderboard logger.info("AutoGluon Model Leaderboard:") try: leaderboard = predictor.leaderboard(silent=True) logger.info(leaderboard.to_string()) except Exception: pass return predictor def _predict_model(self, model: Any, X: np.ndarray) -> np.ndarray: """Generate predictions with AutoGluon predictor. Args: model (TabularPredictor): Trained AutoGluon predictor. X (np.ndarray): Input features. Returns: np.ndarray: Model predictions, shape (n_samples, 1). """ require_backend('autogluon', feature='AutoGluon prediction') # Convert to DataFrame test_df = pd.DataFrame(X) # Get predictions predictions = model.predict(test_df) # Convert to numpy and reshape predictions = np.array(predictions) if predictions.ndim == 1: predictions = predictions.reshape(-1, 1) return predictions def _predict_proba_model(self, model: Any, X: np.ndarray) -> Optional[np.ndarray]: """Get class probabilities for AutoGluon classification models. Args: model (TabularPredictor): Trained AutoGluon predictor. X (np.ndarray): Input features. Returns: np.ndarray: Class probabilities, or None if not classification. """ if not AUTOGLUON_AVAILABLE: return None if not hasattr(model, 'can_predict_proba') or not model.can_predict_proba: return None try: test_df = pd.DataFrame(X) proba = model.predict_proba(test_df) # Convert to numpy proba = np.array(proba) return proba except Exception: return None def _prepare_data( self, X: np.ndarray, y: np.ndarray, context: 'ExecutionContext' ) -> Tuple[np.ndarray, np.ndarray]: """Prepare data for AutoGluon (ensure 2D arrays). Args: X (np.ndarray): Input features. y (np.ndarray): Target values. context (ExecutionContext): Pipeline context. Returns: Tuple[np.ndarray, np.ndarray]: Prepared (X, y) arrays. """ if X is None: return None, None # Ensure X is 2D if X.ndim > 2: X = X.reshape(X.shape[0], -1) elif X.ndim == 1: X = X.reshape(-1, 1) # Handle y if y is not None: if y.ndim == 1: y = y.reshape(-1, 1) elif y.ndim > 2: y = y.reshape(y.shape[0], -1) return X, y def _evaluate_model( self, model: Any, X_val: np.ndarray, y_val: np.ndarray ) -> float: """Evaluate AutoGluon model using its internal evaluation. Args: model (TabularPredictor): AutoGluon predictor. X_val (np.ndarray): Validation features. y_val (np.ndarray): Validation targets. Returns: float: Evaluation score (negative for maximization metrics). """ if not AUTOGLUON_AVAILABLE: return float('inf') try: TabularPredictor = _get_tabular_predictor() # Create validation DataFrame label_col = '__target__' val_df = pd.DataFrame(X_val) val_df[label_col] = y_val.ravel() if y_val.ndim > 1 else y_val # Evaluate using AutoGluon's evaluate method eval_result = model.evaluate(val_df, silent=True) # Get the main metric score if isinstance(eval_result, dict): # Get the primary metric value primary_metric = model.eval_metric.name if hasattr(model, 'eval_metric') else None if primary_metric and primary_metric in eval_result: score = eval_result[primary_metric] else: # Use first metric score = list(eval_result.values())[0] else: score = float(eval_result) # AutoGluon metrics are typically higher-is-better # Return negative for minimization-based optimization return -score except Exception as e: logger.warning(f"Error in AutoGluon evaluation: {e}") return float('inf')
[docs] def get_preferred_layout(self) -> str: """Return the preferred data layout for AutoGluon. Returns: str: Data layout preference, '2d' for AutoGluon. """ return "2d"
def _clone_model(self, model: Any) -> Any: """Clone AutoGluon model configuration. For AutoGluon, we clone the configuration dict since the actual predictor needs to be created fresh for each fold. Args: model: Model config dict or TabularPredictor. Returns: Cloned configuration or deep copy. """ if isinstance(model, dict): return copy.deepcopy(model) # For a fitted predictor, we can clone it using AutoGluon's method if AUTOGLUON_AVAILABLE: try: TabularPredictor = _get_tabular_predictor() if isinstance(model, TabularPredictor): # Create a new temp directory for the clone temp_dir = tempfile.mkdtemp(prefix='autogluon_clone_') return model.clone(path=temp_dir) except (TypeError, BackendNotAvailableError): pass return copy.deepcopy(model) def _sample_hyperparameters( self, trial, finetune_params: Dict[str, Any] ) -> Dict[str, Any]: """Sample hyperparameters for AutoGluon. AutoGluon has its own internal hyperparameter tuning, so this method samples high-level configuration parameters like time_limit and presets. Args: trial: Optuna trial object. finetune_params (Dict[str, Any]): Hyperparameter search space. Returns: Dict[str, Any]: Sampled configuration parameters. """ params = {} # Sample presets if 'presets' in finetune_params: preset_config = finetune_params['presets'] if isinstance(preset_config, list): params['presets'] = trial.suggest_categorical('presets', preset_config) else: params['presets'] = preset_config # Sample time_limit if 'time_limit' in finetune_params: tl_config = finetune_params['time_limit'] if isinstance(tl_config, tuple) and len(tl_config) == 3: low, high = tl_config[1], tl_config[2] params['time_limit'] = trial.suggest_int('time_limit', low, high) else: params['time_limit'] = tl_config # Sample num_bag_folds if 'num_bag_folds' in finetune_params: nbf_config = finetune_params['num_bag_folds'] if isinstance(nbf_config, tuple) and len(nbf_config) == 3: low, high = nbf_config[1], nbf_config[2] params['num_bag_folds'] = trial.suggest_int('num_bag_folds', low, high) else: params['num_bag_folds'] = nbf_config # Sample random_state (typically fixed, but can be searched) if 'random_state' in finetune_params: rs_config = finetune_params['random_state'] if isinstance(rs_config, tuple) and len(rs_config) == 3: low, high = rs_config[1], rs_config[2] params['random_state'] = trial.suggest_int('random_state', low, high) elif isinstance(rs_config, list): params['random_state'] = trial.suggest_categorical( 'random_state', rs_config ) else: params['random_state'] = rs_config return params
[docs] def save_model(self, model: Any, filepath: str) -> None: """Save AutoGluon model to disk. AutoGluon models are saved as directories. This method moves the model's directory to the specified filepath. Args: model (TabularPredictor): Trained AutoGluon predictor. filepath (str): Target path for saving. """ require_backend('autogluon', feature='AutoGluon model saving') # AutoGluon saves to a directory # Remove .pkl extension if present if filepath.endswith('.pkl'): filepath = filepath[:-4] # Save the predictor model.save(filepath)
[docs] def load_model(self, filepath: str) -> Any: """Load AutoGluon model from disk. Args: filepath (str): Path to the saved model directory. Returns: TabularPredictor: Loaded AutoGluon predictor. """ TabularPredictor = _get_tabular_predictor() # Remove .pkl extension if present if filepath.endswith('.pkl'): filepath = filepath[:-4] return TabularPredictor.load(filepath)
[docs] def execute( self, step_info: ParsedStep, dataset: 'SpectroDataset', context: ExecutionContext, runtime_context: RuntimeContext, source: int = -1, mode: str = "train", loaded_binaries: Optional[List[Tuple[str, bytes]]] = None, prediction_store: Optional[Any] = None ) -> Tuple[ExecutionContext, List[ArtifactMeta]]: """Execute AutoGluon model controller. Main entry point for AutoGluon model execution in the pipeline. Args: step_info: Parsed step containing model configuration. dataset (SpectroDataset): Dataset containing features and targets. context (ExecutionContext): Pipeline execution context. runtime_context (RuntimeContext): Runtime context. source (int): Source index. Defaults to -1. mode (str): Execution mode. Defaults to 'train'. loaded_binaries: Pre-loaded model binaries for prediction. prediction_store: Store for managing predictions. Returns: Tuple[ExecutionContext, List[ArtifactMeta]]: Updated context and list of model binaries. """ # Set layout preference (force_layout overrides preferred) context = context.with_layout(self.get_effective_layout(step_info)) # Call parent execute method return super().execute( step_info, dataset, context, runtime_context, source, mode, loaded_binaries, prediction_store )