Source code for nirs4all.controllers.models.autogluon_model

"""
AutoGluon Model Controller - Controller for AutoGluon TabularPredictor

This controller handles AutoGluon TabularPredictor with support for:
- Automatic model selection and ensembling
- Training on tabular data (samples x features)
- Model persistence and prediction storage
- Integration with the nirs4all pipeline

AutoGluon differs from sklearn models in that:
- It trains an ensemble of models automatically
- It uses DataFrames internally, not numpy arrays
- It manages its own model directory for persistence
- It has its own hyperparameter tuning (no need for Optuna)

Lazy loading pattern: AutoGluon is only imported when actually needed
for training or prediction, not at module import time.
"""

from typing import Any, Dict, List, Tuple, Optional, TYPE_CHECKING
import numpy as np
import pandas as pd
import copy
import tempfile
import shutil
import os

from ..models.base_model import BaseModelController
from nirs4all.controllers.registry import register_controller
from nirs4all.core.logging import get_logger
from nirs4all.utils.backend import is_available, require_backend, BackendNotAvailableError

logger = get_logger(__name__)

from nirs4all.pipeline.steps.parser import ParsedStep
from nirs4all.pipeline.config.context import ExecutionContext, RuntimeContext
from nirs4all.pipeline.storage.artifacts.artifact_persistence import ArtifactMeta

# Fast availability check at module level - no imports
AUTOGLUON_AVAILABLE = is_available('autogluon')

if TYPE_CHECKING:
    from nirs4all.pipeline.runner import PipelineRunner
    from nirs4all.data.dataset import SpectroDataset
    try:
        from autogluon.tabular import TabularPredictor
    except ImportError:
        pass


# Lazy-loaded module cache
_ag_modules: Dict[str, Any] = {}


def _get_tabular_predictor():
    """Lazy load AutoGluon TabularPredictor with caching."""
    if 'TabularPredictor' not in _ag_modules:
        require_backend('autogluon', feature='AutoGluon AutoML')
        from autogluon.tabular import TabularPredictor
        _ag_modules['TabularPredictor'] = TabularPredictor
    return _ag_modules['TabularPredictor']


def _is_autogluon_predictor(obj: Any) -> bool:
    """Check if an object is an AutoGluon TabularPredictor.

    Uses module introspection first to avoid importing AutoGluon
    for non-AutoGluon objects.

    Args:
        obj: Object to check.

    Returns:
        bool: True if object is a TabularPredictor instance or class.
    """
    if not AUTOGLUON_AVAILABLE:
        return False

    if obj is None:
        return False

    # Check if it's a dict config with autogluon reference (no import needed)
    if isinstance(obj, dict):
        if 'framework' in obj and obj['framework'] == 'autogluon':
            return True
        if 'class' in obj and 'autogluon' in str(obj['class']):
            return True

    # Check by module name for instances (no import needed)
    if hasattr(obj, '__class__'):
        module = obj.__class__.__module__
        if 'autogluon' in module:
            return True

    # If we need to check with isinstance, load TabularPredictor
    try:
        TabularPredictor = _get_tabular_predictor()

        # Check instance
        if isinstance(obj, TabularPredictor):
            return True

        # Check class
        if obj is TabularPredictor:
            return True
    except (ImportError, BackendNotAvailableError):
        pass

    return False



[docs]
@register_controller
class AutoGluonModelController(BaseModelController):
    """Controller for AutoGluon TabularPredictor.

    This controller handles AutoGluon models with automatic model selection,
    ensembling, and integration with the nirs4all pipeline.

    AutoGluon automatically:
    - Trains multiple models (LightGBM, CatBoost, XGBoost, Neural Networks, etc.)
    - Performs cross-validation
    - Creates weighted ensembles
    - Handles hyperparameter tuning internally

    Uses lazy loading - AutoGluon is only imported when training starts.

    Attributes:
        priority (int): Controller priority (5) - higher than sklearn (6) to
            prioritize AutoGluon when explicitly requested.
    """

    priority = 5  # Higher priority than sklearn to catch autogluon configs


[docs]
    @classmethod
    def matches(cls, step: Any, operator: Any, keyword: str) -> bool:
        """Match AutoGluon TabularPredictor configurations.

        Args:
            step (Any): Pipeline step to check.
            operator (Any): Optional operator object.
            keyword (str): Pipeline keyword (unused).

        Returns:
            bool: True if the step matches an AutoGluon configuration.
        """
        if not AUTOGLUON_AVAILABLE:
            return False

        # Check if step is an explicit AutoGluon config
        if isinstance(step, dict):
            # Check for explicit framework key
            if step.get('framework') == 'autogluon':
                return True

            # Check if 'model' contains autogluon reference
            model = step.get('model')
            if _is_autogluon_predictor(model):
                return True

            # Check for class path containing autogluon
            if 'class' in step and 'autogluon' in str(step.get('class', '')):
                return True

        # Check if operator is AutoGluon predictor
        if _is_autogluon_predictor(operator):
            return True

        # Check if step itself is AutoGluon predictor
        if _is_autogluon_predictor(step):
            return True

        return False


    def _get_model_instance(
        self,
        dataset: 'SpectroDataset',
        model_config: Dict[str, Any],
        force_params: Optional[Dict[str, Any]] = None
    ) -> Any:
        """Create AutoGluon TabularPredictor instance from configuration.

        AutoGluon predictor is created with a temporary path and configured
        based on the task type and user parameters.

        Args:
            dataset (SpectroDataset): Dataset for context-aware configuration.
            model_config (Dict[str, Any]): Model configuration.
            force_params (Optional[Dict[str, Any]]): Parameters to override.

        Returns:
            TabularPredictor: Configured AutoGluon predictor (not yet fitted).
        """
        require_backend('autogluon', feature='AutoGluon AutoML')

        # Get parameters from config
        params = model_config.get('params', {}).copy()
        if force_params:
            params.update(force_params)

        # Extract random_state if provided
        random_state = params.pop('random_state', None)

        # Determine problem type from dataset
        problem_type = None
        if dataset.task_type:
            if dataset.task_type.is_classification:
                if dataset.task_type.value == 'binary_classification':
                    problem_type = 'binary'
                else:
                    problem_type = 'multiclass'
            else:
                problem_type = 'regression'

        # Create temporary directory for AutoGluon models
        # This will be managed by nirs4all's artifact system
        temp_dir = tempfile.mkdtemp(prefix='autogluon_')

        # Create predictor with label placeholder (will be set during fit)
        predictor_params = {
            'label': '__target__',  # Placeholder, actual column name set in _train_model
            'path': temp_dir,
            'problem_type': problem_type,
            'verbosity': params.get('verbosity', 0),
        }

        # Add optional parameters if provided
        if 'eval_metric' in params:
            predictor_params['eval_metric'] = params['eval_metric']

        # Store fit parameters separately for use in _train_model
        self._fit_params = {
            k: v for k, v in params.items()
            if k not in predictor_params
        }

        # We don't create the predictor here, as it needs to be created with
        # the actual label column name. Return a config dict instead.
        return {
            'predictor_params': predictor_params,
            'fit_params': self._fit_params,
            'temp_dir': temp_dir,
            'random_state': random_state
        }

    def _train_model(
        self,
        model: Any,
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_val: Optional[np.ndarray] = None,
        y_val: Optional[np.ndarray] = None,
        **kwargs
    ) -> Any:
        """Train AutoGluon TabularPredictor.

        AutoGluon handles cross-validation, model selection, and ensembling
        internally. This method creates a DataFrame from the numpy arrays
        and calls TabularPredictor.fit().

        Args:
            model: Model config dict from _get_model_instance.
            X_train (np.ndarray): Training features.
            y_train (np.ndarray): Training targets.
            X_val (Optional[np.ndarray]): Validation features (used as tuning_data).
            y_val (Optional[np.ndarray]): Validation targets.
            **kwargs: Additional training parameters.

        Returns:
            TabularPredictor: Trained AutoGluon predictor.
        """
        TabularPredictor = _get_tabular_predictor()

        # Extract controller-specific parameters
        verbose = kwargs.pop('verbose', 0)
        task_type = kwargs.pop('task_type', None)

        # Get config from model
        predictor_params = model['predictor_params'].copy()
        fit_params = model.get('fit_params', {}).copy()
        random_state = model.get('random_state', None)

        # Create DataFrame with features and target
        label_col = '__target__'
        predictor_params['label'] = label_col

        # Convert to DataFrame
        train_df = pd.DataFrame(X_train)
        train_df[label_col] = y_train.ravel() if y_train.ndim > 1 else y_train

        # Create predictor
        predictor = TabularPredictor(**predictor_params)

        # Prepare fit parameters
        fit_kwargs = {}

        # Add validation data if available
        if X_val is not None and y_val is not None and len(X_val) > 0:
            val_df = pd.DataFrame(X_val)
            val_df[label_col] = y_val.ravel() if y_val.ndim > 1 else y_val
            fit_kwargs['tuning_data'] = val_df

        # Add time_limit if specified
        if 'time_limit' in fit_params:
            fit_kwargs['time_limit'] = fit_params.pop('time_limit')

        # Add presets if specified
        if 'presets' in fit_params:
            fit_kwargs['presets'] = fit_params.pop('presets')
        else:
            # Default to 'best_quality' for good results, 'medium_quality' for speed
            fit_kwargs['presets'] = 'medium_quality'

        # Add hyperparameters if specified
        if 'hyperparameters' in fit_params:
            fit_kwargs['hyperparameters'] = fit_params.pop('hyperparameters')

        # Add num_bag_folds if specified (for bagging)
        if 'num_bag_folds' in fit_params:
            fit_kwargs['num_bag_folds'] = fit_params.pop('num_bag_folds')

        # Add random_state via ag_args_fit for reproducibility
        # AutoGluon propagates random seeds to models through ag_args_fit
        if random_state is not None:
            ag_args_fit = fit_kwargs.get('ag_args_fit', {})
            ag_args_fit['random_seed'] = random_state
            fit_kwargs['ag_args_fit'] = ag_args_fit

        # Add remaining fit params
        fit_kwargs.update(fit_params)
        fit_kwargs.update(kwargs)

        # Fit the predictor
        if verbose > 0:
            logger.starting("Training AutoGluon TabularPredictor...")
            logger.info(f"Presets: {fit_kwargs.get('presets', 'default')}")
            logger.info(f"Time limit: {fit_kwargs.get('time_limit', 'None')}")

        predictor.fit(train_df, **fit_kwargs)

        if verbose > 0:
            # Print leaderboard
            logger.info("AutoGluon Model Leaderboard:")
            try:
                leaderboard = predictor.leaderboard(silent=True)
                logger.info(leaderboard.to_string())
            except Exception:
                pass

        return predictor

    def _predict_model(self, model: Any, X: np.ndarray) -> np.ndarray:
        """Generate predictions with AutoGluon predictor.

        Args:
            model (TabularPredictor): Trained AutoGluon predictor.
            X (np.ndarray): Input features.

        Returns:
            np.ndarray: Model predictions, shape (n_samples, 1).
        """
        require_backend('autogluon', feature='AutoGluon prediction')

        # Convert to DataFrame
        test_df = pd.DataFrame(X)

        # Get predictions
        predictions = model.predict(test_df)

        # Convert to numpy and reshape
        predictions = np.array(predictions)
        if predictions.ndim == 1:
            predictions = predictions.reshape(-1, 1)

        return predictions

    def _predict_proba_model(self, model: Any, X: np.ndarray) -> Optional[np.ndarray]:
        """Get class probabilities for AutoGluon classification models.

        Args:
            model (TabularPredictor): Trained AutoGluon predictor.
            X (np.ndarray): Input features.

        Returns:
            np.ndarray: Class probabilities, or None if not classification.
        """
        if not AUTOGLUON_AVAILABLE:
            return None

        if not hasattr(model, 'can_predict_proba') or not model.can_predict_proba:
            return None

        try:
            test_df = pd.DataFrame(X)
            proba = model.predict_proba(test_df)

            # Convert to numpy
            proba = np.array(proba)

            return proba
        except Exception:
            return None

    def _prepare_data(
        self,
        X: np.ndarray,
        y: np.ndarray,
        context: 'ExecutionContext'
    ) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare data for AutoGluon (ensure 2D arrays).

        Args:
            X (np.ndarray): Input features.
            y (np.ndarray): Target values.
            context (ExecutionContext): Pipeline context.

        Returns:
            Tuple[np.ndarray, np.ndarray]: Prepared (X, y) arrays.
        """
        if X is None:
            return None, None

        # Ensure X is 2D
        if X.ndim > 2:
            X = X.reshape(X.shape[0], -1)
        elif X.ndim == 1:
            X = X.reshape(-1, 1)

        # Handle y
        if y is not None:
            if y.ndim == 1:
                y = y.reshape(-1, 1)
            elif y.ndim > 2:
                y = y.reshape(y.shape[0], -1)

        return X, y

    def _evaluate_model(
        self,
        model: Any,
        X_val: np.ndarray,
        y_val: np.ndarray
    ) -> float:
        """Evaluate AutoGluon model using its internal evaluation.

        Args:
            model (TabularPredictor): AutoGluon predictor.
            X_val (np.ndarray): Validation features.
            y_val (np.ndarray): Validation targets.

        Returns:
            float: Evaluation score (negative for maximization metrics).
        """
        if not AUTOGLUON_AVAILABLE:
            return float('inf')

        try:
            TabularPredictor = _get_tabular_predictor()

            # Create validation DataFrame
            label_col = '__target__'
            val_df = pd.DataFrame(X_val)
            val_df[label_col] = y_val.ravel() if y_val.ndim > 1 else y_val

            # Evaluate using AutoGluon's evaluate method
            eval_result = model.evaluate(val_df, silent=True)

            # Get the main metric score
            if isinstance(eval_result, dict):
                # Get the primary metric value
                primary_metric = model.eval_metric.name if hasattr(model, 'eval_metric') else None
                if primary_metric and primary_metric in eval_result:
                    score = eval_result[primary_metric]
                else:
                    # Use first metric
                    score = list(eval_result.values())[0]
            else:
                score = float(eval_result)

            # AutoGluon metrics are typically higher-is-better
            # Return negative for minimization-based optimization
            return -score

        except Exception as e:
            logger.warning(f"Error in AutoGluon evaluation: {e}")
            return float('inf')


[docs]
    def get_preferred_layout(self) -> str:
        """Return the preferred data layout for AutoGluon.

        Returns:
            str: Data layout preference, '2d' for AutoGluon.
        """
        return "2d"


    def _clone_model(self, model: Any) -> Any:
        """Clone AutoGluon model configuration.

        For AutoGluon, we clone the configuration dict since the actual
        predictor needs to be created fresh for each fold.

        Args:
            model: Model config dict or TabularPredictor.

        Returns:
            Cloned configuration or deep copy.
        """
        if isinstance(model, dict):
            return copy.deepcopy(model)

        # For a fitted predictor, we can clone it using AutoGluon's method
        if AUTOGLUON_AVAILABLE:
            try:
                TabularPredictor = _get_tabular_predictor()
                if isinstance(model, TabularPredictor):
                    # Create a new temp directory for the clone
                    temp_dir = tempfile.mkdtemp(prefix='autogluon_clone_')
                    return model.clone(path=temp_dir)
            except (TypeError, BackendNotAvailableError):
                pass

        return copy.deepcopy(model)

    def _sample_hyperparameters(
        self,
        trial,
        finetune_params: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Sample hyperparameters for AutoGluon.

        AutoGluon has its own internal hyperparameter tuning, so this method
        samples high-level configuration parameters like time_limit and presets.

        Args:
            trial: Optuna trial object.
            finetune_params (Dict[str, Any]): Hyperparameter search space.

        Returns:
            Dict[str, Any]: Sampled configuration parameters.
        """
        params = {}

        # Sample presets
        if 'presets' in finetune_params:
            preset_config = finetune_params['presets']
            if isinstance(preset_config, list):
                params['presets'] = trial.suggest_categorical('presets', preset_config)
            else:
                params['presets'] = preset_config

        # Sample time_limit
        if 'time_limit' in finetune_params:
            tl_config = finetune_params['time_limit']
            if isinstance(tl_config, tuple) and len(tl_config) == 3:
                low, high = tl_config[1], tl_config[2]
                params['time_limit'] = trial.suggest_int('time_limit', low, high)
            else:
                params['time_limit'] = tl_config

        # Sample num_bag_folds
        if 'num_bag_folds' in finetune_params:
            nbf_config = finetune_params['num_bag_folds']
            if isinstance(nbf_config, tuple) and len(nbf_config) == 3:
                low, high = nbf_config[1], nbf_config[2]
                params['num_bag_folds'] = trial.suggest_int('num_bag_folds', low, high)
            else:
                params['num_bag_folds'] = nbf_config

        # Sample random_state (typically fixed, but can be searched)
        if 'random_state' in finetune_params:
            rs_config = finetune_params['random_state']
            if isinstance(rs_config, tuple) and len(rs_config) == 3:
                low, high = rs_config[1], rs_config[2]
                params['random_state'] = trial.suggest_int('random_state', low, high)
            elif isinstance(rs_config, list):
                params['random_state'] = trial.suggest_categorical(
                    'random_state', rs_config
                )
            else:
                params['random_state'] = rs_config

        return params


[docs]
    def save_model(self, model: Any, filepath: str) -> None:
        """Save AutoGluon model to disk.

        AutoGluon models are saved as directories. This method moves the
        model's directory to the specified filepath.

        Args:
            model (TabularPredictor): Trained AutoGluon predictor.
            filepath (str): Target path for saving.
        """
        require_backend('autogluon', feature='AutoGluon model saving')

        # AutoGluon saves to a directory
        # Remove .pkl extension if present
        if filepath.endswith('.pkl'):
            filepath = filepath[:-4]

        # Save the predictor
        model.save(filepath)



[docs]
    def load_model(self, filepath: str) -> Any:
        """Load AutoGluon model from disk.

        Args:
            filepath (str): Path to the saved model directory.

        Returns:
            TabularPredictor: Loaded AutoGluon predictor.
        """
        TabularPredictor = _get_tabular_predictor()

        # Remove .pkl extension if present
        if filepath.endswith('.pkl'):
            filepath = filepath[:-4]

        return TabularPredictor.load(filepath)



[docs]
    def execute(
        self,
        step_info: ParsedStep,
        dataset: 'SpectroDataset',
        context: ExecutionContext,
        runtime_context: RuntimeContext,
        source: int = -1,
        mode: str = "train",
        loaded_binaries: Optional[List[Tuple[str, bytes]]] = None,
        prediction_store: Optional[Any] = None
    ) -> Tuple[ExecutionContext, List[ArtifactMeta]]:
        """Execute AutoGluon model controller.

        Main entry point for AutoGluon model execution in the pipeline.

        Args:
            step_info: Parsed step containing model configuration.
            dataset (SpectroDataset): Dataset containing features and targets.
            context (ExecutionContext): Pipeline execution context.
            runtime_context (RuntimeContext): Runtime context.
            source (int): Source index. Defaults to -1.
            mode (str): Execution mode. Defaults to 'train'.
            loaded_binaries: Pre-loaded model binaries for prediction.
            prediction_store: Store for managing predictions.

        Returns:
            Tuple[ExecutionContext, List[ArtifactMeta]]: Updated context
                and list of model binaries.
        """
        # Set layout preference (force_layout overrides preferred)
        context = context.with_layout(self.get_effective_layout(step_info))

        # Call parent execute method
        return super().execute(
            step_info, dataset, context, runtime_context,
            source, mode, loaded_binaries, prediction_store
        )