Source code for nirs4all.operators.models.meta
"""Meta-model operator for stacking ensemble.
This module provides the MetaModel operator for building stacking ensembles
that use predictions from previously trained models as input features.
The meta-model trains on out-of-fold (OOF) predictions from base models
to prevent data leakage and overfitting.
Example:
>>> from nirs4all.operators.models import MetaModel
>>> from sklearn.linear_model import Ridge
>>>
>>> pipeline = [
... MinMaxScaler(),
... KFold(n_splits=5),
... PLSRegression(n_components=10),
... RandomForestRegressor(n_estimators=100),
... {"model": MetaModel(model=Ridge(), source_models="all")},
... ]
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Union
from .base import BaseModelOperator
[docs]
class CoverageStrategy(Enum):
"""Strategy for handling partial coverage in OOF reconstruction.
When some samples are missing predictions (e.g., from sample partitioning),
this determines how to handle them.
Attributes:
STRICT: Raise error if any sample is missing predictions (default).
DROP_INCOMPLETE: Drop samples missing any source model predictions.
IMPUTE_ZERO: Fill missing predictions with zeros.
IMPUTE_MEAN: Fill missing predictions with mean of available predictions.
IMPUTE_FOLD_MEAN: Fill with mean from the same fold.
"""
STRICT = "strict"
DROP_INCOMPLETE = "drop_incomplete"
IMPUTE_ZERO = "impute_zero"
IMPUTE_MEAN = "impute_mean"
IMPUTE_FOLD_MEAN = "impute_fold_mean"
[docs]
class TestAggregation(Enum):
"""Strategy for aggregating test predictions from multiple folds.
When base models are trained with cross-validation, each fold produces
predictions for the test set. This determines how to combine them.
Attributes:
MEAN: Simple average across folds (default).
WEIGHTED_MEAN: Weighted average by validation scores.
BEST_FOLD: Use prediction from best-scoring fold only.
"""
MEAN = "mean"
WEIGHTED_MEAN = "weighted"
BEST_FOLD = "best"
[docs]
class BranchScope(Enum):
"""Which branches to include as source models.
Controls which branches' predictions are used for stacking when
the pipeline contains branching.
Attributes:
CURRENT_ONLY: Only use models from the current branch (default).
ALL_BRANCHES: Use models from all branches (requires compatible samples).
SPECIFIED: Use explicit list from source_models parameter.
"""
CURRENT_ONLY = "current_only"
ALL_BRANCHES = "all_branches"
SPECIFIED = "specified"
[docs]
class StackingLevel(Enum):
"""Level of stacking in multi-level stacking architecture.
Indicates where this meta-model sits in a stacking hierarchy.
Used for validation and dependency tracking.
Attributes:
AUTO: Automatically detect level based on source models (default).
LEVEL_1: First meta-level (stacks on base models only).
LEVEL_2: Second meta-level (can stack on LEVEL_1 meta-models).
LEVEL_3: Third meta-level (can stack on LEVEL_1 and LEVEL_2).
"""
AUTO = "auto"
LEVEL_1 = 1
LEVEL_2 = 2
LEVEL_3 = 3
[docs]
@dataclass
class StackingConfig:
"""Configuration for meta-model training set reconstruction.
Controls how out-of-fold predictions are collected and processed
to build the training features for the meta-model.
Attributes:
coverage_strategy: How to handle samples with missing predictions.
test_aggregation: How to aggregate test predictions across folds.
branch_scope: Which branches to include as source models.
allow_no_cv: If True, allow stacking without cross-validation (with warning).
min_coverage_ratio: Minimum ratio of source models required per sample.
level: Stacking level for multi-level stacking (AUTO, LEVEL_1, LEVEL_2, LEVEL_3).
allow_meta_sources: If True, allow other MetaModels as source models.
max_level: Maximum allowed stacking level (for validation).
Example:
>>> config = StackingConfig(
... coverage_strategy=CoverageStrategy.DROP_INCOMPLETE,
... test_aggregation=TestAggregation.WEIGHTED_MEAN,
... min_coverage_ratio=0.5,
... level=StackingLevel.AUTO,
... allow_meta_sources=True
... )
"""
coverage_strategy: CoverageStrategy = CoverageStrategy.STRICT
test_aggregation: TestAggregation = TestAggregation.MEAN
branch_scope: BranchScope = BranchScope.CURRENT_ONLY
allow_no_cv: bool = False
min_coverage_ratio: float = 1.0
level: StackingLevel = StackingLevel.AUTO
allow_meta_sources: bool = True
max_level: int = 3
[docs]
def __post_init__(self):
"""Validate configuration after initialization."""
if not 0.0 <= self.min_coverage_ratio <= 1.0:
raise ValueError(
f"min_coverage_ratio must be between 0 and 1, got {self.min_coverage_ratio}"
)
if self.max_level < 1 or self.max_level > 10:
raise ValueError(
f"max_level must be between 1 and 10, got {self.max_level}"
)
# Convert string values to enums if needed
if isinstance(self.coverage_strategy, str):
self.coverage_strategy = CoverageStrategy(self.coverage_strategy)
if isinstance(self.test_aggregation, str):
self.test_aggregation = TestAggregation(self.test_aggregation)
if isinstance(self.branch_scope, str):
self.branch_scope = BranchScope(self.branch_scope)
if isinstance(self.level, str):
if self.level == "auto":
self.level = StackingLevel.AUTO
else:
self.level = StackingLevel(int(self.level))
if isinstance(self.level, int):
self.level = StackingLevel(self.level)
[docs]
class MetaModel(BaseModelOperator):
"""Wrapper for meta-model stacking using pipeline predictions.
Creates a meta-learner that uses predictions from previously trained
models in the pipeline as input features. Implements stacked generalization
with proper out-of-fold prediction handling to prevent data leakage.
The meta-model:
1. Collects out-of-fold (OOF) predictions from specified source models
2. Constructs training features from these predictions
3. Trains on these features using the provided sklearn-compatible model
4. For test data, aggregates source model predictions across folds
Multi-Level Stacking (Phase 7):
MetaModel supports multi-level stacking where meta-models can use predictions
from other meta-models as sources. This enables hierarchical ensemble architectures:
- Level 0: Base models (PLS, RF, XGBoost, etc.)
- Level 1: First meta-models (stack on Level 0)
- Level 2: Second meta-models (stack on Level 0 + Level 1)
- Level 3: Third meta-models (stack on all previous levels)
The level is auto-detected by default but can be explicitly set via
stacking_config.level. Circular dependencies are automatically prevented.
Attributes:
model: Sklearn-compatible model to use as meta-learner.
source_models: Which models to use as sources ("all" or list of names).
use_proba: For classification, use probabilities instead of class predictions.
stacking_config: Configuration for OOF reconstruction and multi-level stacking.
selector: Optional custom source model selector.
finetune_space: Optional hyperparameter search space for Optuna finetuning.
Example:
>>> # Basic usage - stack all previous models
>>> MetaModel(model=Ridge())
>>>
>>> # Explicit source selection
>>> MetaModel(
... model=Ridge(),
... source_models=["PLS", "RandomForest", "XGBoost"]
... )
>>>
>>> # Multi-level stacking
>>> pipeline = [
... KFold(n_splits=5),
... PLSRegression(n_components=5), # Level 0
... RandomForestRegressor(), # Level 0
... {"model": MetaModel(model=Ridge())}, # Level 1 (auto-detected)
... {"model": MetaModel( # Level 2 (uses Level 0 + Level 1)
... model=Lasso(),
... stacking_config=StackingConfig(level=StackingLevel.LEVEL_2)
... )},
... ]
>>>
>>> # With probability features for classification
>>> MetaModel(
... model=LogisticRegression(),
... use_proba=True
... )
>>>
>>> # With Optuna hyperparameter tuning
>>> MetaModel(
... model=Ridge(),
... finetune_space={"model__alpha": (0.001, 100.0)}
... )
Notes:
- Source models must be from earlier steps in the pipeline
- In branched pipelines, only models from the current branch are used by default
- For sample_partitioner branches, stacking is done within each partition
- Multi-level stacking supports up to 3 levels by default (configurable)
- Circular dependencies are automatically detected and prevented
"""
def __init__(
self,
model: Any,
source_models: Union[str, List[str]] = "all",
use_proba: bool = False,
stacking_config: Optional[StackingConfig] = None,
selector: Optional[Any] = None,
name: Optional[str] = None,
finetune_space: Optional[Dict[str, Any]] = None,
):
"""Initialize MetaModel operator.
Args:
model: Sklearn-compatible model to use as meta-learner.
Must implement fit() and predict() methods.
source_models: Specifies which models to use as sources:
- "all": Use all previous models in the pipeline (default)
- List[str]: Explicit list of model names to use
use_proba: For classification tasks, if True use class probabilities
as features instead of class predictions. For binary classification,
uses probability of positive class. For multiclass, uses all
class probabilities. Default False.
stacking_config: Configuration for OOF reconstruction and multi-level
stacking. If None, uses default StackingConfig.
selector: Optional SourceModelSelector instance for custom selection.
If provided, overrides source_models parameter.
name: Optional name for the meta-model. If None, uses model class name.
finetune_space: Optional hyperparameter search space for Optuna.
Keys should use 'model__param' format for meta-learner params.
Example: {"model__alpha": (0.001, 100.0)}
Raises:
ValueError: If model doesn't have required fit/predict methods.
ValueError: If source_models is not "all" or a list of strings.
"""
# Validate model has required methods
if not hasattr(model, 'fit') or not hasattr(model, 'predict'):
raise ValueError(
f"Model must have fit() and predict() methods, "
f"got {type(model).__name__}"
)
# Validate source_models
if source_models != "all" and not isinstance(source_models, list):
raise ValueError(
f"source_models must be 'all' or a list of model names, "
f"got {type(source_models).__name__}"
)
if isinstance(source_models, list):
if not all(isinstance(s, str) for s in source_models):
raise ValueError("All source_models entries must be strings")
self.model = model
self.source_models = source_models
self.use_proba = use_proba
self.stacking_config = stacking_config or StackingConfig()
self.selector = selector
self._name = name
self.finetune_space = finetune_space
# Track detected level (will be set during execution)
self._detected_level: Optional[int] = None
[docs]
def get_controller_type(self) -> str:
"""Return the type of controller that handles this operator.
Returns:
str: "meta" to indicate MetaModelController should handle this.
"""
return "meta"
@property
def level(self) -> int:
"""Get the stacking level of this meta-model.
Returns the detected level if AUTO, otherwise the configured level.
Returns:
int: Stacking level (1, 2, or 3).
"""
if self.stacking_config.level == StackingLevel.AUTO:
return self._detected_level or 1
return self.stacking_config.level.value
[docs]
def get_finetune_params(self) -> Optional[Dict[str, Any]]:
"""Get finetuning parameters for Optuna optimization.
Returns the finetune_space with proper formatting for the Optuna manager.
Returns:
Dict with finetune configuration or None if no finetuning configured.
"""
if not self.finetune_space:
return None
return {
'model_params': self.finetune_space,
'n_trials': self.finetune_space.get('n_trials', 50),
'approach': self.finetune_space.get('approach', 'grouped'),
'eval_mode': self.finetune_space.get('eval_mode', 'best'),
'verbose': self.finetune_space.get('verbose', 0),
}
[docs]
def get_params(self, deep: bool = True) -> Dict[str, Any]:
"""Get parameters for this operator.
Parameters:
deep: If True, returns nested parameters from the model.
Returns:
dict: Parameter names mapped to their values.
"""
params = {
'model': self.model,
'source_models': self.source_models,
'use_proba': self.use_proba,
'stacking_config': self.stacking_config,
'selector': self.selector,
'name': self._name,
'finetune_space': self.finetune_space,
}
if deep and hasattr(self.model, 'get_params'):
model_params = self.model.get_params(deep=True)
for key, value in model_params.items():
params[f'model__{key}'] = value
return params
[docs]
def set_params(self, **params) -> 'MetaModel':
"""Set the parameters of this operator.
Parameters:
**params: Operator parameters. Supports nested parameters
for the model using 'model__param_name' syntax.
Returns:
self: MetaModel instance.
"""
# Separate model params from operator params
model_params = {}
operator_params = {}
for key, value in params.items():
if key.startswith('model__'):
model_params[key[7:]] = value # Strip 'model__' prefix
else:
operator_params[key] = value
# Set operator params
for key, value in operator_params.items():
if hasattr(self, key):
setattr(self, key, value)
elif key == 'name':
self._name = value
else:
raise ValueError(f"Unknown parameter: {key}")
# Set model params
if model_params and hasattr(self.model, 'set_params'):
self.model.set_params(**model_params)
return self
@property
def name(self) -> str:
"""Get the display name for this meta-model.
Returns:
str: User-provided name or 'MetaModel_<model_class>'.
"""
if self._name:
return self._name
return f"MetaModel_{type(self.model).__name__}"
[docs]
def __repr__(self) -> str:
"""Return string representation."""
source_str = (
self.source_models if isinstance(self.source_models, str)
else f"[{', '.join(self.source_models[:3])}{'...' if len(self.source_models) > 3 else ''}]"
)
return (
f"MetaModel(model={type(self.model).__name__}, "
f"source_models={source_str}, "
f"use_proba={self.use_proba})"
)