Source code for nirs4all.operators.models.sklearn.dipls

"""Dynamic PLS (DiPLS) regressor for nirs4all.

See pls.py for full documentation and usage examples.
"""
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin

def _check_trendfitter_available():
    """Check if trendfitter package is available."""
    try:
        import trendfitter
        return True
    except ImportError:
        return False


[docs]
class DiPLS(BaseEstimator, RegressorMixin):
    """Dynamic PLS (DiPLS) regressor.

    DiPLS extends PLS to handle dynamic systems by including time-lagged
    variables. It uses the `trendfitter` package.

    Parameters
    ----------
    n_components : int, default=5
        Number of latent variables to extract.
    lags : int, default=1
        Number of time lags to consider (s parameter in DiPLS).
    cv_splits : int, default=7
        Number of cross-validation splits for automatic component selection.
    tol : float, default=1e-8
        Convergence tolerance.
    max_iter : int, default=1000
        Maximum number of iterations.

    Attributes
    ----------
    n_features_in_ : int
        Number of features seen during fit.
    n_components_ : int
        Actual number of components used.

    Examples
    --------
    >>> from nirs4all.operators.models.sklearn.pls import DiPLS
    >>> import numpy as np
    >>> X = np.random.randn(100, 50)
    >>> y = np.random.randn(100)
    >>> model = DiPLS(n_components=5, lags=2)
    >>> model.fit(X, y)
    DiPLS(n_components=5, lags=2)
    >>> predictions = model.predict(X)

    Notes
    -----
    Requires the `trendfitter` package: ``pip install trendfitter``

    DiPLS is particularly useful for:
    - Process monitoring with temporal dependencies
    - NIR data collected over time
    - Batch process analytics

    See Also
    --------
    sklearn.cross_decomposition.PLSRegression : Standard PLS without dynamics.

    References
    ----------
    .. [1] Dong, Y., & Qin, S. J. (2018). A novel dynamic PLS soft sensor
           based on moving-window modeling. Chemical Engineering Research
           and Design, 131, 509-519.
    """

    # Explicitly declare estimator type for sklearn compatibility (e.g., StackingRegressor)
    _estimator_type = "regressor"

    def __init__(
        self,
        n_components: int = 5,
        lags: int = 1,
        cv_splits: int = 7,
        tol: float = 1e-8,
        max_iter: int = 1000,
    ):
        """Initialize DiPLS regressor.

        Parameters
        ----------
        n_components : int, default=5
            Number of latent variables to extract.
        lags : int, default=1
            Number of time lags to consider.
        cv_splits : int, default=7
            Number of cross-validation splits.
        tol : float, default=1e-8
            Convergence tolerance.
        max_iter : int, default=1000
            Maximum number of iterations.
        """
        self.n_components = n_components
        self.lags = lags
        self.cv_splits = cv_splits
        self.tol = tol
        self.max_iter = max_iter


[docs]
    def fit(self, X, y):
        """Fit the DiPLS model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data (time-ordered measurements).
        y : array-like of shape (n_samples,) or (n_samples, n_targets)
            Target values.

        Returns
        -------
        self : DiPLS
            Fitted estimator.

        Raises
        ------
        ImportError
            If trendfitter package is not installed.
        """
        if not _check_trendfitter_available():
            raise ImportError(
                "trendfitter package is required for DiPLS. "
                "Install it with: pip install trendfitter"
            )

        from trendfitter.models import DiPLS as TFDiPLS

        X = np.asarray(X)
        y = np.asarray(y)

        if y.ndim == 1:
            y = y.reshape(-1, 1)

        self.n_features_in_ = X.shape[1]

        # Create and fit trendfitter DiPLS
        self._model = TFDiPLS(
            cv_splits_number=self.cv_splits,
            tol=self.tol,
            loop_limit=self.max_iter,
        )

        # Fit with specified components and lags
        self._model.fit(
            X, y,
            latent_variables=self.n_components,
            s=self.lags,
        )

        self.n_components_ = self.n_components

        return self



[docs]
    def predict(self, X):
        """Predict using the DiPLS model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,) or (n_samples, n_targets)
            Predicted values.

        Notes
        -----
        DiPLS uses Hankelization which may produce fewer predictions than
        input samples. This implementation pads the beginning with the first
        predicted value to maintain compatibility with sklearn cross-validation.
        """
        X = np.asarray(X)
        n_samples = X.shape[0]

        y_pred = self._model.predict(X)

        # DiPLS may return fewer predictions due to Hankelization
        # Pad beginning with first prediction to match input length for sklearn compatibility
        n_pred = y_pred.shape[0]
        if n_pred < n_samples:
            n_pad = n_samples - n_pred
            if y_pred.ndim == 1:
                pad_value = y_pred[0]
                y_pred = np.concatenate([np.full(n_pad, pad_value), y_pred])
            else:
                pad_value = y_pred[0:1]
                y_pred = np.concatenate([np.tile(pad_value, (n_pad, 1)), y_pred], axis=0)

        # Flatten if single target
        if y_pred.ndim > 1 and y_pred.shape[1] == 1:
            y_pred = y_pred.ravel()

        return y_pred



[docs]
    def get_params(self, deep=True):
        """Get parameters for this estimator.

        Parameters
        ----------
        deep : bool, default=True
            If True, will return the parameters for this estimator and
            contained subobjects that are estimators.

        Returns
        -------
        params : dict
            Parameter names mapped to their values.
        """
        return {
            'n_components': self.n_components,
            'lags': self.lags,
            'cv_splits': self.cv_splits,
            'tol': self.tol,
            'max_iter': self.max_iter,
        }



[docs]
    def set_params(self, **params):
        """Set the parameters of this estimator.

        Parameters
        ----------
        **params : dict
            Estimator parameters.

        Returns
        -------
        self : DiPLS
            Estimator instance.
        """
        for key, value in params.items():
            setattr(self, key, value)
        return self