Source code for nirs4all.pipeline.runner

"""Pipeline runner - Main entry point for pipeline execution.

This module provides the PipelineRunner class, which serves as the main interface
for executing ML pipelines on spectroscopic datasets. It delegates execution to
PipelineOrchestrator and provides prediction/explanation capabilities via
Predictor and Explainer classes.
"""
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union

import os

import numpy as np

from nirs4all.core.logging import configure_logging
from nirs4all.data.config import DatasetConfigs
from nirs4all.data.dataset import SpectroDataset
from nirs4all.data.predictions import Predictions
from nirs4all.pipeline.config.pipeline_config import PipelineConfigs
from nirs4all.pipeline.config.context import ExecutionContext
from nirs4all.pipeline.execution.orchestrator import PipelineOrchestrator
from nirs4all.pipeline.predictor import Predictor
from nirs4all.pipeline.explainer import Explainer
from nirs4all.pipeline.retrainer import Retrainer, RetrainMode, StepMode, ExtractedPipeline


def _get_default_workspace_path() -> Path:
    """Get the default workspace path.

    Checks NIRS4ALL_WORKSPACE environment variable first, then falls back
    to ./workspace in the current working directory.

    Returns:
        Default workspace path.
    """
    env_workspace = os.environ.get("NIRS4ALL_WORKSPACE")
    if env_workspace:
        return Path(env_workspace)
    return Path.cwd() / "workspace"


[docs] def init_global_random_state(seed: Optional[int] = None): """Initialize global random state for reproducibility. Sets random seeds for numpy, Python's random module, TensorFlow, PyTorch, and sklearn to ensure reproducible results across runs. Args: seed: Random seed value. If None, uses default seed of 42 for TensorFlow and PyTorch. """ import numpy as np import random import os if seed is not None: np.random.seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) try: import tensorflow as tf tf.random.set_seed(seed if seed is not None else 42) except ImportError: pass try: import torch torch.manual_seed(seed if seed is not None else 42) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed if seed is not None else 42) except ImportError: pass try: from sklearn.utils import check_random_state _ = check_random_state(seed) except ImportError: pass
[docs] class PipelineRunner: """Main pipeline execution interface. Orchestrates pipeline execution on datasets, providing a simplified interface for training, prediction, and explanation workflows. Delegates actual execution to PipelineOrchestrator, Predictor, and Explainer. Attributes: workspace_path (Path): Root workspace directory verbose (int): Verbosity level (0=quiet, 1=info, 2=debug, 3=trace) mode (str): Execution mode ('train', 'predict', 'explain') save_artifacts (bool): Whether to save binary artifacts (models, transformers) save_charts (bool): Whether to save charts and visual outputs enable_tab_reports (bool): Whether to generate tabular reports continue_on_error (bool): Whether to continue on step failures show_spinner (bool): Whether to show progress spinners keep_datasets (bool): Whether to keep raw/preprocessed data snapshots plots_visible (bool): Whether to display plots interactively orchestrator (PipelineOrchestrator): Underlying orchestrator for execution predictor (Predictor): Handler for prediction mode explainer (Explainer): Handler for explanation mode raw_data (Dict[str, np.ndarray]): Raw dataset snapshots (if keep_datasets=True) pp_data (Dict[str, Dict[str, np.ndarray]]): Preprocessed data snapshots Example: >>> # Training workflow >>> runner = PipelineRunner(workspace_path="./workspace", verbose=1) >>> pipeline = [{"preprocessing": StandardScaler()}, {"model": SVC()}] >>> X, y = load_data() >>> predictions, dataset_preds = runner.run(pipeline, (X, y)) >>> # Prediction workflow >>> runner = PipelineRunner(mode="predict") >>> y_pred, preds = runner.predict(best_model, X_new) >>> # Explanation workflow >>> runner = PipelineRunner(mode="explain") >>> shap_results, out_dir = runner.explain(best_model, X_test) """ def __init__( self, workspace_path: Optional[Union[str, Path]] = None, verbose: int = 0, mode: str = "train", save_artifacts: bool = True, save_charts: bool = True, enable_tab_reports: bool = True, continue_on_error: bool = False, show_spinner: bool = True, keep_datasets: bool = True, plots_visible: bool = False, random_state: Optional[int] = None, # Logging configuration log_file: bool = True, log_format: str = "pretty", use_unicode: Optional[bool] = None, use_colors: Optional[bool] = None, show_progress_bar: bool = True, json_output: bool = False, ): """Initialize pipeline runner. Args: workspace_path: Workspace root directory. Defaults to './workspace' verbose: Verbosity level (0=quiet, 1=info, 2=debug, 3=trace) mode: Execution mode ('train', 'predict', 'explain') save_artifacts: Whether to save binary artifacts (models, transformers) save_charts: Whether to save charts and visual outputs enable_tab_reports: Whether to generate tabular reports continue_on_error: Whether to continue on step failures show_spinner: Whether to show progress spinners keep_datasets: Whether to keep data snapshots (raw/preprocessed) plots_visible: Whether to display plots interactively random_state: Random seed for reproducibility log_file: Whether to write logs to workspace/logs/ directory log_format: Output format: "pretty" (default), "minimal", or "json" use_unicode: Use Unicode symbols (auto-detected if None). Set to False for HPC/cluster environments without Unicode support. use_colors: Use ANSI colors (auto-detected if None). Set to False to disable colored output. show_progress_bar: Whether to show TTY-aware progress bars json_output: Also write JSON Lines log file for machine parsing """ if random_state is not None: init_global_random_state(random_state) if workspace_path is None: workspace_path = _get_default_workspace_path() self.workspace_path = Path(workspace_path) self.verbose = verbose self.mode = mode self.save_artifacts = save_artifacts self.save_charts = save_charts self.enable_tab_reports = enable_tab_reports self.continue_on_error = continue_on_error self.show_spinner = show_spinner self.keep_datasets = keep_datasets self.plots_visible = plots_visible # Store logging configuration self.log_file = log_file self.log_format = log_format self.use_unicode = use_unicode self.use_colors = use_colors self.show_progress_bar = show_progress_bar self.json_output = json_output # Configure logging system log_dir = self.workspace_path / "logs" if log_file else None configure_logging( verbose=verbose, log_file=log_file, log_dir=log_dir, log_format=log_format, use_unicode=use_unicode, use_colors=use_colors, show_progress=True, show_progress_bar=show_progress_bar, json_output=json_output, ) # Create orchestrator self.orchestrator = PipelineOrchestrator( workspace_path=self.workspace_path, verbose=verbose, mode=mode, save_artifacts=save_artifacts, save_charts=save_charts, enable_tab_reports=enable_tab_reports, continue_on_error=continue_on_error, show_spinner=show_spinner, keep_datasets=keep_datasets, plots_visible=plots_visible ) # Create predictor and explainer self.predictor = Predictor(self) self.explainer = Explainer(self) self.retrainer = Retrainer(self) # Expose orchestrator state for convenience self.raw_data = self.orchestrator.raw_data self.pp_data = self.orchestrator.pp_data self._figure_refs = self.orchestrator._figure_refs # Model capture support for explainer self._capture_model: bool = False # Last run aggregate settings (for visualization integration) self._last_aggregate_column: Optional[str] = None self._last_aggregate_method: Optional[str] = None self._last_aggregate_exclude_outliers: bool = False # Execution state (synchronized from executor during execution) self.step_number: int = 0 self.substep_number: int = -1 self.operation_count: int = 0 # Runtime components (set by executor during execution) self.saver: Any = None # SimulationSaver self.manifest_manager: Any = None # ManifestManager self.artifact_loader: Any = None # ArtifactLoader for predict/explain modes self.pipeline_uid: Optional[str] = None # Current pipeline UID self.target_model: Optional[Dict] = None # Target model for predict/explain modes self.last_execution_trace: Any = None # ExecutionTrace from last run # Library for template management self._library: Any = None # PipelineLibrary (lazy)
[docs] def run( self, pipeline: Union[PipelineConfigs, List[Any], Dict, str], dataset: Union[DatasetConfigs, SpectroDataset, List[SpectroDataset], np.ndarray, Tuple[np.ndarray, ...], Dict, List[Dict], str, List[str]], pipeline_name: str = "", dataset_name: str = "dataset", max_generation_count: int = 10000 ) -> Tuple[Predictions, Dict[str, Any]]: """Execute pipeline on dataset(s). Main entry point for training workflows. Executes one or more pipeline configurations on one or more datasets, tracking predictions and artifacts. Args: pipeline: Pipeline definition (PipelineConfigs, list of steps, dict, or path) dataset: Dataset definition (see DatasetConfigs for supported formats) pipeline_name: Optional pipeline name for identification dataset_name: Name for array-based datasets max_generation_count: Max pipeline combinations to generate Returns: Tuple of (run_predictions, datasets_predictions) """ run_predictions, dataset_predictions = self.orchestrator.execute( pipeline=pipeline, dataset=dataset, pipeline_name=pipeline_name, dataset_name=dataset_name, max_generation_count=max_generation_count, artifact_loader=self.artifact_loader, target_model=self.target_model, explainer=self.explainer ) # Sync state if self.keep_datasets: self.raw_data = self.orchestrator.raw_data self.pp_data = self.orchestrator.pp_data self._figure_refs = self.orchestrator._figure_refs # Sync runtime components from last executed pipeline if self.orchestrator.last_saver is not None: self.saver = self.orchestrator.last_saver if self.orchestrator.last_pipeline_uid is not None: self.pipeline_uid = self.orchestrator.last_pipeline_uid if self.orchestrator.last_manifest_manager is not None: self.manifest_manager = self.orchestrator.last_manifest_manager # Sync execution state from last executor (via orchestrator) # Note: These values come from the last executed pipeline if hasattr(self.orchestrator, 'last_executor'): if self.orchestrator.last_executor: self.step_number = self.orchestrator.last_executor.step_number self.substep_number = self.orchestrator.last_executor.substep_number self.operation_count = self.orchestrator.last_executor.operation_count # Sync aggregate column from last dataset for visualization integration if hasattr(self.orchestrator, 'last_aggregate_column'): self._last_aggregate_column = self.orchestrator.last_aggregate_column if hasattr(self.orchestrator, 'last_aggregate_method'): self._last_aggregate_method = self.orchestrator.last_aggregate_method if hasattr(self.orchestrator, 'last_aggregate_exclude_outliers'): self._last_aggregate_exclude_outliers = self.orchestrator.last_aggregate_exclude_outliers # Sync execution trace for post-run visualization if hasattr(self.orchestrator, 'last_execution_trace'): self.last_execution_trace = self.orchestrator.last_execution_trace return run_predictions, dataset_predictions
[docs] def predict( self, prediction_obj: Union[Dict[str, Any], str], dataset: Union[DatasetConfigs, SpectroDataset, List[SpectroDataset], np.ndarray, Tuple[np.ndarray, ...], Dict, List[Dict], str, List[str]], dataset_name: str = "prediction_dataset", all_predictions: bool = False, verbose: int = 0 ) -> Union[Tuple[np.ndarray, Predictions], Tuple[Dict[str, Any], Predictions]]: """Run prediction using a saved model on new dataset. Delegates to Predictor class for actual execution. Args: prediction_obj: Model identifier (dict with config_path or prediction ID) dataset: New dataset to predict on dataset_name: Name for the dataset all_predictions: If True, return all predictions; if False, return single best verbose: Verbosity level Returns: If all_predictions=False: (y_pred, predictions) If all_predictions=True: (predictions_dict, predictions) """ return self.predictor.predict( prediction_obj=prediction_obj, dataset=dataset, dataset_name=dataset_name, all_predictions=all_predictions, verbose=verbose )
[docs] def explain( self, prediction_obj: Union[Dict[str, Any], str], dataset: Union[DatasetConfigs, SpectroDataset, np.ndarray, Tuple[np.ndarray, ...], Dict, List[Dict], str, List[str]], dataset_name: str = "explain_dataset", shap_params: Optional[Dict[str, Any]] = None, verbose: int = 0, plots_visible: bool = True ) -> Tuple[Dict[str, Any], str]: """Generate SHAP explanations for a saved model. Delegates to Explainer class for actual execution. Args: prediction_obj: Model identifier (dict with config_path or prediction ID) dataset: Dataset to explain on dataset_name: Name for the dataset shap_params: SHAP configuration parameters verbose: Verbosity level plots_visible: Whether to display plots interactively Returns: Tuple of (shap_results_dict, output_directory_path) """ return self.explainer.explain( prediction_obj=prediction_obj, dataset=dataset, dataset_name=dataset_name, shap_params=shap_params, verbose=verbose, plots_visible=plots_visible )
[docs] def export_best_for_dataset( self, dataset_name: str, mode: str = "predictions" ) -> Optional[Path]: """Export best results for a dataset to exports/ folder. Args: dataset_name: Name of the dataset to export mode: Export mode ('predictions' or other) Returns: Path to exported file, or None if export failed """ saver = self.saver or getattr(self.predictor, 'saver', None) or getattr(self.explainer, 'saver', None) if saver is None: raise ValueError("No saver configured. Run a pipeline first.") return saver.export_best_for_dataset( dataset_name, self.workspace_path, self.orchestrator.runs_dir, mode )
@property def current_run_dir(self) -> Optional[Path]: """Get current run directory. Returns: Path to current run directory, or None if not set """ if self.saver and hasattr(self.saver, 'base_path'): return self.saver.base_path # Fallback for predictor/explainer modes saver = getattr(self.predictor, 'saver', None) or getattr(self.explainer, 'saver', None) return getattr(saver, 'base_path', None) if saver else None @property def runs_dir(self) -> Path: """Get runs directory. Returns: Path to runs directory in workspace """ return self.orchestrator.runs_dir @property def last_aggregate(self) -> Optional[str]: """Get aggregate column from the last executed dataset. Returns the aggregation setting from the last dataset processed by run(). This can be used to create a PredictionAnalyzer with matching defaults. Returns: Aggregate column name ('y' for y-based aggregation, column name for metadata-based aggregation, or None if no aggregation was set). Example: >>> runner = PipelineRunner() >>> predictions, _ = runner.run(pipeline, DatasetConfigs(path, aggregate='sample_id')) >>> # Create analyzer with same aggregate setting >>> analyzer = PredictionAnalyzer(predictions, default_aggregate=runner.last_aggregate) """ return self._last_aggregate_column @property def last_aggregate_method(self) -> Optional[str]: """Get aggregate method from the last executed dataset. Returns: Aggregate method ('mean', 'median', 'vote') or None for default. """ return self._last_aggregate_method @property def last_aggregate_exclude_outliers(self) -> bool: """Get aggregate exclude_outliers setting from the last executed dataset. Returns: True if T² outlier exclusion was enabled, False otherwise. """ return self._last_aggregate_exclude_outliers @property def library(self) -> "PipelineLibrary": """Get pipeline library for template management. Returns: PipelineLibrary instance for managing pipeline templates """ if self._library is None: from nirs4all.pipeline.storage.library import PipelineLibrary self._library = PipelineLibrary(self.workspace_path) return self._library
[docs] def next_op(self) -> int: """Get the next operation ID (for controller compatibility). Returns: Next operation counter value """ self.operation_count += 1 return self.operation_count
[docs] def export( self, source: Union[Dict[str, Any], str, Path], output_path: Union[str, Path], format: str = "n4a", include_metadata: bool = True, compress: bool = True ) -> Path: """Export a trained pipeline to a standalone bundle. Creates a self-contained prediction bundle that can be used for deployment, sharing, or archival without requiring the original workspace or full nirs4all installation. Supported formats: - 'n4a': Full bundle (ZIP archive with artifacts and metadata) - 'n4a.py': Portable Python script with embedded artifacts Phase 6 Feature: This method enables exporting trained pipelines as standalone bundles that can be loaded and used for prediction without the original workspace structure. Args: source: Prediction source to export. Can be: - prediction dict: From a previous run's Predictions object - folder path: Path to a pipeline directory - Run object: Best prediction from a Run output_path: Path for the output bundle file format: Bundle format ('n4a' or 'n4a.py') include_metadata: Whether to include full metadata in bundle compress: Whether to compress artifacts (for .n4a format) Returns: Path to the created bundle file Raises: ValueError: If format is not supported FileNotFoundError: If source cannot be resolved Example: >>> runner = PipelineRunner() >>> predictions, _ = runner.run(pipeline, dataset) >>> best_pred = predictions.top(n=1)[0] >>> >>> # Export to .n4a bundle >>> runner.export(best_pred, "exports/wheat_model.n4a") >>> >>> # Export to portable Python script >>> runner.export(best_pred, "exports/wheat_model.n4a.py", format='n4a.py') >>> >>> # Later, predict from bundle >>> y_pred, _ = runner.predict("exports/wheat_model.n4a", X_new) """ from nirs4all.pipeline.bundle import BundleGenerator generator = BundleGenerator( workspace_path=self.workspace_path, verbose=self.verbose ) return generator.export( source=source, output_path=output_path, format=format, include_metadata=include_metadata, compress=compress )
[docs] def export_model( self, source: Union[Dict[str, Any], str, Path], output_path: Union[str, Path], format: Optional[str] = None, fold: Optional[int] = None ) -> Path: """Export only the model artifact from a trained pipeline. Unlike `export()` which creates a full bundle with all preprocessing artifacts and metadata, this method exports just the model binary. This is useful when you want a lightweight model file that can be loaded directly into other pipelines or used with external tools. The output format is determined by the file extension or can be specified explicitly. The model can then be reloaded using: - Direct path in pipeline config: {"model": "path/to/model.joblib"} - As prediction source: runner.predict("path/to/model.joblib", data) Args: source: Prediction source to export from. Can be: - prediction dict: From a previous run's Predictions object - folder path: Path to a pipeline directory - bundle path: Path to a .n4a bundle output_path: Path for the output model file. Extension determines format: .joblib, .pkl, .h5, .keras, .pt format: Optional explicit format ('joblib', 'pickle', 'keras_h5'). If None, determined from output_path extension. fold: Optional fold index to export. If None, exports fold 0 or the primary model artifact. Returns: Path to the created model file Raises: ValueError: If no model artifact found FileNotFoundError: If source cannot be resolved Example: >>> runner = PipelineRunner() >>> predictions, _ = runner.run(pipeline, dataset) >>> best_pred = predictions.top(n=1)[0] >>> >>> # Export just the model >>> runner.export_model(best_pred, "exports/pls_model.joblib") >>> >>> # Later, use in new pipeline >>> new_pipeline = [ ... MinMaxScaler(), ... {"model": "exports/pls_model.joblib", "name": "pretrained"} ... ] """ from nirs4all.pipeline.resolver import PredictionResolver from nirs4all.pipeline.storage.artifacts.artifact_persistence import to_bytes output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Resolve the prediction source resolver = PredictionResolver( workspace_path=self.workspace_path, runs_dir=self.runs_dir ) resolved = resolver.resolve(source, verbose=self.verbose) if resolved.model_step_index is None: raise ValueError("No model step found in the resolved prediction") # Get the model artifact if resolved.artifact_provider is None: raise ValueError("No artifact provider available for this source") artifacts = resolved.artifact_provider.get_artifacts_for_step( resolved.model_step_index ) if not artifacts: raise ValueError( f"No model artifacts found at step {resolved.model_step_index}" ) # Select the fold if fold is not None: # Find artifact for specific fold model = None for artifact_id, artifact in artifacts: if f":{fold}" in str(artifact_id) or artifact_id.endswith(f"_{fold}"): model = artifact break if model is None: raise ValueError(f"No artifact found for fold {fold}") else: # Use first artifact (fold 0 or primary) _, model = artifacts[0] # Determine format from extension if not specified if format is None: ext = output_path.suffix.lower() format_map = { '.joblib': 'joblib', '.pkl': 'cloudpickle', '.pickle': 'cloudpickle', '.h5': 'keras_h5', '.hdf5': 'keras_h5', '.keras': 'tensorflow_keras', '.pt': 'pytorch_state_dict', '.pth': 'pytorch_state_dict', } format = format_map.get(ext, 'joblib') # Serialize and write data, actual_format = to_bytes(model, format) with open(output_path, 'wb') as f: f.write(data) if self.verbose > 0: print(f"Exported model to {output_path} (format: {actual_format})") return output_path
[docs] def retrain( self, source: Union[Dict[str, Any], str, Path], dataset: Union[DatasetConfigs, SpectroDataset, np.ndarray, Tuple[np.ndarray, ...], Dict, List[Dict], str, List[str]], mode: str = "full", dataset_name: str = "retrain_dataset", new_model: Optional[Any] = None, epochs: Optional[int] = None, step_modes: Optional[List[StepMode]] = None, verbose: int = 0, **kwargs ) -> Tuple[Predictions, Dict[str, Any]]: """Retrain a pipeline on new data. Enables retraining trained pipelines with various modes: - full: Train from scratch with same pipeline structure - transfer: Use existing preprocessing artifacts, train new model - finetune: Continue training existing model with new data Phase 7 Feature: This method enables retraining pipelines without having to reconstruct the pipeline configuration manually. It uses the resolved prediction source (from Phase 3/4) to extract the pipeline structure and optionally reuse preprocessing artifacts. Args: source: Prediction source to retrain from. Can be: - prediction dict: From a previous run's Predictions object - folder path: Path to a pipeline directory - Run object: Best prediction from a Run - artifact_id: Direct artifact reference - bundle: Exported prediction bundle (.n4a) dataset: New dataset to train on. Supports same formats as run() mode: Retrain mode: - 'full': Train everything from scratch (same pipeline structure) - 'transfer': Use existing preprocessing, train new model - 'finetune': Continue training existing model dataset_name: Name for the dataset if array-based new_model: Optional new model for transfer mode (replaces original) epochs: Optional epochs for fine-tuning step_modes: Optional per-step mode overrides for fine-grained control verbose: Verbosity level **kwargs: Additional parameters: - learning_rate: Learning rate for fine-tuning - freeze_layers: List of layers to freeze during fine-tuning Returns: Tuple of (run_predictions, datasets_predictions) Raises: ValueError: If mode is invalid or source cannot be resolved FileNotFoundError: If source references files that don't exist Example: >>> runner = PipelineRunner() >>> predictions, _ = runner.run(pipeline, dataset) >>> best_pred = predictions.top(n=1)[0] >>> >>> # Full retrain on new data >>> new_preds, _ = runner.retrain(best_pred, new_data, mode='full') >>> >>> # Transfer: use preprocessing from old model, train new one >>> new_preds, _ = runner.retrain( ... best_pred, new_data, mode='transfer', ... new_model=XGBRegressor() ... ) >>> >>> # Finetune: continue training existing model >>> new_preds, _ = runner.retrain( ... best_pred, new_data, mode='finetune', epochs=10 ... ) >>> >>> # Fine-grained control: specify per-step modes >>> from nirs4all.pipeline import StepMode >>> step_modes = [ ... StepMode(step_index=1, mode='predict'), # Use existing ... StepMode(step_index=2, mode='train'), # Retrain ... ] >>> new_preds, _ = runner.retrain( ... best_pred, new_data, mode='full', step_modes=step_modes ... ) """ return self.retrainer.retrain( source=source, dataset=dataset, mode=mode, dataset_name=dataset_name, new_model=new_model, epochs=epochs, step_modes=step_modes, verbose=verbose, **kwargs )
[docs] def extract( self, source: Union[Dict[str, Any], str, Path] ) -> ExtractedPipeline: """Extract a trained pipeline for inspection or modification. Loads a trained pipeline from a prediction source and returns an ExtractedPipeline object that can be inspected, modified, and then executed with runner.run(). Phase 7 Feature: This method enables extracting and modifying trained pipelines without retraining from scratch. Args: source: Prediction source to extract. Can be: - prediction dict: From a previous run's Predictions object - folder path: Path to a pipeline directory - Run object: Best prediction from a Run - artifact_id: Direct artifact reference - bundle: Exported prediction bundle (.n4a) Returns: ExtractedPipeline object with: - steps: List of pipeline steps (can be modified) - trace: Original execution trace (read-only) - artifact_provider: Provider for original artifacts - model_step_index: Index of the model step - preprocessing_chain: Summary of preprocessing Example: >>> runner = PipelineRunner() >>> predictions, _ = runner.run(pipeline, dataset) >>> best_pred = predictions.top(n=1)[0] >>> >>> # Extract for inspection >>> extracted = runner.extract(best_pred) >>> print(f"Steps: {len(extracted.steps)}") >>> print(f"Preprocessing: {extracted.preprocessing_chain}") >>> >>> # Modify and run >>> from sklearn.ensemble import RandomForestRegressor >>> extracted.set_model(RandomForestRegressor()) >>> new_preds, _ = runner.run(extracted.steps, new_data) """ return self.retrainer.extract(source, verbose=self.verbose)