Source code for nirs4all.data.performance.lazy_loader

"""
Lazy loading support for datasets.

This module provides lazy loading capabilities to defer data loading
until it is actually needed, improving startup time and memory usage.

Phase 8 Implementation - Dataset Configuration Roadmap
Section 8.5: Performance Optimization - Lazy Loading
"""

from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import numpy as np

from nirs4all.core.logging import get_logger

logger = get_logger(__name__)


[docs] class LazyArray: """A lazy-loading array wrapper. Defers loading until array data is actually accessed. Supports numpy array interface for compatibility. Example: ```python # Create lazy array lazy = LazyArray( loader=lambda: np.load("large_file.npy"), shape=(10000, 500), dtype=np.float32 ) # Array not loaded yet print(lazy.shape) # (10000, 500) # Triggers loading on first access data = lazy[0:100] # Now loads the data # Explicit loading lazy.load() full_data = lazy.data ``` """ def __init__( self, loader: Callable[[], np.ndarray], shape: Optional[Tuple[int, ...]] = None, dtype: Optional[np.dtype] = None, source_path: Optional[str] = None, ): """Initialize lazy array. Args: loader: Function to load the array data. shape: Expected shape (for metadata before loading). dtype: Expected dtype (for metadata before loading). source_path: Path to source file (for cache key). """ self._loader = loader self._shape = shape self._dtype = dtype self._source_path = source_path self._data: Optional[np.ndarray] = None self._loaded = False @property def is_loaded(self) -> bool: """Check if data has been loaded.""" return self._loaded @property def shape(self) -> Optional[Tuple[int, ...]]: """Get array shape (may trigger load if unknown).""" if self._shape is not None: return self._shape self.load() return self._data.shape if self._data is not None else None @property def dtype(self) -> Optional[np.dtype]: """Get array dtype (may trigger load if unknown).""" if self._dtype is not None: return self._dtype self.load() return self._data.dtype if self._data is not None else None @property def ndim(self) -> int: """Get number of dimensions.""" shape = self.shape return len(shape) if shape else 0 @property def data(self) -> np.ndarray: """Get the loaded data (triggers load if needed).""" self.load() return self._data
[docs] def load(self) -> np.ndarray: """Load the data if not already loaded. Returns: The loaded numpy array. """ if not self._loaded: logger.debug(f"Loading lazy array from {self._source_path or 'memory'}") self._data = self._loader() self._loaded = True # Update shape/dtype from actual data if self._data is not None: self._shape = self._data.shape self._dtype = self._data.dtype return self._data
[docs] def unload(self) -> None: """Unload data to free memory.""" if self._loaded: logger.debug(f"Unloading lazy array from {self._source_path or 'memory'}") self._data = None self._loaded = False
[docs] def __len__(self) -> int: """Get length (first dimension).""" shape = self.shape return shape[0] if shape else 0
[docs] def __getitem__(self, key): """Get item from array (triggers load).""" return self.data[key]
[docs] def __array__(self, dtype=None): """Support numpy array conversion.""" data = self.data if dtype is not None: return np.asarray(data, dtype=dtype) return data
def __repr__(self) -> str: status = "loaded" if self._loaded else "not loaded" return f"LazyArray(shape={self._shape}, dtype={self._dtype}, {status})"
[docs] class LazyDataset: """A lazy-loading dataset wrapper. Wraps multiple data components (X, y, metadata) as lazy arrays that load on demand. Example: ```python # Create from loader functions dataset = LazyDataset( x_loader=lambda: load_features("X.csv"), y_loader=lambda: load_targets("Y.csv"), metadata_loader=lambda: load_metadata("M.csv") ) # Nothing loaded yet print(dataset.x_shape) # Returns cached shape if known # Triggers X loading only X_data = dataset.X # Load everything dataset.load_all() ``` """ def __init__( self, x_loader: Optional[Callable[[], np.ndarray]] = None, y_loader: Optional[Callable[[], np.ndarray]] = None, metadata_loader: Optional[Callable[[], Any]] = None, x_shape: Optional[Tuple[int, ...]] = None, y_shape: Optional[Tuple[int, ...]] = None, name: str = "dataset", ): """Initialize lazy dataset. Args: x_loader: Function to load features. y_loader: Function to load targets. metadata_loader: Function to load metadata. x_shape: Expected shape of X (for pre-load info). y_shape: Expected shape of y (for pre-load info). name: Dataset name. """ self.name = name self._x_loader = x_loader self._y_loader = y_loader self._metadata_loader = metadata_loader self._X: Optional[LazyArray] = None self._y: Optional[LazyArray] = None self._metadata: Optional[Any] = None self._metadata_loaded = False if x_loader: self._X = LazyArray(x_loader, shape=x_shape) if y_loader: self._y = LazyArray(y_loader, shape=y_shape) @property def X(self) -> Optional[np.ndarray]: """Get features (triggers load if needed).""" if self._X is None: return None return self._X.data @property def y(self) -> Optional[np.ndarray]: """Get targets (triggers load if needed).""" if self._y is None: return None return self._y.data @property def metadata(self) -> Optional[Any]: """Get metadata (triggers load if needed).""" if self._metadata_loader is not None and not self._metadata_loaded: logger.debug(f"Loading metadata for {self.name}") self._metadata = self._metadata_loader() self._metadata_loaded = True return self._metadata @property def x_shape(self) -> Optional[Tuple[int, ...]]: """Get X shape without loading.""" return self._X.shape if self._X else None @property def y_shape(self) -> Optional[Tuple[int, ...]]: """Get y shape without loading.""" return self._y.shape if self._y else None @property def n_samples(self) -> int: """Get number of samples.""" shape = self.x_shape return shape[0] if shape else 0 @property def n_features(self) -> int: """Get number of features.""" shape = self.x_shape return shape[1] if shape and len(shape) > 1 else 0 @property def is_x_loaded(self) -> bool: """Check if X is loaded.""" return self._X is not None and self._X.is_loaded @property def is_y_loaded(self) -> bool: """Check if y is loaded.""" return self._y is not None and self._y.is_loaded @property def is_metadata_loaded(self) -> bool: """Check if metadata is loaded.""" return self._metadata_loaded
[docs] def load_all(self) -> None: """Load all data components.""" if self._X: self._X.load() if self._y: self._y.load() if self._metadata_loader and not self._metadata_loaded: self._metadata = self._metadata_loader() self._metadata_loaded = True
[docs] def unload_all(self) -> None: """Unload all data to free memory.""" if self._X: self._X.unload() if self._y: self._y.unload() self._metadata = None self._metadata_loaded = False
def __repr__(self) -> str: x_status = "loaded" if self.is_x_loaded else "not loaded" y_status = "loaded" if self.is_y_loaded else "not loaded" return ( f"LazyDataset(name={self.name}, " f"X={self.x_shape} {x_status}, " f"y={self.y_shape} {y_status})" )
[docs] def create_lazy_dataset( train_x_path: Optional[str] = None, train_y_path: Optional[str] = None, train_group_path: Optional[str] = None, load_params: Optional[Dict[str, Any]] = None, ) -> LazyDataset: """Create a lazy dataset from file paths. Args: train_x_path: Path to training features. train_y_path: Path to training targets. train_group_path: Path to training metadata. load_params: Loading parameters. Returns: LazyDataset instance. """ from nirs4all.data.loaders.csv_loader import load_csv load_params = load_params or {} def make_x_loader(): if train_x_path is None: return None def loader(): result = load_csv(train_x_path, data_type='x', **load_params) return result[0].values if result[0] is not None else None return loader def make_y_loader(): if train_y_path is None: return None def loader(): result = load_csv(train_y_path, data_type='y', **load_params) return result[0].values if result[0] is not None else None return loader def make_metadata_loader(): if train_group_path is None: return None def loader(): result = load_csv(train_group_path, data_type='x', **load_params) return result[0] if result[0] is not None else None return loader return LazyDataset( x_loader=make_x_loader(), y_loader=make_y_loader(), metadata_loader=make_metadata_loader(), name=Path(train_x_path).stem if train_x_path else "dataset", )