Source code for nirs4all.data.loaders.numpy_loader

"""
NumPy file loader implementation.

This module provides the NumpyLoader class for loading NumPy array files,
including .npy (single array) and .npz (multiple arrays) formats.
"""

from pathlib import Path
from typing import Any, ClassVar, Dict, List, Optional, Tuple

import numpy as np
import pandas as pd

from .base import (
    FileLoadError,
    FileLoader,
    LoaderResult,
    register_loader,
)


[docs] @register_loader class NumpyLoader(FileLoader): """Loader for NumPy array files. Supports: - Single array files (.npy) - Multi-array archives (.npz) Parameters: allow_pickle: Whether to allow loading pickled objects (default: False). Setting this to True may pose a security risk with untrusted files. key: For .npz files, the key of the array to load. If not specified, uses the first array. header_unit: Unit for generated headers ('cm-1', 'nm', 'index', etc.) Security Note: NumPy's allow_pickle=True can execute arbitrary code when loading untrusted files. Only enable this for files you trust completely. """ supported_extensions: ClassVar[Tuple[str, ...]] = (".npy", ".npz") name: ClassVar[str] = "NumPy Loader" priority: ClassVar[int] = 40 # Higher priority than CSV for numpy files
[docs] @classmethod def supports(cls, path: Path) -> bool: """Check if this loader supports the given file.""" return path.suffix.lower() in cls.supported_extensions
[docs] def load( self, path: Path, allow_pickle: bool = False, key: Optional[str] = None, header_unit: str = "index", data_type: str = "x", **params: Any, ) -> LoaderResult: """Load data from a NumPy file. Args: path: Path to the NumPy file. allow_pickle: Whether to allow loading pickled objects. key: For .npz files, the key of the array to load. header_unit: Unit type for generated headers. data_type: Type of data ('x', 'y', or 'metadata'). **params: Additional parameters (ignored). Returns: LoaderResult with the loaded data as a DataFrame. """ report: Dict[str, Any] = { "file_path": str(path), "format": "npy" if path.suffix.lower() == ".npy" else "npz", "allow_pickle": allow_pickle, "key_used": key, "initial_shape": None, "final_shape": None, "na_handling": { "strategy": "remove", "na_detected": False, "nb_removed_rows": 0, "removed_rows_indices": [], }, "warnings": [], "error": None, } try: if not path.exists(): raise FileNotFoundError(f"File not found: {path}") # Load the array array = self._load_array(path, allow_pickle, key, report) if array is None: return LoaderResult(report=report, header_unit=header_unit) # Ensure 2D array if array.ndim == 1: array = array.reshape(-1, 1) elif array.ndim > 2: report["warnings"].append( f"Array has {array.ndim} dimensions. Reshaping to 2D." ) array = array.reshape(array.shape[0], -1) report["initial_shape"] = array.shape # Generate column headers n_cols = array.shape[1] if header_unit == "index": headers = [str(i) for i in range(n_cols)] else: headers = [f"feature_{i}" for i in range(n_cols)] # Convert to DataFrame try: data = pd.DataFrame(array, columns=headers) except Exception as e: report["error"] = f"Failed to convert array to DataFrame: {e}" return LoaderResult(report=report, header_unit=header_unit) # Handle NA values na_mask = data.isna().any(axis=1) report["na_handling"]["na_detected"] = bool(na_mask.any()) if na_mask.any(): report["na_handling"]["nb_removed_rows"] = int(na_mask.sum()) report["na_handling"]["removed_rows_indices"] = data.index[na_mask].tolist() data = data[~na_mask].copy() report["final_shape"] = data.shape return LoaderResult( data=data, report=report, na_mask=na_mask, headers=headers, header_unit=header_unit, ) except FileNotFoundError as e: report["error"] = str(e) return LoaderResult(report=report, header_unit=header_unit) except Exception as e: import traceback report["error"] = f"Error loading NumPy file: {e}\n{traceback.format_exc()}" return LoaderResult(report=report, header_unit=header_unit)
def _load_array( self, path: Path, allow_pickle: bool, key: Optional[str], report: Dict[str, Any], ) -> Optional[np.ndarray]: """Load array from .npy or .npz file. Args: path: Path to the file. allow_pickle: Whether to allow pickled objects. key: For .npz, the array key to load. report: Report dict to update. Returns: Loaded numpy array or None on error. """ suffix = path.suffix.lower() if suffix == ".npy": try: array = np.load(path, allow_pickle=allow_pickle) report["format_details"] = { "type": "npy", "dtype": str(array.dtype), } return array except Exception as e: if "allow_pickle" in str(e).lower(): report["error"] = ( f"Cannot load pickled array: {e}. " f"Set allow_pickle=True if you trust this file." ) else: report["error"] = f"Failed to load .npy file: {e}" return None elif suffix == ".npz": try: npz_file = np.load(path, allow_pickle=allow_pickle) available_keys = list(npz_file.keys()) report["format_details"] = { "type": "npz", "available_keys": available_keys, } if not available_keys: report["error"] = f"No arrays found in .npz file: {path}" return None # Select the array to use if key is not None: if key not in available_keys: report["error"] = ( f"Key '{key}' not found in .npz file. " f"Available keys: {available_keys}" ) return None selected_key = key else: # Use first key selected_key = available_keys[0] if len(available_keys) > 1: report["warnings"].append( f"Multiple arrays in .npz file. Using '{selected_key}'. " f"Specify 'key' parameter to choose a specific array." ) report["key_used"] = selected_key array = npz_file[selected_key] report["format_details"]["selected_dtype"] = str(array.dtype) return array except Exception as e: if "allow_pickle" in str(e).lower(): report["error"] = ( f"Cannot load pickled array: {e}. " f"Set allow_pickle=True if you trust this file." ) else: report["error"] = f"Failed to load .npz file: {e}" return None else: report["error"] = f"Unsupported NumPy format: {suffix}" return None
[docs] def load_numpy( path, allow_pickle: bool = False, key: Optional[str] = None, header_unit: str = "index", **params, ): """Load a NumPy file. Convenience function for backward compatibility. Args: path: Path to the NumPy file. allow_pickle: Whether to allow pickled objects. key: For .npz files, the array key to load. header_unit: Unit type for generated headers. **params: Additional parameters. Returns: Tuple of (DataFrame, report, na_mask, headers, header_unit). """ loader = NumpyLoader() result = loader.load( Path(path), allow_pickle=allow_pickle, key=key, header_unit=header_unit, **params, ) return ( result.data, result.report, result.na_mask, result.headers, result.header_unit, )