Source code for nirs4all.data.loaders

"""
File loaders module for nirs4all.

This module provides a pluggable file loading system supporting multiple file formats
with automatic format detection and configurable loading parameters.

Supported Formats:
    - CSV (.csv, .csv.gz, .csv.zip) - via CSVLoader
    - NumPy (.npy, .npz) - via NumpyLoader
    - Parquet (.parquet, .pq) - via ParquetLoader (requires pyarrow or fastparquet)
    - Excel (.xlsx, .xls) - via ExcelLoader (requires openpyxl/xlrd)
    - MATLAB (.mat) - via MatlabLoader (requires scipy, optionally h5py)
    - Archives (.tar, .tar.gz, .tgz, .zip) - via TarLoader, EnhancedZipLoader

Usage:
    >>> from nirs4all.data.loaders import LoaderRegistry, load_file
    >>>
    >>> # Using the registry
    >>> registry = LoaderRegistry.get_instance()
    >>> result = registry.load("data.csv", delimiter=",")
    >>>
    >>> # Or using the convenience function
    >>> data, report, na_mask, headers, unit = load_file("data.csv")
    >>>
    >>> # Direct loader usage
    >>> from nirs4all.data.loaders import CSVLoader
    >>> loader = CSVLoader()
    >>> result = loader.load(Path("data.csv"))

Adding Custom Loaders:
    >>> from nirs4all.data.loaders import FileLoader, register_loader
    >>>
    >>> @register_loader
    ... class MyLoader(FileLoader):
    ...     supported_extensions = (".myext",)
    ...     name = "My Loader"
    ...
    ...     @classmethod
    ...     def supports(cls, path):
    ...         return path.suffix.lower() == ".myext"
    ...
    ...     def load(self, path, **params):
    ...         # Load implementation
    ...         pass

Backward Compatibility:
    The legacy load_csv function is still available for existing code:
    >>> from nirs4all.data.loaders.csv_loader import load_csv
"""

from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Type, Union

import pandas as pd

# Base classes and utilities
from .base import (
    ArchiveHandler,
    FileLoadError,
    FileLoader,
    FormatNotSupportedError,
    LoaderError,
    LoaderRegistry,
    LoaderResult,
    register_loader,
)

# Format-specific loaders
# Note: Importing these modules automatically registers them via @register_loader
from .csv_loader_new import CSVLoader, load_csv as load_csv_new
from .numpy_loader import NumpyLoader, load_numpy
from .parquet_loader import ParquetLoader, load_parquet
from .excel_loader import ExcelLoader, load_excel
from .matlab_loader import MatlabLoader, load_matlab
from .archive_loader import TarLoader, EnhancedZipLoader, list_archive_members

# Legacy imports for backward compatibility
from .csv_loader import load_csv



[docs]
def load_file(
    path: Union[str, Path],
    **params: Any,
) -> Tuple[Optional[pd.DataFrame], Dict[str, Any], Optional[pd.Series], List[str], str]:
    """Load a data file with automatic format detection.

    This is the main entry point for loading files. It automatically detects
    the file format and uses the appropriate loader.

    Args:
        path: Path to the file to load.
        **params: Format-specific loading parameters. Common parameters include:
            - header_unit: Unit for headers ('cm-1', 'nm', 'text', etc.)
            - data_type: Type of data ('x', 'y', or 'metadata')
            - delimiter: CSV delimiter
            - sheet_name: Excel sheet to load
            - variable: MATLAB variable name
            - member: Archive member to extract

    Returns:
        Tuple of:
            - DataFrame with loaded data (or None on error)
            - Report dictionary with loading metadata
            - NA mask Series (rows with missing values)
            - List of column headers
            - Header unit string

    Raises:
        FormatNotSupportedError: If no loader supports the file format.

    Example:
        >>> data, report, na_mask, headers, unit = load_file("data.csv")
        >>> if report.get("error"):
        ...     print(f"Error: {report['error']}")
        >>> else:
        ...     print(f"Loaded {data.shape[0]} samples with {data.shape[1]} features")
    """
    registry = LoaderRegistry.get_instance()
    result = registry.load(path, **params)

    return (
        result.data,
        result.report,
        result.na_mask,
        result.headers,
        result.header_unit,
    )




[docs]
def get_supported_formats() -> Dict[str, List[str]]:
    """Get all supported file formats and their extensions.

    Returns:
        Dictionary mapping loader names to their supported extensions.

    Example:
        >>> formats = get_supported_formats()
        >>> for name, exts in formats.items():
        ...     print(f"{name}: {', '.join(exts)}")
    """
    registry = LoaderRegistry.get_instance()
    result = {}

    for loader_class in registry.get_registered_loaders():
        result[loader_class.name] = list(loader_class.supported_extensions)

    return result




[docs]
def get_loader_for_file(path: Union[str, Path]) -> FileLoader:
    """Get the appropriate loader for a file.

    Args:
        path: Path to the file.

    Returns:
        Instance of the appropriate FileLoader subclass.

    Raises:
        FormatNotSupportedError: If no loader supports the file format.
    """
    registry = LoaderRegistry.get_instance()
    return registry.get_loader(path)



__all__ = [
    # Base classes
    "FileLoader",
    "LoaderResult",
    "LoaderRegistry",
    "ArchiveHandler",
    # Exceptions
    "LoaderError",
    "FileLoadError",
    "FormatNotSupportedError",
    # Decorator
    "register_loader",
    # Loaders
    "CSVLoader",
    "NumpyLoader",
    "ParquetLoader",
    "ExcelLoader",
    "MatlabLoader",
    "TarLoader",
    "EnhancedZipLoader",
    # Convenience functions
    "load_file",
    "load_csv",
    "load_csv_new",
    "load_numpy",
    "load_parquet",
    "load_excel",
    "load_matlab",
    "list_archive_members",
    "get_supported_formats",
    "get_loader_for_file",
]