Source code for nirs4all.api.generate

"""
Top-level generate() API for synthetic NIRS data generation.

This module provides the primary entry points for generating synthetic
NIRS datasets within nirs4all.

Example:
    >>> import nirs4all
    >>>
    >>> # Simple generation
    >>> dataset = nirs4all.generate(n_samples=1000, random_state=42)
    >>>
    >>> # Convenience functions
    >>> dataset = nirs4all.generate.regression(n_samples=500)
    >>> dataset = nirs4all.generate.classification(n_samples=300, n_classes=3)
    >>>
    >>> # Builder access
    >>> builder = nirs4all.generate.builder(n_samples=1000)
"""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union

import numpy as np

if TYPE_CHECKING:
    from pathlib import Path
    from nirs4all.data.dataset import SpectroDataset
    from nirs4all.data.synthetic import SyntheticDatasetBuilder



[docs]
def generate(
    n_samples: int = 1000,
    *,
    random_state: Optional[int] = None,
    complexity: Literal["simple", "realistic", "complex"] = "simple",
    wavelength_range: Optional[Tuple[float, float]] = None,
    components: Optional[List[str]] = None,
    target_range: Optional[Tuple[float, float]] = None,
    train_ratio: float = 0.8,
    as_dataset: bool = True,
    name: str = "synthetic_nirs",
    **kwargs: Any,
) -> Union["SpectroDataset", Tuple[np.ndarray, np.ndarray]]:
    """
    Generate a synthetic NIRS dataset.

    This is the primary function for creating synthetic spectroscopic data.
    It provides a simple interface for common use cases while allowing
    full customization through keyword arguments.

    Args:
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        complexity: Complexity level affecting noise, scatter, etc.
            Options: 'simple' (fast, minimal noise), 'realistic' (typical NIR),
            'complex' (challenging scenarios).
        wavelength_range: Tuple of (start, end) wavelengths in nm.
            Defaults to (1000, 2500) which covers the full NIR range.
        components: List of predefined component names to use.
            Options: 'water', 'protein', 'lipid', 'starch', 'cellulose',
            'chlorophyll', 'oil', 'nitrogen_compound'.
        target_range: Optional (min, max) range for scaling targets.
        train_ratio: Proportion of samples for training partition.
        as_dataset: If True, returns SpectroDataset. If False, returns (X, y) tuple.
        name: Dataset name.
        **kwargs: Additional arguments passed to SyntheticDatasetBuilder.

    Returns:
        If as_dataset=True: SpectroDataset ready for pipeline use.
        If as_dataset=False: Tuple of (X, y) numpy arrays.

    Example:
        >>> import nirs4all
        >>>
        >>> # Basic usage
        >>> dataset = nirs4all.generate(n_samples=1000, random_state=42)
        >>>
        >>> # Quick arrays for prototyping
        >>> X, y = nirs4all.generate(n_samples=500, as_dataset=False)
        >>>
        >>> # Realistic spectra
        >>> dataset = nirs4all.generate(
        ...     n_samples=1000,
        ...     complexity="realistic",
        ...     components=["water", "protein", "lipid"],
        ...     target_range=(0, 100),
        ...     random_state=42
        ... )

    See Also:
        generate.regression: Convenience function for regression datasets.
        generate.classification: Convenience function for classification datasets.
        generate.builder: Access the full builder API.
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
        name=name,
    )

    # Configure features
    feature_kwargs: Dict[str, Any] = {"complexity": complexity}
    if wavelength_range is not None:
        feature_kwargs["wavelength_range"] = wavelength_range
    if components is not None:
        feature_kwargs["components"] = components

    builder.with_features(**feature_kwargs)

    # Configure targets
    if target_range is not None:
        builder.with_targets(range=target_range)

    # Configure partitions
    builder.with_partitions(train_ratio=train_ratio)

    # Configure output
    builder.with_output(as_dataset=as_dataset)

    # Handle additional kwargs for advanced configuration
    if "distribution" in kwargs:
        builder.with_targets(distribution=kwargs.pop("distribution"))

    if "batch_effects" in kwargs:
        builder.with_batch_effects(enabled=kwargs.pop("batch_effects"))

    return builder.build()




[docs]
def regression(
    n_samples: int = 1000,
    *,
    random_state: Optional[int] = None,
    complexity: Literal["simple", "realistic", "complex"] = "simple",
    target_range: Optional[Tuple[float, float]] = None,
    target_component: Optional[Union[str, int]] = None,
    distribution: Literal["dirichlet", "uniform", "lognormal", "correlated"] = "dirichlet",
    train_ratio: float = 0.8,
    as_dataset: bool = True,
    name: str = "synthetic_regression",
) -> Union["SpectroDataset", Tuple[np.ndarray, np.ndarray]]:
    """
    Generate a synthetic NIRS dataset for regression tasks.

    This convenience function is optimized for regression scenarios,
    with sensible defaults for target distribution and scaling.

    Args:
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        complexity: Complexity level ('simple', 'realistic', 'complex').
        target_range: Target value range (min, max) for scaling.
        target_component: Which component to use as target.
            If None, uses all components (multi-output regression).
        distribution: Concentration distribution method.
        train_ratio: Proportion of samples for training partition.
        as_dataset: If True, returns SpectroDataset. If False, returns (X, y).
        name: Dataset name.

    Returns:
        If as_dataset=True: SpectroDataset ready for pipeline use.
        If as_dataset=False: Tuple of (X, y) numpy arrays.

    Example:
        >>> import nirs4all
        >>>
        >>> # Simple regression dataset
        >>> dataset = nirs4all.generate.regression(n_samples=500)
        >>>
        >>> # Single target with scaling
        >>> dataset = nirs4all.generate.regression(
        ...     n_samples=1000,
        ...     target_range=(0, 100),
        ...     target_component="protein",
        ...     random_state=42
        ... )
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
        name=name,
    )

    builder.with_features(complexity=complexity)

    target_kwargs: Dict[str, Any] = {"distribution": distribution}
    if target_range is not None:
        target_kwargs["range"] = target_range
    if target_component is not None:
        target_kwargs["component"] = target_component

    builder.with_targets(**target_kwargs)
    builder.with_partitions(train_ratio=train_ratio)
    builder.with_output(as_dataset=as_dataset)

    return builder.build()




[docs]
def classification(
    n_samples: int = 1000,
    *,
    n_classes: int = 2,
    random_state: Optional[int] = None,
    complexity: Literal["simple", "realistic", "complex"] = "simple",
    class_separation: float = 1.0,
    class_weights: Optional[List[float]] = None,
    train_ratio: float = 0.8,
    as_dataset: bool = True,
    name: str = "synthetic_classification",
) -> Union["SpectroDataset", Tuple[np.ndarray, np.ndarray]]:
    """
    Generate a synthetic NIRS dataset for classification tasks.

    This convenience function creates datasets with discrete class labels,
    suitable for classification experiments.

    Args:
        n_samples: Number of samples to generate.
        n_classes: Number of classes (2 for binary, >2 for multiclass).
        random_state: Random seed for reproducibility.
        complexity: Complexity level ('simple', 'realistic', 'complex').
        class_separation: Separation factor between classes.
            Higher values make classes more distinguishable.
        class_weights: Optional class proportions for imbalanced datasets.
            Should sum to 1.0.
        train_ratio: Proportion of samples for training partition.
        as_dataset: If True, returns SpectroDataset. If False, returns (X, y).
        name: Dataset name.

    Returns:
        If as_dataset=True: SpectroDataset ready for pipeline use.
        If as_dataset=False: Tuple of (X, y) numpy arrays where y is integer labels.

    Example:
        >>> import nirs4all
        >>>
        >>> # Binary classification
        >>> dataset = nirs4all.generate.classification(n_samples=500, n_classes=2)
        >>>
        >>> # Multiclass with imbalanced classes
        >>> dataset = nirs4all.generate.classification(
        ...     n_samples=1000,
        ...     n_classes=3,
        ...     class_weights=[0.5, 0.3, 0.2],
        ...     random_state=42
        ... )
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
        name=name,
    )

    builder.with_features(complexity=complexity)
    builder.with_classification(
        n_classes=n_classes,
        separation=class_separation,
        class_weights=class_weights,
    )
    builder.with_partitions(train_ratio=train_ratio)
    builder.with_output(as_dataset=as_dataset)

    return builder.build()




[docs]
def builder(
    n_samples: int = 1000,
    random_state: Optional[int] = None,
    name: str = "synthetic_nirs",
) -> "SyntheticDatasetBuilder":
    """
    Create a SyntheticDatasetBuilder for fine-grained control.

    Use this when you need full control over all generation parameters
    via the fluent builder interface.

    Args:
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        name: Dataset name.

    Returns:
        SyntheticDatasetBuilder instance for method chaining.

    Example:
        >>> import nirs4all
        >>>
        >>> dataset = (
        ...     nirs4all.generate.builder(n_samples=1000, random_state=42)
        ...     .with_features(
        ...         wavelength_range=(1000, 2500),
        ...         complexity="realistic",
        ...         components=["water", "protein", "lipid"]
        ...     )
        ...     .with_targets(
        ...         distribution="lognormal",
        ...         range=(5, 50),
        ...         component="protein"
        ...     )
        ...     .with_metadata(n_groups=3)
        ...     .with_partitions(train_ratio=0.8)
        ...     .with_batch_effects(n_batches=3)
        ...     .build()
        ... )
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    return SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
        name=name,
    )




[docs]
def multi_source(
    n_samples: int = 1000,
    sources: List[Dict[str, Any]] = None,
    *,
    random_state: Optional[int] = None,
    target_range: Optional[Tuple[float, float]] = None,
    train_ratio: float = 0.8,
    as_dataset: bool = True,
    name: str = "multi_source_synthetic",
) -> Union["SpectroDataset", Tuple[np.ndarray, np.ndarray]]:
    """
    Generate a synthetic multi-source NIRS dataset.

    Multi-source datasets combine different types of data, such as
    multiple NIR spectral ranges or NIR spectra with auxiliary measurements.

    Args:
        n_samples: Number of samples to generate.
        sources: List of source configurations. Each source is a dict with:
            - name: Unique source identifier (required).
            - type: Source type - "nir", "vis", "aux", "markers" (default: "nir").
            - wavelength_range: (start, end) for NIR sources.
            - n_features: Number of features for auxiliary sources.
            - complexity: Complexity level for NIR sources.
            - components: Component names for NIR sources.
        random_state: Random seed for reproducibility.
        target_range: Optional (min, max) for scaling targets.
        train_ratio: Proportion of samples for training partition.
        as_dataset: If True, returns SpectroDataset. If False, returns (X, y).
        name: Dataset name.

    Returns:
        If as_dataset=True: SpectroDataset with multiple sources.
        If as_dataset=False: Tuple of (X, y) where X is concatenated features.

    Example:
        >>> import nirs4all
        >>>
        >>> # NIR + markers
        >>> dataset = nirs4all.generate.multi_source(
        ...     n_samples=500,
        ...     sources=[
        ...         {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)},
        ...         {"name": "markers", "type": "aux", "n_features": 15}
        ...     ],
        ...     random_state=42
        ... )
        >>>
        >>> # Multiple NIR ranges
        >>> dataset = nirs4all.generate.multi_source(
        ...     n_samples=500,
        ...     sources=[
        ...         {"name": "VIS-NIR", "type": "nir", "wavelength_range": (400, 1100)},
        ...         {"name": "SWIR", "type": "nir", "wavelength_range": (1100, 2500)}
        ...     ]
        ... )
    """
    from nirs4all.data.synthetic import generate_multi_source as _generate_multi_source

    if sources is None:
        # Default: NIR + markers
        sources = [
            {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)},
            {"name": "markers", "type": "aux", "n_features": 10}
        ]

    return _generate_multi_source(
        n_samples=n_samples,
        sources=sources,
        random_state=random_state,
        target_range=target_range,
        as_dataset=as_dataset,
        train_ratio=train_ratio,
        name=name,
    )




[docs]
def to_folder(
    path: Union[str, "Path"],
    n_samples: int = 1000,
    *,
    random_state: Optional[int] = None,
    complexity: Literal["simple", "realistic", "complex"] = "simple",
    train_ratio: float = 0.8,
    format: Literal["standard", "single", "fragmented"] = "standard",
    wavelength_range: Optional[Tuple[float, float]] = None,
    components: Optional[List[str]] = None,
    target_range: Optional[Tuple[float, float]] = None,
) -> "Path":
    """
    Generate synthetic data and export to a folder.

    Creates a folder with CSV files compatible with nirs4all's
    DatasetConfigs loader.

    Args:
        path: Output folder path.
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        complexity: Complexity level.
        train_ratio: Train/test split ratio.
        format: Export format ('standard', 'single', 'fragmented').
        wavelength_range: Optional (start, end) wavelengths.
        components: Optional list of component names.
        target_range: Optional (min, max) for target scaling.

    Returns:
        Path to created folder.

    Example:
        >>> import nirs4all
        >>> path = nirs4all.generate.to_folder(
        ...     "data/synthetic",
        ...     n_samples=1000,
        ...     train_ratio=0.8,
        ...     random_state=42
        ... )
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
    )

    # Configure features
    feature_kwargs: Dict[str, Any] = {"complexity": complexity}
    if wavelength_range is not None:
        feature_kwargs["wavelength_range"] = wavelength_range
    if components is not None:
        feature_kwargs["components"] = components
    builder.with_features(**feature_kwargs)

    # Configure targets
    if target_range is not None:
        builder.with_targets(range=target_range)

    # Configure partitions
    builder.with_partitions(train_ratio=train_ratio)

    return builder.export(path, format=format)




[docs]
def to_csv(
    path: Union[str, "Path"],
    n_samples: int = 1000,
    *,
    random_state: Optional[int] = None,
    complexity: Literal["simple", "realistic", "complex"] = "simple",
    wavelength_range: Optional[Tuple[float, float]] = None,
    target_range: Optional[Tuple[float, float]] = None,
) -> "Path":
    """
    Generate synthetic data and export to a single CSV file.

    Args:
        path: Output file path.
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        complexity: Complexity level.
        wavelength_range: Optional (start, end) wavelengths.
        target_range: Optional (min, max) for target scaling.

    Returns:
        Path to created file.

    Example:
        >>> import nirs4all
        >>> path = nirs4all.generate.to_csv("data.csv", n_samples=500)
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
    )

    # Configure features
    feature_kwargs: Dict[str, Any] = {"complexity": complexity}
    if wavelength_range is not None:
        feature_kwargs["wavelength_range"] = wavelength_range
    builder.with_features(**feature_kwargs)

    # Configure targets
    if target_range is not None:
        builder.with_targets(range=target_range)

    return builder.export_to_csv(path)




[docs]
def from_template(
    template: Union[str, np.ndarray, "SpectroDataset"],
    n_samples: int = 1000,
    *,
    random_state: Optional[int] = None,
    wavelengths: Optional[np.ndarray] = None,
    as_dataset: bool = True,
) -> Union["SpectroDataset", Tuple[np.ndarray, np.ndarray]]:
    """
    Generate synthetic data mimicking a real dataset template.

    Analyzes the template data and generates synthetic spectra
    with similar statistical and spectral properties.

    Args:
        template: Real data to mimic. Can be:
            - Path to dataset folder (str).
            - Numpy array (n_samples, n_wavelengths).
            - SpectroDataset object.
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        wavelengths: Wavelength grid (required if template is array).
        as_dataset: If True, returns SpectroDataset. If False, returns (X, y).

    Returns:
        Synthetic dataset or arrays with properties similar to template.

    Example:
        >>> import nirs4all
        >>>
        >>> # From a dataset path
        >>> dataset = nirs4all.generate.from_template(
        ...     "sample_data/regression",
        ...     n_samples=1000
        ... )
        >>>
        >>> # From numpy array
        >>> dataset = nirs4all.generate.from_template(
        ...     X_real,
        ...     n_samples=500,
        ...     wavelengths=wavelengths
        ... )
    """
    from nirs4all.data.synthetic import SyntheticDatasetBuilder, RealDataFitter

    builder = SyntheticDatasetBuilder(
        n_samples=n_samples,
        random_state=random_state,
    )

    # Handle string path
    if isinstance(template, str):
        from nirs4all.data import DatasetConfigs

        dataset_config = DatasetConfigs(template)
        datasets = dataset_config.get_datasets()
        if not datasets:
            raise ValueError(f"No datasets found at {template}")

        template_ds = datasets[0]
        template_array = template_ds.x({}, layout="2d")
        try:
            wavelengths = template_ds.wavelengths
        except (AttributeError, TypeError):
            pass
        builder.fit_to(template_array, wavelengths=wavelengths)
    else:
        builder.fit_to(template, wavelengths=wavelengths)

    builder.with_output(as_dataset=as_dataset)
    return builder.build()



class _GenerateNamespace:
    """
    Namespace class that makes generate both callable and a namespace.

    This allows both:
        nirs4all.generate(n_samples=1000)
        nirs4all.generate.regression(n_samples=500)
    """

    # Make the main generate function available as __call__
    __call__ = staticmethod(generate)

    # Convenience functions as class attributes
    regression = staticmethod(regression)
    classification = staticmethod(classification)
    builder = staticmethod(builder)
    multi_source = staticmethod(multi_source)

    # Export functions (Phase 4)
    to_folder = staticmethod(to_folder)
    to_csv = staticmethod(to_csv)
    from_template = staticmethod(from_template)

    def __repr__(self) -> str:
        """Return string representation."""
        return (
            "<nirs4all.generate namespace>\n"
            "  generate(n_samples, ...) - Generate synthetic NIRS dataset\n"
            "  generate.regression(...) - Generate regression dataset\n"
            "  generate.classification(...) - Generate classification dataset\n"
            "  generate.multi_source(...) - Generate multi-source dataset\n"
            "  generate.builder(...) - Get fluent builder for full control\n"
            "  generate.to_folder(...) - Generate and export to folder\n"
            "  generate.to_csv(...) - Generate and export to CSV file\n"
            "  generate.from_template(...) - Generate mimicking real data"
        )


# Create the singleton namespace instance
# This replaces the module when imported
generate_namespace = _GenerateNamespace()


# For direct function access
__all__ = [
    "generate",
    "regression",
    "classification",
    "builder",
    "multi_source",
    "to_folder",
    "to_csv",
    "from_template",
    "generate_namespace",
]