Source code for nirs4all.data.synthetic.builder

"""
Fluent builder for synthetic NIRS dataset construction.

This module provides a builder pattern interface for creating synthetic
NIRS datasets with fine-grained control over all generation parameters.

Example:
    >>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
    >>>
    >>> dataset = (
    ...     SyntheticDatasetBuilder(n_samples=1000, random_state=42)
    ...     .with_features(complexity="realistic")
    ...     .with_targets(distribution="lognormal", range=(0, 100))
    ...     .with_partitions(train_ratio=0.8)
    ...     .build()
    ... )
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union

import numpy as np

from .generator import SyntheticNIRSGenerator
from .components import ComponentLibrary
from .config import (
    SyntheticDatasetConfig,
    FeatureConfig,
    TargetConfig,
    MetadataConfig,
    PartitionConfig,
    BatchEffectConfig,
    OutputConfig,
)
from ._constants import DEFAULT_WAVELENGTH_START, DEFAULT_WAVELENGTH_END, DEFAULT_WAVELENGTH_STEP
from .metadata import MetadataGenerator, MetadataGenerationResult
from .targets import TargetGenerator, NonLinearTargetProcessor, NonLinearTargetConfig

if TYPE_CHECKING:
    from pathlib import Path
    from nirs4all.data.dataset import SpectroDataset
    from .sources import SourceConfig



[docs]
@dataclass
class BuilderState:
    """
    Internal state container for the builder.

    This holds all configuration accumulated through the builder methods.
    """

    n_samples: int = 1000
    random_state: Optional[int] = None
    name: str = "synthetic_nirs"

    # Feature configuration
    wavelength_start: float = DEFAULT_WAVELENGTH_START
    wavelength_end: float = DEFAULT_WAVELENGTH_END
    wavelength_step: float = DEFAULT_WAVELENGTH_STEP
    custom_wavelengths: Optional[np.ndarray] = None  # Phase 6: custom wavelength array
    instrument_wavelength_grid: Optional[str] = None  # Phase 6: predefined instrument grid
    complexity: Literal["simple", "realistic", "complex"] = "simple"
    component_names: Optional[List[str]] = None
    component_library: Optional[ComponentLibrary] = None

    # === Custom physics parameters (override complexity presets) ===
    custom_params: Optional[Dict[str, Any]] = None  # Dict with any of:
    # path_length_std, baseline_amplitude, scatter_alpha_std, scatter_beta_std,
    # tilt_std, global_slope_mean, global_slope_std, shift_std, stretch_std,
    # instrumental_fwhm, noise_base, noise_signal_dep, artifact_prob

    # === Instrument simulation (Phase 2) ===
    instrument: Optional[str] = None  # Instrument archetype name
    measurement_mode: Optional[str] = None  # Measurement mode

    # Target configuration
    concentration_method: Literal["dirichlet", "uniform", "lognormal", "correlated"] = "dirichlet"
    target_range: Optional[Tuple[float, float]] = None
    target_component: Optional[Union[str, int]] = None
    target_transform: Optional[Literal["log", "sqrt"]] = None

    # Classification configuration
    n_classes: Optional[int] = None
    class_separation: float = 1.5
    class_weights: Optional[List[float]] = None
    class_separation_method: Literal["component", "threshold", "cluster"] = "component"

    # Metadata configuration
    generate_sample_ids: bool = False
    sample_id_prefix: str = "sample"
    n_groups: Optional[int] = None
    n_repetitions: Union[int, Tuple[int, int]] = 1
    group_names: Optional[List[str]] = None

    # Multi-source configuration
    sources: Optional[List[Any]] = None  # List of SourceConfig or dicts

    # Aggregate configuration (Phase 4)
    aggregate_name: Optional[str] = None  # Predefined aggregate name
    aggregate_variability: bool = False  # Sample from variability ranges

    # Partition configuration
    train_ratio: float = 0.8
    stratify: bool = False
    shuffle: bool = True

    # Batch effect configuration
    batch_effects_enabled: bool = False
    n_batches: int = 3

    # Output configuration
    as_dataset: bool = True
    include_metadata: bool = False

    # === Proposition 1: Non-linear interactions ===
    nonlinear_interactions: Literal["none", "polynomial", "synergistic", "antagonistic"] = "none"
    interaction_strength: float = 0.5  # 0 = linear, 1 = strong non-linearity
    hidden_factors: int = 0  # Latent variables affecting y but not spectra
    polynomial_degree: int = 2  # Degree for polynomial interactions

    # === Proposition 2: Spectral-target decoupling / confounders ===
    signal_to_confound_ratio: float = 1.0  # 1.0 = fully predictable, 0.5 = 50% predictable
    n_confounders: int = 0  # Variables affecting both spectra and target differently
    spectral_masking: float = 0.0  # Fraction of signal hidden in noisy regions
    temporal_drift: bool = False  # Target relationship changes over samples

    # === Proposition 3: Multi-regime target landscapes ===
    n_regimes: int = 1  # Number of different relationship regimes
    regime_method: Literal["concentration", "spectral", "random"] = "concentration"
    regime_overlap: float = 0.2  # Overlap between regimes (transition zones)
    noise_heteroscedasticity: float = 0.0  # Noise varies by regime (0 = homoscedastic)

    # Cached generated data
    _X: Optional[np.ndarray] = field(default=None, repr=False)
    _y: Optional[np.ndarray] = field(default=None, repr=False)
    _C: Optional[np.ndarray] = field(default=None, repr=False)  # Concentrations
    _wavelengths: Optional[np.ndarray] = field(default=None, repr=False)
    _metadata: Optional[Dict[str, Any]] = field(default=None, repr=False)
    _sample_metadata: Optional[MetadataGenerationResult] = field(default=None, repr=False)




[docs]
class SyntheticDatasetBuilder:
    """
    Fluent builder for constructing synthetic NIRS datasets.

    This builder provides a chainable interface for configuring all aspects
    of synthetic data generation, from spectral features to targets and metadata.

    The builder accumulates configuration through method calls, then generates
    the dataset when ``build()`` is called.

    Attributes:
        state: Internal BuilderState containing all configuration.

    Args:
        n_samples: Number of samples to generate.
        random_state: Random seed for reproducibility.
        name: Dataset name.

    Example:
        >>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
        >>>
        >>> # Simple usage
        >>> dataset = SyntheticDatasetBuilder(n_samples=500).build()
        >>>
        >>> # Full configuration
        >>> dataset = (
        ...     SyntheticDatasetBuilder(n_samples=1000, random_state=42)
        ...     .with_features(
        ...         wavelength_range=(1000, 2500),
        ...         complexity="realistic",
        ...         components=["water", "protein", "lipid"]
        ...     )
        ...     .with_targets(
        ...         distribution="lognormal",
        ...         range=(5, 50),
        ...         component="protein"
        ...     )
        ...     .with_metadata(
        ...         n_groups=3,
        ...         n_repetitions=(2, 5)
        ...     )
        ...     .with_partitions(train_ratio=0.8)
        ...     .build()
        ... )

    See Also:
        nirs4all.generate: Top-level convenience function.
        SyntheticNIRSGenerator: Core generation engine.
    """

    def __init__(
        self,
        n_samples: int = 1000,
        random_state: Optional[int] = None,
        name: str = "synthetic_nirs",
    ) -> None:
        """
        Initialize the builder with basic configuration.

        Args:
            n_samples: Number of samples to generate.
            random_state: Random seed for reproducibility.
            name: Dataset name.

        Raises:
            ValueError: If n_samples is less than 1.
        """
        if n_samples < 1:
            raise ValueError(f"n_samples must be >= 1, got {n_samples}")

        self.state = BuilderState(
            n_samples=n_samples,
            random_state=random_state,
            name=name,
        )
        self._built = False


[docs]
    def with_features(
        self,
        *,
        wavelength_range: Optional[Tuple[float, float]] = None,
        wavelength_step: Optional[float] = None,
        complexity: Optional[Literal["simple", "realistic", "complex"]] = None,
        components: Optional[List[str]] = None,
        component_library: Optional[ComponentLibrary] = None,
        # Custom physics parameters (override complexity presets)
        path_length_std: Optional[float] = None,
        baseline_amplitude: Optional[float] = None,
        scatter_alpha_std: Optional[float] = None,
        scatter_beta_std: Optional[float] = None,
        tilt_std: Optional[float] = None,
        global_slope_mean: Optional[float] = None,
        global_slope_std: Optional[float] = None,
        shift_std: Optional[float] = None,
        stretch_std: Optional[float] = None,
        instrumental_fwhm: Optional[float] = None,
        noise_base: Optional[float] = None,
        noise_signal_dep: Optional[float] = None,
        artifact_prob: Optional[float] = None,
        # Instrument simulation (Phase 2)
        instrument: Optional[str] = None,
        measurement_mode: Optional[str] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure spectral feature generation.

        Args:
            wavelength_range: Tuple of (start, end) wavelengths in nm.
            wavelength_step: Wavelength sampling step in nm.
            complexity: Complexity level affecting noise, scatter, etc.
                Options: 'simple', 'realistic', 'complex'.
            components: List of predefined component names to use.
            component_library: Pre-configured ComponentLibrary instance.
            path_length_std: Standard deviation of optical path length variation.
            baseline_amplitude: Amplitude of polynomial baseline drift.
            scatter_alpha_std: MSC-like multiplicative scattering coefficient variation.
            scatter_beta_std: Additive scattering offset variation.
            tilt_std: Standard deviation of linear spectral tilt.
            global_slope_mean: Mean slope across all spectra.
            global_slope_std: Standard deviation of global slope.
            shift_std: Random wavelength axis shift (nm).
            stretch_std: Wavelength axis stretching/compression factor.
            instrumental_fwhm: Instrumental broadening FWHM (nm).
            noise_base: Constant noise floor (detector noise).
            noise_signal_dep: Noise proportional to signal intensity (shot noise).
            artifact_prob: Probability of spectral artifacts.
            instrument: Instrument archetype name (e.g., 'foss_xds', 'bruker_mpa').
            measurement_mode: Measurement mode ('transmittance', 'reflectance', 'atr', etc.).

        Returns:
            Self for method chaining.

        Raises:
            ValueError: If both components and component_library are specified.

        Example:
            >>> # Simple usage with preset
            >>> builder.with_features(
            ...     wavelength_range=(1000, 2500),
            ...     complexity="realistic",
            ...     components=["water", "protein"]
            ... )

            >>> # Advanced usage with custom physics parameters
            >>> builder.with_features(
            ...     wavelength_range=(1000, 2500),
            ...     components=["water", "protein", "lipid"],
            ...     noise_base=0.003,
            ...     noise_signal_dep=0.008,
            ...     baseline_amplitude=0.015,
            ...     scatter_alpha_std=0.04,
            ...     instrument="foss_xds"
            ... )
        """
        if components is not None and component_library is not None:
            raise ValueError("Cannot specify both 'components' and 'component_library'")

        if wavelength_range is not None:
            self.state.wavelength_start, self.state.wavelength_end = wavelength_range

        if wavelength_step is not None:
            self.state.wavelength_step = wavelength_step

        if complexity is not None:
            self.state.complexity = complexity

        if components is not None:
            self.state.component_names = components

        if component_library is not None:
            self.state.component_library = component_library

        # Store custom physics parameters
        custom_params = {}
        param_mappings = {
            'path_length_std': path_length_std,
            'baseline_amplitude': baseline_amplitude,
            'scatter_alpha_std': scatter_alpha_std,
            'scatter_beta_std': scatter_beta_std,
            'tilt_std': tilt_std,
            'global_slope_mean': global_slope_mean,
            'global_slope_std': global_slope_std,
            'shift_std': shift_std,
            'stretch_std': stretch_std,
            'instrumental_fwhm': instrumental_fwhm,
            'noise_base': noise_base,
            'noise_signal_dep': noise_signal_dep,
            'artifact_prob': artifact_prob,
        }
        for key, value in param_mappings.items():
            if value is not None:
                custom_params[key] = value

        if custom_params:
            self.state.custom_params = custom_params

        # Instrument simulation
        if instrument is not None:
            self.state.instrument = instrument
        if measurement_mode is not None:
            self.state.measurement_mode = measurement_mode

        return self



[docs]
    def with_wavelengths(
        self,
        wavelengths: Optional[np.ndarray] = None,
        *,
        instrument_grid: Optional[str] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure custom wavelength grid for spectrum generation.

        This method allows generating spectra at specific wavelengths matching
        a real instrument's wavelength grid, which is essential for transfer
        learning and domain adaptation experiments.

        Priority: wavelengths > instrument_grid > wavelength_range (in with_features)

        Args:
            wavelengths: Custom wavelength array in nm. If provided, overrides
                the wavelength_range set in with_features().
            instrument_grid: Name of predefined instrument wavelength grid.
                Available grids include: 'micronir_onsite', 'foss_xds',
                'scio', 'neospectra_micro', 'asd_fieldspec', 'bruker_mpa', etc.
                See ``list_instrument_wavelength_grids()`` for all options.

        Returns:
            Self for method chaining.

        Raises:
            ValueError: If instrument_grid name is not recognized.

        Example:
            >>> # Use predefined instrument wavelength grid
            >>> builder.with_wavelengths(instrument_grid="micronir_onsite")

            >>> # Use custom wavelength array
            >>> custom_wl = np.linspace(1000, 2000, 100)
            >>> builder.with_wavelengths(wavelengths=custom_wl)

            >>> # Full example
            >>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
            >>> dataset = (
            ...     SyntheticDatasetBuilder(n_samples=500)
            ...     .with_wavelengths(instrument_grid="micronir_onsite")
            ...     .with_features(complexity="realistic")
            ...     .build()
            ... )

        See Also:
            get_instrument_wavelengths: Get wavelengths for a specific instrument.
            list_instrument_wavelength_grids: List all available instrument grids.
        """
        if wavelengths is not None and instrument_grid is not None:
            raise ValueError("Cannot specify both 'wavelengths' and 'instrument_grid'")

        if wavelengths is not None:
            self.state.custom_wavelengths = np.asarray(wavelengths)
            self.state.instrument_wavelength_grid = None

        if instrument_grid is not None:
            # Validate instrument grid name
            from .instruments import get_instrument_wavelengths
            _ = get_instrument_wavelengths(instrument_grid)  # Raises ValueError if unknown
            self.state.instrument_wavelength_grid = instrument_grid
            self.state.custom_wavelengths = None

        return self



[docs]
    def with_targets(
        self,
        *,
        distribution: Optional[Literal["dirichlet", "uniform", "lognormal", "correlated"]] = None,
        range: Optional[Tuple[float, float]] = None,
        component: Optional[Union[str, int]] = None,
        transform: Optional[Literal["log", "sqrt"]] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure target variable generation for regression tasks.

        Args:
            distribution: Concentration distribution method.
                Options: 'dirichlet', 'uniform', 'lognormal', 'correlated'.
            range: Target value range (min, max) for scaling.
            component: Which component to use as target.
                If None, uses all components (multi-output).
                If str, uses the component with that name.
                If int, uses the component at that index.
            transform: Optional transformation to apply ('log', 'sqrt').

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_targets(
            ...     distribution="lognormal",
            ...     range=(5, 50),
            ...     component="protein"
            ... )
        """
        # Clear classification settings when configuring regression
        self.state.n_classes = None

        if distribution is not None:
            self.state.concentration_method = distribution

        if range is not None:
            self.state.target_range = range

        if component is not None:
            self.state.target_component = component

        if transform is not None:
            self.state.target_transform = transform

        return self



[docs]
    def with_classification(
        self,
        *,
        n_classes: int = 2,
        separation: float = 1.5,
        class_weights: Optional[List[float]] = None,
        separation_method: Literal["component", "threshold", "cluster"] = "component",
    ) -> SyntheticDatasetBuilder:
        """
        Configure target generation for classification tasks.

        This creates discrete class labels with controllable separation
        between classes, enabling classification experiments with varying
        difficulty levels.

        Args:
            n_classes: Number of classes to generate.
            separation: Class separation factor (higher = more separable).
                Values around 0.5-1.0: overlapping classes (challenging).
                Values around 1.5-2.0: moderate separation (realistic).
                Values around 2.5+: well-separated classes (easy).
            class_weights: Optional class weights for imbalanced datasets.
                Should sum to 1.0.
            separation_method: How to create class differences:
                - "component": Different component concentration profiles per class.
                - "threshold": Classes based on concentration thresholds.
                - "cluster": K-means-like cluster assignment.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_classification(
            ...     n_classes=3,
            ...     separation=2.0,
            ...     class_weights=[0.5, 0.3, 0.2]
            ... )
        """
        if n_classes < 2:
            raise ValueError(f"n_classes must be >= 2, got {n_classes}")

        if class_weights is not None:
            if len(class_weights) != n_classes:
                raise ValueError(
                    f"class_weights length ({len(class_weights)}) must match "
                    f"n_classes ({n_classes})"
                )
            if abs(sum(class_weights) - 1.0) > 0.01:
                raise ValueError(f"class_weights must sum to 1.0, got {sum(class_weights)}")

        self.state.n_classes = n_classes
        self.state.class_separation = separation
        self.state.class_weights = class_weights
        self.state.class_separation_method = separation_method

        return self



[docs]
    def with_metadata(
        self,
        *,
        sample_ids: bool = True,
        sample_id_prefix: Optional[str] = None,
        n_groups: Optional[int] = None,
        n_repetitions: Optional[Union[int, Tuple[int, int]]] = None,
        group_names: Optional[List[str]] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure sample metadata generation.

        Generates realistic metadata including sample IDs, biological sample
        groupings (with repetitions), and group assignments.

        Args:
            sample_ids: Whether to generate sample IDs.
            sample_id_prefix: Prefix for sample ID strings.
            n_groups: Number of sample groups (for grouped cross-validation).
            n_repetitions: Repetitions per biological sample. Either a fixed int
                or a (min, max) tuple for random variation. When set, each
                "biological sample" gets multiple spectral measurements.
            group_names: Optional list of group names. If None and n_groups > 0,
                generates names like "Group_0", "Group_1", etc.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_metadata(
            ...     n_groups=5,
            ...     n_repetitions=(2, 4),
            ...     group_names=["Field_A", "Field_B", "Field_C", "Field_D", "Field_E"]
            ... )
        """
        self.state.generate_sample_ids = sample_ids

        if sample_id_prefix is not None:
            self.state.sample_id_prefix = sample_id_prefix

        if n_groups is not None:
            self.state.n_groups = n_groups

        if n_repetitions is not None:
            self.state.n_repetitions = n_repetitions

        if group_names is not None:
            self.state.group_names = group_names

        return self



[docs]
    def with_sources(
        self,
        sources: List[Union[Dict[str, Any], Any]],
    ) -> SyntheticDatasetBuilder:
        """
        Configure multi-source generation.

        Multi-source datasets combine different types of data, such as
        multiple NIR spectral ranges or NIR spectra with auxiliary measurements.

        Args:
            sources: List of source configurations. Each source is a dict with:
                - name: Unique source identifier (required).
                - type: Source type - "nir", "vis", "aux", "markers" (default: "nir").
                - wavelength_range: (start, end) for NIR sources.
                - n_features: Number of features for auxiliary sources.
                - complexity: Complexity level for NIR sources.
                - components: Component names for NIR sources.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_sources([
            ...     {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)},
            ...     {"name": "markers", "type": "aux", "n_features": 15}
            ... ])
        """
        self.state.sources = sources
        return self



[docs]
    def with_aggregate(
        self,
        name: str,
        *,
        variability: bool = False,
        target_component: Optional[str] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure generation from a predefined aggregate component.

        Aggregates are predefined compositions representing common sample types
        (e.g., "wheat_grain", "milk", "tablet_excipient_base"). Using an aggregate
        automatically sets up the component library with realistic proportions.

        Args:
            name: Aggregate name (e.g., "wheat_grain", "milk", "cheese_cheddar").
            variability: If True, sample compositions from realistic variability
                ranges instead of using fixed base values. Useful for generating
                diverse training data.
            target_component: Optional component to use as regression target.
                If not specified, uses the first component in the aggregate.

        Returns:
            Self for method chaining.

        Raises:
            ValueError: If aggregate name is not found.

        Example:
            >>> # Generate wheat samples with protein as target
            >>> dataset = (
            ...     SyntheticDatasetBuilder(n_samples=1000, random_state=42)
            ...     .with_aggregate("wheat_grain", variability=True)
            ...     .with_targets(component="protein", range=(8, 18))
            ...     .build()
            ... )

        See Also:
            nirs4all.data.synthetic.list_aggregates: List available aggregates.
            nirs4all.data.synthetic.aggregate_info: Get aggregate details.
        """
        from ._aggregates import get_aggregate

        # Validate aggregate exists
        agg = get_aggregate(name)  # Will raise ValueError if not found

        self.state.aggregate_name = name
        self.state.aggregate_variability = variability

        # Set component names from aggregate
        self.state.component_names = list(agg.components.keys())

        # Set target component if specified
        if target_component is not None:
            if target_component not in agg.components:
                raise ValueError(
                    f"Target component '{target_component}' not in aggregate '{name}'. "
                    f"Available: {list(agg.components.keys())}"
                )
            self.state.target_component = target_component

        return self



[docs]
    def with_partitions(
        self,
        *,
        train_ratio: Optional[float] = None,
        stratify: Optional[bool] = None,
        shuffle: Optional[bool] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure data partitioning (train/test split).

        Args:
            train_ratio: Proportion of samples for training (0.0-1.0).
            stratify: Whether to stratify by target (for classification).
            shuffle: Whether to shuffle before splitting.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_partitions(train_ratio=0.75, shuffle=True)
        """
        if train_ratio is not None:
            if not 0.0 < train_ratio <= 1.0:
                raise ValueError(f"train_ratio must be in (0, 1], got {train_ratio}")
            self.state.train_ratio = train_ratio

        if stratify is not None:
            self.state.stratify = stratify

        if shuffle is not None:
            self.state.shuffle = shuffle

        return self



[docs]
    def with_batch_effects(
        self,
        *,
        enabled: bool = True,
        n_batches: int = 3,
    ) -> SyntheticDatasetBuilder:
        """
        Configure batch/session effects simulation.

        Batch effects introduce systematic variations between measurement
        sessions, useful for domain adaptation research.

        Args:
            enabled: Whether to enable batch effects.
            n_batches: Number of measurement batches.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_batch_effects(n_batches=5)
        """
        self.state.batch_effects_enabled = enabled
        self.state.n_batches = n_batches
        return self



[docs]
    def with_nonlinear_targets(
        self,
        *,
        interactions: Literal["none", "polynomial", "synergistic", "antagonistic"] = "polynomial",
        interaction_strength: float = 0.5,
        hidden_factors: int = 0,
        polynomial_degree: int = 2,
    ) -> SyntheticDatasetBuilder:
        """
        Configure non-linear relationships between concentrations and targets.

        This introduces non-linear mixture effects that make targets harder to
        predict with simple linear models, simulating real chemical interactions.

        Args:
            interactions: Type of non-linear interaction:
                - "none": Pure linear relationship (default behavior).
                - "polynomial": Include terms like C1², C1×C2, etc.
                - "synergistic": Non-additive effects where combinations enhance target.
                - "antagonistic": Saturation/inhibition (Michaelis-Menten-like).
            interaction_strength: Blend factor between linear and non-linear.
                0 = purely linear, 1 = fully non-linear. Default 0.5.
            hidden_factors: Number of latent variables that affect target but have
                NO spectral signature. Forces models to learn robust features.
            polynomial_degree: Maximum degree for polynomial interactions (2 or 3).

        Returns:
            Self for method chaining.

        Example:
            >>> # Make targets require non-linear models
            >>> builder.with_nonlinear_targets(
            ...     interactions="polynomial",
            ...     interaction_strength=0.7,
            ...     hidden_factors=2
            ... )
        """
        if interaction_strength < 0 or interaction_strength > 1:
            raise ValueError(f"interaction_strength must be in [0, 1], got {interaction_strength}")
        if polynomial_degree not in (2, 3):
            raise ValueError(f"polynomial_degree must be 2 or 3, got {polynomial_degree}")
        if hidden_factors < 0:
            raise ValueError(f"hidden_factors must be >= 0, got {hidden_factors}")

        self.state.nonlinear_interactions = interactions
        self.state.interaction_strength = interaction_strength
        self.state.hidden_factors = hidden_factors
        self.state.polynomial_degree = polynomial_degree
        return self



[docs]
    def with_target_complexity(
        self,
        *,
        signal_to_confound_ratio: float = 0.7,
        n_confounders: int = 2,
        spectral_masking: float = 0.0,
        temporal_drift: bool = False,
    ) -> SyntheticDatasetBuilder:
        """
        Configure spectral-target decoupling and confounding effects.

        This introduces factors that make the target only partially predictable
        from spectral features, simulating real-world irreducible error.

        Args:
            signal_to_confound_ratio: Proportion of target variance explainable
                from spectra. 1.0 = fully predictable, 0.5 = 50% unexplainable.
                Default 0.7 (70% predictable).
            n_confounders: Number of confounding variables that affect both
                spectra and target in different ways. Default 2.
            spectral_masking: Fraction of predictive signal hidden in high-noise
                wavelength regions (0.0-0.5). Default 0.0.
            temporal_drift: If True, the target-spectra relationship gradually
                changes across samples, testing model robustness.

        Returns:
            Self for method chaining.

        Example:
            >>> # Add realistic confounding
            >>> builder.with_target_complexity(
            ...     signal_to_confound_ratio=0.6,
            ...     n_confounders=3,
            ...     temporal_drift=True
            ... )
        """
        if signal_to_confound_ratio < 0 or signal_to_confound_ratio > 1:
            raise ValueError(
                f"signal_to_confound_ratio must be in [0, 1], got {signal_to_confound_ratio}"
            )
        if spectral_masking < 0 or spectral_masking > 0.5:
            raise ValueError(f"spectral_masking must be in [0, 0.5], got {spectral_masking}")
        if n_confounders < 0:
            raise ValueError(f"n_confounders must be >= 0, got {n_confounders}")

        self.state.signal_to_confound_ratio = signal_to_confound_ratio
        self.state.n_confounders = n_confounders
        self.state.spectral_masking = spectral_masking
        self.state.temporal_drift = temporal_drift
        return self



[docs]
    def with_complex_target_landscape(
        self,
        *,
        n_regimes: int = 3,
        regime_method: Literal["concentration", "spectral", "random"] = "concentration",
        regime_overlap: float = 0.2,
        noise_heteroscedasticity: float = 0.5,
    ) -> SyntheticDatasetBuilder:
        """
        Configure multi-regime target landscapes with spatially-varying relationships.

        This creates regions in feature space where the target-spectra relationship
        differs, simulating subpopulations like ripe/unripe fruit or healthy/diseased.

        Args:
            n_regimes: Number of different relationship regimes. Default 3.
            regime_method: How to partition samples into regimes:
                - "concentration": Regimes based on concentration space clustering.
                - "spectral": Regimes based on spectral feature patterns.
                - "random": Random regime assignment (baseline difficulty).
            regime_overlap: Overlap between regimes creating transition zones.
                0 = hard boundaries, 0.5 = smooth transitions. Default 0.2.
            noise_heteroscedasticity: How much prediction noise varies by regime.
                0 = same noise everywhere, 1 = very different noise levels.
                Default 0.5.

        Returns:
            Self for method chaining.

        Example:
            >>> # Create challenging multi-regime landscape
            >>> builder.with_complex_target_landscape(
            ...     n_regimes=4,
            ...     regime_method="concentration",
            ...     regime_overlap=0.3,
            ...     noise_heteroscedasticity=0.7
            ... )
        """
        if n_regimes < 1:
            raise ValueError(f"n_regimes must be >= 1, got {n_regimes}")
        if regime_overlap < 0 or regime_overlap > 0.5:
            raise ValueError(f"regime_overlap must be in [0, 0.5], got {regime_overlap}")
        if noise_heteroscedasticity < 0 or noise_heteroscedasticity > 1:
            raise ValueError(
                f"noise_heteroscedasticity must be in [0, 1], got {noise_heteroscedasticity}"
            )

        self.state.n_regimes = n_regimes
        self.state.regime_method = regime_method
        self.state.regime_overlap = regime_overlap
        self.state.noise_heteroscedasticity = noise_heteroscedasticity
        return self



[docs]
    def with_output(
        self,
        *,
        as_dataset: Optional[bool] = None,
        include_metadata: Optional[bool] = None,
    ) -> SyntheticDatasetBuilder:
        """
        Configure output format.

        Args:
            as_dataset: If True, returns SpectroDataset. If False, returns tuple.
            include_metadata: Whether to include generation metadata in output.

        Returns:
            Self for method chaining.

        Example:
            >>> builder.with_output(as_dataset=False)  # Returns (X, y) tuple
        """
        if as_dataset is not None:
            self.state.as_dataset = as_dataset

        if include_metadata is not None:
            self.state.include_metadata = include_metadata

        return self


    def _create_generator(self) -> SyntheticNIRSGenerator:
        """Create and configure the generator from current state."""
        # Build component library if needed
        library = self.state.component_library
        if library is None and self.state.component_names is not None:
            library = ComponentLibrary.from_predefined(
                self.state.component_names,
                random_state=self.state.random_state,
            )

        return SyntheticNIRSGenerator(
            wavelength_start=self.state.wavelength_start,
            wavelength_end=self.state.wavelength_end,
            wavelength_step=self.state.wavelength_step,
            wavelengths=self.state.custom_wavelengths,
            instrument_wavelength_grid=self.state.instrument_wavelength_grid,
            component_library=library,
            complexity=self.state.complexity,
            custom_params=self.state.custom_params,
            instrument=self.state.instrument,
            measurement_mode=self.state.measurement_mode,
            random_state=self.state.random_state,
        )

    def _generate_data(self, generator: SyntheticNIRSGenerator) -> None:
        """Generate raw spectral data using the generator."""
        # Check if using aggregate-based generation
        if self.state.aggregate_name is not None:
            X, C, metadata = self._generate_from_aggregate(generator)
        else:
            X, C, _E, metadata = generator.generate(
                n_samples=self.state.n_samples,
                concentration_method=self.state.concentration_method,
                include_batch_effects=self.state.batch_effects_enabled,
                n_batches=self.state.n_batches,
                return_metadata=True,
            )

        # Store wavelengths and concentrations
        self.state._wavelengths = generator.wavelengths.copy()
        self.state._metadata = metadata
        self.state._C = C

        # Generate sample metadata if requested
        if self.state.generate_sample_ids or self.state.n_groups is not None:
            self._generate_sample_metadata()

        # Process targets
        y = self._process_targets(C, generator)

        self.state._X = X
        self.state._y = y

    def _generate_sample_metadata(self) -> None:
        """Generate sample metadata using MetadataGenerator."""
        metadata_gen = MetadataGenerator(random_state=self.state.random_state)
        result = metadata_gen.generate(
            n_samples=self.state.n_samples,
            sample_id_prefix=self.state.sample_id_prefix,
            n_groups=self.state.n_groups,
            group_names=self.state.group_names,
            n_repetitions=self.state.n_repetitions,
        )
        self.state._sample_metadata = result

    def _generate_from_aggregate(
        self,
        generator: SyntheticNIRSGenerator,
    ) -> Tuple[np.ndarray, np.ndarray, Dict[str, Any]]:
        """
        Generate spectral data using aggregate-based composition.

        This method samples concentrations from the aggregate's composition
        (optionally with variability) and uses them to generate spectra.

        Args:
            generator: Configured SyntheticNIRSGenerator instance.

        Returns:
            Tuple of (X, C, metadata) where:
                X: Spectra array (n_samples, n_wavelengths)
                C: Concentration array (n_samples, n_components)
                metadata: Generation metadata dict
        """
        from ._aggregates import get_aggregate, expand_aggregate

        agg = get_aggregate(self.state.aggregate_name)
        n_samples = self.state.n_samples
        component_names = list(agg.components.keys())
        n_components = len(component_names)
        rng = np.random.default_rng(self.state.random_state)

        # Generate concentration matrix
        C = np.zeros((n_samples, n_components))

        for i in range(n_samples):
            # Get composition (with or without variability)
            composition = expand_aggregate(
                self.state.aggregate_name,
                variability=self.state.aggregate_variability,
                random_state=rng.integers(0, 2**31) if self.state.aggregate_variability else None,
            )

            # Map to concentration array in correct order
            for j, comp_name in enumerate(component_names):
                C[i, j] = composition.get(comp_name, 0.0)

        # Generate spectra from concentrations
        X, metadata = generator.generate_from_concentrations(
            C,
            include_batch_effects=self.state.batch_effects_enabled,
            n_batches=self.state.n_batches,
        )

        # Add aggregate info to metadata
        metadata["aggregate_name"] = self.state.aggregate_name
        metadata["aggregate_variability"] = self.state.aggregate_variability

        return X, C, metadata

    def _process_targets(
        self,
        C: np.ndarray,
        generator: SyntheticNIRSGenerator,
    ) -> np.ndarray:
        """Process concentration matrix into target values."""
        # For classification, use the TargetGenerator
        if self.state.n_classes is not None:
            target_gen = TargetGenerator(random_state=self.state.random_state)
            y = target_gen.classification(
                n_samples=C.shape[0],
                concentrations=C,
                n_classes=self.state.n_classes,
                class_weights=self.state.class_weights,
                separation=self.state.class_separation,
                separation_method=self.state.class_separation_method,
            )
            return y

        # For regression, process as before
        # Select target component(s)
        if self.state.target_component is None:
            y = C
        elif isinstance(self.state.target_component, str):
            comp_idx = generator.library.component_names.index(self.state.target_component)
            y = C[:, comp_idx]
        else:
            y = C[:, self.state.target_component]

        # Ensure 2D
        if y.ndim == 1:
            y = y.reshape(-1, 1)

        # Apply transformation
        if self.state.target_transform == "log":
            y = np.log1p(y)
        elif self.state.target_transform == "sqrt":
            y = np.sqrt(y)

        # Apply range scaling
        if self.state.target_range is not None:
            min_val, max_val = self.state.target_range
            y_min, y_max = y.min(), y.max()
            if y_max > y_min:
                y = (y - y_min) / (y_max - y_min) * (max_val - min_val) + min_val
            else:
                y = np.full_like(y, (min_val + max_val) / 2)

        # === Apply non-linear complexity (Propositions 1, 2, 3) ===
        if self._has_nonlinear_complexity():
            y = self._apply_nonlinear_complexity(C, y)

        # Flatten if single target
        if y.ndim > 1 and y.shape[1] == 1:
            y = y.ravel()

        return y

    def _has_nonlinear_complexity(self) -> bool:
        """Check if any non-linear complexity options are enabled."""
        return (
            self.state.nonlinear_interactions != "none"
            or self.state.hidden_factors > 0
            or self.state.signal_to_confound_ratio < 1.0
            or self.state.n_confounders > 0
            or self.state.temporal_drift
            or self.state.n_regimes > 1
            or self.state.noise_heteroscedasticity > 0
        )

    def _apply_nonlinear_complexity(
        self,
        C: np.ndarray,
        y: np.ndarray,
    ) -> np.ndarray:
        """Apply non-linear target complexity using NonLinearTargetProcessor."""
        config = NonLinearTargetConfig(
            # Proposition 1: Non-linear interactions
            nonlinear_interactions=self.state.nonlinear_interactions,
            interaction_strength=self.state.interaction_strength,
            hidden_factors=self.state.hidden_factors,
            polynomial_degree=self.state.polynomial_degree,
            # Proposition 2: Confounders
            signal_to_confound_ratio=self.state.signal_to_confound_ratio,
            n_confounders=self.state.n_confounders,
            spectral_masking=self.state.spectral_masking,
            temporal_drift=self.state.temporal_drift,
            # Proposition 3: Multi-regime
            n_regimes=self.state.n_regimes,
            regime_method=self.state.regime_method,
            regime_overlap=self.state.regime_overlap,
            noise_heteroscedasticity=self.state.noise_heteroscedasticity,
        )

        processor = NonLinearTargetProcessor(
            config=config,
            random_state=self.state.random_state,
        )

        # Get spectra for spectral-based regime assignment
        spectra = self.state._X

        return processor.process(
            concentrations=C,
            y_base=y,
            spectra=spectra,
        )

    def _build_dataset(self) -> SpectroDataset:
        """Build SpectroDataset from generated data."""
        from nirs4all.data import SpectroDataset

        X = self.state._X
        y = self.state._y
        n_samples = self.state.n_samples
        train_ratio = self.state.train_ratio

        # Calculate partition sizes
        n_train = int(n_samples * train_ratio)
        n_test = n_samples - n_train

        # Create shuffle indices if needed
        rng = np.random.default_rng(self.state.random_state)
        if self.state.shuffle:
            indices = rng.permutation(n_samples)
        else:
            indices = np.arange(n_samples)

        train_indices = indices[:n_train]
        test_indices = indices[n_train:]

        # Create dataset
        dataset = SpectroDataset(name=self.state.name)

        # Create wavelength headers
        headers = [str(int(wl)) for wl in self.state._wavelengths]

        # Add training samples
        dataset.add_samples(
            X[train_indices],
            indexes={"partition": "train"},
            headers=headers,
            header_unit="nm",
        )
        if y.ndim == 1:
            dataset.add_targets(y[train_indices])
        else:
            dataset.add_targets(y[train_indices])

        # Add test samples if any
        if n_test > 0:
            dataset.add_samples(
                X[test_indices],
                indexes={"partition": "test"},
                headers=headers,
                header_unit="nm",
            )
            if y.ndim == 1:
                dataset.add_targets(y[test_indices])
            else:
                dataset.add_targets(y[test_indices])

        return dataset

    def _build_arrays(self) -> Tuple[np.ndarray, np.ndarray]:
        """Build raw numpy arrays from generated data."""
        return self.state._X.copy(), self.state._y.copy()


[docs]
    def build(self) -> Union[SpectroDataset, Tuple[np.ndarray, np.ndarray]]:
        """
        Build the synthetic dataset with all configured options.

        This method generates the data and returns it in the configured format.

        Returns:
            If as_dataset=True (default): SpectroDataset instance.
            If as_dataset=False: Tuple of (X, y) numpy arrays.

        Raises:
            RuntimeError: If build() was already called on this builder.

        Example:
            >>> dataset = builder.build()
            >>> print(dataset.num_samples)
            1000
        """
        if self._built:
            raise RuntimeError(
                "build() can only be called once per builder instance. "
                "Create a new builder for additional datasets."
            )

        # Check for multi-source generation
        if self.state.sources is not None:
            return self._build_multi_source()

        # Single-source generation
        generator = self._create_generator()
        self._generate_data(generator)

        self._built = True

        # Return in requested format
        if self.state.as_dataset:
            return self._build_dataset()
        else:
            return self._build_arrays()


    def _build_multi_source(self) -> Union[SpectroDataset, Tuple[np.ndarray, np.ndarray]]:
        """Build multi-source dataset using MultiSourceGenerator."""
        from .sources import MultiSourceGenerator

        generator = MultiSourceGenerator(random_state=self.state.random_state)

        if self.state.as_dataset:
            dataset = generator.create_dataset(
                n_samples=self.state.n_samples,
                sources=self.state.sources,
                train_ratio=self.state.train_ratio,
                target_range=self.state.target_range,
                name=self.state.name,
            )
            self._built = True
            return dataset
        else:
            result = generator.generate(
                n_samples=self.state.n_samples,
                sources=self.state.sources,
                target_range=self.state.target_range,
            )
            self._built = True
            X = result.get_combined_features()
            return X, result.targets


[docs]
    def build_arrays(self) -> Tuple[np.ndarray, np.ndarray]:
        """
        Build and return raw numpy arrays.

        This is a convenience method equivalent to calling
        ``with_output(as_dataset=False).build()``.

        Returns:
            Tuple of (X, y) numpy arrays.

        Example:
            >>> X, y = builder.build_arrays()
        """
        self.state.as_dataset = False
        return self.build()



[docs]
    def build_dataset(self) -> SpectroDataset:
        """
        Build and return a SpectroDataset.

        This is a convenience method equivalent to calling
        ``with_output(as_dataset=True).build()``.

        Returns:
            SpectroDataset instance.

        Example:
            >>> dataset = builder.build_dataset()
        """
        self.state.as_dataset = True
        return self.build()



[docs]
    def get_config(self) -> SyntheticDatasetConfig:
        """
        Get the current configuration as a SyntheticDatasetConfig object.

        Returns:
            SyntheticDatasetConfig with all current settings.

        Example:
            >>> config = builder.get_config()
            >>> print(config.n_samples)
            1000
        """
        return SyntheticDatasetConfig(
            n_samples=self.state.n_samples,
            random_state=self.state.random_state,
            features=FeatureConfig(
                wavelength_start=self.state.wavelength_start,
                wavelength_end=self.state.wavelength_end,
                wavelength_step=self.state.wavelength_step,
                complexity=self.state.complexity,
                component_names=self.state.component_names,
            ),
            targets=TargetConfig(
                distribution=self.state.concentration_method,
                range=self.state.target_range,
                transform=self.state.target_transform,
            ),
            metadata=MetadataConfig(
                generate_sample_ids=self.state.generate_sample_ids,
                sample_id_prefix=self.state.sample_id_prefix,
                n_groups=self.state.n_groups,
                n_repetitions=self.state.n_repetitions,
            ),
            partitions=PartitionConfig(
                train_ratio=self.state.train_ratio,
                stratify=self.state.stratify,
                shuffle=self.state.shuffle,
            ),
            batch_effects=BatchEffectConfig(
                enabled=self.state.batch_effects_enabled,
                n_batches=self.state.n_batches,
            ),
            output=OutputConfig(
                as_dataset=self.state.as_dataset,
                include_metadata=self.state.include_metadata,
            ),
            name=self.state.name,
        )



[docs]
    @classmethod
    def from_config(
        cls,
        config: SyntheticDatasetConfig,
    ) -> SyntheticDatasetBuilder:
        """
        Create a builder from a SyntheticDatasetConfig object.

        Args:
            config: Configuration object to use.

        Returns:
            Configured SyntheticDatasetBuilder instance.

        Example:
            >>> config = SyntheticDatasetConfig(n_samples=500)
            >>> builder = SyntheticDatasetBuilder.from_config(config)
            >>> dataset = builder.build()
        """
        builder = cls(
            n_samples=config.n_samples,
            random_state=config.random_state,
            name=config.name,
        )

        # Apply feature config
        builder.state.wavelength_start = config.features.wavelength_start
        builder.state.wavelength_end = config.features.wavelength_end
        builder.state.wavelength_step = config.features.wavelength_step
        builder.state.complexity = config.features.complexity
        builder.state.component_names = config.features.component_names

        # Apply target config
        builder.state.concentration_method = config.targets.distribution
        builder.state.target_range = config.targets.range
        builder.state.target_transform = config.targets.transform

        # Apply metadata config
        builder.state.generate_sample_ids = config.metadata.generate_sample_ids
        builder.state.sample_id_prefix = config.metadata.sample_id_prefix
        builder.state.n_groups = config.metadata.n_groups
        builder.state.n_repetitions = config.metadata.n_repetitions

        # Apply partition config
        builder.state.train_ratio = config.partitions.train_ratio
        builder.state.stratify = config.partitions.stratify
        builder.state.shuffle = config.partitions.shuffle

        # Apply batch effect config
        builder.state.batch_effects_enabled = config.batch_effects.enabled
        builder.state.n_batches = config.batch_effects.n_batches

        # Apply output config
        builder.state.as_dataset = config.output.as_dataset
        builder.state.include_metadata = config.output.include_metadata

        return builder



[docs]
    def export(
        self,
        path: Union[str, "Path"],
        format: Literal["standard", "single", "fragmented"] = "standard",
    ) -> "Path":
        """
        Generate data and export to folder.

        Generates the synthetic data and exports it to a folder structure
        compatible with nirs4all's DatasetConfigs loader.

        Args:
            path: Output folder path.
            format: Export format:
                - 'standard': Xcal, Ycal, Xval, Yval files.
                - 'single': All data in one file with partition column.
                - 'fragmented': Multiple small files (for testing).

        Returns:
            Path to created folder.

        Example:
            >>> builder = SyntheticDatasetBuilder(n_samples=1000)
            >>> path = builder.export("data/synthetic", format="standard")
        """
        from pathlib import Path
        from .exporter import DatasetExporter

        # Generate data if not already done
        if self.state._X is None:
            if self.state.sources is not None:
                # Multi-source - generate and export differently
                result = self._build_multi_source()
                if hasattr(result, 'x'):
                    # It's a dataset
                    X = result.x({}, layout='2d')
                    y = result.y({})
                else:
                    X, y = result
                wavelengths = None  # Multi-source doesn't have simple wavelengths
            else:
                generator = self._create_generator()
                self._generate_data(generator)
                X = self.state._X
                y = self.state._y
                wavelengths = self.state._wavelengths
        else:
            X = self.state._X
            y = self.state._y
            wavelengths = self.state._wavelengths

        # Export
        exporter = DatasetExporter()
        return exporter.to_folder(
            path,
            X, y,
            train_ratio=self.state.train_ratio,
            wavelengths=wavelengths,
            format=format,
            random_state=self.state.random_state,
        )



[docs]
    def export_to_csv(
        self,
        path: Union[str, "Path"],
        include_targets: bool = True,
    ) -> "Path":
        """
        Generate data and export to a single CSV file.

        Args:
            path: Output file path.
            include_targets: Whether to include target column(s).

        Returns:
            Path to created file.

        Example:
            >>> path = builder.export_to_csv("data.csv")
        """
        from pathlib import Path
        from .exporter import DatasetExporter

        # Generate data if not already done
        if self.state._X is None:
            generator = self._create_generator()
            self._generate_data(generator)

        exporter = DatasetExporter()
        return exporter.to_csv(
            path,
            self.state._X,
            self.state._y,
            wavelengths=self.state._wavelengths,
            include_targets=include_targets,
        )



[docs]
    def fit_to(
        self,
        template: Union[np.ndarray, "SpectroDataset"],
        wavelengths: Optional[np.ndarray] = None,
        *,
        match_statistics: bool = True,
        match_structure: bool = True,
    ) -> "SyntheticDatasetBuilder":
        """
        Configure builder to generate data similar to a template.

        Analyzes the template data and adjusts generation parameters
        to produce synthetic data with similar properties.

        Args:
            template: Real data to mimic (array or SpectroDataset).
            wavelengths: Wavelength grid (if template is array).
            match_statistics: Match statistical properties (mean, std).
            match_structure: Match PCA structure and complexity.

        Returns:
            Self for method chaining.

        Example:
            >>> builder = SyntheticDatasetBuilder(n_samples=1000)
            >>> builder.fit_to(X_real, wavelengths=wl)
            >>> X_synth, y = builder.build_arrays()
        """
        from .fitter import RealDataFitter

        fitter = RealDataFitter()
        params = fitter.fit(template, wavelengths=wavelengths)

        # Apply fitted wavelength range
        self.state.wavelength_start = params.wavelength_start
        self.state.wavelength_end = params.wavelength_end
        self.state.wavelength_step = params.wavelength_step

        # Apply complexity
        if match_structure:
            self.state.complexity = params.complexity

        return self



[docs]
    def __repr__(self) -> str:
        """Return string representation of the builder."""
        return (
            f"SyntheticDatasetBuilder("
            f"n_samples={self.state.n_samples}, "
            f"complexity='{self.state.complexity}', "
            f"random_state={self.state.random_state})"
        )