nirs4all.data.synthetic.builder module

Fluent builder for synthetic NIRS dataset construction.

This module provides a builder pattern interface for creating synthetic NIRS datasets with fine-grained control over all generation parameters.

Example

>>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
>>>
>>> dataset = (
...     SyntheticDatasetBuilder(n_samples=1000, random_state=42)
...     .with_features(complexity="realistic")
...     .with_targets(distribution="lognormal", range=(0, 100))
...     .with_partitions(train_ratio=0.8)
...     .build()
... )

class nirs4all.data.synthetic.builder.BuilderState(n_samples: int = 1000, random_state: int | None = None, name: str = 'synthetic_nirs', wavelength_start: float = 350.0, wavelength_end: float = 2500.0, wavelength_step: float = 2.0, custom_wavelengths: ndarray | None = None, instrument_wavelength_grid: str | None = None, complexity: Literal['simple', 'realistic', 'complex'] = 'simple', component_names: List[str] | None = None, component_library: ComponentLibrary | None = None, custom_params: Dict[str, Any] | None = None, instrument: str | None = None, measurement_mode: str | None = None, concentration_method: Literal['dirichlet', 'uniform', 'lognormal', 'correlated'] = 'dirichlet', target_range: Tuple[float, float] | None = None, target_component: str | int | None = None, target_transform: Literal['log', 'sqrt'] | None = None, n_classes: int | None = None, class_separation: float = 1.5, class_weights: List[float] | None = None, class_separation_method: Literal['component', 'threshold', 'cluster'] = 'component', generate_sample_ids: bool = False, sample_id_prefix: str = 'sample', n_groups: int | None = None, n_repetitions: int | Tuple[int, int] = 1, group_names: List[str] | None = None, sources: List[Any] | None = None, aggregate_name: str | None = None, aggregate_variability: bool = False, train_ratio: float = 0.8, stratify: bool = False, shuffle: bool = True, batch_effects_enabled: bool = False, n_batches: int = 3, as_dataset: bool = True, include_metadata: bool = False, nonlinear_interactions: Literal['none', 'polynomial', 'synergistic', 'antagonistic'] = 'none', interaction_strength: float = 0.5, hidden_factors: int = 0, polynomial_degree: int = 2, signal_to_confound_ratio: float = 1.0, n_confounders: int = 0, spectral_masking: float = 0.0, temporal_drift: bool = False, n_regimes: int = 1, regime_method: Literal['concentration', 'spectral', 'random'] = 'concentration', regime_overlap: float = 0.2, noise_heteroscedasticity: float = 0.0, _X: ndarray | None = None, _y: ndarray | None = None, _C: ndarray | None = None, _wavelengths: ndarray | None = None, _metadata: Dict[str, Any] | None = None, _sample_metadata: MetadataGenerationResult | None = None)[source]

Bases: object

Internal state container for the builder.

This holds all configuration accumulated through the builder methods.

aggregate_name: str | None = None

aggregate_variability: bool = False

as_dataset: bool = True

batch_effects_enabled: bool = False

class_separation: float = 1.5

class_separation_method: Literal['component', 'threshold', 'cluster'] = 'component'

class_weights: List[float] | None = None

complexity: Literal['simple', 'realistic', 'complex'] = 'simple'

component_library: ComponentLibrary | None = None

component_names: List[str] | None = None

concentration_method: Literal['dirichlet', 'uniform', 'lognormal', 'correlated'] = 'dirichlet'

custom_params: Dict[str, Any] | None = None

custom_wavelengths: ndarray | None = None

generate_sample_ids: bool = False

group_names: List[str] | None = None

hidden_factors: int = 0

include_metadata: bool = False

instrument: str | None = None

instrument_wavelength_grid: str | None = None

interaction_strength: float = 0.5

measurement_mode: str | None = None

n_batches: int = 3

n_classes: int | None = None

n_confounders: int = 0

n_groups: int | None = None

n_regimes: int = 1

n_repetitions: int | Tuple[int, int] = 1

n_samples: int = 1000

name: str = 'synthetic_nirs'

noise_heteroscedasticity: float = 0.0

nonlinear_interactions: Literal['none', 'polynomial', 'synergistic', 'antagonistic'] = 'none'

polynomial_degree: int = 2

random_state: int | None = None

regime_method: Literal['concentration', 'spectral', 'random'] = 'concentration'

regime_overlap: float = 0.2

sample_id_prefix: str = 'sample'

shuffle: bool = True

signal_to_confound_ratio: float = 1.0

sources: List[Any] | None = None

spectral_masking: float = 0.0

stratify: bool = False

target_component: str | int | None = None

target_range: Tuple[float, float] | None = None

target_transform: Literal['log', 'sqrt'] | None = None

temporal_drift: bool = False

train_ratio: float = 0.8

wavelength_end: float = 2500.0

wavelength_start: float = 350.0

wavelength_step: float = 2.0

class nirs4all.data.synthetic.builder.SyntheticDatasetBuilder(n_samples: int = 1000, random_state: int | None = None, name: str = 'synthetic_nirs')[source]

Bases: object

Fluent builder for constructing synthetic NIRS datasets.

This builder provides a chainable interface for configuring all aspects of synthetic data generation, from spectral features to targets and metadata.

The builder accumulates configuration through method calls, then generates the dataset when build() is called.

state: Internal BuilderState containing all configuration.

Parameters:

n_samples – Number of samples to generate.
random_state – Random seed for reproducibility.
name – Dataset name.

Example

>>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
>>>
>>> # Simple usage
>>> dataset = SyntheticDatasetBuilder(n_samples=500).build()
>>>
>>> # Full configuration
>>> dataset = (
...     SyntheticDatasetBuilder(n_samples=1000, random_state=42)
...     .with_features(
...         wavelength_range=(1000, 2500),
...         complexity="realistic",
...         components=["water", "protein", "lipid"]
...     )
...     .with_targets(
...         distribution="lognormal",
...         range=(5, 50),
...         component="protein"
...     )
...     .with_metadata(
...         n_groups=3,
...         n_repetitions=(2, 5)
...     )
...     .with_partitions(train_ratio=0.8)
...     .build()
... )

See also

nirs4all.data.synthetic.list_aggregates: List available aggregates. nirs4all.data.synthetic.aggregate_info: Get aggregate details.

with_batch_effects(*, enabled: bool = True, n_batches: int = 3) → SyntheticDatasetBuilder[source]

Configure batch/session effects simulation.

Batch effects introduce systematic variations between measurement sessions, useful for domain adaptation research.

Parameters:

enabled – Whether to enable batch effects.
n_batches – Number of measurement batches.

Returns:

Self for method chaining.

Example

>>> builder.with_batch_effects(n_batches=5)

with_classification(*, n_classes: int = 2, separation: float = 1.5, class_weights: List[float] | None = None, separation_method: Literal['component', 'threshold', 'cluster'] = 'component') → SyntheticDatasetBuilder[source]

Configure target generation for classification tasks.

This creates discrete class labels with controllable separation between classes, enabling classification experiments with varying difficulty levels.

Parameters:

n_classes – Number of classes to generate.
separation – Class separation factor (higher = more separable). Values around 0.5-1.0: overlapping classes (challenging). Values around 1.5-2.0: moderate separation (realistic). Values around 2.5+: well-separated classes (easy).
class_weights – Optional class weights for imbalanced datasets. Should sum to 1.0.
separation_method – How to create class differences: - “component”: Different component concentration profiles per class. - “threshold”: Classes based on concentration thresholds. - “cluster”: K-means-like cluster assignment.

Returns:

Self for method chaining.

Example

>>> builder.with_classification(
...     n_classes=3,
...     separation=2.0,
...     class_weights=[0.5, 0.3, 0.2]
... )

with_complex_target_landscape(*, n_regimes: int = 3, regime_method: Literal['concentration', 'spectral', 'random'] = 'concentration', regime_overlap: float = 0.2, noise_heteroscedasticity: float = 0.5) → SyntheticDatasetBuilder[source]

Configure multi-regime target landscapes with spatially-varying relationships.

This creates regions in feature space where the target-spectra relationship differs, simulating subpopulations like ripe/unripe fruit or healthy/diseased.

Parameters:

n_regimes – Number of different relationship regimes. Default 3.
regime_method – How to partition samples into regimes: - “concentration”: Regimes based on concentration space clustering. - “spectral”: Regimes based on spectral feature patterns. - “random”: Random regime assignment (baseline difficulty).
regime_overlap – Overlap between regimes creating transition zones. 0 = hard boundaries, 0.5 = smooth transitions. Default 0.2.
noise_heteroscedasticity – How much prediction noise varies by regime. 0 = same noise everywhere, 1 = very different noise levels. Default 0.5.

Returns:

Self for method chaining.

Example

>>> # Create challenging multi-regime landscape
>>> builder.with_complex_target_landscape(
...     n_regimes=4,
...     regime_method="concentration",
...     regime_overlap=0.3,
...     noise_heteroscedasticity=0.7
... )

Configure spectral feature generation.

Parameters:

wavelength_range – Tuple of (start, end) wavelengths in nm.
wavelength_step – Wavelength sampling step in nm.
complexity – Complexity level affecting noise, scatter, etc. Options: ‘simple’, ‘realistic’, ‘complex’.
components – List of predefined component names to use.
component_library – Pre-configured ComponentLibrary instance.
path_length_std – Standard deviation of optical path length variation.
baseline_amplitude – Amplitude of polynomial baseline drift.
scatter_alpha_std – MSC-like multiplicative scattering coefficient variation.
scatter_beta_std – Additive scattering offset variation.
tilt_std – Standard deviation of linear spectral tilt.
global_slope_mean – Mean slope across all spectra.
global_slope_std – Standard deviation of global slope.
shift_std – Random wavelength axis shift (nm).
stretch_std – Wavelength axis stretching/compression factor.
instrumental_fwhm – Instrumental broadening FWHM (nm).
noise_base – Constant noise floor (detector noise).
noise_signal_dep – Noise proportional to signal intensity (shot noise).
artifact_prob – Probability of spectral artifacts.
instrument – Instrument archetype name (e.g., ‘foss_xds’, ‘bruker_mpa’).
measurement_mode – Measurement mode (‘transmittance’, ‘reflectance’, ‘atr’, etc.).

Returns:

Self for method chaining.

Raises:

ValueError – If both components and component_library are specified.

Example

>>> # Simple usage with preset
>>> builder.with_features(
...     wavelength_range=(1000, 2500),
...     complexity="realistic",
...     components=["water", "protein"]
... )

>>> # Advanced usage with custom physics parameters
>>> builder.with_features(
...     wavelength_range=(1000, 2500),
...     components=["water", "protein", "lipid"],
...     noise_base=0.003,
...     noise_signal_dep=0.008,
...     baseline_amplitude=0.015,
...     scatter_alpha_std=0.04,
...     instrument="foss_xds"
... )

Configure sample metadata generation.

Generates realistic metadata including sample IDs, biological sample groupings (with repetitions), and group assignments.

Parameters:

sample_ids – Whether to generate sample IDs.
sample_id_prefix – Prefix for sample ID strings.
n_groups – Number of sample groups (for grouped cross-validation).
n_repetitions – Repetitions per biological sample. Either a fixed int or a (min, max) tuple for random variation. When set, each “biological sample” gets multiple spectral measurements.
group_names – Optional list of group names. If None and n_groups > 0, generates names like “Group_0”, “Group_1”, etc.

Returns:

Self for method chaining.

Example

>>> builder.with_metadata(
...     n_groups=5,
...     n_repetitions=(2, 4),
...     group_names=["Field_A", "Field_B", "Field_C", "Field_D", "Field_E"]
... )

with_nonlinear_targets(*, interactions: Literal['none', 'polynomial', 'synergistic', 'antagonistic'] = 'polynomial', interaction_strength: float = 0.5, hidden_factors: int = 0, polynomial_degree: int = 2) → SyntheticDatasetBuilder[source]

Configure non-linear relationships between concentrations and targets.

This introduces non-linear mixture effects that make targets harder to predict with simple linear models, simulating real chemical interactions.

Parameters:

interactions – Type of non-linear interaction: - “none”: Pure linear relationship (default behavior). - “polynomial”: Include terms like C1², C1×C2, etc. - “synergistic”: Non-additive effects where combinations enhance target. - “antagonistic”: Saturation/inhibition (Michaelis-Menten-like).
interaction_strength – Blend factor between linear and non-linear. 0 = purely linear, 1 = fully non-linear. Default 0.5.
hidden_factors – Number of latent variables that affect target but have NO spectral signature. Forces models to learn robust features.
polynomial_degree – Maximum degree for polynomial interactions (2 or 3).

Returns:

Self for method chaining.

Example

>>> # Make targets require non-linear models
>>> builder.with_nonlinear_targets(
...     interactions="polynomial",
...     interaction_strength=0.7,
...     hidden_factors=2
... )

with_output(*, as_dataset: bool | None = None, include_metadata: bool | None = None) → SyntheticDatasetBuilder[source]

Configure output format.

Parameters:

as_dataset – If True, returns SpectroDataset. If False, returns tuple.
include_metadata – Whether to include generation metadata in output.

Returns:

Self for method chaining.

Example

>>> builder.with_output(as_dataset=False)  # Returns (X, y) tuple

with_partitions(*, train_ratio: float | None = None, stratify: bool | None = None, shuffle: bool | None = None) → SyntheticDatasetBuilder[source]

Configure data partitioning (train/test split).

Parameters:

train_ratio – Proportion of samples for training (0.0-1.0).
stratify – Whether to stratify by target (for classification).
shuffle – Whether to shuffle before splitting.

Returns:

Self for method chaining.

Example

>>> builder.with_partitions(train_ratio=0.75, shuffle=True)

with_sources(sources: List[Dict[str, Any] | Any]) → SyntheticDatasetBuilder[source]

Configure multi-source generation.

Multi-source datasets combine different types of data, such as multiple NIR spectral ranges or NIR spectra with auxiliary measurements.

Parameters:: sources – List of source configurations. Each source is a dict with: - name: Unique source identifier (required). - type: Source type - “nir”, “vis”, “aux”, “markers” (default: “nir”). - wavelength_range: (start, end) for NIR sources. - n_features: Number of features for auxiliary sources. - complexity: Complexity level for NIR sources. - components: Component names for NIR sources.
Returns:: Self for method chaining.

Example

>>> builder.with_sources([
...     {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)},
...     {"name": "markers", "type": "aux", "n_features": 15}
... ])

with_target_complexity(*, signal_to_confound_ratio: float = 0.7, n_confounders: int = 2, spectral_masking: float = 0.0, temporal_drift: bool = False) → SyntheticDatasetBuilder[source]

Configure spectral-target decoupling and confounding effects.

This introduces factors that make the target only partially predictable from spectral features, simulating real-world irreducible error.

Parameters:

signal_to_confound_ratio – Proportion of target variance explainable from spectra. 1.0 = fully predictable, 0.5 = 50% unexplainable. Default 0.7 (70% predictable).
n_confounders – Number of confounding variables that affect both spectra and target in different ways. Default 2.
spectral_masking – Fraction of predictive signal hidden in high-noise wavelength regions (0.0-0.5). Default 0.0.
temporal_drift – If True, the target-spectra relationship gradually changes across samples, testing model robustness.

Returns:

Self for method chaining.

Example

>>> # Add realistic confounding
>>> builder.with_target_complexity(
...     signal_to_confound_ratio=0.6,
...     n_confounders=3,
...     temporal_drift=True
... )

with_targets(*, distribution: Literal['dirichlet', 'uniform', 'lognormal', 'correlated'] | None = None, range: Tuple[float, float] | None = None, component: str | int | None = None, transform: Literal['log', 'sqrt'] | None = None) → SyntheticDatasetBuilder[source]

Configure target variable generation for regression tasks.

Parameters:

distribution – Concentration distribution method. Options: ‘dirichlet’, ‘uniform’, ‘lognormal’, ‘correlated’.
range – Target value range (min, max) for scaling.
component – Which component to use as target. If None, uses all components (multi-output). If str, uses the component with that name. If int, uses the component at that index.
transform – Optional transformation to apply (‘log’, ‘sqrt’).

Returns:

Self for method chaining.

Example

>>> builder.with_targets(
...     distribution="lognormal",
...     range=(5, 50),
...     component="protein"
... )

with_wavelengths(wavelengths: ndarray | None = None, *, instrument_grid: str | None = None) → SyntheticDatasetBuilder[source]

Configure custom wavelength grid for spectrum generation.

This method allows generating spectra at specific wavelengths matching a real instrument’s wavelength grid, which is essential for transfer learning and domain adaptation experiments.

Priority: wavelengths > instrument_grid > wavelength_range (in with_features)

Parameters:

wavelengths – Custom wavelength array in nm. If provided, overrides the wavelength_range set in with_features().
instrument_grid – Name of predefined instrument wavelength grid. Available grids include: ‘micronir_onsite’, ‘foss_xds’, ‘scio’, ‘neospectra_micro’, ‘asd_fieldspec’, ‘bruker_mpa’, etc. See list_instrument_wavelength_grids() for all options.

Returns:

Self for method chaining.

Raises:

ValueError – If instrument_grid name is not recognized.

Example

>>> # Use predefined instrument wavelength grid
>>> builder.with_wavelengths(instrument_grid="micronir_onsite")

>>> # Use custom wavelength array
>>> custom_wl = np.linspace(1000, 2000, 100)
>>> builder.with_wavelengths(wavelengths=custom_wl)

>>> # Full example
>>> from nirs4all.data.synthetic import SyntheticDatasetBuilder
>>> dataset = (
...     SyntheticDatasetBuilder(n_samples=500)
...     .with_wavelengths(instrument_grid="micronir_onsite")
...     .with_features(complexity="realistic")
...     .build()
... )