Source code for nirs4all.data.synthetic.sources

"""
Multi-source dataset generation for synthetic NIRS data.

This module provides tools for generating synthetic datasets with multiple
data sources, such as combining NIR spectra with molecular markers or
auxiliary measurements.

Example:
    >>> from nirs4all.data.synthetic.sources import MultiSourceGenerator
    >>>
    >>> generator = MultiSourceGenerator(random_state=42)
    >>>
    >>> dataset = generator.generate(
    ...     n_samples=500,
    ...     sources=[
    ...         {"name": "NIR_low", "type": "nir", "wavelength_range": (1000, 1700)},
    ...         {"name": "NIR_high", "type": "nir", "wavelength_range": (1700, 2500)},
    ...         {"name": "markers", "type": "aux", "n_features": 15},
    ...     ]
    ... )
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union

import numpy as np

from .generator import SyntheticNIRSGenerator
from .components import ComponentLibrary

if TYPE_CHECKING:
    from nirs4all.data.dataset import SpectroDataset


[docs] @dataclass class SourceConfig: """ Configuration for a single data source. Attributes: name: Unique identifier for the source. source_type: Type of source ('nir', 'vis', 'aux', 'markers'). n_features: Number of features (auto-calculated for NIR sources). # NIR-specific wavelength_start: Start wavelength for NIR sources. wavelength_end: End wavelength for NIR sources. wavelength_step: Wavelength step for NIR sources. components: Component names for NIR sources. complexity: Complexity level for NIR sources. # Auxiliary-specific distribution: Distribution for auxiliary features. correlation_with_target: How correlated aux features are with target. """ name: str source_type: Literal["nir", "vis", "aux", "markers"] = "nir" n_features: Optional[int] = None # NIR-specific options wavelength_start: Optional[float] = None wavelength_end: Optional[float] = None wavelength_step: float = 2.0 components: Optional[List[str]] = None complexity: Literal["simple", "realistic", "complex"] = "simple" # Auxiliary-specific options distribution: Literal["normal", "uniform", "lognormal"] = "normal" correlation_with_target: float = 0.5
[docs] @classmethod def from_dict(cls, config: Dict[str, Any]) -> SourceConfig: """Create SourceConfig from dictionary.""" # Handle wavelength_range shorthand if "wavelength_range" in config: wl_range = config.pop("wavelength_range") config["wavelength_start"] = wl_range[0] config["wavelength_end"] = wl_range[1] # Handle type -> source_type mapping if "type" in config: config["source_type"] = config.pop("type") return cls(**config)
[docs] @dataclass class MultiSourceResult: """ Container for multi-source generation results. Attributes: sources: Dictionary mapping source names to feature arrays. targets: Target values. source_configs: Source configuration objects. wavelengths: Dictionary mapping NIR source names to wavelength arrays. metadata: Optional metadata dictionary. """ sources: Dict[str, np.ndarray] targets: np.ndarray source_configs: List[SourceConfig] wavelengths: Dict[str, np.ndarray] = field(default_factory=dict) metadata: Optional[Dict[str, Any]] = None @property def source_names(self) -> List[str]: """Get list of source names.""" return list(self.sources.keys())
[docs] def get_combined_features(self) -> np.ndarray: """Concatenate all sources into single feature matrix.""" return np.hstack([self.sources[name] for name in self.source_names])
@property def n_samples(self) -> int: """Get number of samples.""" return len(self.targets) @property def n_features_total(self) -> int: """Get total number of features across all sources.""" return sum(arr.shape[1] for arr in self.sources.values())
[docs] class MultiSourceGenerator: """ Generate synthetic multi-source NIRS datasets. This class creates datasets combining multiple data sources, such as: - Multiple NIR spectral ranges (e.g., visible-NIR + shortwave-NIR) - NIR spectra + molecular markers - NIR spectra + auxiliary measurements The generated sources share common underlying structure through component concentrations, creating realistic inter-source correlations. Attributes: rng: NumPy random generator. Args: random_state: Random seed for reproducibility. Example: >>> generator = MultiSourceGenerator(random_state=42) >>> >>> result = generator.generate( ... n_samples=500, ... sources=[ ... { ... "name": "NIR", ... "type": "nir", ... "wavelength_range": (1000, 2500), ... "complexity": "realistic" ... }, ... { ... "name": "markers", ... "type": "aux", ... "n_features": 20, ... "correlation_with_target": 0.7 ... } ... ], ... target_range=(0, 100) ... ) >>> >>> print(result.source_names) ['NIR', 'markers'] """ def __init__(self, random_state: Optional[int] = None) -> None: """ Initialize the multi-source generator. Args: random_state: Random seed for reproducibility. """ self.rng = np.random.default_rng(random_state) self._random_state = random_state
[docs] def generate( self, n_samples: int, sources: List[Union[SourceConfig, Dict[str, Any]]], *, target_range: Optional[Tuple[float, float]] = None, concentration_method: str = "dirichlet", n_components: int = 5, ) -> MultiSourceResult: """ Generate a multi-source dataset. All sources share underlying component concentrations, which creates realistic correlations between sources. NIR sources generate spectra from these concentrations, while auxiliary sources create features correlated with the same underlying structure. Args: n_samples: Number of samples to generate. sources: List of source configurations (SourceConfig or dict). target_range: Optional (min, max) for scaling target values. concentration_method: Method for generating component concentrations. n_components: Number of underlying components. Returns: MultiSourceResult containing all generated data. Example: >>> result = generator.generate( ... n_samples=300, ... sources=[ ... {"name": "VIS-NIR", "type": "nir", "wavelength_range": (400, 1100)}, ... {"name": "SWIR", "type": "nir", "wavelength_range": (1100, 2500)}, ... ] ... ) """ # Parse source configurations parsed_sources = [ s if isinstance(s, SourceConfig) else SourceConfig.from_dict(s.copy()) for s in sources ] # Validate source names are unique names = [s.name for s in parsed_sources] if len(names) != len(set(names)): raise ValueError("Source names must be unique") # Generate shared component concentrations concentrations = self._generate_concentrations( n_samples, n_components, concentration_method ) # Generate each source source_data: Dict[str, np.ndarray] = {} wavelengths: Dict[str, np.ndarray] = {} for source_config in parsed_sources: if source_config.source_type in ("nir", "vis"): X, wl = self._generate_nir_source( n_samples, concentrations, source_config ) source_data[source_config.name] = X wavelengths[source_config.name] = wl elif source_config.source_type in ("aux", "markers"): X = self._generate_aux_source( n_samples, concentrations, source_config ) source_data[source_config.name] = X else: raise ValueError( f"Unknown source type: '{source_config.source_type}'" ) # Generate targets from concentrations targets = self._generate_targets(concentrations, target_range) return MultiSourceResult( sources=source_data, targets=targets, source_configs=parsed_sources, wavelengths=wavelengths, )
def _generate_concentrations( self, n_samples: int, n_components: int, method: str, ) -> np.ndarray: """Generate shared component concentrations.""" if method == "dirichlet": alpha = np.ones(n_components) * 2.0 return self.rng.dirichlet(alpha, size=n_samples) elif method == "uniform": return self.rng.uniform(0, 1, size=(n_samples, n_components)) elif method == "lognormal": C = self.rng.lognormal(0, 0.5, size=(n_samples, n_components)) return C / C.sum(axis=1, keepdims=True) else: raise ValueError(f"Unknown concentration method: '{method}'") def _generate_nir_source( self, n_samples: int, concentrations: np.ndarray, config: SourceConfig, ) -> Tuple[np.ndarray, np.ndarray]: """Generate NIR spectral source.""" # Use default wavelength range if not specified wl_start = config.wavelength_start or 1000 wl_end = config.wavelength_end or 2500 wl_step = config.wavelength_step # Create NIR generator for this source library = None if config.components: library = ComponentLibrary.from_predefined( config.components, random_state=self._random_state ) generator = SyntheticNIRSGenerator( wavelength_start=wl_start, wavelength_end=wl_end, wavelength_step=wl_step, component_library=library, complexity=config.complexity, random_state=self._random_state, ) # Generate spectra # Note: We use the shared concentrations, but the generator may have # different number of components, so we adapt n_gen_components = generator.library.n_components n_shared_components = concentrations.shape[1] if n_gen_components == n_shared_components: C_adapted = concentrations elif n_gen_components < n_shared_components: # Use subset of concentrations C_adapted = concentrations[:, :n_gen_components] C_adapted = C_adapted / C_adapted.sum(axis=1, keepdims=True) else: # Extend concentrations with noise extra = self.rng.dirichlet( np.ones(n_gen_components - n_shared_components) * 0.5, size=n_samples ) * 0.2 C_adapted = np.hstack([concentrations * 0.8, extra]) C_adapted = C_adapted / C_adapted.sum(axis=1, keepdims=True) # Generate using the generator's internal methods X = generator._apply_beer_lambert(C_adapted) X = generator._apply_path_length(X) X = X + generator._generate_baseline(n_samples) X = generator._apply_global_slope(X) X = generator._apply_scatter(X) X = generator._apply_wavelength_shift(X) X = generator._apply_instrumental_response(X) X = generator._add_noise(X) return X, generator.wavelengths.copy() def _generate_aux_source( self, n_samples: int, concentrations: np.ndarray, config: SourceConfig, ) -> np.ndarray: """Generate auxiliary/marker source.""" n_features = config.n_features or 10 correlation = config.correlation_with_target # Generate features correlated with concentrations # Use a linear combination of concentrations plus noise n_components = concentrations.shape[1] # Create mixing matrix mixing = self.rng.normal(0, 1, size=(n_components, n_features)) # Base features from concentrations base_features = concentrations @ mixing # Add noise based on correlation level noise_std = np.sqrt((1 - correlation**2) / correlation**2) if correlation > 0 else 1.0 noise = self.rng.normal(0, noise_std, size=(n_samples, n_features)) X = base_features + noise * np.std(base_features) # Apply distribution transformation if config.distribution == "lognormal": X = np.exp((X - X.mean()) / X.std()) elif config.distribution == "uniform": # Rank transform to uniform for j in range(n_features): ranks = np.argsort(np.argsort(X[:, j])) X[:, j] = ranks / (n_samples - 1) return X def _generate_targets( self, concentrations: np.ndarray, target_range: Optional[Tuple[float, float]], ) -> np.ndarray: """Generate target values from concentrations.""" # Weighted combination of components weights = self.rng.dirichlet(np.ones(concentrations.shape[1])) y = concentrations @ weights # Scale to range if target_range is not None: min_val, max_val = target_range y_min, y_max = y.min(), y.max() if y_max > y_min: y = (y - y_min) / (y_max - y_min) * (max_val - min_val) + min_val else: y = np.full_like(y, (min_val + max_val) / 2) return y
[docs] def create_dataset( self, n_samples: int, sources: List[Union[SourceConfig, Dict[str, Any]]], *, train_ratio: float = 0.8, target_range: Optional[Tuple[float, float]] = None, name: str = "multi_source_synthetic", ) -> SpectroDataset: """ Create a SpectroDataset from multi-source generation. Args: n_samples: Number of samples to generate. sources: List of source configurations. train_ratio: Proportion of samples for training. target_range: Optional (min, max) for target scaling. name: Dataset name. Returns: SpectroDataset with multiple sources configured. Example: >>> dataset = generator.create_dataset( ... n_samples=500, ... sources=[ ... {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)}, ... {"name": "markers", "type": "aux", "n_features": 10} ... ], ... train_ratio=0.8 ... ) """ from nirs4all.data import SpectroDataset # Generate data result = self.generate( n_samples=n_samples, sources=sources, target_range=target_range, ) # Create dataset dataset = SpectroDataset(name=name) # Calculate partition sizes n_train = int(n_samples * train_ratio) # Shuffle indices indices = self.rng.permutation(n_samples) train_indices = indices[:n_train] test_indices = indices[n_train:] # Prepare multi-source data and headers # Combine all sources into feature arrays (concatenated) X_combined = result.get_combined_features() # Get headers from first NIR source if available, else simple feature names headers = None header_unit = None for source_name in result.source_names: if source_name in result.wavelengths: headers = [str(int(wl)) for wl in result.wavelengths[source_name]] header_unit = "nm" break if headers is None: headers = [f"feature_{i}" for i in range(X_combined.shape[1])] # Add training samples dataset.add_samples( X_combined[train_indices], indexes={"partition": "train"}, headers=headers, header_unit=header_unit, ) # Add test samples if len(test_indices) > 0: dataset.add_samples( X_combined[test_indices], indexes={"partition": "test"}, headers=headers, header_unit=header_unit, ) # Add targets y = result.targets dataset.add_targets(y[train_indices]) if len(test_indices) > 0: dataset.add_targets(y[test_indices]) return dataset
[docs] def generate_multi_source( n_samples: int, sources: Optional[List[Dict[str, Any]]] = None, *, random_state: Optional[int] = None, target_range: Optional[Tuple[float, float]] = None, as_dataset: bool = True, train_ratio: float = 0.8, name: str = "multi_source_synthetic", ) -> Union[SpectroDataset, MultiSourceResult]: """ Convenience function for generating multi-source datasets. Args: n_samples: Number of samples. sources: List of source configurations. If None, uses default single NIR source with wavelength range (1000, 2500). random_state: Random seed. target_range: Target value range. as_dataset: If True, returns SpectroDataset. train_ratio: Training set proportion. name: Dataset name. Returns: SpectroDataset or MultiSourceResult depending on as_dataset. Example: >>> dataset = generate_multi_source( ... n_samples=500, ... sources=[ ... {"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)}, ... {"name": "markers", "type": "aux", "n_features": 15} ... ], ... random_state=42 ... ) """ # Default sources if none provided if sources is None: sources = [{"name": "NIR", "type": "nir", "wavelength_range": (1000, 2500)}] generator = MultiSourceGenerator(random_state=random_state) if as_dataset: return generator.create_dataset( n_samples=n_samples, sources=sources, train_ratio=train_ratio, target_range=target_range, name=name, ) else: return generator.generate( n_samples=n_samples, sources=sources, target_range=target_range, )