Source code for nirs4all.data.synthetic.domains

"""
Application domain configurations for synthetic NIRS data generation.

This module provides domain-specific priors and configurations for generating
realistic synthetic NIRS data tailored to specific application areas such as
agriculture, pharmaceutical, food processing, petrochemical, and others.

Each domain configuration includes:
- Typical spectral components (chemical compounds)
- Concentration distributions specific to the domain
- Wavelength ranges commonly used
- Typical number of components in samples
- Domain-specific noise and artifact characteristics

Key Features:
    - 15+ predefined application domains
    - Domain-aware component selection
    - Realistic concentration priors
    - Easy integration with generators

Example:
    >>> from nirs4all.data.synthetic.domains import (
    ...     get_domain_config,
    ...     APPLICATION_DOMAINS,
    ...     DomainConfig
    ... )
    >>>
    >>> # Get configuration for agricultural samples
    >>> config = get_domain_config("agriculture_grain")
    >>> print(config.typical_components)
    ['starch', 'protein', 'moisture', 'lipid', 'cellulose']

References:
    - Burns, D. A., & Ciurczak, E. W. (2007). Handbook of Near-Infrared
      Analysis (3rd ed.). CRC Press.
    - Williams, P. C., & Norris, K. H. (2001). Near-Infrared Technology
      in the Agricultural and Food Industries (2nd ed.). AACC International.
    - Reich, G. (2005). Near-Infrared Spectroscopy and Imaging: Basic Principles
      and Pharmaceutical Applications. Advanced Drug Delivery Reviews.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, List, Optional, Tuple, Any

import numpy as np


[docs] class DomainCategory(str, Enum): """Top-level domain categories.""" AGRICULTURE = "agriculture" FOOD = "food" PHARMACEUTICAL = "pharmaceutical" PETROCHEMICAL = "petrochemical" TEXTILE = "textile" ENVIRONMENTAL = "environmental" BIOMEDICAL = "biomedical" POLYMER = "polymer" BEVERAGE = "beverage"
[docs] @dataclass class ConcentrationPrior: """ Prior distribution for component concentrations. Attributes: distribution: Distribution type ('uniform', 'normal', 'lognormal', 'beta'). params: Parameters for the distribution (distribution-specific). min_value: Minimum allowed concentration. max_value: Maximum allowed concentration. """ distribution: str = "uniform" params: Dict[str, float] = field(default_factory=lambda: {"low": 0.0, "high": 1.0}) min_value: float = 0.0 max_value: float = 1.0
[docs] def sample(self, rng: np.random.Generator, n_samples: int = 1) -> np.ndarray: """Sample from the concentration prior.""" if self.distribution == "uniform": values = rng.uniform( self.params.get("low", 0.0), self.params.get("high", 1.0), size=n_samples ) elif self.distribution == "normal": values = rng.normal( self.params.get("mean", 0.5), self.params.get("std", 0.1), size=n_samples ) elif self.distribution == "lognormal": values = rng.lognormal( self.params.get("mean", -1.0), self.params.get("sigma", 0.5), size=n_samples ) elif self.distribution == "beta": values = rng.beta( self.params.get("a", 2.0), self.params.get("b", 5.0), size=n_samples ) else: values = rng.uniform(0, 1, size=n_samples) return np.clip(values, self.min_value, self.max_value)
[docs] @dataclass class DomainConfig: """ Configuration for a specific application domain. Encapsulates all domain-specific parameters needed for generating realistic synthetic NIRS data. Attributes: name: Human-readable domain name. category: Domain category (agriculture, pharmaceutical, etc.). description: Brief description of the domain. typical_components: List of predefined component names commonly found. component_weights: Relative importance of each component (for selection). concentration_priors: Per-component concentration distributions. wavelength_range: Typical measurement range (nm). n_components_range: Range of number of components per sample. noise_level: Typical noise level ('low', 'medium', 'high'). measurement_mode: Typical measurement geometry. typical_sample_types: Examples of sample types in this domain. complexity: Overall complexity level for generation. additional_params: Domain-specific additional parameters. """ name: str category: DomainCategory description: str = "" typical_components: List[str] = field(default_factory=list) component_weights: Optional[Dict[str, float]] = None concentration_priors: Dict[str, ConcentrationPrior] = field(default_factory=dict) wavelength_range: Tuple[float, float] = (1000, 2500) n_components_range: Tuple[int, int] = (3, 8) noise_level: str = "medium" measurement_mode: str = "reflectance" typical_sample_types: List[str] = field(default_factory=list) complexity: str = "realistic" additional_params: Dict[str, Any] = field(default_factory=dict)
[docs] def get_component_weights(self) -> Dict[str, float]: """Get normalized component weights for selection.""" if self.component_weights is not None: return self.component_weights # Default: equal weights n = len(self.typical_components) if n == 0: return {} return {comp: 1.0 / n for comp in self.typical_components}
[docs] def sample_components( self, rng: np.random.Generator, n_components: Optional[int] = None ) -> List[str]: """ Sample components for a sample based on domain priors. Args: rng: Random number generator. n_components: Number of components. If None, samples from range. Returns: List of component names. """ if n_components is None: n_components = rng.integers( self.n_components_range[0], self.n_components_range[1] + 1 ) weights = self.get_component_weights() components = list(weights.keys()) probs = np.array(list(weights.values())) probs = probs / probs.sum() # Sample without replacement if possible n_to_sample = min(n_components, len(components)) selected = rng.choice( components, size=n_to_sample, replace=False, p=probs ) return list(selected)
[docs] def sample_concentrations( self, rng: np.random.Generator, components: List[str], n_samples: int = 1 ) -> np.ndarray: """ Sample concentrations for selected components. Args: rng: Random number generator. components: List of component names. n_samples: Number of samples. Returns: Concentration matrix (n_samples, n_components). """ n_components = len(components) concentrations = np.zeros((n_samples, n_components)) for i, comp in enumerate(components): if comp in self.concentration_priors: prior = self.concentration_priors[comp] else: # Default prior prior = ConcentrationPrior( distribution="beta", params={"a": 2, "b": 5} ) concentrations[:, i] = prior.sample(rng, n_samples) return concentrations
# ============================================================================ # Predefined Domain Configurations # ============================================================================ APPLICATION_DOMAINS: Dict[str, DomainConfig] = { # ========================================================================= # AGRICULTURE DOMAINS # ========================================================================= "agriculture_grain": DomainConfig( name="Grain and Cereals", category=DomainCategory.AGRICULTURE, description="NIR analysis of wheat, corn, barley, rice, and other cereals", typical_components=[ "starch", "protein", "moisture", "lipid", "cellulose", "gluten", "hemicellulose", "dietary_fiber" ], component_weights={ "starch": 0.25, "protein": 0.20, "moisture": 0.20, "lipid": 0.10, "cellulose": 0.10, "gluten": 0.08, "hemicellulose": 0.05, "dietary_fiber": 0.02 }, concentration_priors={ "starch": ConcentrationPrior("normal", {"mean": 0.65, "std": 0.10}, 0.3, 0.8), "protein": ConcentrationPrior("normal", {"mean": 0.12, "std": 0.03}, 0.05, 0.25), "moisture": ConcentrationPrior("normal", {"mean": 0.12, "std": 0.02}, 0.08, 0.18), "lipid": ConcentrationPrior("beta", {"a": 2, "b": 20}, 0.01, 0.10), }, wavelength_range=(1100, 2500), n_components_range=(4, 7), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["wheat flour", "corn meal", "whole grain", "ground samples"], complexity="realistic", ), "agriculture_forage": DomainConfig( name="Forage and Feed", category=DomainCategory.AGRICULTURE, description="NIR analysis of hay, silage, and animal feed", typical_components=[ "protein", "moisture", "cellulose", "hemicellulose", "lignin", "starch", "lipid", "nitrogen_compound", "dietary_fiber" ], component_weights={ "cellulose": 0.20, "protein": 0.18, "moisture": 0.18, "hemicellulose": 0.12, "lignin": 0.10, "starch": 0.08, "lipid": 0.06, "nitrogen_compound": 0.05, "dietary_fiber": 0.03 }, concentration_priors={ "protein": ConcentrationPrior("normal", {"mean": 0.15, "std": 0.05}, 0.05, 0.30), "moisture": ConcentrationPrior("normal", {"mean": 0.15, "std": 0.05}, 0.05, 0.40), "cellulose": ConcentrationPrior("normal", {"mean": 0.30, "std": 0.08}, 0.15, 0.50), }, wavelength_range=(1100, 2500), n_components_range=(5, 9), noise_level="high", measurement_mode="reflectance", typical_sample_types=["hay", "silage", "TMR", "pasture"], complexity="complex", ), "agriculture_oilseeds": DomainConfig( name="Oilseeds", category=DomainCategory.AGRICULTURE, description="NIR analysis of soybeans, canola, sunflower, and other oilseeds", typical_components=[ "oil", "protein", "moisture", "starch", "cellulose", "unsaturated_fat", "saturated_fat" ], component_weights={ "oil": 0.25, "protein": 0.25, "moisture": 0.15, "starch": 0.12, "cellulose": 0.10, "unsaturated_fat": 0.08, "saturated_fat": 0.05 }, concentration_priors={ "oil": ConcentrationPrior("normal", {"mean": 0.20, "std": 0.05}, 0.10, 0.45), "protein": ConcentrationPrior("normal", {"mean": 0.35, "std": 0.05}, 0.25, 0.50), "moisture": ConcentrationPrior("normal", {"mean": 0.10, "std": 0.02}, 0.05, 0.15), }, wavelength_range=(1100, 2500), n_components_range=(4, 7), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["soybeans", "canola", "sunflower seeds", "cottonseed"], ), "agriculture_fruit": DomainConfig( name="Fruits and Vegetables", category=DomainCategory.AGRICULTURE, description="NIR analysis of fresh produce quality", typical_components=[ "water", "glucose", "fructose", "sucrose", "starch", "cellulose", "malic_acid", "citric_acid", "carotenoid" ], component_weights={ "water": 0.25, "glucose": 0.15, "fructose": 0.15, "sucrose": 0.12, "cellulose": 0.10, "starch": 0.08, "malic_acid": 0.06, "citric_acid": 0.05, "carotenoid": 0.04 }, concentration_priors={ "water": ConcentrationPrior("normal", {"mean": 0.85, "std": 0.05}, 0.70, 0.95), "glucose": ConcentrationPrior("beta", {"a": 2, "b": 10}, 0.02, 0.15), "fructose": ConcentrationPrior("beta", {"a": 2, "b": 10}, 0.02, 0.15), }, wavelength_range=(700, 1100), # Shorter range for fresh produce n_components_range=(5, 8), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["apples", "tomatoes", "citrus", "berries"], ), # ========================================================================= # FOOD DOMAINS # ========================================================================= "food_dairy": DomainConfig( name="Dairy Products", category=DomainCategory.FOOD, description="NIR analysis of milk, cheese, and dairy products", typical_components=[ "water", "lactose", "casein", "lipid", "moisture", "protein" ], component_weights={ "water": 0.25, "lactose": 0.20, "casein": 0.18, "lipid": 0.18, "moisture": 0.10, "protein": 0.09 }, concentration_priors={ "water": ConcentrationPrior("normal", {"mean": 0.87, "std": 0.02}, 0.80, 0.92), "lipid": ConcentrationPrior("normal", {"mean": 0.04, "std": 0.01}, 0.01, 0.08), "protein": ConcentrationPrior("normal", {"mean": 0.035, "std": 0.005}, 0.02, 0.05), "lactose": ConcentrationPrior("normal", {"mean": 0.048, "std": 0.003}, 0.04, 0.055), }, wavelength_range=(1100, 2500), n_components_range=(4, 6), noise_level="low", measurement_mode="transflectance", typical_sample_types=["milk", "cheese", "yogurt", "cream"], ), "food_meat": DomainConfig( name="Meat and Poultry", category=DomainCategory.FOOD, description="NIR analysis of meat composition and quality", typical_components=[ "water", "protein", "lipid", "moisture", "collagen" ], component_weights={ "water": 0.25, "protein": 0.30, "lipid": 0.25, "moisture": 0.12, "collagen": 0.08 }, concentration_priors={ "water": ConcentrationPrior("normal", {"mean": 0.70, "std": 0.05}, 0.55, 0.80), "protein": ConcentrationPrior("normal", {"mean": 0.20, "std": 0.03}, 0.12, 0.28), "lipid": ConcentrationPrior("lognormal", {"mean": -2, "sigma": 0.5}, 0.02, 0.40), }, wavelength_range=(900, 1700), n_components_range=(4, 5), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["beef", "pork", "chicken", "ground meat"], additional_params={"collagen": "derived from protein group"}, ), "food_bakery": DomainConfig( name="Bakery Products", category=DomainCategory.FOOD, description="NIR analysis of bread, cookies, and baked goods", typical_components=[ "starch", "gluten", "moisture", "lipid", "sucrose", "glucose", "protein", "cellulose" ], component_weights={ "starch": 0.25, "gluten": 0.18, "moisture": 0.18, "lipid": 0.12, "sucrose": 0.10, "glucose": 0.07, "protein": 0.06, "cellulose": 0.04 }, wavelength_range=(1100, 2500), n_components_range=(5, 8), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["bread", "cookies", "crackers", "pastries"], ), "food_chocolate": DomainConfig( name="Confectionery and Chocolate", category=DomainCategory.FOOD, description="NIR analysis of chocolate and confectionery products", typical_components=[ "lipid", "sucrose", "moisture", "protein", "starch", "caffeine", "unsaturated_fat" ], component_weights={ "lipid": 0.25, "sucrose": 0.25, "moisture": 0.12, "protein": 0.12, "starch": 0.10, "caffeine": 0.08, "unsaturated_fat": 0.08 }, wavelength_range=(1100, 2500), n_components_range=(4, 7), noise_level="low", measurement_mode="reflectance", typical_sample_types=["dark chocolate", "milk chocolate", "cocoa powder"], ), # ========================================================================= # PHARMACEUTICAL DOMAINS # ========================================================================= "pharma_tablets": DomainConfig( name="Pharmaceutical Tablets", category=DomainCategory.PHARMACEUTICAL, description="NIR analysis of tablet formulations and API content", typical_components=[ "starch", "cellulose", "lactose", "moisture", "aspirin", "paracetamol", "caffeine" ], component_weights={ "starch": 0.18, "cellulose": 0.18, "lactose": 0.18, "moisture": 0.12, "aspirin": 0.12, "paracetamol": 0.12, "caffeine": 0.10 }, concentration_priors={ "moisture": ConcentrationPrior("normal", {"mean": 0.02, "std": 0.005}, 0.005, 0.05), }, wavelength_range=(1100, 2500), n_components_range=(4, 7), noise_level="low", measurement_mode="reflectance", typical_sample_types=["tablets", "capsules", "granules"], complexity="simple", ), "pharma_powder_blends": DomainConfig( name="Pharmaceutical Powder Blends", category=DomainCategory.PHARMACEUTICAL, description="NIR monitoring of powder blending uniformity", typical_components=[ "starch", "cellulose", "lactose", "moisture", "aspirin", "paracetamol", "caffeine" ], wavelength_range=(1100, 2500), n_components_range=(3, 6), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["powder blend", "premix", "granulation"], ), "pharma_raw_materials": DomainConfig( name="Pharmaceutical Raw Materials", category=DomainCategory.PHARMACEUTICAL, description="NIR identification and verification of raw materials", typical_components=[ "starch", "cellulose", "lactose", "glucose", "sucrose", "aspirin", "paracetamol", "caffeine", "urea" ], wavelength_range=(1100, 2500), n_components_range=(1, 3), # Usually single component verification noise_level="low", measurement_mode="reflectance", typical_sample_types=["excipients", "APIs", "intermediates"], ), # ========================================================================= # PETROCHEMICAL DOMAINS # ========================================================================= "petrochem_fuels": DomainConfig( name="Petroleum Fuels", category=DomainCategory.PETROCHEMICAL, description="NIR analysis of gasoline, diesel, and aviation fuels", typical_components=[ "alkane", "aromatic", "oil", "unsaturated_fat", "methanol", "ethanol" ], component_weights={ "alkane": 0.35, "aromatic": 0.25, "oil": 0.20, "unsaturated_fat": 0.10, "methanol": 0.05, "ethanol": 0.05 }, wavelength_range=(900, 1700), n_components_range=(3, 6), noise_level="low", measurement_mode="transmission", typical_sample_types=["gasoline", "diesel", "jet fuel", "biodiesel"], ), "petrochem_polymers": DomainConfig( name="Petrochemical Polymers", category=DomainCategory.PETROCHEMICAL, description="NIR analysis of synthetic polymers and plastics", typical_components=[ "polyethylene", "polystyrene", "nylon", "polyester", "natural_rubber" ], wavelength_range=(1100, 2500), n_components_range=(1, 4), noise_level="low", measurement_mode="reflectance", typical_sample_types=["pellets", "films", "fibers", "molded parts"], ), # ========================================================================= # TEXTILE DOMAINS # ========================================================================= "textile_natural": DomainConfig( name="Natural Fibers", category=DomainCategory.TEXTILE, description="NIR analysis of cotton, wool, and natural fibers", typical_components=[ "cellulose", "cotton", "moisture", "protein", "waxes", "lignin" ], component_weights={ "cellulose": 0.30, "cotton": 0.25, "moisture": 0.20, "protein": 0.10, "waxes": 0.10, "lignin": 0.05 }, wavelength_range=(1100, 2500), n_components_range=(3, 6), noise_level="medium", measurement_mode="reflectance", typical_sample_types=["cotton", "wool", "silk", "linen"], ), "textile_synthetic": DomainConfig( name="Synthetic Fibers", category=DomainCategory.TEXTILE, description="NIR analysis of polyester, nylon, and synthetic fibers", typical_components=[ "polyester", "nylon", "polystyrene", "moisture" ], wavelength_range=(1100, 2500), n_components_range=(2, 4), noise_level="low", measurement_mode="reflectance", typical_sample_types=["polyester", "nylon", "acrylic", "blends"], ), # ========================================================================= # ENVIRONMENTAL DOMAINS # ========================================================================= "environmental_soil": DomainConfig( name="Soil Analysis", category=DomainCategory.ENVIRONMENTAL, description="NIR analysis of soil properties and composition", typical_components=[ "moisture", "carbonates", "kaolinite", "gypsum", "cellulose", "lignin", "protein" ], component_weights={ "moisture": 0.25, "carbonates": 0.18, "kaolinite": 0.18, "cellulose": 0.12, "lignin": 0.10, "gypsum": 0.10, "protein": 0.07 }, wavelength_range=(1100, 2500), n_components_range=(4, 7), noise_level="high", measurement_mode="reflectance", typical_sample_types=["topsoil", "subsoil", "sediments"], complexity="complex", ), "environmental_water": DomainConfig( name="Water Quality", category=DomainCategory.ENVIRONMENTAL, description="NIR analysis of water quality parameters", typical_components=[ "water", "glucose", "protein", "urea", "acetic_acid" ], wavelength_range=(900, 1100), # Short-wave NIR for water n_components_range=(2, 4), noise_level="medium", measurement_mode="transmission", typical_sample_types=["surface water", "wastewater", "process water"], ), # ========================================================================= # BEVERAGE DOMAINS # ========================================================================= "beverage_wine": DomainConfig( name="Wine and Spirits", category=DomainCategory.BEVERAGE, description="NIR analysis of wine, beer, and alcoholic beverages", typical_components=[ "water", "ethanol", "glucose", "fructose", "glycerol", "tartaric_acid", "malic_acid", "tannins" ], component_weights={ "water": 0.20, "ethanol": 0.25, "glucose": 0.10, "fructose": 0.10, "glycerol": 0.10, "tartaric_acid": 0.10, "malic_acid": 0.08, "tannins": 0.07 }, concentration_priors={ "ethanol": ConcentrationPrior("normal", {"mean": 0.13, "std": 0.02}, 0.08, 0.18), "glucose": ConcentrationPrior("lognormal", {"mean": -3, "sigma": 0.8}, 0.0, 0.15), }, wavelength_range=(900, 1700), n_components_range=(5, 8), noise_level="low", measurement_mode="transmission", typical_sample_types=["red wine", "white wine", "beer", "spirits"], ), "beverage_juice": DomainConfig( name="Fruit Juices", category=DomainCategory.BEVERAGE, description="NIR analysis of fruit juices and beverages", typical_components=[ "water", "glucose", "fructose", "sucrose", "citric_acid", "malic_acid", "carotenoid" ], wavelength_range=(900, 1100), n_components_range=(4, 7), noise_level="low", measurement_mode="transmission", typical_sample_types=["orange juice", "apple juice", "grape juice"], ), # ========================================================================= # BIOMEDICAL DOMAINS # ========================================================================= "biomedical_tissue": DomainConfig( name="Tissue Analysis", category=DomainCategory.BIOMEDICAL, description="NIR spectroscopy of biological tissues", typical_components=[ "water", "lipid", "protein", "glucose", "hemoglobin" ], component_weights={ "water": 0.35, "lipid": 0.20, "protein": 0.25, "glucose": 0.10, "hemoglobin": 0.10 }, wavelength_range=(700, 1100), # Optical window n_components_range=(3, 5), noise_level="high", measurement_mode="reflectance", typical_sample_types=["skin", "muscle", "fat tissue"], additional_params={"hemoglobin": "simulated with carotenoid"}, ), } # ============================================================================ # Domain Access Functions # ============================================================================
[docs] def get_domain_config(domain_name: str) -> DomainConfig: """ Get configuration for a specific domain. Args: domain_name: Name of the domain (key in APPLICATION_DOMAINS). Returns: DomainConfig for the specified domain. Raises: ValueError: If domain is not found. Example: >>> config = get_domain_config("agriculture_grain") >>> print(config.name) 'Grain and Cereals' """ if domain_name not in APPLICATION_DOMAINS: available = list(APPLICATION_DOMAINS.keys()) raise ValueError( f"Unknown domain: '{domain_name}'. Available domains: {available}" ) return APPLICATION_DOMAINS[domain_name]
[docs] def list_domains(category: Optional[DomainCategory] = None) -> List[str]: """ List available domain names. Args: category: Optional category filter. Returns: List of domain names. Example: >>> list_domains(DomainCategory.AGRICULTURE) ['agriculture_grain', 'agriculture_forage', ...] """ domains = [] for name, config in APPLICATION_DOMAINS.items(): if category is None or config.category == category: domains.append(name) return domains
[docs] def get_domain_components(domain_name: str) -> List[str]: """ Get typical components for a domain. Args: domain_name: Name of the domain. Returns: List of component names. Example: >>> get_domain_components("food_dairy") ['water', 'lactose', 'casein', 'lipid', 'moisture', 'protein'] """ config = get_domain_config(domain_name) return config.typical_components
[docs] def get_domains_for_component(component_name: str) -> List[str]: """ Find domains that typically contain a specific component. Args: component_name: Name of the component. Returns: List of domain names containing this component. Example: >>> get_domains_for_component("protein") ['agriculture_grain', 'food_meat', 'biomedical_tissue', ...] """ domains = [] for name, config in APPLICATION_DOMAINS.items(): if component_name in config.typical_components: domains.append(name) return domains
[docs] def create_domain_aware_library( domain_name: str, n_samples: int = 100, random_state: Optional[int] = None ) -> Tuple[List[str], np.ndarray]: """ Create component selection and concentrations based on domain priors. This function samples components and their concentrations according to domain-specific distributions. Args: domain_name: Name of the domain. n_samples: Number of samples to generate concentrations for. random_state: Random seed for reproducibility. Returns: Tuple of (component_names, concentration_matrix). Example: >>> components, concentrations = create_domain_aware_library( ... "food_dairy", ... n_samples=50, ... random_state=42 ... ) >>> print(components) ['water', 'lactose', 'casein', 'lipid'] >>> print(concentrations.shape) (50, 4) """ config = get_domain_config(domain_name) rng = np.random.default_rng(random_state) # Sample components components = config.sample_components(rng) # Sample concentrations concentrations = config.sample_concentrations(rng, components, n_samples) return components, concentrations
# ============================================================================ # Module-level exports # ============================================================================ __all__ = [ # Classes "DomainCategory", "ConcentrationPrior", "DomainConfig", # Data "APPLICATION_DOMAINS", # Functions "get_domain_config", "list_domains", "get_domain_components", "get_domains_for_component", "create_domain_aware_library", ]