"""
Benchmark dataset utilities for synthetic data validation.
This module provides information about standard NIR benchmark datasets
that can be used to validate synthetic data quality.
Phase 4 Features:
- Benchmark dataset registry with metadata
- Dataset characteristic summaries
- Reference spectral properties
- Loader utilities for common formats
Note:
This module provides metadata and loading utilities for benchmark datasets.
The actual dataset files need to be obtained from their respective sources
due to licensing restrictions.
References:
- Corn (Cargill): M5spec competition dataset
- Tecator (meat): StatLib - meat protein/fat/moisture
- Shootout 2002: IDRC shootout pharmaceutical tablets
- Wheat: Hard red wheat kernels dataset
"""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np
# ============================================================================
# Benchmark Dataset Registry
# ============================================================================
[docs]
class BenchmarkDomain(str, Enum):
"""Domains for benchmark datasets."""
AGRICULTURE = "agriculture"
FOOD = "food"
PHARMACEUTICAL = "pharmaceutical"
PETROCHEMICAL = "petrochemical"
ENVIRONMENTAL = "environmental"
GENERAL = "general"
[docs]
@dataclass
class BenchmarkDatasetInfo:
"""
Metadata for a benchmark dataset.
Attributes:
name: Dataset name/identifier.
full_name: Full descriptive name.
domain: Application domain.
n_samples: Number of samples (approximate if variable).
n_wavelengths: Number of wavelength points.
wavelength_range: (min, max) wavelength in nm.
targets: List of target variable names.
sample_type: Description of sample type.
measurement_mode: Typical measurement mode.
source_url: URL to obtain the dataset.
reference: Publication or source reference.
license: License information.
typical_snr: Typical signal-to-noise ratio range.
typical_peak_density: Typical peaks per 100 nm.
notes: Additional notes.
"""
name: str
full_name: str
domain: BenchmarkDomain
n_samples: int
n_wavelengths: int
wavelength_range: Tuple[float, float]
targets: List[str]
sample_type: str
measurement_mode: str
source_url: str
reference: str
license: str = "Unknown"
typical_snr: Tuple[float, float] = (50, 500)
typical_peak_density: Tuple[float, float] = (1.0, 5.0)
notes: str = ""
[docs]
def summary(self) -> str:
"""Return a human-readable summary."""
lines = [
f"Dataset: {self.full_name} ({self.name})",
f"Domain: {self.domain.value}",
f"Samples: {self.n_samples}",
f"Wavelengths: {self.n_wavelengths} ({self.wavelength_range[0]}-{self.wavelength_range[1]} nm)",
f"Targets: {', '.join(self.targets)}",
f"Sample Type: {self.sample_type}",
f"Measurement: {self.measurement_mode}",
f"Source: {self.source_url}",
f"Reference: {self.reference}",
]
if self.notes:
lines.append(f"Notes: {self.notes}")
return "\n".join(lines)
# Registry of benchmark datasets
BENCHMARK_DATASETS: Dict[str, BenchmarkDatasetInfo] = {
"corn": BenchmarkDatasetInfo(
name="corn",
full_name="Corn/Maize M5spec Dataset",
domain=BenchmarkDomain.AGRICULTURE,
n_samples=80,
n_wavelengths=700,
wavelength_range=(1100, 2498),
targets=["moisture", "oil", "protein", "starch"],
sample_type="Ground corn samples",
measurement_mode="reflectance",
source_url="http://www.eigenvector.com/data/Corn/",
reference="Eigenvector Research, Cargill Inc.",
license="Free for research use",
typical_snr=(100, 500),
typical_peak_density=(1.5, 4.0),
notes="Classic small sample size calibration challenge. 3 instruments.",
),
"tecator": BenchmarkDatasetInfo(
name="tecator",
full_name="Tecator Meat Dataset",
domain=BenchmarkDomain.FOOD,
n_samples=215,
n_wavelengths=100,
wavelength_range=(850, 1050),
targets=["fat", "moisture", "protein"],
sample_type="Finely chopped meat samples",
measurement_mode="transmittance",
source_url="http://lib.stat.cmu.edu/datasets/tecator",
reference="Tecator Infratec Food and Feed Analyzer",
license="Free for academic use",
typical_snr=(200, 1000),
typical_peak_density=(0.5, 2.0),
notes="Wet meat samples. Narrow wavelength range.",
),
"shootout2002": BenchmarkDatasetInfo(
name="shootout2002",
full_name="IDRC Shootout 2002 Pharmaceutical Tablets",
domain=BenchmarkDomain.PHARMACEUTICAL,
n_samples=654,
n_wavelengths=404,
wavelength_range=(600, 1898),
targets=["api_content", "hardness", "active_weight"],
sample_type="Pharmaceutical tablets",
measurement_mode="reflectance",
source_url="http://www.idrc-chambersburg.org/shootout.html",
reference="IDRC (International Diffuse Reflectance Conference)",
license="Free for research use",
typical_snr=(100, 400),
typical_peak_density=(2.0, 5.0),
notes="Blend uniformity challenge. Multiple manufacturing lots.",
),
"wheat_kernels": BenchmarkDatasetInfo(
name="wheat_kernels",
full_name="Hard Red Wheat Kernels",
domain=BenchmarkDomain.AGRICULTURE,
n_samples=155,
n_wavelengths=100,
wavelength_range=(1100, 2498),
targets=["protein", "moisture", "hardness"],
sample_type="Intact wheat kernels",
measurement_mode="reflectance",
source_url="http://www.eigenvector.com/data/Wheat/",
reference="Eigenvector Research",
license="Free for research use",
typical_snr=(50, 300),
typical_peak_density=(1.0, 3.0),
notes="Intact kernel analysis (not ground). High scatter variation.",
),
"diesel": BenchmarkDatasetInfo(
name="diesel",
full_name="Diesel Fuel NIR Dataset",
domain=BenchmarkDomain.PETROCHEMICAL,
n_samples=245,
n_wavelengths=401,
wavelength_range=(750, 1550),
targets=["cetane", "density", "viscosity", "total_aromatics"],
sample_type="Diesel fuel samples",
measurement_mode="transmittance",
source_url="http://www.eigenvector.com/data/SWRI/",
reference="Southwest Research Institute",
license="Free for research use",
typical_snr=(300, 1000),
typical_peak_density=(0.5, 2.0),
notes="Clear liquid samples. Low scattering.",
),
"tablet_api": BenchmarkDatasetInfo(
name="tablet_api",
full_name="Tablet Active Pharmaceutical Ingredient Dataset",
domain=BenchmarkDomain.PHARMACEUTICAL,
n_samples=310,
n_wavelengths=650,
wavelength_range=(1100, 2498),
targets=["api_concentration"],
sample_type="Intact pharmaceutical tablets",
measurement_mode="reflectance",
source_url="Various publications",
reference="Multiple sources",
license="Various",
typical_snr=(80, 400),
typical_peak_density=(2.0, 6.0),
notes="Typical intact tablet analysis scenario.",
),
"milk": BenchmarkDatasetInfo(
name="milk",
full_name="Milk Composition Dataset",
domain=BenchmarkDomain.FOOD,
n_samples=300,
n_wavelengths=1050,
wavelength_range=(400, 2500),
targets=["fat", "protein", "lactose"],
sample_type="Raw milk samples",
measurement_mode="transflectance",
source_url="Various dairy research",
reference="Dairy research literature",
license="Various",
typical_snr=(200, 800),
typical_peak_density=(1.5, 4.0),
notes="Emulsion samples. Water dominates spectrum.",
),
"olive_oil": BenchmarkDatasetInfo(
name="olive_oil",
full_name="Olive Oil Authenticity Dataset",
domain=BenchmarkDomain.FOOD,
n_samples=120,
n_wavelengths=1050,
wavelength_range=(400, 2500),
targets=["adulterant_fraction", "acidity", "peroxide_value"],
sample_type="Olive oil samples",
measurement_mode="transmittance",
source_url="Various food authenticity research",
reference="Food authenticity literature",
license="Various",
typical_snr=(400, 1200),
typical_peak_density=(0.5, 2.0),
notes="Clear liquid. Classification and regression tasks.",
),
}
# ============================================================================
# Dataset Loader Utilities
# ============================================================================
[docs]
@dataclass
class LoadedBenchmarkDataset:
"""
Container for a loaded benchmark dataset.
Attributes:
info: Dataset metadata.
X: Spectral data (n_samples, n_wavelengths).
y: Target values (n_samples, n_targets) or (n_samples,).
wavelengths: Wavelength array.
sample_ids: Optional sample identifiers.
metadata: Optional additional metadata.
"""
info: BenchmarkDatasetInfo
X: np.ndarray
y: np.ndarray
wavelengths: np.ndarray
sample_ids: Optional[np.ndarray] = None
metadata: Dict[str, Any] = field(default_factory=dict)
[docs]
def list_benchmark_datasets() -> List[str]:
"""
List all registered benchmark datasets.
Returns:
List of dataset names.
Example:
>>> datasets = list_benchmark_datasets()
>>> print(datasets)
"""
return list(BENCHMARK_DATASETS.keys())
[docs]
def get_benchmark_info(name: str) -> BenchmarkDatasetInfo:
"""
Get information about a benchmark dataset.
Args:
name: Dataset name.
Returns:
BenchmarkDatasetInfo for the dataset.
Raises:
KeyError: If dataset not found.
Example:
>>> info = get_benchmark_info("corn")
>>> print(info.summary())
"""
if name not in BENCHMARK_DATASETS:
available = ", ".join(BENCHMARK_DATASETS.keys())
raise KeyError(f"Unknown benchmark dataset '{name}'. Available: {available}")
return BENCHMARK_DATASETS[name]
[docs]
def get_datasets_by_domain(domain: Union[str, BenchmarkDomain]) -> List[str]:
"""
Get benchmark datasets for a specific domain.
Args:
domain: Domain name or enum.
Returns:
List of dataset names in that domain.
Example:
>>> pharma_datasets = get_datasets_by_domain("pharmaceutical")
>>> print(pharma_datasets)
"""
if isinstance(domain, str):
domain = BenchmarkDomain(domain)
return [
name for name, info in BENCHMARK_DATASETS.items()
if info.domain == domain
]
[docs]
def load_benchmark_dataset(
name: str,
data_dir: Optional[Union[str, Path]] = None,
format: str = "auto",
) -> LoadedBenchmarkDataset:
"""
Load a benchmark dataset from disk.
Args:
name: Dataset name from registry.
data_dir: Directory containing dataset files.
format: File format ("auto", "csv", "mat", "jdx").
Returns:
LoadedBenchmarkDataset with data.
Raises:
FileNotFoundError: If dataset files not found.
KeyError: If dataset name not in registry.
Example:
>>> dataset = load_benchmark_dataset("corn", data_dir="./datasets/")
>>> print(dataset.X.shape, dataset.y.shape)
Note:
Dataset files must be obtained separately from their sources.
This function provides standardized loading once files are available.
"""
info = get_benchmark_info(name)
if data_dir is None:
raise FileNotFoundError(
f"Dataset '{name}' requires data_dir parameter. "
f"Please obtain the dataset from: {info.source_url}"
)
data_dir = Path(data_dir)
# Try common file patterns
possible_files = [
data_dir / f"{name}.csv",
data_dir / f"{name}.mat",
data_dir / f"{name}_spectra.csv",
data_dir / name / "spectra.csv",
data_dir / name / f"{name}.csv",
]
data_file = None
for f in possible_files:
if f.exists():
data_file = f
break
if data_file is None:
raise FileNotFoundError(
f"Could not find dataset files for '{name}' in {data_dir}. "
f"Tried: {[str(f) for f in possible_files]}"
)
# Load based on format
if format == "auto":
format = data_file.suffix.lstrip(".")
if format == "csv":
return _load_csv_dataset(data_file, info)
elif format == "mat":
return _load_mat_dataset(data_file, info)
else:
raise ValueError(f"Unsupported format: {format}")
def _load_csv_dataset(
filepath: Path,
info: BenchmarkDatasetInfo,
) -> LoadedBenchmarkDataset:
"""Load dataset from CSV format."""
import csv
data = []
with open(filepath, 'r') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
data.append([float(x) if x else np.nan for x in row])
data = np.array(data)
# Assume first few columns are targets, rest are spectra
n_targets = len(info.targets)
y = data[:, :n_targets]
X = data[:, n_targets:]
# Generate wavelength array
wl_start, wl_end = info.wavelength_range
wavelengths = np.linspace(wl_start, wl_end, X.shape[1])
return LoadedBenchmarkDataset(
info=info,
X=X,
y=y,
wavelengths=wavelengths,
)
def _load_mat_dataset(
filepath: Path,
info: BenchmarkDatasetInfo,
) -> LoadedBenchmarkDataset:
"""Load dataset from MATLAB .mat format."""
from scipy.io import loadmat
mat_data = loadmat(str(filepath))
# Common variable names in .mat files
X = None
y = None
wavelengths = None
for key in ['X', 'spectra', 'Spectra', 'NIR']:
if key in mat_data:
X = mat_data[key]
break
for key in ['Y', 'y', 'targets', 'Targets', 'reference']:
if key in mat_data:
y = mat_data[key]
break
for key in ['wavelengths', 'wl', 'Wavelengths', 'nm']:
if key in mat_data:
wavelengths = mat_data[key].flatten()
break
if X is None:
raise ValueError(f"Could not find spectral data in {filepath}")
if y is None:
# Create dummy targets
y = np.zeros((X.shape[0], 1))
if wavelengths is None:
wl_start, wl_end = info.wavelength_range
wavelengths = np.linspace(wl_start, wl_end, X.shape[1])
return LoadedBenchmarkDataset(
info=info,
X=X,
y=y,
wavelengths=wavelengths,
)
# ============================================================================
# Synthetic Dataset Generation Matching Benchmark
# ============================================================================
[docs]
def get_benchmark_spectral_properties(name: str) -> Dict[str, Any]:
"""
Get spectral properties to match when generating synthetic data.
Args:
name: Benchmark dataset name.
Returns:
Dictionary of properties suitable for synthetic generator.
Example:
>>> props = get_benchmark_spectral_properties("corn")
>>> generator = SyntheticNIRSGenerator(**props)
"""
info = get_benchmark_info(name)
# Map domain to likely components
domain_components = {
BenchmarkDomain.AGRICULTURE: ["water", "protein", "starch", "cellulose", "lipid"],
BenchmarkDomain.FOOD: ["water", "protein", "lipid", "glucose", "lactose"],
BenchmarkDomain.PHARMACEUTICAL: ["cellulose", "starch", "paracetamol", "water"],
BenchmarkDomain.PETROCHEMICAL: ["alkane", "aromatic", "oil"],
}
return {
"wavelength_start": info.wavelength_range[0],
"wavelength_end": info.wavelength_range[1],
"wavelength_step": (info.wavelength_range[1] - info.wavelength_range[0]) / info.n_wavelengths,
"measurement_mode": info.measurement_mode,
"typical_components": domain_components.get(info.domain, ["water", "protein"]),
"n_samples": info.n_samples,
"expected_snr": info.typical_snr,
"expected_peak_density": info.typical_peak_density,
}
[docs]
def create_synthetic_matching_benchmark(
benchmark_name: str,
n_samples: Optional[int] = None,
random_state: Optional[int] = None,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Create synthetic data matching benchmark dataset properties.
Args:
benchmark_name: Name of benchmark dataset to match.
n_samples: Number of samples (uses benchmark size if None).
random_state: Random state for reproducibility.
Returns:
Tuple of (spectra, concentrations, component_spectra).
Example:
>>> X, C, E = create_synthetic_matching_benchmark("corn", random_state=42)
>>> print(X.shape)
"""
# Import here to avoid circular imports
from .generator import SyntheticNIRSGenerator
from .components import ComponentLibrary
props = get_benchmark_spectral_properties(benchmark_name)
# Create component library
library = ComponentLibrary.from_predefined(props["typical_components"])
# Create generator
generator = SyntheticNIRSGenerator(
component_library=library,
wavelength_start=props["wavelength_start"],
wavelength_end=props["wavelength_end"],
wavelength_step=props["wavelength_step"],
random_state=random_state,
)
# Generate
if n_samples is None:
n_samples = props["n_samples"]
X, C, E = generator.generate(n_samples=n_samples)
return X, C, E