Source code for nirs4all.data.synthetic.metadata
"""
Metadata generation for synthetic NIRS datasets.
This module provides tools for generating realistic sample metadata including
sample IDs, biological sample groupings, repetitions, and custom columns.
Example:
>>> from nirs4all.data.synthetic.metadata import MetadataGenerator
>>>
>>> generator = MetadataGenerator(random_state=42)
>>> metadata = generator.generate(
... n_samples=100,
... sample_id_prefix="S",
... n_groups=3,
... n_repetitions=(2, 4)
... )
>>> print(metadata.keys())
dict_keys(['sample_id', 'bio_sample_id', 'repetition', 'group'])
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
import numpy as np
[docs]
@dataclass
class MetadataGenerationResult:
"""
Container for generated metadata.
Attributes:
sample_ids: Unique sample identifiers.
bio_sample_ids: Biological sample identifiers (before repetitions).
repetitions: Repetition number for each sample.
groups: Group assignments.
group_indices: Integer group indices (for stratification).
n_bio_samples: Number of unique biological samples.
additional_columns: Any extra columns generated.
"""
sample_ids: np.ndarray
bio_sample_ids: Optional[np.ndarray] = None
repetitions: Optional[np.ndarray] = None
groups: Optional[np.ndarray] = None
group_indices: Optional[np.ndarray] = None
n_bio_samples: int = 0
additional_columns: Optional[Dict[str, np.ndarray]] = None
[docs]
def to_dict(self) -> Dict[str, np.ndarray]:
"""
Convert to dictionary format suitable for DataFrame or SpectroDataset.
Returns:
Dictionary with string keys and array values.
"""
result: Dict[str, np.ndarray] = {"sample_id": self.sample_ids}
if self.bio_sample_ids is not None:
result["bio_sample_id"] = self.bio_sample_ids
if self.repetitions is not None:
result["repetition"] = self.repetitions
if self.groups is not None:
result["group"] = self.groups
if self.group_indices is not None:
result["group_idx"] = self.group_indices
if self.additional_columns:
result.update(self.additional_columns)
return result
[docs]
class MetadataGenerator:
"""
Generate realistic metadata for synthetic NIRS datasets.
This class creates sample identifiers, biological sample groupings,
repetition structures, and group assignments that mimic real
spectroscopy datasets.
Attributes:
rng: NumPy random generator for reproducibility.
Args:
random_state: Random seed for reproducibility.
Example:
>>> generator = MetadataGenerator(random_state=42)
>>>
>>> # Generate with repetitions and groups
>>> metadata = generator.generate(
... n_samples=100,
... sample_id_prefix="WHEAT",
... n_groups=3,
... group_names=["Field_A", "Field_B", "Field_C"],
... n_repetitions=(2, 4)
... )
>>>
>>> # Result: Each biological sample has 2-4 spectral measurements
>>> print(f"Bio samples: {metadata.n_bio_samples}")
>>> print(f"Total samples: {len(metadata.sample_ids)}")
"""
def __init__(self, random_state: Optional[int] = None) -> None:
"""
Initialize the metadata generator.
Args:
random_state: Random seed for reproducibility.
"""
self.rng = np.random.default_rng(random_state)
self._random_state = random_state
[docs]
def generate(
self,
n_samples: int,
*,
sample_id_prefix: str = "S",
n_groups: Optional[int] = None,
group_names: Optional[List[str]] = None,
n_repetitions: Union[int, Tuple[int, int]] = 1,
bio_sample_prefix: str = "B",
additional_columns: Optional[Dict[str, Any]] = None,
) -> MetadataGenerationResult:
"""
Generate complete metadata for a synthetic dataset.
This method handles the complex logic of generating samples with
repetitions while respecting group structures. When repetitions
are requested, biological samples are created first, then each
is replicated 1 or more times to create the final samples.
Args:
n_samples: Total number of samples (spectra) to generate.
sample_id_prefix: Prefix for sample ID strings.
n_groups: Number of groups (None for no grouping).
group_names: Optional list of group names. If None and n_groups > 0,
generates names like "Group_0", "Group_1", etc.
n_repetitions: Number of repetitions per biological sample.
If int: fixed number of repetitions.
If tuple (min, max): random number in range [min, max].
bio_sample_prefix: Prefix for biological sample IDs.
additional_columns: Dictionary of additional columns to generate.
Keys are column names, values can be:
- Callable(n_samples, rng) -> np.ndarray
- List of values to randomly sample from
- Tuple (distribution, params) for numeric data
Returns:
MetadataGenerationResult containing all generated metadata.
Raises:
ValueError: If n_samples is less than 1 or if n_repetitions
would make it impossible to generate the requested samples.
Example:
>>> generator = MetadataGenerator(random_state=42)
>>>
>>> # Simple case: 100 samples, no repetitions
>>> result = generator.generate(100)
>>> assert len(result.sample_ids) == 100
>>>
>>> # With repetitions: ~50 bio samples, each measured 2 times
>>> result = generator.generate(100, n_repetitions=2)
>>> assert result.n_bio_samples == 50
>>>
>>> # Variable repetitions
>>> result = generator.generate(100, n_repetitions=(1, 3))
"""
if n_samples < 1:
raise ValueError(f"n_samples must be >= 1, got {n_samples}")
# Parse repetition config
if isinstance(n_repetitions, int):
min_reps = max_reps = n_repetitions
else:
min_reps, max_reps = n_repetitions
if min_reps < 1:
raise ValueError(f"Minimum repetitions must be >= 1, got {min_reps}")
# Generate samples with repetitions
if min_reps == max_reps == 1:
# No repetitions - simple case
n_bio_samples = n_samples
bio_sample_ids = None
repetitions = None
sample_bio_mapping = np.arange(n_samples)
else:
# With repetitions - need to calculate bio samples
bio_sample_ids, repetitions, sample_bio_mapping, n_bio_samples = (
self._generate_repetition_structure(
n_samples, min_reps, max_reps, bio_sample_prefix
)
)
# Generate sample IDs
sample_ids = self._generate_sample_ids(n_samples, sample_id_prefix)
# Generate groups if requested
groups = None
group_indices = None
if n_groups is not None and n_groups > 0:
groups, group_indices = self._generate_groups(
n_samples=n_samples,
n_bio_samples=n_bio_samples,
sample_bio_mapping=sample_bio_mapping,
n_groups=n_groups,
group_names=group_names,
)
# Generate additional columns
extra_columns = None
if additional_columns:
extra_columns = self._generate_additional_columns(
n_samples, additional_columns
)
return MetadataGenerationResult(
sample_ids=sample_ids,
bio_sample_ids=bio_sample_ids,
repetitions=repetitions,
groups=groups,
group_indices=group_indices,
n_bio_samples=n_bio_samples,
additional_columns=extra_columns,
)
def _generate_sample_ids(
self, n_samples: int, prefix: str
) -> np.ndarray:
"""Generate unique sample ID strings."""
# Determine number of digits needed
n_digits = max(4, len(str(n_samples)))
return np.array([f"{prefix}{i:0{n_digits}d}" for i in range(n_samples)])
def _generate_repetition_structure(
self,
n_samples: int,
min_reps: int,
max_reps: int,
bio_sample_prefix: str,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
"""
Generate biological sample structure with repetitions.
Returns:
Tuple of (bio_sample_ids, repetitions, sample_bio_mapping, n_bio_samples)
"""
if min_reps == max_reps:
# Fixed repetitions
avg_reps = min_reps
else:
# Variable repetitions - estimate needed bio samples
avg_reps = (min_reps + max_reps) / 2
# Estimate number of biological samples
n_bio_estimate = int(np.ceil(n_samples / avg_reps))
# Generate repetition counts for each bio sample
if min_reps == max_reps:
rep_counts = np.full(n_bio_estimate, min_reps)
else:
rep_counts = self.rng.integers(
min_reps, max_reps + 1, size=n_bio_estimate
)
# Adjust to get exact sample count with a bounded loop
total_samples = rep_counts.sum()
max_iterations = n_samples * 2 # Safety limit
iteration = 0
while total_samples != n_samples and iteration < max_iterations:
iteration += 1
diff = n_samples - total_samples
if diff > 0:
# Need more samples
if diff <= max_reps:
# Can add one final bio sample with exact count needed
# But ensure it's within valid range
if min_reps <= diff <= max_reps:
rep_counts = np.append(rep_counts, diff)
else:
# Add min_reps and continue adjusting
rep_counts = np.append(rep_counts, min_reps)
else:
# Add a bio sample with random reps
new_reps = self.rng.integers(min_reps, max_reps + 1)
rep_counts = np.append(rep_counts, new_reps)
else:
# Too many samples - reduce or remove
if len(rep_counts) == 0:
break # Safety: can't remove from empty
if rep_counts[-1] > min_reps:
# Reduce last bio sample's reps
reduction = min(-diff, rep_counts[-1] - min_reps)
rep_counts[-1] -= reduction
else:
# Remove last bio sample entirely
rep_counts = rep_counts[:-1]
total_samples = rep_counts.sum()
# If we couldn't match exactly, force a match by adjusting last element
if total_samples != n_samples and len(rep_counts) > 0:
diff = n_samples - total_samples
if diff > 0:
rep_counts[-1] += diff
elif rep_counts[-1] + diff >= 1:
rep_counts[-1] += diff
n_bio_samples = len(rep_counts)
# Generate bio sample IDs
n_digits = max(4, len(str(n_bio_samples)))
bio_ids_unique = np.array(
[f"{bio_sample_prefix}{i:0{n_digits}d}" for i in range(n_bio_samples)]
)
# Expand to all samples
bio_sample_ids = np.repeat(bio_ids_unique, rep_counts)
sample_bio_mapping = np.repeat(np.arange(n_bio_samples), rep_counts)
# Generate repetition numbers
repetitions = np.concatenate([
np.arange(1, count + 1) for count in rep_counts
])
return bio_sample_ids, repetitions, sample_bio_mapping, n_bio_samples
def _generate_groups(
self,
n_samples: int,
n_bio_samples: int,
sample_bio_mapping: np.ndarray,
n_groups: int,
group_names: Optional[List[str]],
) -> Tuple[np.ndarray, np.ndarray]:
"""
Generate group assignments.
Groups are assigned at the biological sample level to ensure all
repetitions of a sample belong to the same group.
"""
# Generate group names
if group_names is None:
group_names = [f"Group_{i}" for i in range(n_groups)]
elif len(group_names) != n_groups:
raise ValueError(
f"group_names length ({len(group_names)}) must match "
f"n_groups ({n_groups})"
)
# Assign groups to biological samples (balanced)
bio_group_indices = np.zeros(n_bio_samples, dtype=np.int32)
samples_per_group = n_bio_samples // n_groups
remainder = n_bio_samples % n_groups
idx = 0
for g in range(n_groups):
count = samples_per_group + (1 if g < remainder else 0)
bio_group_indices[idx:idx + count] = g
idx += count
# Shuffle group assignments
self.rng.shuffle(bio_group_indices)
# Expand to all samples
group_indices = bio_group_indices[sample_bio_mapping]
groups = np.array([group_names[i] for i in group_indices])
return groups, group_indices
def _generate_additional_columns(
self,
n_samples: int,
columns: Dict[str, Any],
) -> Dict[str, np.ndarray]:
"""Generate additional metadata columns based on specifications."""
result = {}
for col_name, spec in columns.items():
if callable(spec):
# User-provided generator function
result[col_name] = spec(n_samples, self.rng)
elif isinstance(spec, (list, np.ndarray)):
# Random sampling from provided values
result[col_name] = self.rng.choice(spec, size=n_samples)
elif isinstance(spec, tuple) and len(spec) == 2:
# Distribution specification
dist_name, params = spec
result[col_name] = self._generate_from_distribution(
n_samples, dist_name, params
)
else:
raise ValueError(
f"Invalid specification for column '{col_name}': {spec}. "
f"Expected callable, list, or (distribution, params) tuple."
)
return result
def _generate_from_distribution(
self,
n_samples: int,
dist_name: str,
params: Dict[str, Any],
) -> np.ndarray:
"""Generate values from a named distribution."""
if dist_name == "uniform":
low = params.get("low", 0)
high = params.get("high", 1)
return self.rng.uniform(low, high, size=n_samples)
elif dist_name == "normal":
mean = params.get("mean", 0)
std = params.get("std", 1)
return self.rng.normal(mean, std, size=n_samples)
elif dist_name == "integers":
low = params.get("low", 0)
high = params.get("high", 10)
return self.rng.integers(low, high + 1, size=n_samples)
elif dist_name == "choice":
values = params.get("values", [0, 1])
probs = params.get("probs", None)
return self.rng.choice(values, size=n_samples, p=probs)
else:
raise ValueError(f"Unknown distribution: '{dist_name}'")
[docs]
def generate_sample_metadata(
n_samples: int,
*,
random_state: Optional[int] = None,
sample_id_prefix: str = "S",
n_groups: Optional[int] = None,
group_names: Optional[List[str]] = None,
n_repetitions: Union[int, Tuple[int, int]] = 1,
) -> Dict[str, np.ndarray]:
"""
Convenience function to generate sample metadata.
This is a simplified interface to MetadataGenerator for common use cases.
Args:
n_samples: Total number of samples to generate.
random_state: Random seed for reproducibility.
sample_id_prefix: Prefix for sample ID strings.
n_groups: Number of groups (None for no grouping).
group_names: Optional list of group names.
n_repetitions: Repetitions per biological sample.
Returns:
Dictionary with metadata arrays.
Example:
>>> metadata = generate_sample_metadata(
... n_samples=100,
... random_state=42,
... n_groups=3,
... n_repetitions=(2, 4)
... )
>>> print(metadata.keys())
"""
generator = MetadataGenerator(random_state=random_state)
result = generator.generate(
n_samples=n_samples,
sample_id_prefix=sample_id_prefix,
n_groups=n_groups,
group_names=group_names,
n_repetitions=n_repetitions,
)
return result.to_dict()