"""
Dataset export utilities for synthetic NIRS data.
This module provides tools for exporting synthetic datasets to various
file formats and folder structures compatible with nirs4all loaders.
Key Features:
- Export to CSV files (single or multi-file format)
- Export to nirs4all standard folder structure (Xcal, Ycal, Xval, Yval)
- Export with metadata (sample IDs, groups, etc.)
- Generate CSV variations for loader testing
Example:
>>> from nirs4all.data.synthetic import SyntheticDatasetBuilder, DatasetExporter
>>>
>>> builder = SyntheticDatasetBuilder(n_samples=1000, random_state=42)
>>> X, y = builder.build_arrays()
>>>
>>> exporter = DatasetExporter()
>>> path = exporter.to_folder(
... "output/synthetic_data",
... X, y,
... train_ratio=0.8,
... wavelengths=builder.state._wavelengths
... )
"""
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import numpy as np
try:
import pandas as pd
HAS_PANDAS = True
except ImportError:
HAS_PANDAS = False
[docs]
@dataclass
class ExportConfig:
"""
Configuration for dataset export.
Attributes:
format: Export format ('standard', 'single', 'fragmented').
- 'standard': Separate Xcal, Ycal, Xval, Yval files.
- 'single': All data in one file with partition column.
- 'fragmented': Multiple small files (for loader testing).
separator: CSV delimiter character.
float_precision: Decimal precision for floating point values.
include_headers: Whether to include column headers in CSV.
include_index: Whether to include row index.
compression: Optional compression ('gzip', 'zip', None).
file_extension: File extension to use.
"""
format: Literal["standard", "single", "fragmented"] = "standard"
separator: str = ";"
float_precision: int = 6
include_headers: bool = True
include_index: bool = False
compression: Optional[Literal["gzip", "zip"]] = None
file_extension: str = ".csv"
[docs]
class DatasetExporter:
"""
Export synthetic datasets to various file formats.
This class provides methods for exporting synthetic NIRS datasets
to files and folders compatible with nirs4all's data loaders.
Attributes:
config: Export configuration settings.
Args:
config: Optional ExportConfig. Uses defaults if None.
Example:
>>> exporter = DatasetExporter()
>>>
>>> # Export to standard folder structure
>>> path = exporter.to_folder(
... "output/data",
... X, y,
... train_ratio=0.8,
... wavelengths=wavelengths
... )
>>>
>>> # Export to single CSV
>>> path = exporter.to_csv(
... "output/all_data.csv",
... X, y,
... wavelengths=wavelengths
... )
"""
def __init__(self, config: Optional[ExportConfig] = None) -> None:
"""
Initialize the exporter.
Args:
config: Export configuration. Uses defaults if None.
"""
self.config = config or ExportConfig()
[docs]
def to_folder(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
train_ratio: float = 0.8,
wavelengths: Optional[np.ndarray] = None,
metadata: Optional[Dict[str, np.ndarray]] = None,
random_state: Optional[int] = None,
format: Optional[Literal["standard", "single", "fragmented"]] = None,
) -> Path:
"""
Export dataset to a folder structure.
Creates a folder with CSV files compatible with nirs4all's
DatasetConfigs loader.
Args:
path: Output folder path.
X: Feature matrix (n_samples, n_features).
y: Target values (n_samples,) or (n_samples, n_targets).
train_ratio: Proportion for training set.
wavelengths: Optional wavelength values for column headers.
metadata: Optional dict of metadata arrays (same length as X).
random_state: Random seed for train/test split.
format: Override config format for this export.
Returns:
Path to created folder.
Raises:
ValueError: If X and y have incompatible shapes.
ImportError: If pandas is not available.
Example:
>>> exporter.to_folder(
... "data/synthetic",
... X, y,
... train_ratio=0.8,
... wavelengths=np.arange(1000, 2500, 2)
... )
"""
if not HAS_PANDAS:
raise ImportError("pandas is required for CSV export")
path = Path(path)
path.mkdir(parents=True, exist_ok=True)
# Validate inputs
if X.shape[0] != y.shape[0]:
raise ValueError(
f"X and y must have same number of samples. "
f"Got X: {X.shape[0]}, y: {y.shape[0]}"
)
export_format = format or self.config.format
if export_format == "standard":
return self._export_standard(path, X, y, train_ratio, wavelengths, metadata, random_state)
elif export_format == "single":
return self._export_single(path, X, y, train_ratio, wavelengths, metadata, random_state)
elif export_format == "fragmented":
return self._export_fragmented(path, X, y, train_ratio, wavelengths, metadata, random_state)
else:
raise ValueError(f"Unknown format: {export_format}")
def _export_standard(
self,
path: Path,
X: np.ndarray,
y: np.ndarray,
train_ratio: float,
wavelengths: Optional[np.ndarray],
metadata: Optional[Dict[str, np.ndarray]],
random_state: Optional[int],
) -> Path:
"""Export to standard Xcal/Ycal/Xval/Yval structure."""
n_samples = X.shape[0]
n_train = int(n_samples * train_ratio)
# Create train/test split
rng = np.random.default_rng(random_state)
indices = rng.permutation(n_samples)
train_idx = indices[:n_train]
test_idx = indices[n_train:]
# Create feature column names
if wavelengths is not None:
columns = [str(int(wl)) for wl in wavelengths]
else:
columns = [f"feature_{i}" for i in range(X.shape[1])]
# Create target column names
if y.ndim == 1:
y = y.reshape(-1, 1)
y_columns = [f"target_{i}" if y.shape[1] > 1 else "target" for i in range(y.shape[1])]
if y.shape[1] == 1:
y_columns = ["target"]
# Export training data
X_train = pd.DataFrame(X[train_idx], columns=columns)
X_train.to_csv(
path / f"Xcal{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
y_train = pd.DataFrame(y[train_idx], columns=y_columns)
y_train.to_csv(
path / f"Ycal{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
# Export test data
if len(test_idx) > 0:
X_test = pd.DataFrame(X[test_idx], columns=columns)
X_test.to_csv(
path / f"Xval{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
y_test = pd.DataFrame(y[test_idx], columns=y_columns)
y_test.to_csv(
path / f"Yval{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
# Export metadata if provided
if metadata:
self._export_metadata(path, metadata, train_idx, test_idx)
return path
def _export_single(
self,
path: Path,
X: np.ndarray,
y: np.ndarray,
train_ratio: float,
wavelengths: Optional[np.ndarray],
metadata: Optional[Dict[str, np.ndarray]],
random_state: Optional[int],
) -> Path:
"""Export all data to a single CSV file with partition column."""
n_samples = X.shape[0]
n_train = int(n_samples * train_ratio)
# Create train/test split
rng = np.random.default_rng(random_state)
indices = rng.permutation(n_samples)
train_idx = indices[:n_train]
test_idx = indices[n_train:]
# Create feature column names
if wavelengths is not None:
feature_columns = [str(int(wl)) for wl in wavelengths]
else:
feature_columns = [f"feature_{i}" for i in range(X.shape[1])]
# Ensure y is 2D
if y.ndim == 1:
y = y.reshape(-1, 1)
y_columns = [f"target_{i}" if y.shape[1] > 1 else "target" for i in range(y.shape[1])]
if y.shape[1] == 1:
y_columns = ["target"]
# Build combined DataFrame
data = {}
# Add partition column
partition = np.array(["train"] * n_samples)
partition[test_idx] = "test"
# Reorder by indices
reorder = np.zeros(n_samples, dtype=int)
reorder[indices] = np.arange(n_samples)
data["partition"] = partition
# Add sample IDs if in metadata
if metadata and "sample_id" in metadata:
data["sample_id"] = metadata["sample_id"]
# Add features
for i, col in enumerate(feature_columns):
data[col] = X[:, i]
# Add targets
for i, col in enumerate(y_columns):
data[col] = y[:, i]
# Add remaining metadata
if metadata:
for key, values in metadata.items():
if key != "sample_id": # Already added
data[key] = values
df = pd.DataFrame(data)
df.to_csv(
path / f"data{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
return path
def _export_fragmented(
self,
path: Path,
X: np.ndarray,
y: np.ndarray,
train_ratio: float,
wavelengths: Optional[np.ndarray],
metadata: Optional[Dict[str, np.ndarray]],
random_state: Optional[int],
) -> Path:
"""Export to multiple small files (for loader testing)."""
n_samples = X.shape[0]
n_train = int(n_samples * train_ratio)
# Create train/test split
rng = np.random.default_rng(random_state)
indices = rng.permutation(n_samples)
train_idx = indices[:n_train]
test_idx = indices[n_train:]
# Create feature column names
if wavelengths is not None:
columns = [str(int(wl)) for wl in wavelengths]
else:
columns = [f"feature_{i}" for i in range(X.shape[1])]
# Ensure y is 2D
if y.ndim == 1:
y = y.reshape(-1, 1)
y_columns = [f"target_{i}" if y.shape[1] > 1 else "target" for i in range(y.shape[1])]
if y.shape[1] == 1:
y_columns = ["target"]
# Create train folder with fragmented files
train_path = path / "train"
train_path.mkdir(parents=True, exist_ok=True)
# Split training data into chunks
chunk_size = max(10, len(train_idx) // 5) # At least 5 chunks
for i, start in enumerate(range(0, len(train_idx), chunk_size)):
end = min(start + chunk_size, len(train_idx))
chunk_idx = train_idx[start:end]
# Export features
df_x = pd.DataFrame(X[chunk_idx], columns=columns)
df_x.to_csv(
train_path / f"X_part{i}{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
# Export targets
df_y = pd.DataFrame(y[chunk_idx], columns=y_columns)
df_y.to_csv(
train_path / f"Y_part{i}{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
# Create test folder
if len(test_idx) > 0:
test_path = path / "test"
test_path.mkdir(parents=True, exist_ok=True)
df_x = pd.DataFrame(X[test_idx], columns=columns)
df_x.to_csv(
test_path / f"X{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
df_y = pd.DataFrame(y[test_idx], columns=y_columns)
df_y.to_csv(
test_path / f"Y{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
return path
def _export_metadata(
self,
path: Path,
metadata: Dict[str, np.ndarray],
train_idx: np.ndarray,
test_idx: np.ndarray,
) -> None:
"""Export metadata to separate CSV files."""
meta_df = pd.DataFrame(metadata)
# Training metadata
meta_train = meta_df.iloc[train_idx]
meta_train.to_csv(
path / f"metadata_cal{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
)
# Test metadata
if len(test_idx) > 0:
meta_test = meta_df.iloc[test_idx]
meta_test.to_csv(
path / f"metadata_val{self.config.file_extension}",
sep=self.config.separator,
index=self.config.include_index,
)
[docs]
def to_csv(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
metadata: Optional[Dict[str, np.ndarray]] = None,
include_targets: bool = True,
) -> Path:
"""
Export dataset to a single CSV file.
Creates a CSV file with features (and optionally targets) combined.
Args:
path: Output file path.
X: Feature matrix (n_samples, n_features).
y: Target values (n_samples,) or (n_samples, n_targets).
wavelengths: Optional wavelength values for column headers.
metadata: Optional dict of metadata arrays.
include_targets: Whether to include target column(s).
Returns:
Path to created file.
Example:
>>> exporter.to_csv("data.csv", X, y, wavelengths=wavelengths)
"""
if not HAS_PANDAS:
raise ImportError("pandas is required for CSV export")
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
# Validate inputs
if X.shape[0] != y.shape[0]:
raise ValueError(
f"X and y must have same number of samples. "
f"Got X: {X.shape[0]}, y: {y.shape[0]}"
)
# Create feature column names
if wavelengths is not None:
feature_columns = [str(int(wl)) for wl in wavelengths]
else:
feature_columns = [f"feature_{i}" for i in range(X.shape[1])]
# Build DataFrame
data = {}
# Add metadata first (sample IDs, etc.)
if metadata:
for key, values in metadata.items():
data[key] = values
# Add features
for i, col in enumerate(feature_columns):
data[col] = X[:, i]
# Add targets
if include_targets:
if y.ndim == 1:
data["target"] = y
else:
for i in range(y.shape[1]):
data[f"target_{i}" if y.shape[1] > 1 else "target"] = y[:, i]
df = pd.DataFrame(data)
df.to_csv(
path,
sep=self.config.separator,
index=self.config.include_index,
float_format=f"%.{self.config.float_precision}f",
)
return path
[docs]
def to_numpy(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
compressed: bool = False,
) -> Path:
"""
Export dataset to numpy .npy or .npz format.
Args:
path: Output file path (without extension).
X: Feature matrix (n_samples, n_features).
y: Target values.
wavelengths: Optional wavelength values.
compressed: Whether to use compressed format (.npz).
Returns:
Path to created file.
Example:
>>> exporter.to_numpy("data", X, y, compressed=True)
"""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
arrays = {"X": X, "y": y}
if wavelengths is not None:
arrays["wavelengths"] = wavelengths
if compressed:
save_path = path.with_suffix(".npz")
np.savez_compressed(save_path, **arrays)
else:
save_path = path.with_suffix(".npz")
np.savez(save_path, **arrays)
return save_path
[docs]
class CSVVariationGenerator:
"""
Generate CSV files with various format variations for loader testing.
This class creates CSV files with different delimiters, encodings,
header formats, and other variations to test the robustness of
CSV loaders.
Attributes:
base_exporter: DatasetExporter for actual file writing.
Example:
>>> generator = CSVVariationGenerator()
>>>
>>> # Generate all variations
>>> paths = generator.generate_all_variations(
... "test_data",
... X, y,
... wavelengths=wavelengths
... )
>>>
>>> # Generate specific variation
>>> path = generator.with_semicolon_delimiter(
... "data_semicolon",
... X, y
... )
"""
def __init__(self) -> None:
"""Initialize the variation generator."""
self.base_exporter = DatasetExporter()
[docs]
def generate_all_variations(
self,
base_path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Dict[str, Path]:
"""
Generate CSV files with all format variations.
Creates multiple versions of the dataset with different CSV
format options for comprehensive loader testing.
Args:
base_path: Base output folder path.
X: Feature matrix.
y: Target values.
wavelengths: Optional wavelength values.
train_ratio: Train/test split ratio.
random_state: Random seed.
Returns:
Dictionary mapping variation name to created path.
Example:
>>> paths = generator.generate_all_variations(
... "test_variations",
... X, y,
... random_state=42
... )
>>> print(paths.keys())
"""
base_path = Path(base_path)
base_path.mkdir(parents=True, exist_ok=True)
paths = {}
# Standard format (semicolon separator)
paths["standard_semicolon"] = self.with_semicolon_delimiter(
base_path / "standard_semicolon",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# Comma separator
paths["comma_separated"] = self.with_comma_delimiter(
base_path / "comma_separated",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# Tab separated
paths["tab_separated"] = self.with_tab_delimiter(
base_path / "tab_separated",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# No headers
paths["no_headers"] = self.without_headers(
base_path / "no_headers",
X, y,
train_ratio=train_ratio,
random_state=random_state,
)
# With index
paths["with_index"] = self.with_row_index(
base_path / "with_index",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# Single file format
paths["single_file"] = self.as_single_file(
base_path / "single_file",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# Fragmented files
paths["fragmented"] = self.as_fragmented(
base_path / "fragmented",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
# Low precision
paths["low_precision"] = self.with_precision(
base_path / "low_precision",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
precision=2,
)
# High precision
paths["high_precision"] = self.with_precision(
base_path / "high_precision",
X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
precision=10,
)
return paths
[docs]
def with_semicolon_delimiter(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create CSV with semicolon delimiter (nirs4all default)."""
config = ExportConfig(separator=";")
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
[docs]
def with_comma_delimiter(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create CSV with comma delimiter."""
config = ExportConfig(separator=",")
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
[docs]
def with_tab_delimiter(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create CSV with tab delimiter."""
config = ExportConfig(separator="\t", file_extension=".tsv")
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
[docs]
def with_row_index(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create CSV with row index column."""
config = ExportConfig(include_index=True)
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
[docs]
def as_single_file(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create single CSV file with all data and partition column."""
config = ExportConfig(format="single")
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
format="single",
)
[docs]
def as_fragmented(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
) -> Path:
"""Create fragmented dataset with multiple small files."""
exporter = DatasetExporter()
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
format="fragmented",
)
[docs]
def with_precision(
self,
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
train_ratio: float = 0.8,
random_state: Optional[int] = None,
precision: int = 6,
) -> Path:
"""Create CSV with specified floating point precision."""
config = ExportConfig(float_precision=precision)
exporter = DatasetExporter(config)
return exporter.to_folder(
path, X, y,
wavelengths=wavelengths,
train_ratio=train_ratio,
random_state=random_state,
)
[docs]
def export_to_folder(
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
train_ratio: float = 0.8,
wavelengths: Optional[np.ndarray] = None,
format: Literal["standard", "single", "fragmented"] = "standard",
random_state: Optional[int] = None,
) -> Path:
"""
Quick function to export synthetic data to folder.
Convenience function for simple export use cases.
Args:
path: Output folder path.
X: Feature matrix.
y: Target values.
train_ratio: Train/test split ratio.
wavelengths: Optional wavelength values.
format: Export format.
random_state: Random seed.
Returns:
Path to created folder.
Example:
>>> path = export_to_folder(
... "data/synthetic",
... X, y,
... train_ratio=0.8,
... wavelengths=wavelengths
... )
"""
exporter = DatasetExporter()
return exporter.to_folder(
path, X, y,
train_ratio=train_ratio,
wavelengths=wavelengths,
format=format,
random_state=random_state,
)
[docs]
def export_to_csv(
path: Union[str, Path],
X: np.ndarray,
y: np.ndarray,
*,
wavelengths: Optional[np.ndarray] = None,
) -> Path:
"""
Quick function to export synthetic data to single CSV.
Args:
path: Output file path.
X: Feature matrix.
y: Target values.
wavelengths: Optional wavelength values.
Returns:
Path to created file.
Example:
>>> path = export_to_csv("data.csv", X, y)
"""
exporter = DatasetExporter()
return exporter.to_csv(path, X, y, wavelengths=wavelengths)