Source code for nirs4all.data.selection.sample_linker

"""
Sample linker for dataset configuration.

This module provides key-based sample linking across multiple data files,
enabling joining of features, targets, and metadata by a common identifier.

Example:
    >>> linker = SampleLinker()
    >>> result = linker.link(
    ...     {"features": features_df, "targets": targets_df},
    ...     link_by="sample_id"
    ... )
    >>> print(result.linked_data)  # Joined DataFrame
"""

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Union

import pandas as pd


[docs] class LinkingError(Exception): """Raised when sample linking fails.""" pass
[docs] @dataclass class LinkingResult: """Result of a sample linking operation. Attributes: linked_data: Dictionary of linked DataFrames (key column removed). key_column: The column used for linking. matched_keys: Set of keys present in all sources. missing_keys: Dictionary mapping source names to their missing keys. sample_count: Number of linked samples. report: Detailed linking report. """ linked_data: Dict[str, pd.DataFrame] key_column: str matched_keys: Set[Any] missing_keys: Dict[str, Set[Any]] sample_count: int report: Dict[str, Any] = field(default_factory=dict)
[docs] class SampleLinker: """Link samples across multiple data files by key column. Supports multiple linking modes: - "inner": Keep only samples present in all sources (default) - "left": Keep all samples from the first source - "outer": Keep all samples from any source Example: >>> linker = SampleLinker() >>> result = linker.link( ... { ... "X": features_df, # Has columns: sample_id, feature1, feature2 ... "Y": targets_df, # Has columns: sample_id, target ... "M": metadata_df, # Has columns: sample_id, group, date ... }, ... link_by="sample_id" ... ) >>> # Linked DataFrames have aligned rows >>> X_linked = result.linked_data["X"] # Without sample_id column """ def __init__( self, mode: str = "inner", on_missing: str = "warn", ): """Initialize the sample linker. Args: mode: Linking mode - "inner", "left", or "outer". on_missing: Action when keys are missing - "warn", "error", or "ignore". """ if mode not in ("inner", "left", "outer"): raise ValueError(f"Invalid mode: {mode}. Expected 'inner', 'left', or 'outer'.") if on_missing not in ("warn", "error", "ignore"): raise ValueError(f"Invalid on_missing: {on_missing}. Expected 'warn', 'error', or 'ignore'.") self.mode = mode self.on_missing = on_missing
[docs] def create_sample_index( self, sources: Dict[str, pd.DataFrame], link_by: str, ) -> pd.DataFrame: """Create a sample index showing key presence across sources. Args: sources: Dictionary of source DataFrames. link_by: Key column name. Returns: DataFrame with keys as index and boolean columns per source. """ key_sets: Dict[str, Set[Any]] = {} for name, df in sources.items(): if link_by in df.columns: key_sets[name] = set(df[link_by].unique()) else: key_sets[name] = set() all_keys = set.union(*key_sets.values()) if key_sets else set() index_data = { link_by: list(all_keys), } for name, keys in key_sets.items(): index_data[f"in_{name}"] = [k in keys for k in all_keys] result = pd.DataFrame(index_data) result["in_all"] = result[[f"in_{name}" for name in key_sets]].all(axis=1) result = result.set_index(link_by) return result