Source code for nirs4all.data.selection.column_selector

"""
Column selector for dataset configuration.

This module provides flexible column selection for DataFrames, supporting
multiple selection syntaxes including indices, names, ranges, regex patterns,
and exclusion.

Example:
    >>> selector = ColumnSelector()
    >>> # By name
    >>> cols = selector.select(df, ["col1", "col2"])
    >>> # By index
    >>> cols = selector.select(df, [0, 1, 2])
    >>> # By range (slice syntax)
    >>> cols = selector.select(df, "2:-1")
    >>> # By regex pattern
    >>> cols = selector.select(df, {"regex": "^feature_.*"})
    >>> # By exclusion
    >>> cols = selector.select(df, {"exclude": ["id", "date"]})
"""

import re
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union

import numpy as np
import pandas as pd



[docs]
class ColumnSelectionError(Exception):
    """Raised when column selection fails."""
    pass



# Type alias for column selection specification
ColumnSpec = Union[
    int,                     # Single index
    str,                     # Single name or range string
    List[int],               # List of indices
    List[str],               # List of names
    Dict[str, Any],          # Complex selection (regex, exclude, etc.)
    slice,                   # Python slice object
    None,                    # Select all columns
]



[docs]
@dataclass
class SelectionResult:
    """Result of a column selection operation.

    Attributes:
        indices: List of selected column indices (0-based).
        names: List of selected column names.
        data: The selected DataFrame subset.
    """
    indices: List[int]
    names: List[str]
    data: pd.DataFrame




[docs]
class ColumnSelector:
    """Flexible column selector for DataFrames.

    Supports multiple selection methods:
    - By name: `["col1", "col2"]` or `"col_name"`
    - By index: `[0, 1, 2]` or `0`
    - By range: `"2:-1"` (slice syntax as string)
    - By regex pattern: `{"regex": "^feature_.*"}`
    - By exclusion: `{"exclude": ["id", "date"]}`
    - Combined: `{"include": [0, 1], "exclude": ["id"]}`

    Example:
        >>> selector = ColumnSelector()
        >>> result = selector.select(df, "2:-1")
        >>> print(result.names)  # Column names in range
        >>> print(result.data)   # Selected columns as DataFrame
    """

    def __init__(self, case_sensitive: bool = True):
        """Initialize the column selector.

        Args:
            case_sensitive: Whether column name matching is case-sensitive.
        """
        self.case_sensitive = case_sensitive


[docs]
    def select(
        self,
        df: pd.DataFrame,
        selection: ColumnSpec,
    ) -> SelectionResult:
        """Select columns from a DataFrame.

        Args:
            df: The DataFrame to select columns from.
            selection: Column selection specification. Can be:
                - None: Select all columns
                - int: Single column index
                - str: Single column name or range string ("2:-1")
                - List[int]: List of column indices
                - List[str]: List of column names
                - Dict: Complex selection (see class docstring)

        Returns:
            SelectionResult with indices, names, and selected data.

        Raises:
            ColumnSelectionError: If selection is invalid or columns not found.
        """
        if selection is None:
            # Select all columns
            return SelectionResult(
                indices=list(range(len(df.columns))),
                names=df.columns.tolist(),
                data=df.copy(),
            )

        if isinstance(selection, int):
            return self._select_by_single_index(df, selection)

        if isinstance(selection, str):
            return self._select_by_string(df, selection)

        if isinstance(selection, slice):
            return self._select_by_slice(df, selection)

        if isinstance(selection, (list, tuple)):
            return self._select_by_list(df, selection)

        if isinstance(selection, dict):
            return self._select_by_dict(df, selection)

        raise ColumnSelectionError(
            f"Unsupported selection type: {type(selection).__name__}. "
            f"Expected int, str, list, dict, or None."
        )


    def _select_by_single_index(
        self,
        df: pd.DataFrame,
        index: int,
    ) -> SelectionResult:
        """Select a single column by index."""
        n_cols = len(df.columns)

        # Handle negative indices
        if index < 0:
            index = n_cols + index

        if index < 0 or index >= n_cols:
            raise ColumnSelectionError(
                f"Column index {index} out of range. "
                f"DataFrame has {n_cols} columns (0-{n_cols - 1})."
            )

        col_name = df.columns[index]
        return SelectionResult(
            indices=[index],
            names=[col_name],
            data=df.iloc[:, [index]].copy(),
        )

    def _select_by_string(
        self,
        df: pd.DataFrame,
        selection: str,
    ) -> SelectionResult:
        """Select by string (column name or range)."""
        # Check if it's a range/slice syntax
        if ":" in selection:
            return self._select_by_range_string(df, selection)

        # Otherwise, treat as column name
        return self._select_by_name(df, selection)

    def _select_by_name(
        self,
        df: pd.DataFrame,
        name: str,
    ) -> SelectionResult:
        """Select a single column by name."""
        col_names = df.columns.tolist()

        if self.case_sensitive:
            if name in col_names:
                idx = col_names.index(name)
                return SelectionResult(
                    indices=[idx],
                    names=[name],
                    data=df[[name]].copy(),
                )
        else:
            # Case-insensitive search
            lower_names = [n.lower() for n in col_names]
            if name.lower() in lower_names:
                idx = lower_names.index(name.lower())
                actual_name = col_names[idx]
                return SelectionResult(
                    indices=[idx],
                    names=[actual_name],
                    data=df[[actual_name]].copy(),
                )

        raise ColumnSelectionError(
            f"Column '{name}' not found. Available columns: {col_names[:10]}"
            + ("..." if len(col_names) > 10 else "")
        )

    def _select_by_range_string(
        self,
        df: pd.DataFrame,
        range_str: str,
    ) -> SelectionResult:
        """Select columns by range string (slice syntax).

        Examples:
            "2:-1" -> columns from index 2 to second-to-last
            ":5" -> first 5 columns
            "3:" -> columns from index 3 to end
            "1:10:2" -> columns 1, 3, 5, 7, 9
        """
        n_cols = len(df.columns)

        # Parse the range string
        parts = range_str.split(":")
        if len(parts) == 2:
            start_str, stop_str = parts
            step_str = None
        elif len(parts) == 3:
            start_str, stop_str, step_str = parts
        else:
            raise ColumnSelectionError(
                f"Invalid range format: '{range_str}'. "
                f"Expected format: 'start:stop' or 'start:stop:step'."
            )

        # Parse start, stop, step
        try:
            start = int(start_str) if start_str.strip() else None
            stop = int(stop_str) if stop_str.strip() else None
            step = int(step_str) if step_str and step_str.strip() else None
        except ValueError as e:
            raise ColumnSelectionError(
                f"Invalid range values in '{range_str}': {e}"
            )

        # Create slice and select
        slc = slice(start, stop, step)
        return self._select_by_slice(df, slc)

    def _select_by_slice(
        self,
        df: pd.DataFrame,
        slc: slice,
    ) -> SelectionResult:
        """Select columns by slice object."""
        n_cols = len(df.columns)
        indices = list(range(*slc.indices(n_cols)))

        if not indices:
            raise ColumnSelectionError(
                f"Slice {slc} results in empty selection for DataFrame with {n_cols} columns."
            )

        names = [df.columns[i] for i in indices]
        return SelectionResult(
            indices=indices,
            names=names,
            data=df.iloc[:, indices].copy(),
        )

    def _select_by_list(
        self,
        df: pd.DataFrame,
        selection: Sequence,
    ) -> SelectionResult:
        """Select columns by list of indices or names."""
        if not selection:
            raise ColumnSelectionError("Empty selection list provided.")

        # Determine if list contains indices or names
        first_item = selection[0]

        if isinstance(first_item, (int, np.integer)):
            return self._select_by_index_list(df, list(selection))
        elif isinstance(first_item, str):
            return self._select_by_name_list(df, list(selection))
        else:
            raise ColumnSelectionError(
                f"Unsupported list item type: {type(first_item).__name__}. "
                f"Expected int or str."
            )

    def _select_by_index_list(
        self,
        df: pd.DataFrame,
        indices: List[int],
    ) -> SelectionResult:
        """Select columns by list of indices."""
        n_cols = len(df.columns)
        resolved_indices = []

        for idx in indices:
            # Handle negative indices
            resolved_idx = idx if idx >= 0 else n_cols + idx

            if resolved_idx < 0 or resolved_idx >= n_cols:
                raise ColumnSelectionError(
                    f"Column index {idx} out of range. "
                    f"DataFrame has {n_cols} columns (0-{n_cols - 1})."
                )
            resolved_indices.append(resolved_idx)

        names = [df.columns[i] for i in resolved_indices]
        return SelectionResult(
            indices=resolved_indices,
            names=names,
            data=df.iloc[:, resolved_indices].copy(),
        )

    def _select_by_name_list(
        self,
        df: pd.DataFrame,
        names: List[str],
    ) -> SelectionResult:
        """Select columns by list of names."""
        col_names = df.columns.tolist()
        resolved_indices = []
        resolved_names = []

        for name in names:
            if self.case_sensitive:
                if name in col_names:
                    idx = col_names.index(name)
                    resolved_indices.append(idx)
                    resolved_names.append(name)
                else:
                    raise ColumnSelectionError(
                        f"Column '{name}' not found. "
                        f"Available: {col_names[:10]}" +
                        ("..." if len(col_names) > 10 else "")
                    )
            else:
                # Case-insensitive search
                lower_names = [n.lower() for n in col_names]
                if name.lower() in lower_names:
                    idx = lower_names.index(name.lower())
                    resolved_indices.append(idx)
                    resolved_names.append(col_names[idx])
                else:
                    raise ColumnSelectionError(
                        f"Column '{name}' not found (case-insensitive). "
                        f"Available: {col_names[:10]}" +
                        ("..." if len(col_names) > 10 else "")
                    )

        return SelectionResult(
            indices=resolved_indices,
            names=resolved_names,
            data=df[resolved_names].copy(),
        )

    def _select_by_dict(
        self,
        df: pd.DataFrame,
        selection: Dict[str, Any],
    ) -> SelectionResult:
        """Select columns by dictionary specification.

        Supported keys:
        - "regex": Regular expression pattern to match column names
        - "exclude": Columns to exclude (names or indices)
        - "include": Columns to include (names or indices)
        - "startswith": Prefix to match
        - "endswith": Suffix to match
        - "contains": Substring to match
        - "dtype": Select columns by dtype
        """
        col_names = df.columns.tolist()
        n_cols = len(col_names)

        # Start with all columns or specified include set
        if "include" in selection:
            include_result = self.select(df, selection["include"])
            selected_indices = set(include_result.indices)
        else:
            selected_indices = set(range(n_cols))

        # Apply regex filter
        if "regex" in selection:
            pattern = selection["regex"]
            flags = 0 if self.case_sensitive else re.IGNORECASE
            try:
                regex = re.compile(pattern, flags)
            except re.error as e:
                raise ColumnSelectionError(f"Invalid regex pattern '{pattern}': {e}")

            matching = {i for i, name in enumerate(col_names) if regex.search(name)}
            selected_indices &= matching

        # Apply startswith filter
        if "startswith" in selection:
            prefix = selection["startswith"]
            if self.case_sensitive:
                matching = {i for i, name in enumerate(col_names) if name.startswith(prefix)}
            else:
                matching = {i for i, name in enumerate(col_names) if name.lower().startswith(prefix.lower())}
            selected_indices &= matching

        # Apply endswith filter
        if "endswith" in selection:
            suffix = selection["endswith"]
            if self.case_sensitive:
                matching = {i for i, name in enumerate(col_names) if name.endswith(suffix)}
            else:
                matching = {i for i, name in enumerate(col_names) if name.lower().endswith(suffix.lower())}
            selected_indices &= matching

        # Apply contains filter
        if "contains" in selection:
            substring = selection["contains"]
            if self.case_sensitive:
                matching = {i for i, name in enumerate(col_names) if substring in name}
            else:
                matching = {i for i, name in enumerate(col_names) if substring.lower() in name.lower()}
            selected_indices &= matching

        # Apply dtype filter
        if "dtype" in selection:
            dtype_spec = selection["dtype"]
            matching = set()
            for i, col in enumerate(col_names):
                col_dtype = df[col].dtype
                if self._dtype_matches(col_dtype, dtype_spec):
                    matching.add(i)
            selected_indices &= matching

        # Apply exclude filter
        if "exclude" in selection:
            exclude_spec = selection["exclude"]
            exclude_result = self.select(df, exclude_spec)
            selected_indices -= set(exclude_result.indices)

        if not selected_indices:
            raise ColumnSelectionError(
                f"Selection {selection} resulted in no columns. "
                f"Available columns: {col_names[:10]}" +
                ("..." if len(col_names) > 10 else "")
            )

        # Sort indices to maintain column order
        sorted_indices = sorted(selected_indices)
        names = [col_names[i] for i in sorted_indices]

        return SelectionResult(
            indices=sorted_indices,
            names=names,
            data=df.iloc[:, sorted_indices].copy(),
        )

    def _dtype_matches(self, actual_dtype, dtype_spec: str) -> bool:
        """Check if a dtype matches a specification."""
        dtype_str = str(actual_dtype).lower()
        spec_lower = dtype_spec.lower()

        # Handle common dtype categories
        dtype_categories = {
            "numeric": ["int", "float", "complex"],
            "integer": ["int"],
            "float": ["float"],
            "string": ["object", "string", "str"],
            "categorical": ["category"],
            "datetime": ["datetime"],
            "bool": ["bool"],
        }

        if spec_lower in dtype_categories:
            return any(cat in dtype_str for cat in dtype_categories[spec_lower])

        # Direct match
        return spec_lower in dtype_str


[docs]
    def parse_selection(
        self,
        selection: Any,
        available_columns: List[str],
    ) -> List[int]:
        """Parse a selection specification and return column indices.

        This is a convenience method for when you don't have a DataFrame
        but want to validate and resolve a selection.

        Args:
            selection: Column selection specification.
            available_columns: List of available column names.

        Returns:
            List of column indices.

        Raises:
            ColumnSelectionError: If selection is invalid.
        """
        # Create a dummy DataFrame with just the column names
        dummy_df = pd.DataFrame(columns=available_columns)
        result = self.select(dummy_df, selection)
        return result.indices