Source code for nirs4all.data.parsers.legacy_parser

"""
Legacy parser for dataset configuration.

This parser handles the current train_x/test_x format that is fully implemented
and widely used. It provides backward compatibility with existing configurations.
"""

from pathlib import Path
from typing import Any, Dict, Optional

from .base import BaseParser, ParserResult


# Key mapping for normalization
# Maps various naming conventions to standard keys
KEY_MAPPINGS = {
    # train_x variations
    'train_x': 'train_x',
    'x_train': 'train_x',
    'xtrain': 'train_x',
    'trainx': 'train_x',

    # train_y variations
    'train_y': 'train_y',
    'y_train': 'train_y',
    'ytrain': 'train_y',
    'trainy': 'train_y',

    # test_x variations (including val)
    'test_x': 'test_x',
    'x_test': 'test_x',
    'xtest': 'test_x',
    'testx': 'test_x',
    'val_x': 'test_x',
    'x_val': 'test_x',
    'xval': 'test_x',
    'valx': 'test_x',

    # test_y variations (including val)
    'test_y': 'test_y',
    'y_test': 'test_y',
    'ytest': 'test_y',
    'testy': 'test_y',
    'val_y': 'test_y',
    'y_val': 'test_y',
    'yval': 'test_y',
    'valy': 'test_y',

    # train_group (metadata) variations
    'train_group': 'train_group',
    'group_train': 'train_group',
    'grouptrain': 'train_group',
    'traingroup': 'train_group',
    'train_metadata': 'train_group',
    'metadata_train': 'train_group',
    'metadatatrain': 'train_group',
    'trainmetadata': 'train_group',
    'train_meta': 'train_group',
    'meta_train': 'train_group',
    'metatrain': 'train_group',
    'trainmeta': 'train_group',
    'train_m': 'train_group',
    'm_train': 'train_group',
    'mtrain': 'train_group',
    'trainm': 'train_group',

    # test_group (metadata) variations
    'test_group': 'test_group',
    'group_test': 'test_group',
    'grouptest': 'test_group',
    'testgroup': 'test_group',
    'test_metadata': 'test_group',
    'metadata_test': 'test_group',
    'metadatatest': 'test_group',
    'testmetadata': 'test_group',
    'test_meta': 'test_group',
    'meta_test': 'test_group',
    'metatest': 'test_group',
    'testmeta': 'test_group',
    'test_m': 'test_group',
    'm_test': 'test_group',
    'mtest': 'test_group',
    'testm': 'test_group',
    'val_group': 'test_group',
    'group_val': 'test_group',
    'groupval': 'test_group',
    'valgroup': 'test_group',
    'val_metadata': 'test_group',
    'metadata_val': 'test_group',
    'metadataval': 'test_group',
    'valmetadata': 'test_group',
    'val_meta': 'test_group',
    'meta_val': 'test_group',
    'metaval': 'test_group',
    'valmeta': 'test_group',
    'val_m': 'test_group',
    'm_val': 'test_group',
    'mval': 'test_group',
    'valm': 'test_group',
}



[docs]
def normalize_config_keys(config: Dict[str, Any]) -> Dict[str, Any]:
    """Normalize dataset configuration keys to standard format.

    Maps variations like 'x_train', 'X_train', 'Xtrain' to 'train_x'.
    Maps metadata variations like 'metadata_train', 'train_metadata', 'm_train' to 'train_group'.

    Args:
        config: Original configuration dictionary.

    Returns:
        Normalized configuration with standardized keys.
    """
    normalized = {}

    for key, value in config.items():
        # Try case-insensitive lookup
        normalized_key = KEY_MAPPINGS.get(key.lower(), key)
        normalized[normalized_key] = value

    return normalized




[docs]
class LegacyParser(BaseParser):
    """Parser for legacy train_x/test_x configuration format.

    This parser handles dictionary configurations using the established
    key format: train_x, train_y, test_x, test_y, train_group, test_group.

    It also handles flexible key naming (X_train, Xtrain, etc.) by normalizing
    to the standard format.
    """


[docs]
    def can_parse(self, input_data: Any) -> bool:
        """Check if this is a legacy format configuration.

        Args:
            input_data: The input to check.

        Returns:
            True if input is a dict with legacy keys or data arrays.
        """
        if not isinstance(input_data, dict):
            return False

        # Normalize keys for checking
        normalized = normalize_config_keys(input_data)

        # Check for legacy keys
        legacy_keys = ['train_x', 'train_y', 'test_x', 'test_y', 'train_group', 'test_group']
        has_legacy_keys = any(key in normalized for key in legacy_keys)

        # Also handle folder dict format
        has_folder = 'folder' in input_data

        return has_legacy_keys or has_folder



[docs]
    def parse(self, input_data: Dict[str, Any]) -> ParserResult:
        """Parse a legacy format configuration.

        Args:
            input_data: Dictionary configuration to parse.

        Returns:
            ParserResult with normalized configuration.
        """
        if not isinstance(input_data, dict):
            return ParserResult(
                success=False,
                errors=[f"Expected dict, got {type(input_data).__name__}"],
                source_type="unknown"
            )

        # Normalize keys
        config = normalize_config_keys(input_data)
        warnings = []

        # Extract dataset name
        dataset_name = self._infer_dataset_name(config)

        # Validate required data
        has_train = config.get('train_x') is not None
        has_test = config.get('test_x') is not None

        if not has_train and not has_test:
            return ParserResult(
                success=False,
                errors=["No data source found. Provide train_x or test_x."],
                source_type="dict"
            )

        return ParserResult(
            success=True,
            config=config,
            dataset_name=dataset_name,
            warnings=warnings,
            source_type="dict"
        )


    def _infer_dataset_name(self, config: Dict[str, Any]) -> str:
        """Infer dataset name from configuration.

        Priority:
        1. 'name' key in config
        2. Path from train_x or test_x
        3. Default 'unnamed_dataset'

        Args:
            config: Normalized configuration dictionary.

        Returns:
            Inferred dataset name.
        """
        # Check for explicit name
        if 'name' in config:
            return config['name']

        # Try to extract from file path
        for key in ['train_x', 'test_x']:
            path_value = config.get(key)
            if path_value is None:
                continue

            # Handle list (multi-source) - use first path
            if isinstance(path_value, list):
                if len(path_value) > 0:
                    path_value = path_value[0]
                else:
                    continue

            # Handle string paths
            if isinstance(path_value, str):
                path = Path(path_value)
                return f"{path.parent.name}_{path.stem}"

            # Handle Path objects
            if isinstance(path_value, Path):
                return f"{path_value.parent.name}_{path_value.stem}"

        return "array_dataset"