Source code for nirs4all.data.parsers.legacy_parser

"""
Legacy parser for dataset configuration.

This parser handles the current train_x/test_x format that is fully implemented
and widely used. It provides backward compatibility with existing configurations.
"""

from pathlib import Path
from typing import Any, Dict, Optional

from .base import BaseParser, ParserResult


# Key mapping for normalization
# Maps various naming conventions to standard keys
KEY_MAPPINGS = {
    # train_x variations
    'train_x': 'train_x',
    'x_train': 'train_x',
    'xtrain': 'train_x',
    'trainx': 'train_x',

    # train_y variations
    'train_y': 'train_y',
    'y_train': 'train_y',
    'ytrain': 'train_y',
    'trainy': 'train_y',

    # test_x variations (including val)
    'test_x': 'test_x',
    'x_test': 'test_x',
    'xtest': 'test_x',
    'testx': 'test_x',
    'val_x': 'test_x',
    'x_val': 'test_x',
    'xval': 'test_x',
    'valx': 'test_x',

    # test_y variations (including val)
    'test_y': 'test_y',
    'y_test': 'test_y',
    'ytest': 'test_y',
    'testy': 'test_y',
    'val_y': 'test_y',
    'y_val': 'test_y',
    'yval': 'test_y',
    'valy': 'test_y',

    # train_group (metadata) variations
    'train_group': 'train_group',
    'group_train': 'train_group',
    'grouptrain': 'train_group',
    'traingroup': 'train_group',
    'train_metadata': 'train_group',
    'metadata_train': 'train_group',
    'metadatatrain': 'train_group',
    'trainmetadata': 'train_group',
    'train_meta': 'train_group',
    'meta_train': 'train_group',
    'metatrain': 'train_group',
    'trainmeta': 'train_group',
    'train_m': 'train_group',
    'm_train': 'train_group',
    'mtrain': 'train_group',
    'trainm': 'train_group',

    # test_group (metadata) variations
    'test_group': 'test_group',
    'group_test': 'test_group',
    'grouptest': 'test_group',
    'testgroup': 'test_group',
    'test_metadata': 'test_group',
    'metadata_test': 'test_group',
    'metadatatest': 'test_group',
    'testmetadata': 'test_group',
    'test_meta': 'test_group',
    'meta_test': 'test_group',
    'metatest': 'test_group',
    'testmeta': 'test_group',
    'test_m': 'test_group',
    'm_test': 'test_group',
    'mtest': 'test_group',
    'testm': 'test_group',
    'val_group': 'test_group',
    'group_val': 'test_group',
    'groupval': 'test_group',
    'valgroup': 'test_group',
    'val_metadata': 'test_group',
    'metadata_val': 'test_group',
    'metadataval': 'test_group',
    'valmetadata': 'test_group',
    'val_meta': 'test_group',
    'meta_val': 'test_group',
    'metaval': 'test_group',
    'valmeta': 'test_group',
    'val_m': 'test_group',
    'm_val': 'test_group',
    'mval': 'test_group',
    'valm': 'test_group',
}


[docs] def normalize_config_keys(config: Dict[str, Any]) -> Dict[str, Any]: """Normalize dataset configuration keys to standard format. Maps variations like 'x_train', 'X_train', 'Xtrain' to 'train_x'. Maps metadata variations like 'metadata_train', 'train_metadata', 'm_train' to 'train_group'. Args: config: Original configuration dictionary. Returns: Normalized configuration with standardized keys. """ normalized = {} for key, value in config.items(): # Try case-insensitive lookup normalized_key = KEY_MAPPINGS.get(key.lower(), key) normalized[normalized_key] = value return normalized
[docs] class LegacyParser(BaseParser): """Parser for legacy train_x/test_x configuration format. This parser handles dictionary configurations using the established key format: train_x, train_y, test_x, test_y, train_group, test_group. It also handles flexible key naming (X_train, Xtrain, etc.) by normalizing to the standard format. """
[docs] def can_parse(self, input_data: Any) -> bool: """Check if this is a legacy format configuration. Args: input_data: The input to check. Returns: True if input is a dict with legacy keys or data arrays. """ if not isinstance(input_data, dict): return False # Normalize keys for checking normalized = normalize_config_keys(input_data) # Check for legacy keys legacy_keys = ['train_x', 'train_y', 'test_x', 'test_y', 'train_group', 'test_group'] has_legacy_keys = any(key in normalized for key in legacy_keys) # Also handle folder dict format has_folder = 'folder' in input_data return has_legacy_keys or has_folder
[docs] def parse(self, input_data: Dict[str, Any]) -> ParserResult: """Parse a legacy format configuration. Args: input_data: Dictionary configuration to parse. Returns: ParserResult with normalized configuration. """ if not isinstance(input_data, dict): return ParserResult( success=False, errors=[f"Expected dict, got {type(input_data).__name__}"], source_type="unknown" ) # Normalize keys config = normalize_config_keys(input_data) warnings = [] # Extract dataset name dataset_name = self._infer_dataset_name(config) # Validate required data has_train = config.get('train_x') is not None has_test = config.get('test_x') is not None if not has_train and not has_test: return ParserResult( success=False, errors=["No data source found. Provide train_x or test_x."], source_type="dict" ) return ParserResult( success=True, config=config, dataset_name=dataset_name, warnings=warnings, source_type="dict" )
def _infer_dataset_name(self, config: Dict[str, Any]) -> str: """Infer dataset name from configuration. Priority: 1. 'name' key in config 2. Path from train_x or test_x 3. Default 'unnamed_dataset' Args: config: Normalized configuration dictionary. Returns: Inferred dataset name. """ # Check for explicit name if 'name' in config: return config['name'] # Try to extract from file path for key in ['train_x', 'test_x']: path_value = config.get(key) if path_value is None: continue # Handle list (multi-source) - use first path if isinstance(path_value, list): if len(path_value) > 0: path_value = path_value[0] else: continue # Handle string paths if isinstance(path_value, str): path = Path(path_value) return f"{path.parent.name}_{path.stem}" # Handle Path objects if isinstance(path_value, Path): return f"{path_value.parent.name}_{path_value.stem}" return "array_dataset"