Source code for nirs4all.data.parsers.base

"""
Base parser interface for dataset configuration.

This module defines the abstract base class for configuration parsers.
All parsers should inherit from BaseParser and implement the required methods.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union


[docs] @dataclass class ParserResult: """Result of parsing a configuration. Attributes: success: Whether parsing was successful. config: The parsed configuration dictionary. dataset_name: The extracted or inferred dataset name. errors: List of error messages if parsing failed. warnings: List of warning messages (non-fatal issues). source_type: Type of source that was parsed ('dict', 'file', 'folder', 'array'). """ success: bool config: Optional[Dict[str, Any]] = None dataset_name: Optional[str] = None errors: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list) source_type: Optional[str] = None def __str__(self) -> str: if self.success: return f"ParserResult(success=True, name='{self.dataset_name}')" return f"ParserResult(success=False, errors={self.errors})"
[docs] class BaseParser(ABC): """Abstract base class for configuration parsers. Subclasses must implement: - can_parse(): Check if this parser can handle the input - parse(): Parse the input and return a ParserResult """
[docs] @abstractmethod def can_parse(self, input_data: Any) -> bool: """Check if this parser can handle the given input. Args: input_data: The input to check. Returns: True if this parser can handle the input, False otherwise. """ pass
[docs] @abstractmethod def parse(self, input_data: Any) -> ParserResult: """Parse the input and return a configuration. Args: input_data: The input to parse. Returns: ParserResult with parsed configuration or errors. """ pass
def _extract_name_from_path(self, path: Union[str, Path]) -> str: """Extract a dataset name from a file or folder path. Args: path: Path to extract name from. Returns: Cleaned dataset name. """ path_obj = Path(path) # For files, use stem (filename without extension) if path_obj.is_file() or path_obj.suffix: return self._clean_name(path_obj.stem) # For folders, use the folder name return self._clean_name(path_obj.name) def _clean_name(self, name: str) -> str: """Clean a name to be a valid dataset identifier. Args: name: Raw name to clean. Returns: Cleaned name with only alphanumeric and underscore characters. """ # Replace non-alphanumeric with underscore cleaned = ''.join(c if c.isalnum() else '_' for c in name) # Remove consecutive underscores while '__' in cleaned: cleaned = cleaned.replace('__', '_') # Remove leading/trailing underscores cleaned = cleaned.strip('_') # Ensure lowercase cleaned = cleaned.lower() # Ensure non-empty return cleaned or "unnamed_dataset"