Source code for nirs4all.data.schema.validation.error_codes

"""
Error codes and diagnostics for dataset configuration.

This module provides comprehensive error codes, diagnostic messages,
and suggestion systems for configuration issues.

Phase 8 Implementation - Dataset Configuration Roadmap
Section 8.4: Error Handling & Diagnostics
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional, Type


[docs] class ErrorCategory(str, Enum): """Categories of configuration errors.""" SCHEMA = "schema" # Schema validation errors FILE = "file" # File-related errors DATA = "data" # Data content errors LOADING = "loading" # Loading/parsing errors PARTITION = "partition" # Partition-related errors AGGREGATION = "aggregation" # Aggregation errors VARIATION = "variation" # Variation/source errors FOLD = "fold" # Cross-validation fold errors RUNTIME = "runtime" # Runtime errors
[docs] class ErrorSeverity(str, Enum): """Severity levels for errors.""" ERROR = "error" # Fatal error, cannot proceed WARNING = "warning" # Non-fatal issue INFO = "info" # Informational message
[docs] @dataclass class ErrorCode: """Error code definition. Attributes: code: Unique error code (e.g., "E001"). category: Error category. severity: Error severity. message_template: Template for error message with {placeholders}. suggestion_template: Template for fix suggestion. documentation_url: Link to relevant documentation. """ code: str category: ErrorCategory severity: ErrorSeverity message_template: str suggestion_template: Optional[str] = None documentation_url: Optional[str] = None
# ============================================================================= # Error Code Registry # =============================================================================
[docs] class ErrorRegistry: """Registry of all error codes.""" _codes: Dict[str, ErrorCode] = {} # Schema Errors (E1xx) E100 = ErrorCode( code="E100", category=ErrorCategory.SCHEMA, severity=ErrorSeverity.ERROR, message_template="Invalid configuration structure: {details}", suggestion_template="Check that the configuration is a valid dictionary.", ) E101 = ErrorCode( code="E101", category=ErrorCategory.SCHEMA, severity=ErrorSeverity.ERROR, message_template="Missing required field: {field}", suggestion_template="Add the '{field}' field to your configuration.", ) E102 = ErrorCode( code="E102", category=ErrorCategory.SCHEMA, severity=ErrorSeverity.ERROR, message_template="Invalid type for '{field}': expected {expected}, got {actual}", suggestion_template="Change '{field}' to be of type {expected}.", ) E103 = ErrorCode( code="E103", category=ErrorCategory.SCHEMA, severity=ErrorSeverity.ERROR, message_template="Invalid value for '{field}': {value}. Valid values: {valid_values}", suggestion_template="Use one of the valid values: {valid_values}", ) E104 = ErrorCode( code="E104", category=ErrorCategory.SCHEMA, severity=ErrorSeverity.ERROR, message_template="No data source specified", suggestion_template="Add 'train_x', 'test_x', 'folder', 'sources', or 'variations' to your configuration.", ) # File Errors (E2xx) E200 = ErrorCode( code="E200", category=ErrorCategory.FILE, severity=ErrorSeverity.ERROR, message_template="File not found: {path}", suggestion_template="Check that the file path is correct and the file exists.", ) E201 = ErrorCode( code="E201", category=ErrorCategory.FILE, severity=ErrorSeverity.ERROR, message_template="Cannot read file: {path}. Error: {error}", suggestion_template="Check file permissions and encoding.", ) E202 = ErrorCode( code="E202", category=ErrorCategory.FILE, severity=ErrorSeverity.ERROR, message_template="Unsupported file format: {format}", suggestion_template="Supported formats: CSV, NPY, NPZ, Parquet, Excel, MATLAB", ) E203 = ErrorCode( code="E203", category=ErrorCategory.FILE, severity=ErrorSeverity.ERROR, message_template="Empty file: {path}", suggestion_template="Ensure the file contains data.", ) E204 = ErrorCode( code="E204", category=ErrorCategory.FILE, severity=ErrorSeverity.WARNING, message_template="File encoding issue: {path}. Using fallback encoding: {encoding}", suggestion_template="Specify the encoding explicitly in loading parameters.", ) # Data Errors (E3xx) E300 = ErrorCode( code="E300", category=ErrorCategory.DATA, severity=ErrorSeverity.ERROR, message_template="Data shape mismatch: {details}", suggestion_template="Ensure all data arrays have consistent sample counts.", ) E301 = ErrorCode( code="E301", category=ErrorCategory.DATA, severity=ErrorSeverity.ERROR, message_template="NA values found in data and na_policy='abort': {details}", suggestion_template="Set na_policy='remove' or clean your data before loading.", ) E302 = ErrorCode( code="E302", category=ErrorCategory.DATA, severity=ErrorSeverity.WARNING, message_template="NA values removed: {count} rows affected", suggestion_template="Review your data for missing values.", ) E303 = ErrorCode( code="E303", category=ErrorCategory.DATA, severity=ErrorSeverity.ERROR, message_template="Column not found: '{column}' in {file}", suggestion_template="Available columns: {available}", ) E304 = ErrorCode( code="E304", category=ErrorCategory.DATA, severity=ErrorSeverity.WARNING, message_template="Non-numeric values in feature data at column(s): {columns}", suggestion_template="Features should be numeric. Non-numeric values will be converted to NaN.", ) # Loading Errors (E4xx) E400 = ErrorCode( code="E400", category=ErrorCategory.LOADING, severity=ErrorSeverity.ERROR, message_template="Failed to parse CSV: {error}", suggestion_template="Check delimiter and encoding settings.", ) E401 = ErrorCode( code="E401", category=ErrorCategory.LOADING, severity=ErrorSeverity.ERROR, message_template="Invalid JSON configuration at line {line}: {error}", suggestion_template="Check JSON syntax around line {line}.", ) E402 = ErrorCode( code="E402", category=ErrorCategory.LOADING, severity=ErrorSeverity.ERROR, message_template="Invalid YAML configuration at line {line}: {error}", suggestion_template="Check YAML indentation and syntax around line {line}.", ) E403 = ErrorCode( code="E403", category=ErrorCategory.LOADING, severity=ErrorSeverity.ERROR, message_template="Archive error: {error}", suggestion_template="Ensure the archive is not corrupted and contains the expected files.", ) # Partition Errors (E5xx) E500 = ErrorCode( code="E500", category=ErrorCategory.PARTITION, severity=ErrorSeverity.ERROR, message_template="Invalid partition specification: {details}", suggestion_template="Use 'train', 'test', column-based, or percentage-based partition.", ) E501 = ErrorCode( code="E501", category=ErrorCategory.PARTITION, severity=ErrorSeverity.ERROR, message_template="Partition column not found: '{column}'", suggestion_template="Available columns: {available}", ) E502 = ErrorCode( code="E502", category=ErrorCategory.PARTITION, severity=ErrorSeverity.ERROR, message_template="Partition indices out of range: max index {max_index}, data has {n_samples} samples", suggestion_template="Ensure partition indices are within valid range.", ) E503 = ErrorCode( code="E503", category=ErrorCategory.PARTITION, severity=ErrorSeverity.WARNING, message_template="Overlapping partition indices detected", suggestion_template="Train and test indices should not overlap.", ) # Aggregation Errors (E6xx) E600 = ErrorCode( code="E600", category=ErrorCategory.AGGREGATION, severity=ErrorSeverity.ERROR, message_template="Aggregation column not found: '{column}'", suggestion_template="Available columns in metadata: {available}", ) E601 = ErrorCode( code="E601", category=ErrorCategory.AGGREGATION, severity=ErrorSeverity.WARNING, message_template="Group '{group}' has only {count} sample(s), below minimum {min_samples}", suggestion_template="Consider lowering aggregate_min_samples or reviewing your data.", ) E602 = ErrorCode( code="E602", category=ErrorCategory.AGGREGATION, severity=ErrorSeverity.ERROR, message_template="Invalid aggregation method: '{method}'", suggestion_template="Valid methods: mean, median, vote, min, max, sum, std, first, last", ) # Variation/Source Errors (E7xx) E700 = ErrorCode( code="E700", category=ErrorCategory.VARIATION, severity=ErrorSeverity.ERROR, message_template="Duplicate source name: '{name}'", suggestion_template="Each source must have a unique name.", ) E701 = ErrorCode( code="E701", category=ErrorCategory.VARIATION, severity=ErrorSeverity.ERROR, message_template="Duplicate variation name: '{name}'", suggestion_template="Each variation must have a unique name.", ) E702 = ErrorCode( code="E702", category=ErrorCategory.VARIATION, severity=ErrorSeverity.ERROR, message_template="Unknown variation(s) in variation_select: {names}", suggestion_template="Available variations: {available}", ) E703 = ErrorCode( code="E703", category=ErrorCategory.VARIATION, severity=ErrorSeverity.ERROR, message_template="variation_mode='select' requires 'variation_select' to be specified", suggestion_template="Add 'variation_select: [\"var1\", \"var2\"]' to your configuration.", ) E704 = ErrorCode( code="E704", category=ErrorCategory.VARIATION, severity=ErrorSeverity.ERROR, message_template="Sample count mismatch across sources: {details}", suggestion_template="All sources must have the same number of samples.", ) # Fold Errors (E8xx) E800 = ErrorCode( code="E800", category=ErrorCategory.FOLD, severity=ErrorSeverity.ERROR, message_template="Invalid fold file format: {error}", suggestion_template="Fold files should be CSV with fold columns or JSON/YAML with fold definitions.", ) E801 = ErrorCode( code="E801", category=ErrorCategory.FOLD, severity=ErrorSeverity.ERROR, message_template="Fold sample IDs do not match dataset: {details}", suggestion_template="Ensure fold file was generated for this dataset.", ) E802 = ErrorCode( code="E802", category=ErrorCategory.FOLD, severity=ErrorSeverity.WARNING, message_template="Fold file has {fold_samples} samples, dataset has {data_samples} samples", suggestion_template="Folds will be adjusted to match current dataset size.", ) # Runtime Errors (E9xx) E900 = ErrorCode( code="E900", category=ErrorCategory.RUNTIME, severity=ErrorSeverity.ERROR, message_template="Unexpected error during loading: {error}", suggestion_template="Please report this issue with the full error traceback.", )
[docs] @classmethod def get(cls, code: str) -> Optional[ErrorCode]: """Get error code by code string.""" return getattr(cls, code, None)
[docs] @classmethod def all_codes(cls) -> Dict[str, ErrorCode]: """Get all error codes.""" return { name: value for name, value in cls.__dict__.items() if isinstance(value, ErrorCode) }
[docs] @dataclass class DiagnosticMessage: """A diagnostic message with formatted content. Attributes: error_code: The ErrorCode definition. message: Formatted error message. suggestion: Formatted suggestion (if any). context: Additional context information. location: File/line location (if applicable). """ error_code: ErrorCode message: str suggestion: Optional[str] = None context: Dict[str, Any] = field(default_factory=dict) location: Optional[str] = None @property def code(self) -> str: """Get the error code string.""" return self.error_code.code @property def severity(self) -> ErrorSeverity: """Get the error severity.""" return self.error_code.severity @property def category(self) -> ErrorCategory: """Get the error category.""" return self.error_code.category def __str__(self) -> str: parts = [f"[{self.code}] {self.message}"] if self.location: parts.insert(0, f"At {self.location}:") if self.suggestion: parts.append(f"\n Suggestion: {self.suggestion}") return " ".join(parts)
[docs] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "code": self.code, "category": self.category.value, "severity": self.severity.value, "message": self.message, "suggestion": self.suggestion, "location": self.location, "context": self.context, }
[docs] class DiagnosticBuilder: """Builder for diagnostic messages. Example: ```python builder = DiagnosticBuilder() # Create error message error = builder.create( ErrorRegistry.E200, path="/path/to/file.csv" ) # Create with location error = builder.create( ErrorRegistry.E401, line=10, error="Unexpected token", location="config.json:10" ) ``` """
[docs] def create( self, error_code: ErrorCode, location: Optional[str] = None, **kwargs, ) -> DiagnosticMessage: """Create a diagnostic message. Args: error_code: The error code definition. location: Optional file/line location. **kwargs: Parameters for message template. Returns: DiagnosticMessage instance. """ # Format message try: message = error_code.message_template.format(**kwargs) except KeyError as e: message = f"{error_code.message_template} (missing: {e})" # Format suggestion suggestion = None if error_code.suggestion_template: try: suggestion = error_code.suggestion_template.format(**kwargs) except KeyError: suggestion = error_code.suggestion_template return DiagnosticMessage( error_code=error_code, message=message, suggestion=suggestion, context=kwargs, location=location, )
[docs] def file_not_found(self, path: str) -> DiagnosticMessage: """Create file not found error.""" return self.create(ErrorRegistry.E200, path=path)
[docs] def invalid_value( self, field: str, value: Any, valid_values: List[Any], ) -> DiagnosticMessage: """Create invalid value error.""" return self.create( ErrorRegistry.E103, field=field, value=value, valid_values=valid_values, )
[docs] def missing_field(self, field: str) -> DiagnosticMessage: """Create missing field error.""" return self.create(ErrorRegistry.E101, field=field)
[docs] def type_error( self, field: str, expected: str, actual: str, ) -> DiagnosticMessage: """Create type error.""" return self.create( ErrorRegistry.E102, field=field, expected=expected, actual=actual, )
[docs] @dataclass class DiagnosticReport: """Collection of diagnostic messages. Attributes: messages: List of diagnostic messages. config_path: Path to the configuration file (if any). """ messages: List[DiagnosticMessage] = field(default_factory=list) config_path: Optional[str] = None
[docs] def add(self, message: DiagnosticMessage) -> None: """Add a diagnostic message.""" self.messages.append(message)
[docs] def add_error( self, error_code: ErrorCode, location: Optional[str] = None, **kwargs, ) -> DiagnosticMessage: """Create and add an error message.""" builder = DiagnosticBuilder() message = builder.create(error_code, location, **kwargs) self.add(message) return message
@property def errors(self) -> List[DiagnosticMessage]: """Get all error messages.""" return [m for m in self.messages if m.severity == ErrorSeverity.ERROR] @property def warnings(self) -> List[DiagnosticMessage]: """Get all warning messages.""" return [m for m in self.messages if m.severity == ErrorSeverity.WARNING] @property def has_errors(self) -> bool: """Check if there are any errors.""" return any(m.severity == ErrorSeverity.ERROR for m in self.messages) @property def is_valid(self) -> bool: """Check if configuration is valid (no errors).""" return not self.has_errors def __str__(self) -> str: lines = [] if self.config_path: lines.append(f"Diagnostics for: {self.config_path}") lines.append("=" * 60) if not self.messages: lines.append("✓ No issues found") return "\n".join(lines) errors = self.errors warnings = self.warnings if errors: lines.append(f"\nErrors ({len(errors)}):") for msg in errors: lines.append(f" ✗ {msg}") if warnings: lines.append(f"\nWarnings ({len(warnings)}):") for msg in warnings: lines.append(f" ⚠ {msg}") if not errors: lines.append("\n✓ Configuration is valid") return "\n".join(lines)
[docs] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return { "config_path": self.config_path, "is_valid": self.is_valid, "error_count": len(self.errors), "warning_count": len(self.warnings), "messages": [m.to_dict() for m in self.messages], }