Source code for nirs4all.cli.installation_test

"""
Installation testing utilities for nirs4all CLI.
"""

import sys
import importlib
import os
import tempfile
import time
from typing import Dict, List, Tuple
import numpy as np

from nirs4all.core.logging import get_logger

logger = get_logger(__name__)


[docs] def check_dependency(name: str, min_version: str = None) -> Tuple[bool, str]: """ Check if a dependency is installed and optionally verify minimum version. Args: name: Name of the dependency/module to check min_version: Minimum required version (optional) Returns: Tuple of (is_available, version_string) """ try: module = importlib.import_module(name) version = getattr(module, '__version__', 'unknown') if min_version and version != 'unknown': # Simple version comparison (works for most cases) try: from packaging import version as pkg_version if pkg_version.parse(version) < pkg_version.parse(min_version): return False, f"{version} (< {min_version} required)" except ImportError: # Fallback if packaging is not available pass return True, version except ImportError: return False, "Not installed"
[docs] def test_installation() -> bool: """ Test basic installation and show dependency versions. Returns: True if all required dependencies are available, False otherwise. """ logger.info("Testing NIRS4ALL Installation...") logger.info("=" * 50) # Core required dependencies from pyproject.toml required_deps = { 'numpy': '1.20.0', 'pandas': '2.0.0', 'scipy': '1.5.0', 'sklearn': '0.24.0', # scikit-learn is imported as sklearn 'pywt': '1.1.0', # PyWavelets is imported as pywt 'joblib': '0.16.0', 'jsonschema': '3.2.0', 'optuna': '2.0.0', 'matplotlib': '3.0.0', 'polars': '0.18.0', 'yaml': '5.4.0', # pyyaml is imported as yaml 'seaborn': '0.11.0', 'h5py': '3.0.0', 'packaging': '20.0', 'shap': '0.41.0', } # Optional ML framework dependencies optional_deps = { 'tensorflow': '2.0.0', 'torch': '1.4.0', 'keras': None, 'jax': None, } # Test Python version python_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" logger.success(f"Python: {python_version}") if sys.version_info < (3, 7): logger.error(f"Python version {python_version} is not supported (requires >=3.7)") return False logger.info("") # Test required dependencies logger.info("\nRequired Dependencies:") all_required_ok = True for dep_name, min_version in required_deps.items(): is_available, version = check_dependency(dep_name, min_version) if is_available: logger.success(f" {dep_name}: {version}") else: logger.error(f" {dep_name}: {version}") all_required_ok = False logger.info("") # Test optional dependencies logger.info("\nOptional ML Frameworks:") optional_available = {} for dep_name, min_version in optional_deps.items(): is_available, version = check_dependency(dep_name, min_version) if is_available: logger.success(f" {dep_name}: {version}") else: logger.warning(f" {dep_name}: {version}") optional_available[dep_name] = is_available logger.info("") # Test nirs4all itself logger.info("\nNIRS4ALL Components:") try: # Test core pipeline components from nirs4all.pipeline.runner import PipelineRunner logger.success(" nirs4all.pipeline.runner: OK") from nirs4all.data.dataset import SpectroDataset logger.success(" nirs4all.dataset.dataset: OK") # Test controller system from nirs4all.controllers import register_controller, CONTROLLER_REGISTRY logger.success(f" nirs4all.controllers: OK ({len(CONTROLLER_REGISTRY)} controllers registered)") # Test operators from nirs4all.operators.transforms import StandardNormalVariate, SavitzkyGolay logger.success(" nirs4all.operators.transforms: OK") # Test backend utils from nirs4all.utils.backend import ( is_tensorflow_available, is_torch_available, is_gpu_available ) logger.success(" nirs4all.utils.backend_utils: OK") except ImportError as e: logger.error(f" nirs4all import error: {e}") all_required_ok = False logger.info("") # Summary if all_required_ok: logger.success("Basic installation test PASSED!") logger.info("All required dependencies are available") available_frameworks = [name for name, available in optional_available.items() if available] if available_frameworks: logger.info(f"Available ML frameworks: {', '.join(available_frameworks)}") else: logger.info("No optional ML frameworks detected") return True else: logger.error("Basic installation test FAILED!") logger.info("Please install missing dependencies using:") logger.info(" pip install nirs4all") return False
[docs] def test_integration() -> bool: """ Run integration test with sklearn, tensorflow, and optuna pipelines. Based on examples Q1.py, Q1_finetune.py, Q2.py but using synthetic data. Monitors execution time of each test. Returns: True if integration test passes, False otherwise. """ logger.info("NIRS4ALL Integration Test...") logger.info("=" * 50) # # First check if basic installation is working # basic_ok = test_installation() # if not basic_ok: # logger.error("Integration test FAILED!") # logger.info("Please fix installation issues first.") # return False logger.info("\n" + "=" * 50) logger.info("Running Pipeline Integration Tests...") logger.info("=" * 50) # Store test results with timing test_results = [] try: # Import required modules based on examples from nirs4all.pipeline import PipelineConfigs, PipelineRunner from nirs4all.data import DatasetConfigs from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import ShuffleSplit from sklearn.ensemble import RandomForestRegressor from sklearn.cross_decomposition import PLSRegression from nirs4all.operators.transforms import StandardNormalVariate logger.success("Successfully imported NIRS4ALL modules") except ImportError as e: logger.error(f"Failed to import required modules: {e}") return False def create_synthetic_dataset_files(temp_dir, task_type="regression", n_samples=100, n_features=500): """Create synthetic CSV files matching the expected format.""" np.random.seed(42) # Create realistic spectral-like data X = np.random.normal(0.1, 0.05, (n_samples, n_features)) X = np.clip(X, 0.001, 0.5) # Typical absorbance range # Add some spectral structure for i in range(0, n_features, 100): peak_width = 20 for j in range(max(0, i - peak_width), min(n_features, i + peak_width)): X[:, j] += 0.02 * np.exp(-((j - i) ** 2) / (2 * (peak_width / 3) ** 2)) # Create wavelength-like column names wavelengths = np.linspace(2500, 400, n_features) # Typical NIR range columns = [f"X{int(w)}" for w in wavelengths] # Split data n_train = int(0.7 * n_samples) X_train, X_test = X[:n_train], X[n_train:] if task_type == "regression": # Regression target correlated with spectral features y = (X[:, 100:200].mean(axis=1) * 100 + np.random.normal(0, 2, n_samples)) else: # classification # Binary target based on spectral threshold threshold = np.median(X[:, 150:250].mean(axis=1)) y = (X[:, 150:250].mean(axis=1) > threshold).astype(int) y_train, y_test = y[:n_train], y[n_train:] # Create CSV files import pandas as pd # Training data pd.DataFrame(X_train, columns=columns).to_csv( os.path.join(temp_dir, "Xcal.csv"), index=False, sep=";" ) pd.DataFrame(y_train, columns=["value" if task_type == "regression" else "label"]).to_csv( os.path.join(temp_dir, "Ycal.csv"), index=False ) # Test data pd.DataFrame(X_test, columns=columns).to_csv( os.path.join(temp_dir, "Xval.csv"), index=False, sep=";" ) pd.DataFrame(y_test, columns=["value" if task_type == "regression" else "label"]).to_csv( os.path.join(temp_dir, "Yval.csv"), index=False ) return temp_dir def run_test(test_name, test_func): """Run a test with timing and error handling.""" logger.info(f"\nTest: {test_name}") start_time = time.time() try: success = test_func() end_time = time.time() elapsed = end_time - start_time if success: logger.success(f" {test_name} completed successfully ({elapsed:.2f}s)") test_results.append((test_name, True, elapsed, None)) return True else: logger.error(f" {test_name} failed ({elapsed:.2f}s)") test_results.append((test_name, False, elapsed, "Test function returned False")) return False except Exception as e: end_time = time.time() elapsed = end_time - start_time logger.error(f" {test_name} failed with error ({elapsed:.2f}s): {e}") test_results.append((test_name, False, elapsed, str(e))) return False def test_sklearn_pipeline(): """Test sklearn-based pipeline (based on Q2.py) - Extended version.""" # Create temporary dataset with more samples for thorough testing temp_dir = tempfile.mkdtemp() try: create_synthetic_dataset_files(temp_dir, "regression", 120, 500) # More samples and features # Extended pipeline based on Q2.py example with more models pipeline = [ MinMaxScaler(feature_range=(0.1, 0.8)), StandardNormalVariate(), ShuffleSplit(n_splits=4), # More folds {"model": PLSRegression(n_components=2)}, {"model": PLSRegression(n_components=3)}, {"model": PLSRegression(n_components=4)}, {"model": RandomForestRegressor(n_estimators=50, max_depth=8, random_state=42)}, {"model": RandomForestRegressor(n_estimators=30, max_depth=5, random_state=123)}, ] pipeline_config = PipelineConfigs(pipeline, "sklearn_test") dataset_config = DatasetConfigs(temp_dir) runner = PipelineRunner(save_artifacts=False, save_charts=False, verbose=0) predictions, _ = runner.run(pipeline_config, dataset_config) # Verify results assert predictions is not None, "No predictions returned" num_predictions = predictions.num_predictions logger.info(f" Pipeline executed successfully, {num_predictions} predictions generated") # Additional validation assert num_predictions >= 10, f"Expected at least 10 predictions, got {num_predictions}" return True finally: # Cleanup import shutil shutil.rmtree(temp_dir, ignore_errors=True) def test_tensorflow_pipeline(): """Test TensorFlow-based pipeline (based on Q2.py).""" try: import tensorflow as tf from nirs4all.operators.models.tensorflow.nicon import nicon except ImportError: logger.warning(" TensorFlow/NIRS models not available, skipping test") return True # Skip but don't fail # Create temporary dataset temp_dir = tempfile.mkdtemp() try: create_synthetic_dataset_files(temp_dir, "regression", 60, 300) # Pipeline based on Q2.py example pipeline = [ MinMaxScaler(), StandardNormalVariate(), ShuffleSplit(n_splits=2), # Fewer splits for speed { "model": nicon, "train_params": { "epochs": 3, # Very few epochs for speed "patience": 10, "verbose": 0 }, }, ] pipeline_config = PipelineConfigs(pipeline, "tensorflow_test") dataset_config = DatasetConfigs(temp_dir) runner = PipelineRunner(save_artifacts=False, save_charts=False, verbose=0) predictions, _ = runner.run(pipeline_config, dataset_config) # Verify results assert predictions is not None, "No predictions returned" logger.info(" TensorFlow model trained successfully") return True finally: # Cleanup import shutil shutil.rmtree(temp_dir, ignore_errors=True) def test_optuna_pipeline(): """Test Optuna hyperparameter optimization (based on Q1_finetune.py) - Extended version.""" try: import optuna except ImportError: logger.warning(" Optuna not available, skipping test") return True # Skip but don't fail # Create temporary dataset with more samples temp_dir = tempfile.mkdtemp() try: create_synthetic_dataset_files(temp_dir, "regression", 100, 400) # More samples # Extended pipeline based on Q1_finetune.py example with more comprehensive optimization pipeline = [ MinMaxScaler(), StandardNormalVariate(), ShuffleSplit(n_splits=3), # Keep splits moderate for speed { "model": PLSRegression(), "name": "PLS-Finetuned-Extended", "finetune_params": { "n_trials": 15, # More trials for better optimization "verbose": 0, "approach": "grouped", # Test grouped approach "eval_mode": "best", "model_params": { 'n_components': ('int', 1, 5), # Safe range for small training sets }, } }, # Add a second optimization test { "model": PLSRegression(), "name": "PLS-Single-Optim", "finetune_params": { "n_trials": 25, # More trials for comprehensive optimization "verbose": 0, "approach": "single", # Test single approach "model_params": { 'n_components': ('int', 1, 5), # Safe range for small training sets }, } }, ] pipeline_config = PipelineConfigs(pipeline, "optuna_test") dataset_config = DatasetConfigs(temp_dir) runner = PipelineRunner(save_artifacts=False, save_charts=False, verbose=0) predictions, _ = runner.run(pipeline_config, dataset_config) # Verify results assert predictions is not None, "No predictions returned" num_predictions = predictions.num_predictions logger.info(f" Optuna optimization completed, {num_predictions} predictions generated") # Additional validation - should have predictions from both optimizations assert num_predictions >= 4, f"Expected at least 4 predictions from optimization, got {num_predictions}" return True finally: # Cleanup import shutil shutil.rmtree(temp_dir, ignore_errors=True) # Run all tests tests = [ ("Sklearn Extended Pipeline (Multiple PLS + RandomForest)", test_sklearn_pipeline), ("TensorFlow Pipeline (NICON Neural Network)", test_tensorflow_pipeline), ("Optuna Extended Pipeline (Comprehensive PLS Optimization)", test_optuna_pipeline), ] success_count = 0 for test_name, test_func in tests: if run_test(test_name, test_func): success_count += 1 # Print summary with timing logger.info("\n" + "=" * 50) logger.info("Integration Test Summary") logger.info("=" * 50) total_time = sum(result[2] for result in test_results) for name, success, elapsed, error in test_results: if success: logger.success(f"PASS {name}: {elapsed:.2f}s") else: logger.error(f"FAIL {name}: {elapsed:.2f}s") if error and not success: logger.info(f" Error: {error}") logger.info(f"\nTotal execution time: {total_time:.2f}s") if success_count == len(tests): logger.success("Integration test PASSED!") logger.success(f"All {len(tests)} pipeline tests completed successfully") logger.info("NIRS4ALL is ready for use!") return True else: logger.warning(f"Partial success: {success_count}/{len(tests)} tests passed") if success_count > 0: logger.success("Basic pipeline functionality is working") logger.warning("Some optional features may have issues") return True # Return True for partial success else: logger.error("Integration test FAILED!") logger.error("Pipeline execution is not working properly") return False