Source code for maldiamrkit.preprocessing.quality

"""Quality metrics for MALDI-TOF spectra."""

from __future__ import annotations

import warnings
from dataclasses import dataclass
from enum import Enum
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from scipy.stats import median_abs_deviation, norm


[docs] class SignalMethod(str, Enum): """Method for estimating the signal level in SNR computation. Attributes ---------- max : str Maximum intensity (standard approach). median_peaks : str Median intensity of the top detected peaks (more robust). """ max = "max" median_peaks = "median_peaks"
if TYPE_CHECKING: from maldiamrkit.spectrum import MaldiSpectrum def _get_spectrum_df(spectrum: MaldiSpectrum) -> pd.DataFrame: """Return preprocessed data if available, otherwise raw.""" return spectrum.get_data(prefer="preprocessed")
[docs] @dataclass class SpectrumQualityReport: """ Quality metrics report for a single MALDI-TOF spectrum. Attributes ---------- snr : float Signal-to-noise ratio. total_ion_count : float Sum of all intensities (total ion count). peak_count : int Number of detected peaks. baseline_fraction : float Fraction of data points below noise floor (baseline contamination). noise_level : float Estimated noise level (standard deviation). dynamic_range : float Log10 ratio of max to median signal intensity. """ snr: float total_ion_count: float peak_count: int baseline_fraction: float noise_level: float dynamic_range: float
[docs] class SpectrumQuality: """ Comprehensive quality assessment for MALDI-TOF spectra. Provides methods to compute various quality metrics for individual spectra, useful for quality control and filtering poor-quality acquisitions. Parameters ---------- noise_region : tuple of (float, float), default=(19500, 20000) m/z range to use for noise estimation. Should be a region with minimal peaks (typically high m/z range). peak_prominence : float, default=1e-4 Minimum prominence for peak detection. signal_method : str, default="max" How to estimate the signal level for SNR calculation: - ``"max"``: use the maximum intensity (standard, but sensitive to single outlier peaks). - ``"median_peaks"``: use the median intensity of the top *n_top_peaks* detected peaks (more robust). n_top_peaks : int, default=10 Number of top peaks to consider when ``signal_method="median_peaks"``. Examples -------- >>> from maldiamrkit import MaldiSpectrum >>> from maldiamrkit.preprocessing.quality import SpectrumQuality >>> spec = MaldiSpectrum("spectrum.txt").preprocess() >>> qc = SpectrumQuality(noise_region=(19500, 20000)) >>> report = qc.assess(spec) >>> print(f"SNR: {report.snr:.1f}") >>> print(f"TIC: {report.total_ion_count:.2e}") >>> print(f"Peaks: {report.peak_count}") """
[docs] def __init__( self, noise_region: tuple[float, float] = (19500, 20000), peak_prominence: float = 1e-4, signal_method: str | SignalMethod = SignalMethod.max, n_top_peaks: int = 10, ): self.noise_region = noise_region self.peak_prominence = peak_prominence self.signal_method = SignalMethod(signal_method) self.n_top_peaks = n_top_peaks
[docs] def estimate_noise_level(self, spectrum: MaldiSpectrum) -> float: """ Estimate noise level using MAD in noise region. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. Returns ------- float Estimated noise standard deviation. Returns 0 if noise region is empty. """ df = _get_spectrum_df(spectrum) noise_mask = df["mass"].between(*self.noise_region) noise = df.loc[noise_mask, "intensity"] if len(noise) == 0: return 0.0 mad = np.median(np.abs(noise - np.median(noise))) # 1/Phi^{-1}(3/4), exact for Gaussian noise return mad / norm.ppf(0.75)
[docs] def estimate_mad_noise( self, spectrum: MaldiSpectrum, mz_region: tuple[float, float] | None = None, constant: float = 1.4826, ) -> float: """ Estimate noise level via median absolute deviation (MAD). Uses :func:`scipy.stats.median_abs_deviation` on the intensities in the selected m/z region and multiplies the raw MAD by ``constant``. The default ``constant = 1.4826 = 1 / Phi^{-1}(3/4)`` rescales MAD to match the standard deviation of a Gaussian (Rousseeuw & Croux 1993), matching the convention used by :meth:`estimate_noise_level`. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. mz_region : tuple of (float, float), optional m/z range to use for noise estimation. When ``None`` (the default) falls back to ``self.noise_region``. constant : float, default=1.4826 Scale factor applied to the raw MAD. Use ``1.4826`` for a standard-normal-scaled estimator (equivalent to ``scale='normal'`` in scipy). Returns ------- float Estimated noise level. Returns ``0.0`` when the selected region contains no data points. """ region = self.noise_region if mz_region is None else mz_region df = _get_spectrum_df(spectrum) noise = df.loc[df["mass"].between(*region), "intensity"].to_numpy(dtype=float) if len(noise) == 0: return 0.0 mad = median_abs_deviation(noise, scale=1.0, nan_policy="omit") return float(constant * mad)
[docs] def estimate_baseline_fraction(self, spectrum: MaldiSpectrum) -> float: """ Estimate fraction of intensity below noise floor. This indicates how much of the spectrum is dominated by baseline rather than signal. High values suggest poor acquisition quality or excessive baseline. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. Returns ------- float Fraction of data points below 2x noise level (0 to 1). """ df = _get_spectrum_df(spectrum) noise_level = self.estimate_noise_level(spectrum) if noise_level == 0: return 0.0 baseline_threshold = 2 * noise_level baseline_points = (df["intensity"] < baseline_threshold).sum() return baseline_points / len(df)
[docs] def estimate_dynamic_range(self, spectrum: MaldiSpectrum) -> float: """ Estimate dynamic range as log10 ratio of max to median signal. Higher values indicate better separation between signal and background. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. Returns ------- float Log10 ratio of max to median intensity. Returns 0 if median is zero. """ df = _get_spectrum_df(spectrum) # Exclude very low values (likely noise/baseline) signal_mask = df["intensity"] > df["intensity"].quantile(0.1) if signal_mask.sum() == 0: return 0.0 max_signal = df.loc[signal_mask, "intensity"].max() median_signal = df.loc[signal_mask, "intensity"].median() if median_signal <= 0: return 0.0 return np.log10(max_signal / median_signal)
[docs] def count_peaks(self, spectrum: MaldiSpectrum) -> int: """ Count the number of peaks in the spectrum. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. Returns ------- int Number of detected peaks. """ df = _get_spectrum_df(spectrum) noise_level = self.estimate_noise_level(spectrum) min_prominence = max(self.peak_prominence, noise_level * 3) peaks, _ = find_peaks(df["intensity"].values, prominence=min_prominence) return len(peaks)
[docs] def assess(self, spectrum: MaldiSpectrum) -> SpectrumQualityReport: """ Perform full quality assessment of a spectrum. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. Returns ------- SpectrumQualityReport Dataclass containing all quality metrics. """ df = _get_spectrum_df(spectrum) mz_max = df["mass"].max() if self.noise_region[0] > mz_max: warnings.warn( f"noise_region {self.noise_region} is outside spectrum range " f"(max m/z={mz_max:.0f}). Quality metrics will be unreliable. " f"Adjust noise_region to fall within the trimmed spectrum range.", UserWarning, stacklevel=2, ) noise_level = self.estimate_noise_level(spectrum) snr = estimate_snr( spectrum, self.noise_region, signal_method=self.signal_method, n_top_peaks=self.n_top_peaks, ) return SpectrumQualityReport( snr=snr, total_ion_count=df["intensity"].sum(), peak_count=self.count_peaks(spectrum), baseline_fraction=self.estimate_baseline_fraction(spectrum), noise_level=noise_level, dynamic_range=self.estimate_dynamic_range(spectrum), )
[docs] def estimate_snr( spectrum: MaldiSpectrum, noise_region: tuple[float, float] = (19500, 20000), signal_method: str | SignalMethod = SignalMethod.max, n_top_peaks: int = 10, ) -> float: """ Estimate signal-to-noise ratio of a spectrum. Uses median absolute deviation (MAD) in a noise region to estimate noise level. The signal level is determined by *signal_method*. Parameters ---------- spectrum : MaldiSpectrum Spectrum to assess. Uses preprocessed data if available, otherwise raw. noise_region : tuple of (float, float), default=(19500, 20000) m/z range to use for noise estimation. Should be a region with minimal peaks (typically high m/z range). signal_method : str, default="max" How to estimate the signal level: - ``"max"``: maximum intensity (standard approach). - ``"median_peaks"``: median intensity of the top *n_top_peaks* detected peaks. More robust to single outlier peaks. n_top_peaks : int, default=10 Number of top peaks to consider when ``signal_method="median_peaks"``. Returns ------- float Estimated signal-to-noise ratio, capped at ``1e6``. Returns ``1e6`` when the noise standard deviation is zero or the configured noise region contains no data points. Raises ------ ValueError If ``signal_method`` is not one of 'max' or 'median_peaks'. Notes ----- The MAD-to-standard-deviation conversion factor (1.4826) assumes normally distributed noise. Examples -------- >>> from maldiamrkit import MaldiSpectrum >>> from maldiamrkit.preprocessing import estimate_snr >>> spec = MaldiSpectrum("spectrum.txt").preprocess() >>> snr = estimate_snr(spec) >>> print(f"SNR: {snr:.1f}") >>> snr_robust = estimate_snr(spec, signal_method="median_peaks") """ signal_method = SignalMethod(signal_method) df = _get_spectrum_df(spectrum) noise_mask = df["mass"].between(*noise_region) noise = df.loc[noise_mask, "intensity"] _MAX_SNR = 1e6 # finite cap to avoid inf propagating downstream if len(noise) == 0: warnings.warn( f"Noise region {noise_region} contains no data points. " f"SNR capped at {_MAX_SNR:.0e}.", UserWarning, stacklevel=2, ) return _MAX_SNR mad = np.median(np.abs(noise - np.median(noise))) # MAD-to-std conversion factor 1.4826 assumes normally distributed # noise (exact for Gaussian; see Rousseeuw & Croux 1993). noise_std = 1.4826 * mad if signal_method == "max": signal = df["intensity"].max() else: # median_peaks peaks, _ = find_peaks(df["intensity"].values) if len(peaks) == 0: signal = df["intensity"].max() else: peak_intensities = df["intensity"].values[peaks] top_n = np.sort(peak_intensities)[-n_top_peaks:] signal = np.median(top_n) if noise_std <= 0: return _MAX_SNR return min(signal / noise_std, _MAX_SNR)