Source code for maldiamrkit.similarity.pairwise

"""Pairwise spectral distance matrix computation."""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from joblib import Parallel, delayed

from .metrics import METRIC_REGISTRY, SpectralMetric, spectral_distance

if TYPE_CHECKING:
    from maldiamrkit.spectrum import MaldiSpectrum

_BINNED_METRICS = frozenset({"cosine", "spectral_contrast_angle", "pearson"})


[docs] def pairwise_distances( spectra: list[MaldiSpectrum] | pd.DataFrame, metric: str | SpectralMetric = SpectralMetric.wasserstein, n_jobs: int = 1, ) -> np.ndarray: """Compute an *n x n* symmetric distance matrix. Parameters ---------- spectra : list[MaldiSpectrum] or DataFrame If a :class:`~pandas.DataFrame` (binned feature matrix, rows are samples), row vectors are used. If a list of :class:`~maldiamrkit.spectrum.MaldiSpectrum`, raw/preprocessed data is used. metric : str or SpectralMetric, default="wasserstein" Key in :data:`~maldiamrkit.similarity.METRIC_REGISTRY`. n_jobs : int, default=1 Number of parallel jobs for pairwise computation. Returns ------- np.ndarray Symmetric distance matrix of shape ``(n, n)`` with zeros on the diagonal. Raises ------ ValueError If *metric* is not in the registry. """ metric = SpectralMetric(metric) # Fast path: binned metric on DataFrame input. if isinstance(spectra, pd.DataFrame) and metric in _BINNED_METRICS: return _pairwise_binned(spectra, metric) # General path: compute upper triangle with joblib parallelization. n = len(spectra) if isinstance(spectra, list) else len(spectra) return _pairwise_general(spectra, metric, n, n_jobs)
def _pairwise_binned(X: pd.DataFrame, metric: str) -> np.ndarray: """Fast path using sklearn for binned feature matrices.""" from sklearn.metrics import pairwise_distances as sklearn_pd metric_fn = METRIC_REGISTRY[metric] arr = X.values def _metric(a: np.ndarray, b: np.ndarray) -> float: return metric_fn(a, b) D = sklearn_pd(arr, metric=_metric) np.fill_diagonal(D, 0.0) return D def _pairwise_general( spectra: list | pd.DataFrame, metric: str, n: int, n_jobs: int, ) -> np.ndarray: """General path: upper-triangle computation with joblib.""" pairs = [(i, j) for i in range(n) for j in range(i + 1, n)] if isinstance(spectra, pd.DataFrame): rows = [spectra.iloc[i].values for i in range(n)] else: rows = spectra distances = Parallel(n_jobs=n_jobs, prefer="processes")( delayed(spectral_distance)(rows[i], rows[j], metric=metric) for i, j in pairs ) D = np.zeros((n, n), dtype=np.float64) for (i, j), d in zip(pairs, distances, strict=True): D[i, j] = d D[j, i] = d return D