Source code for maldiamrkit.susceptibility.label_encoder

"""Label encoding utilities for AMR classification.

Maps clinical resistance categories (R/I/S) to binary or numeric labels.

Examples
--------
>>> from maldiamrkit.susceptibility import LabelEncoder
>>> enc = LabelEncoder()
>>> enc.fit_transform(["R", "S", "I", "R", "S"])
array([1, 0, 0, 1, 0])

>>> enc = LabelEncoder(intermediate="resistant")
>>> enc.transform(["I"])
array([1])
"""

from __future__ import annotations

import warnings
from enum import Enum

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin



[docs]
class IntermediateHandling(str, Enum):
    """Strategy for handling intermediate (I) resistance labels.

    Attributes
    ----------
    susceptible : str
        Map intermediate to susceptible (0).
    resistant : str
        Map intermediate to resistant (1).
    drop : str
        Remove intermediate samples.
    nan : str
        Map intermediate to NaN.
    """

    susceptible = "susceptible"
    resistant = "resistant"
    drop = "drop"
    nan = "nan"




[docs]
class LabelEncoder(BaseEstimator, TransformerMixin):
    """Encode R/I/S resistance labels to binary (0/1).

    Supports configurable handling of intermediate (I) labels.
    Accepts both 1-D arrays (single drug) and 2-D DataFrames
    (multiple drugs).

    Parameters
    ----------
    intermediate : str, default="susceptible"
        How to handle intermediate ("I") labels:

        - ``"susceptible"``: treat I as susceptible (0) - conservative,
          avoids false resistance calls.
        - ``"resistant"``: treat I as resistant (1) - stricter, avoids
          missing resistance.
        - ``"drop"``: remove samples with I labels entirely.
          Note: this changes the output array length (samples with I
          labels are excluded) and is not compatible with sklearn
          pipelines that expect consistent sample counts.
        - ``"nan"``: map I to ``NaN``. Useful for multi-drug encoding
          where each drug is handled independently. Output dtype is
          ``float64`` (required to hold ``NaN``).

    Attributes
    ----------
    classes_ : ndarray
        Array of ``[0, 1]`` after fitting.

    Raises
    ------
    ValueError
        If ``intermediate`` is not one of the accepted values.
    """

    _RESISTANT = {"R", "r", "resistant", "Resistant"}
    _SUSCEPTIBLE = {"S", "s", "susceptible", "Susceptible"}
    _INTERMEDIATE = {"I", "i", "intermediate", "Intermediate"}


[docs]
    def __init__(
        self,
        intermediate: str | IntermediateHandling = IntermediateHandling.susceptible,
    ) -> None:
        self.intermediate = IntermediateHandling(intermediate)



[docs]
    def fit(self, y: np.ndarray | pd.DataFrame, **kwargs: object) -> LabelEncoder:
        """Fit the encoder (no-op, just sets ``classes_``).

        Parameters
        ----------
        y : array-like
            Labels to learn from (unused beyond validation).
        **kwargs : dict
            Additional keyword arguments (unused, accepted for sklearn
            compatibility).

        Returns
        -------
        self
        """
        self.classes_ = np.array([0, 1])
        return self



[docs]
    def transform(self, y: np.ndarray | pd.DataFrame) -> np.ndarray | pd.DataFrame:
        """Transform labels to binary.

        Parameters
        ----------
        y : array-like or pd.DataFrame
            String labels (R/I/S or resistant/intermediate/susceptible).
            If a DataFrame is passed, each column is encoded independently.

        Returns
        -------
        ndarray or pd.DataFrame
            Binary encoded labels. Returns a DataFrame when the input is
            a DataFrame (or a 2-D ndarray), preserving column names and
            index. Returns a 1-D ndarray for 1-D input.
        """
        if isinstance(y, pd.DataFrame):
            return y.apply(self._encode_column)
        arr = np.asarray(y)
        if arr.ndim == 2:
            return pd.DataFrame(
                {i: self._encode_column(arr[:, i]) for i in range(arr.shape[1])}
            )
        return self._encode_column(arr)


    def _encode_column(self, y: np.ndarray) -> np.ndarray:
        """Encode a single 1-D label array."""
        y = np.asarray(y)
        result = np.full(len(y), np.nan)

        for i, label in enumerate(y):
            result[i] = self._classify_label(str(label))

        self._validate_unrecognized(y, result)

        if self.intermediate == "drop":
            mask = ~np.isnan(result)
            n_dropped = int((~mask).sum())
            if n_dropped:
                warnings.warn(
                    f"intermediate='drop' removed {n_dropped} intermediate "
                    "samples. Output length differs from input - incompatible "
                    "with sklearn pipelines that expect consistent sample counts.",
                    UserWarning,
                    stacklevel=2,
                )
            return result[mask].astype(int)
        if self.intermediate == "nan":
            return result
        return result.astype(int)

    def _classify_label(self, label_str: str) -> float:
        """Map a single label string to its numeric value (1, 0, or NaN)."""
        if label_str in self._RESISTANT:
            return 1.0
        if label_str in self._SUSCEPTIBLE:
            return 0.0
        if label_str in self._INTERMEDIATE:
            if self.intermediate == "susceptible":
                return 0.0
            if self.intermediate == "resistant":
                return 1.0
        return float("nan")

    def _validate_unrecognized(self, y: np.ndarray, result: np.ndarray) -> None:
        """Raise ValueError if any labels are unrecognized (unexpected NaN)."""
        unrecognized_mask = np.isnan(result)
        if self.intermediate in ("drop", "nan"):
            intermediate_mask = np.array(
                [str(label) in self._INTERMEDIATE for label in y]
            )
            unexpected_nan = unrecognized_mask & ~intermediate_mask
        else:
            unexpected_nan = unrecognized_mask

        if unexpected_nan.any():
            bad_labels = [str(y[i]) for i in np.where(unexpected_nan)[0]]
            raise ValueError(
                f"Unrecognized labels: {bad_labels}. "
                f"Expected one of R/S/I (or resistant/susceptible/intermediate)."
            )


[docs]
    def fit_transform(self, y, **kwargs):
        """Fit the encoder and transform labels in one step.

        Parameters
        ----------
        y : array-like or pd.DataFrame
            String labels (R/I/S or resistant/intermediate/susceptible).
            If a DataFrame is passed, each column is encoded independently.
        **kwargs : dict
            Additional keyword arguments (unused, accepted for sklearn
            compatibility).

        Returns
        -------
        ndarray or pd.DataFrame
            Binary encoded labels. Returns a DataFrame when the input is
            a DataFrame, preserving column names and index.
        """
        return self.fit(y).transform(y)