Source code for maldiamrkit.data.duplicates
"""Unified duplicate spectrum handling.
Provides the :class:`DuplicateStrategy` enum and helper functions for
applying duplicate-handling strategies consistently across all layout
classes, :func:`~maldiamrkit.alignment.raw_warping.create_raw_input`,
and downstream transformers.
"""
from __future__ import annotations
import logging
from enum import Enum
import pandas as pd
logger = logging.getLogger(__name__)
[docs]
class DuplicateStrategy(str, Enum):
"""Strategy for handling duplicate spectrum identifiers.
Attributes
----------
first : str
Keep the first occurrence of each duplicate ID.
last : str
Keep the last occurrence of each duplicate ID.
drop : str
Remove **all** rows whose ID appears more than once.
keep_all : str
Retain every replicate, disambiguating IDs with a suffix
(e.g. ``_rep1``, ``_rep2`` or the target-position value).
average : str
Keep all replicates for downstream averaging. Adds an
``_original_id`` column so that loaders / transformers can
group replicates and average their spectra.
"""
first = "first"
last = "last"
drop = "drop"
keep_all = "keep_all"
average = "average"
def apply_metadata_strategy(
df: pd.DataFrame,
strategy: DuplicateStrategy,
*,
id_col: str = "ID",
suffix_col: str | None = None,
) -> pd.DataFrame:
"""Apply a duplicate-handling strategy to a metadata DataFrame.
Parameters
----------
df : pd.DataFrame
Metadata with an ``id_col`` column that may contain duplicates.
strategy : DuplicateStrategy
Which strategy to apply.
id_col : str, default ``"ID"``
Name of the identifier column.
suffix_col : str or None
When *strategy* is ``"keep_all"`` and *suffix_col* is given,
duplicate IDs are disambiguated by appending
``_{suffix_col_value}`` (e.g. a target-position column).
If ``None``, a sequential ``_rep1``, ``_rep2``, ... suffix is
used instead.
Returns
-------
pd.DataFrame
The (possibly filtered / modified) DataFrame.
"""
strategy = DuplicateStrategy(strategy)
dup_mask = df.duplicated(subset=id_col, keep=False)
n_dups = dup_mask.sum()
if n_dups == 0:
return df
logger.warning(
"%d duplicate ID(s) detected; applying strategy '%s'.",
n_dups,
strategy.value,
)
if strategy in (DuplicateStrategy.first, DuplicateStrategy.last):
return df.drop_duplicates(subset=id_col, keep=strategy.value)
if strategy is DuplicateStrategy.drop:
return df.drop_duplicates(subset=id_col, keep=False)
if strategy is DuplicateStrategy.keep_all:
return _suffix_duplicates(df, id_col, suffix_col)
# strategy is DuplicateStrategy.average
df = df.copy()
df["_original_id"] = df[id_col]
return _suffix_duplicates(df, id_col, suffix_col)
def _suffix_duplicates(
df: pd.DataFrame,
id_col: str,
suffix_col: str | None,
) -> pd.DataFrame:
"""Append disambiguating suffixes to duplicate IDs.
If *suffix_col* is provided and present in the DataFrame, the
suffix is ``_{value}``; otherwise a sequential ``_rep{N}`` suffix
is used. A final ``drop_duplicates`` removes any residual clashes.
"""
df = df.copy()
if suffix_col is not None and suffix_col in df.columns:
df[id_col] = df[id_col] + "_" + df[suffix_col].astype(str)
else:
counts: dict[str, int] = {}
new_ids: list[str] = []
for val in df[id_col]:
counts[val] = counts.get(val, 0) + 1
new_ids.append(f"{val}_rep{counts[val]}")
df[id_col] = new_ids
# Guard against suffix collisions
df = df.drop_duplicates(subset=id_col, keep="first")
return df
def apply_index_strategy(
df: pd.DataFrame,
strategy: DuplicateStrategy,
) -> pd.DataFrame:
"""Apply a duplicate-handling strategy on the DataFrame **index**.
This is used by :func:`~maldiamrkit.alignment.raw_warping.create_raw_input`
where sample IDs live in the index rather than a column.
Parameters
----------
df : pd.DataFrame
DataFrame whose index may contain duplicate sample IDs.
strategy : DuplicateStrategy
Which strategy to apply.
Returns
-------
pd.DataFrame
The (possibly filtered / modified) DataFrame.
"""
strategy = DuplicateStrategy(strategy)
dup_mask = df.index.duplicated(keep=False)
n_dups = dup_mask.sum()
if n_dups == 0:
return df
logger.warning(
"%d duplicate sample ID(s) detected; applying strategy '%s'.",
n_dups,
strategy.value,
)
if strategy is DuplicateStrategy.first:
return df[~df.index.duplicated(keep="first")]
if strategy is DuplicateStrategy.last:
return df[~df.index.duplicated(keep="last")]
if strategy is DuplicateStrategy.drop:
return df[~dup_mask]
if strategy is DuplicateStrategy.keep_all:
return _suffix_index_duplicates(df)
# strategy is DuplicateStrategy.average
df = df.copy()
df["_original_id"] = df.index
return df
def _suffix_index_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""Append ``_repN`` suffixes to duplicate index entries."""
df = df.copy()
counts: dict[str, int] = {}
new_index: list[str] = []
for val in df.index:
counts[val] = counts.get(val, 0) + 1
new_index.append(f"{val}_rep{counts[val]}")
df.index = pd.Index(new_index)
return df