"""Single MALDI-TOF spectrum handling."""
from __future__ import annotations
import logging
from pathlib import Path
import numpy as np
import pandas as pd
from .io.readers import _find_bruker_acqus, read_spectrum
from .preprocessing.binning import BinningMethod, bin_spectrum
from .preprocessing.pipeline import preprocess
from .preprocessing.preprocessing_pipeline import PreprocessingPipeline
logger = logging.getLogger(__name__)
def _infer_id(path: Path) -> str:
"""Infer a spectrum identifier from a file or directory path.
For files, use the stem (filename without extension).
For Bruker directories, combine the identifier and target position
extracted from the relative path to the ``acqus`` file:
- ``{identifier}/{target_pos}/1/1SLin/acqus`` (depth 2 from
``{target_pos}``) -> ``{identifier}_{target_pos}``
- ``{identifier}/.../{target_pos}/1/1SLin/acqus`` (depth 3 from
``{identifier}``) -> ``{identifier}_{target_pos}``
- ``acqus`` directly in path -> ``path.name``
"""
if not path.is_dir():
return path.stem
acqus = _find_bruker_acqus(path)
if acqus is None:
return path.name
rel_parts = acqus.parent.relative_to(path).parts
depth = len(rel_parts)
if depth == 2:
# Path is at {identifier}/{target_pos} level
return f"{path.parent.name}_{path.name}"
if depth == 3:
# Path is at {identifier} level; first rel part is target_pos
all_acqus = sorted(path.glob("*/*/*/acqus"))
if len(all_acqus) > 1:
targets = [a.parent.relative_to(path).parts[0] for a in all_acqus]
logger.warning(
"Multiple target positions found for %s: %s. Using '%s'.",
path.name,
targets,
rel_parts[0],
)
return f"{path.name}_{rel_parts[0]}"
return path.name
[docs]
class MaldiSpectrum:
"""
A single MALDI-TOF spectrum.
Provides methods for loading, preprocessing, binning, and visualizing
individual mass spectra.
Parameters
----------
source : str, Path, or pd.DataFrame
Source of the spectrum data. Can be a file path or a DataFrame
with columns 'mass' and 'intensity'.
pipeline : PreprocessingPipeline, optional
Preprocessing pipeline. If None, uses the default pipeline.
verbose : bool, default=False
If True, print progress messages.
Attributes
----------
path : Path or None
Path to the source file, if loaded from file.
id : str
Identifier for the spectrum (filename stem or 'in-memory').
pipeline : PreprocessingPipeline
Preprocessing pipeline.
Raises
------
ValueError
If the source DataFrame is empty or missing required columns
('mass', 'intensity').
TypeError
If the 'mass' or 'intensity' columns are not numeric, or if
``source`` is not a supported type.
Examples
--------
>>> spec = MaldiSpectrum("raw/abc.txt")
>>> spec.preprocess()
>>> spec.bin(3)
>>> from maldiamrkit.visualization import plot_spectrum
>>> plot_spectrum(spec)
"""
[docs]
def __init__(
self,
source: str | Path | pd.DataFrame,
*,
pipeline: PreprocessingPipeline | None = None,
verbose: bool = False,
) -> None:
self.pipeline = pipeline or PreprocessingPipeline.default()
self._raw: pd.DataFrame
self._preprocessed: pd.DataFrame | None = None
self._binned: pd.DataFrame | None = None
self._bin_width: int | float | None = None
self._bin_method: str | None = None
self._bin_metadata: pd.DataFrame | None = None
self.verbose = verbose
if isinstance(source, (str, Path)):
self.path = Path(source)
self._raw = read_spectrum(self.path)
self.id = _infer_id(self.path)
elif isinstance(source, pd.DataFrame):
if source.empty:
raise ValueError("Cannot create MaldiSpectrum from an empty DataFrame.")
missing = {"mass", "intensity"} - set(source.columns)
if missing:
raise ValueError(
f"DataFrame missing required columns: {missing}. "
f"Expected 'mass' and 'intensity'."
)
if not pd.api.types.is_numeric_dtype(source["mass"]):
raise TypeError("Column 'mass' must be numeric.")
if not pd.api.types.is_numeric_dtype(source["intensity"]):
raise TypeError("Column 'intensity' must be numeric.")
self.path = None
self._raw = source.copy()
self.id = "in-memory"
else:
raise TypeError("Unsupported source type for MaldiSpectrum")
@property
def raw(self) -> pd.DataFrame:
"""Return a copy of the raw spectrum data."""
return self._raw.copy()
@property
def bin_width(self) -> int | float | None:
"""Return the bin width used for binning, or None if not binned."""
return self._bin_width
@property
def bin_method(self) -> str | None:
"""Return the binning method used, or None if not binned."""
return self._bin_method
@property
def bin_metadata(self) -> pd.DataFrame:
"""
Return bin metadata with bin boundaries and widths.
Returns
-------
pd.DataFrame
DataFrame with columns: bin_index, bin_start, bin_end, bin_width.
Raises
------
RuntimeError
If bin() has not been called.
"""
if self._bin_metadata is None:
raise RuntimeError("Call .bin() before accessing this property.")
return self._bin_metadata.copy()
@property
def preprocessed(self) -> pd.DataFrame:
"""
Return the preprocessed spectrum.
Raises
------
RuntimeError
If preprocess() has not been called.
"""
if self._preprocessed is None:
raise RuntimeError("Call .preprocess() before accessing this property.")
return self._preprocessed.copy()
@property
def binned(self) -> pd.DataFrame:
"""
Return the binned spectrum.
Raises
------
RuntimeError
If bin() has not been called.
"""
if self._binned is None:
raise RuntimeError("Call .bin() before accessing this property.")
return self._binned.copy()
[docs]
def preprocess(self) -> MaldiSpectrum:
"""
Run preprocessing pipeline on the raw spectrum.
Returns
-------
MaldiSpectrum
Self, for method chaining.
"""
self._preprocessed = preprocess(self._raw, self.pipeline)
if self.verbose:
logger.info("Preprocessed spectrum %s", self.id)
return self
[docs]
def bin(
self,
bin_width: int | float = 3,
method: str | BinningMethod = BinningMethod.uniform,
custom_edges: np.ndarray | list | None = None,
**kwargs,
) -> MaldiSpectrum:
"""
Bin the spectrum into m/z intervals.
Automatically calls preprocess() if not already done.
Supports multiple binning strategies.
Parameters
----------
bin_width : int or float, default=3
Width of each bin in Daltons. For 'uniform', this is the fixed width.
For 'proportional', this is the reference width at mz_min.
Ignored for 'adaptive' and 'custom' methods.
method : str, default='uniform'
Binning method. One of 'uniform', 'proportional', 'adaptive', 'custom'.
custom_edges : array-like, optional
User-provided bin edges. Required if method='custom'.
**kwargs : dict
Additional parameters for specific methods:
- adaptive_min_width : float, default=1.0
- adaptive_max_width : float, default=10.0
Returns
-------
MaldiSpectrum
Self, for method chaining.
Examples
--------
>>> spec.bin(3) # uniform binning
>>> spec.bin(3, method='proportional')
>>> spec.bin(method='adaptive', adaptive_min_width=1.0, adaptive_max_width=10.0)
>>> spec.bin(method='custom', custom_edges=[2000, 5000, 10000, 20000])
"""
self._bin_width = bin_width
self._bin_method = method
if self._preprocessed is None:
self.preprocess()
mz_min, mz_max = self.pipeline.mz_range
self._binned, self._bin_metadata = bin_spectrum(
self._preprocessed,
mz_min=mz_min,
mz_max=mz_max,
bin_width=bin_width,
method=method,
custom_edges=custom_edges,
**kwargs,
)
if self.verbose:
logger.info(
"Binned spectrum %s (method=%s, w=%s)", self.id, method, bin_width
)
return self
[docs]
def save(
self, path: str | Path, *, stage: str = "binned", fmt: str = "csv"
) -> None:
"""Save spectrum data to a file.
Parameters
----------
path : str or Path
Output file path.
stage : str, default="binned"
Which processing stage to save. One of ``"raw"``,
``"preprocessed"``, ``"binned"``.
fmt : str, default="csv"
Output format. ``"csv"`` for comma-separated, ``"txt"`` for
tab-separated.
Raises
------
ValueError
If ``stage`` is not one of 'raw', 'preprocessed', or 'binned',
or if ``fmt`` is not one of 'csv' or 'txt'.
RuntimeError
If the requested stage has not been computed yet.
"""
if stage == "raw":
df = self.raw
elif stage == "preprocessed":
df = self.preprocessed
elif stage == "binned":
df = self.binned
else:
raise ValueError(
f"Invalid stage '{stage}'. Use 'raw', 'preprocessed', or 'binned'."
)
if fmt == "csv":
df.to_csv(path, index=False)
elif fmt == "txt":
df.to_csv(path, sep="\t", index=False)
else:
raise ValueError(f"Invalid fmt '{fmt}'. Use 'csv' or 'txt'.")
[docs]
def get_data(self, prefer: str = "preprocessed") -> pd.DataFrame:
"""Return spectrum data, preferring the requested processing stage.
Parameters
----------
prefer : str, default="preprocessed"
Preferred stage: ``"preprocessed"`` or ``"binned"``. Falls back
to raw data if the requested stage has not been computed.
Returns
-------
pd.DataFrame
Copy of the spectrum data at the best available stage.
"""
if prefer == "binned" and self._binned is not None:
return self._binned.copy()
if self._preprocessed is not None:
return self._preprocessed.copy()
return self._raw.copy()
@property
def is_binned(self) -> bool:
"""Whether the spectrum has been binned."""
return self._binned is not None
@property
def is_preprocessed(self) -> bool:
"""Whether the spectrum has been preprocessed."""
return self._preprocessed is not None
@property
def has_bin_metadata(self) -> bool:
"""Whether bin metadata is available (i.e. ``bin()`` has been called)."""
return self._bin_metadata is not None
def __repr__(self) -> str:
status = []
if self._preprocessed is not None:
status.append("preprocessed")
if self._binned is not None:
n = len(self._binned)
status.append(f"binned({n} bins)")
state = ", ".join(status) if status else "raw"
return f"MaldiSpectrum(id={self.id!r}, {state})"