MaldiAMRKit - Quick Start#

This notebook covers the basics of loading, preprocessing, and binning MALDI-TOF spectra.

If you haven’t installed the package yet, run:

pip install maldiamrkit

Import MaldiAMRKit#

[ ]:
from maldiamrkit import MaldiSet, MaldiSpectrum
from maldiamrkit.filters import DrugFilter, MetadataFilter, SpeciesFilter
from maldiamrkit.preprocessing import (
    ClipNegatives,
    LogTransform,
    MedianNormalizer,
    MzTrimmer,
    PreprocessingPipeline,
    SavitzkyGolaySmooth,
    SNIPBaseline,
    SpectrumQuality,
    estimate_snr,
)
from maldiamrkit.susceptibility import LabelEncoder
from maldiamrkit.visualization import plot_pseudogel, plot_spectrum

Preprocessing Pipeline#

Inspect the default composable preprocessing pipeline.

[2]:
pipe = PreprocessingPipeline.default()
pipe
[2]:
PreprocessingPipeline([
  ('clip', ClipNegatives()),
  ('sqrt', SqrtTransform()),
  ('smooth', SavitzkyGolaySmooth(window_length=20, polyorder=2)),
  ('baseline', SNIPBaseline(half_window=40)),
  ('trim', MzTrimmer(mz_min=2000, mz_max=20000)),
  ('normalize', TICNormalizer())
])

Load and Preprocess a Single Spectrum#

[ ]:
# Load, preprocess (smoothing, baseline removal, normalization), and bin
spec = MaldiSpectrum("../data/1s.txt").preprocess()
spec.bin(3)  # bin width 3 Da

# Plot the binned spectrum
_ = plot_spectrum(spec, stage="binned")

Verbose Mode#

Enable verbose mode to see processing messages.

[4]:
spec = MaldiSpectrum("../data/1s.txt", verbose=True).preprocess()
spec.bin(3).binned
[4]:
mass intensity
0 2000 0.000039
1 2003 0.000041
2 2006 0.000083
3 2009 0.000123
4 2012 0.000109
... ... ...
5995 19985 0.000087
5996 19988 0.000054
5997 19991 0.000048
5998 19994 0.000050
5999 19997 0.000012

6000 rows × 2 columns

Plot Without Binning#

[ ]:
spec = MaldiSpectrum("../data/1s.txt").preprocess()
_ = plot_spectrum(spec, stage="preprocessed")

Quality Assessment#

Signal-to-Noise Ratio (SNR)#

Use estimate_snr() to assess spectrum quality. Higher SNR indicates better signal quality.

[6]:
spec = MaldiSpectrum("../data/1s.txt").preprocess()
snr = estimate_snr(spec)
print(f"Signal-to-Noise Ratio: {snr:.1f}")
Signal-to-Noise Ratio: 67.2

Comprehensive Quality Report#

Use SpectrumQuality for a comprehensive quality assessment.

[7]:
qc = SpectrumQuality()
report = qc.assess(spec)
print(f"SNR: {report.snr:.1f}")
print(f"Total Ion Count: {report.total_ion_count:.2e}")
print(f"Peak Count: {report.peak_count}")
print(f"Baseline Fraction: {report.baseline_fraction:.2%}")
print(f"Dynamic Range: {report.dynamic_range:.2f}")
SNR: 67.2
Total Ion Count: 1.00e+00
Peak Count: 74
Baseline Fraction: 46.26%
Dynamic Range: 1.46

Binning Methods#

MaldiAMRKit supports multiple binning strategies: uniform (default), proportional, adaptive, and custom.

[8]:
import numpy as np

spec = MaldiSpectrum("../data/1s.txt").preprocess()

# Uniform binning (default)
spec.bin(bin_width=3)
print(f"Uniform: {len(spec.binned)} bins")
print(spec.bin_metadata.head())
Uniform: 6000 bins
   bin_index  bin_start  bin_end  bin_width
0          0       2000     2003          3
1          1       2003     2006          3
2          2       2006     2009          3
3          3       2009     2012          3
4          4       2012     2015          3
[ ]:
# Proportional binning (bin width scales with m/z)
spec.bin(bin_width=3, method="proportional")
print(f"Proportional: {len(spec.binned)} bins")
print(f"Width at start: {spec.bin_metadata.iloc[0]['bin_width']:.2f} Da")
print(f"Width at end: {spec.bin_metadata.iloc[-1]['bin_width']:.2f} Da")
[10]:
# Adaptive binning (smaller bins in peak-dense regions)
spec.bin(method="adaptive", adaptive_min_width=1.0, adaptive_max_width=10.0)
print(f"Adaptive: {len(spec.binned)} bins")
print(f"Min width: {spec.bin_metadata['bin_width'].min():.2f} Da")
print(f"Max width: {spec.bin_metadata['bin_width'].max():.2f} Da")
Adaptive: 3756 bins
Min width: 1.00 Da
Max width: 9.37 Da
[11]:
# Custom binning (user-defined edges)
custom_edges = np.linspace(2000, 20000, 50)  # 49 bins
spec.bin(method="custom", custom_edges=custom_edges)
print(f"Custom: {len(spec.binned)} bins")
print(spec.bin_metadata.head())
Custom: 49 bins
   bin_index    bin_start      bin_end   bin_width
0          0  2000.000000  2367.346939  367.346939
1          1  2367.346939  2734.693878  367.346939
2          2  2734.693878  3102.040816  367.346939
3          3  3102.040816  3469.387755  367.346939
4          4  3469.387755  3836.734694  367.346939

Build a Dataset from Multiple Spectra#

Use MaldiSet to load and process multiple spectra with metadata.

[12]:
data = MaldiSet.from_directory(
    "../data/",
    "../data/metadata/metadata.csv",
    aggregate_by=dict(antibiotics="Drug"),
)
X, y = data.X, data.y

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")
Features shape: (29, 6000)
Labels shape: (29, 1)
[13]:
y.head()
[13]:
Drug
10s S
11s R
12s R
13s S
14s S

Pseudogel Visualization#

[ ]:
_ = plot_pseudogel(data)
[ ]:
_ = plot_pseudogel(data, regions=[(2000, 3000), (6000, 7000)])

Label Encoding#

Use LabelEncoder to convert R/I/S clinical resistance labels to binary (0/1). The intermediate parameter controls how “I” (intermediate) labels are handled:

  • ``”susceptible”`` (default): treat I as 0 - conservative, avoids false resistance calls

  • ``”resistant”``: treat I as 1 - stricter, flags uncertain isolates

  • ``”drop”``: remove I samples entirely

[16]:
# Compare all three modes on labels with Intermediate
labels = ["R", "S", "I", "R", "S", "I"]

enc_s = LabelEncoder(intermediate="susceptible")
print("intermediate='susceptible':", enc_s.fit_transform(labels))

enc_r = LabelEncoder(intermediate="resistant")
print("intermediate='resistant': ", enc_r.fit_transform(labels))

enc_d = LabelEncoder(intermediate="drop")
print("intermediate='drop':      ", enc_d.fit_transform(labels))

# Apply to dataset labels (R/S only here, so all modes give the same result)
enc = LabelEncoder()
y_binary = enc.fit_transform(y["Drug"].values)
print(f"\nDataset labels: {y_binary[:10]}")
print(f"Resistant: {y_binary.sum()}, Susceptible: {len(y_binary) - y_binary.sum()}")
intermediate='susceptible': [1 0 0 1 0 0]
intermediate='resistant':  [1 0 1 1 0 1]
intermediate='drop':       [1 0 1 0]

Dataset labels: [0 1 1 0 0 0 0 0 0 0]
Resistant: 10, Susceptible: 19

Filtering Datasets#

MaldiAMRKit provides a composable filter system for subsetting a MaldiSet. Filters can be combined with & (and), | (or), and ~ (not) operators.

Available filters:

  • ``SpeciesFilter``: keep samples from specific species

  • ``DrugFilter``: filter by antibiotic resistance status

  • ``QualityFilter``: filter by SNR, peak count, or baseline fraction (requires enriched metadata)

  • ``MetadataFilter``: filter by any metadata column with a custom condition

[ ]:
# SpeciesFilter - keep samples from specific species
f_species = SpeciesFilter("taxon")
filtered = data.filter(f_species)
print(
    f"SpeciesFilter('taxon'): {len(filtered.spectra)} of {len(data.spectra)} spectra kept"
)

# Filtering for a species not in the data removes all samples
f_other = SpeciesFilter("Escherichia coli")
filtered_empty = data.filter(f_other)
print(f"SpeciesFilter('Escherichia coli'): {len(filtered_empty.spectra)} spectra kept")

# DrugFilter - filter by antibiotic resistance status
f_drug = DrugFilter("Drug", status="R")
resistant = data.filter(f_drug)
print(f"DrugFilter('Drug', status='R'): {len(resistant.spectra)} spectra kept")
[ ]:
# MetadataFilter - filter by any metadata column
f_resistant = MetadataFilter("Drug", lambda v: v == "R")
resistant_only = data.filter(f_resistant)
print(f"Resistant only: {len(resistant_only.spectra)} spectra")

f_susceptible = MetadataFilter("Drug", lambda v: v == "S")
susceptible_only = data.filter(f_susceptible)
print(f"Susceptible only: {len(susceptible_only.spectra)} spectra")
[19]:
# Combine filters with & (and), | (or), ~ (not)
f_combined = SpeciesFilter("taxon") & DrugFilter("Drug", status="R")
result = data.filter(f_combined)
print(f"taxon AND resistant: {len(result.spectra)} spectra")

# NOT resistant (equivalent to susceptible)
f_not_r = ~DrugFilter("Drug", status="R")
result2 = data.filter(f_not_r)
print(f"NOT resistant: {len(result2.spectra)} spectra")

# Display the composed filter
print(f"\nFilter repr: {f_combined}")
taxon AND resistant: 10 spectra
NOT resistant: 19 spectra

Filter repr: (SpeciesFilter('taxon') & DrugFilter('Drug', status='R'))

Custom Preprocessing Pipelines#

The default pipeline uses: clip negatives, sqrt transform, Savitzky-Golay smoothing, SNIP baseline correction, m/z trimming, and TIC normalization.

You can build a custom pipeline by choosing from the available transformers: ClipNegatives, SqrtTransform, LogTransform, SavitzkyGolaySmooth, SNIPBaseline, MzTrimmer, TICNormalizer, MedianNormalizer, PQNNormalizer, MzMultiTrimmer.

[20]:
# Build a custom pipeline with different parameters
custom_pipe = PreprocessingPipeline(
    [
        ("clip", ClipNegatives()),
        ("log", LogTransform()),  # log1p instead of sqrt
        (
            "smooth",
            SavitzkyGolaySmooth(window_length=15, polyorder=3),
        ),  # different smoothing
        ("baseline", SNIPBaseline(half_window=30)),  # narrower baseline window
        ("trim", MzTrimmer(mz_min=3000, mz_max=15000)),  # narrower m/z range
        ("normalize", MedianNormalizer()),  # median instead of TIC
    ]
)
custom_pipe
[20]:
PreprocessingPipeline([
  ('clip', ClipNegatives()),
  ('log', LogTransform()),
  ('smooth', SavitzkyGolaySmooth(window_length=15, polyorder=3)),
  ('baseline', SNIPBaseline(half_window=30)),
  ('trim', MzTrimmer(mz_min=3000, mz_max=15000)),
  ('normalize', MedianNormalizer())
])
[21]:
# Compare default vs custom pipeline on a single spectrum
spec_default = MaldiSpectrum("../data/1s.txt").preprocess().bin(3)
spec_custom = MaldiSpectrum("../data/1s.txt", pipeline=custom_pipe).preprocess().bin(3)

print(f"Default pipeline: {len(spec_default.binned)} bins, m/z range 2000-20000")
print(f"Custom pipeline:  {len(spec_custom.binned)} bins, m/z range 3000-15000")
Default pipeline: 6000 bins, m/z range 2000-20000
Custom pipeline:  4000 bins, m/z range 3000-15000
[22]:
# Use a custom pipeline with MaldiSet
data_custom = MaldiSet.from_directory(
    "../data/",
    "../data/metadata/metadata.csv",
    aggregate_by=dict(antibiotics="Drug"),
    pipeline=custom_pipe,
)
print(f"Custom pipeline features shape: {data_custom.X.shape}")
print(f"Default pipeline features shape: {data.X.shape}")
Custom pipeline features shape: (29, 4000)
Default pipeline features shape: (29, 6000)

Pipeline Serialization#

Save and load pipeline configurations for reproducibility. Supports JSON, YAML, and Python dict formats.

[23]:
# Serialize pipeline to a dictionary
d = custom_pipe.to_dict()
print("Pipeline as dict:")
for step in d["steps"]:
    params = {k: v for k, v in step.items() if k not in ("step_name", "name")}
    print(f"  {step['step_name']}: {step['name']}", end="")
    if params:
        print(f"  {params}")
    else:
        print()
Pipeline as dict:
  clip: ClipNegatives
  log: LogTransform
  smooth: SavitzkyGolaySmooth  {'window_length': 15, 'polyorder': 3}
  baseline: SNIPBaseline  {'half_window': 30}
  trim: MzTrimmer  {'mz_min': 3000, 'mz_max': 15000}
  normalize: MedianNormalizer
[24]:
from pprint import pprint

# Save to JSON and inspect
custom_pipe.to_json("my_pipeline.json")

custom_pipe_reloaded = PreprocessingPipeline.from_json("my_pipeline.json")
pprint(custom_pipe_reloaded.to_dict())

# Reload from JSON
reloaded = PreprocessingPipeline.from_json("my_pipeline.json")
print(f"\nReloaded pipeline steps: {reloaded.step_names}")
print(f"Same m/z range: {reloaded.mz_range == custom_pipe.mz_range}")
{'steps': [{'name': 'ClipNegatives', 'step_name': 'clip'},
           {'name': 'LogTransform', 'step_name': 'log'},
           {'name': 'SavitzkyGolaySmooth',
            'polyorder': 3,
            'step_name': 'smooth',
            'window_length': 15},
           {'half_window': 30, 'name': 'SNIPBaseline', 'step_name': 'baseline'},
           {'mz_max': 15000,
            'mz_min': 3000,
            'name': 'MzTrimmer',
            'step_name': 'trim'},
           {'name': 'MedianNormalizer', 'step_name': 'normalize'}]}

Reloaded pipeline steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']
Same m/z range: True
[25]:
# Save to YAML and reload
custom_pipe.to_yaml("my_pipeline.yaml")
reloaded_yaml = PreprocessingPipeline.from_yaml("my_pipeline.yaml")
print(f"YAML-reloaded steps: {reloaded_yaml.step_names}")
YAML-reloaded steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']
[26]:
import os

# Clean up temporary files
os.remove("my_pipeline.json")
os.remove("my_pipeline.yaml")