{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# MaldiAMRKit - Quick Start\n", "\n", "This notebook covers the basics of loading, preprocessing, and binning MALDI-TOF spectra." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If you haven't installed the package yet, run:\n", "```bash\n", "pip install maldiamrkit\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import MaldiAMRKit" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:26.072070Z", "iopub.status.busy": "2026-02-08T15:04:26.071970Z", "iopub.status.idle": "2026-02-08T15:04:26.684663Z", "shell.execute_reply": "2026-02-08T15:04:26.684010Z" } }, "outputs": [], "source": "from maldiamrkit import MaldiSet, MaldiSpectrum\nfrom maldiamrkit.filters import DrugFilter, MetadataFilter, SpeciesFilter\nfrom maldiamrkit.preprocessing import (\n ClipNegatives,\n LogTransform,\n MedianNormalizer,\n MzTrimmer,\n PreprocessingPipeline,\n SavitzkyGolaySmooth,\n SNIPBaseline,\n SpectrumQuality,\n estimate_snr,\n)\nfrom maldiamrkit.susceptibility import LabelEncoder\nfrom maldiamrkit.visualization import plot_pseudogel, plot_spectrum" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preprocessing Pipeline\n", "\n", "Inspect the default composable preprocessing pipeline." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:26.686879Z", "iopub.status.busy": "2026-02-08T15:04:26.686678Z", "iopub.status.idle": "2026-02-08T15:04:26.690296Z", "shell.execute_reply": "2026-02-08T15:04:26.689889Z" } }, "outputs": [ { "data": { "text/plain": [ "PreprocessingPipeline([\n", " ('clip', ClipNegatives()),\n", " ('sqrt', SqrtTransform()),\n", " ('smooth', SavitzkyGolaySmooth(window_length=20, polyorder=2)),\n", " ('baseline', SNIPBaseline(half_window=40)),\n", " ('trim', MzTrimmer(mz_min=2000, mz_max=20000)),\n", " ('normalize', TICNormalizer())\n", "])" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe = PreprocessingPipeline.default()\n", "pipe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load and Preprocess a Single Spectrum" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:26.708633Z", "iopub.status.busy": "2026-02-08T15:04:26.708402Z", "iopub.status.idle": "2026-02-08T15:04:34.807856Z", "shell.execute_reply": "2026-02-08T15:04:34.807111Z" } }, "outputs": [], "source": "# Load, preprocess (smoothing, baseline removal, normalization), and bin\nspec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\nspec.bin(3) # bin width 3 Da\n\n# Plot the binned spectrum\n_ = plot_spectrum(spec, stage=\"binned\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Verbose Mode\n", "\n", "Enable verbose mode to see processing messages." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.809508Z", "iopub.status.busy": "2026-02-08T15:04:34.809234Z", "iopub.status.idle": "2026-02-08T15:04:34.831421Z", "shell.execute_reply": "2026-02-08T15:04:34.830726Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
massintensity
020000.000039
120030.000041
220060.000083
320090.000123
420120.000109
.........
5995199850.000087
5996199880.000054
5997199910.000048
5998199940.000050
5999199970.000012
\n", "

6000 rows × 2 columns

\n", "
" ], "text/plain": [ " mass intensity\n", "0 2000 0.000039\n", "1 2003 0.000041\n", "2 2006 0.000083\n", "3 2009 0.000123\n", "4 2012 0.000109\n", "... ... ...\n", "5995 19985 0.000087\n", "5996 19988 0.000054\n", "5997 19991 0.000048\n", "5998 19994 0.000050\n", "5999 19997 0.000012\n", "\n", "[6000 rows x 2 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spec = MaldiSpectrum(\"../data/1s.txt\", verbose=True).preprocess()\n", "spec.bin(3).binned" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Plot Without Binning" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.832721Z", "iopub.status.busy": "2026-02-08T15:04:34.832603Z", "iopub.status.idle": "2026-02-08T15:04:34.893125Z", "shell.execute_reply": "2026-02-08T15:04:34.892502Z" } }, "outputs": [], "source": "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n_ = plot_spectrum(spec, stage=\"preprocessed\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Quality Assessment\n", "\n", "### Signal-to-Noise Ratio (SNR)\n", "\n", "Use `estimate_snr()` to assess spectrum quality. Higher SNR indicates better signal quality." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.894729Z", "iopub.status.busy": "2026-02-08T15:04:34.894622Z", "iopub.status.idle": "2026-02-08T15:04:34.906353Z", "shell.execute_reply": "2026-02-08T15:04:34.905743Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Signal-to-Noise Ratio: 67.2\n" ] } ], "source": [ "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n", "snr = estimate_snr(spec)\n", "print(f\"Signal-to-Noise Ratio: {snr:.1f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Comprehensive Quality Report\n", "\n", "Use `SpectrumQuality` for a comprehensive quality assessment." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.908167Z", "iopub.status.busy": "2026-02-08T15:04:34.908062Z", "iopub.status.idle": "2026-02-08T15:04:34.913273Z", "shell.execute_reply": "2026-02-08T15:04:34.912940Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "SNR: 67.2\n", "Total Ion Count: 1.00e+00\n", "Peak Count: 74\n", "Baseline Fraction: 46.26%\n", "Dynamic Range: 1.46\n" ] } ], "source": [ "qc = SpectrumQuality()\n", "report = qc.assess(spec)\n", "print(f\"SNR: {report.snr:.1f}\")\n", "print(f\"Total Ion Count: {report.total_ion_count:.2e}\")\n", "print(f\"Peak Count: {report.peak_count}\")\n", "print(f\"Baseline Fraction: {report.baseline_fraction:.2%}\")\n", "print(f\"Dynamic Range: {report.dynamic_range:.2f}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Binning Methods\n", "\n", "MaldiAMRKit supports multiple binning strategies: uniform (default), proportional, adaptive, and custom." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.914758Z", "iopub.status.busy": "2026-02-08T15:04:34.914639Z", "iopub.status.idle": "2026-02-08T15:04:34.932822Z", "shell.execute_reply": "2026-02-08T15:04:34.932400Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Uniform: 6000 bins\n", " bin_index bin_start bin_end bin_width\n", "0 0 2000 2003 3\n", "1 1 2003 2006 3\n", "2 2 2006 2009 3\n", "3 3 2009 2012 3\n", "4 4 2012 2015 3\n" ] } ], "source": [ "import numpy as np\n", "\n", "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n", "\n", "# Uniform binning (default)\n", "spec.bin(bin_width=3)\n", "print(f\"Uniform: {len(spec.binned)} bins\")\n", "print(spec.bin_metadata.head())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.934372Z", "iopub.status.busy": "2026-02-08T15:04:34.934264Z", "iopub.status.idle": "2026-02-08T15:04:34.942544Z", "shell.execute_reply": "2026-02-08T15:04:34.942053Z" } }, "outputs": [], "source": [ "# Proportional binning (bin width scales with m/z)\n", "spec.bin(bin_width=3, method=\"proportional\")\n", "print(f\"Proportional: {len(spec.binned)} bins\")\n", "print(f\"Width at start: {spec.bin_metadata.iloc[0]['bin_width']:.2f} Da\")\n", "print(f\"Width at end: {spec.bin_metadata.iloc[-1]['bin_width']:.2f} Da\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.943795Z", "iopub.status.busy": "2026-02-08T15:04:34.943691Z", "iopub.status.idle": "2026-02-08T15:04:34.963292Z", "shell.execute_reply": "2026-02-08T15:04:34.962754Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Adaptive: 3756 bins\n", "Min width: 1.00 Da\n", "Max width: 9.37 Da\n" ] } ], "source": [ "# Adaptive binning (smaller bins in peak-dense regions)\n", "spec.bin(method=\"adaptive\", adaptive_min_width=1.0, adaptive_max_width=10.0)\n", "print(f\"Adaptive: {len(spec.binned)} bins\")\n", "print(f\"Min width: {spec.bin_metadata['bin_width'].min():.2f} Da\")\n", "print(f\"Max width: {spec.bin_metadata['bin_width'].max():.2f} Da\")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.964630Z", "iopub.status.busy": "2026-02-08T15:04:34.964527Z", "iopub.status.idle": "2026-02-08T15:04:34.969690Z", "shell.execute_reply": "2026-02-08T15:04:34.969329Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Custom: 49 bins\n", " bin_index bin_start bin_end bin_width\n", "0 0 2000.000000 2367.346939 367.346939\n", "1 1 2367.346939 2734.693878 367.346939\n", "2 2 2734.693878 3102.040816 367.346939\n", "3 3 3102.040816 3469.387755 367.346939\n", "4 4 3469.387755 3836.734694 367.346939\n" ] } ], "source": [ "# Custom binning (user-defined edges)\n", "custom_edges = np.linspace(2000, 20000, 50) # 49 bins\n", "spec.bin(method=\"custom\", custom_edges=custom_edges)\n", "print(f\"Custom: {len(spec.binned)} bins\")\n", "print(spec.bin_metadata.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build a Dataset from Multiple Spectra\n", "\n", "Use `MaldiSet` to load and process multiple spectra with metadata." ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:34.971166Z", "iopub.status.busy": "2026-02-08T15:04:34.971064Z", "iopub.status.idle": "2026-02-08T15:04:35.470153Z", "shell.execute_reply": "2026-02-08T15:04:35.469504Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Features shape: (29, 6000)\n", "Labels shape: (29, 1)\n" ] } ], "source": [ "data = MaldiSet.from_directory(\n", " \"../data/\",\n", " \"../data/metadata/metadata.csv\",\n", " aggregate_by=dict(antibiotics=\"Drug\"),\n", ")\n", "X, y = data.X, data.y\n", "\n", "print(f\"Features shape: {X.shape}\")\n", "print(f\"Labels shape: {y.shape}\")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.472364Z", "iopub.status.busy": "2026-02-08T15:04:35.472258Z", "iopub.status.idle": "2026-02-08T15:04:35.475495Z", "shell.execute_reply": "2026-02-08T15:04:35.475112Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Drug
10sS
11sR
12sR
13sS
14sS
\n", "
" ], "text/plain": [ " Drug\n", "10s S\n", "11s R\n", "12s R\n", "13s S\n", "14s S" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pseudogel Visualization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.476634Z", "iopub.status.busy": "2026-02-08T15:04:35.476544Z", "iopub.status.idle": "2026-02-08T15:04:35.628416Z", "shell.execute_reply": "2026-02-08T15:04:35.627821Z" } }, "outputs": [], "source": "_ = plot_pseudogel(data)" }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.629856Z", "iopub.status.busy": "2026-02-08T15:04:35.629755Z", "iopub.status.idle": "2026-02-08T15:04:35.769357Z", "shell.execute_reply": "2026-02-08T15:04:35.768823Z" } }, "outputs": [], "source": "_ = plot_pseudogel(data, regions=[(2000, 3000), (6000, 7000)])" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Label Encoding\n", "\n", "Use `LabelEncoder` to convert R/I/S clinical resistance labels to binary (0/1).\n", "The `intermediate` parameter controls how \"I\" (intermediate) labels are handled:\n", "\n", "- **`\"susceptible\"`** (default): treat I as 0 - conservative, avoids false resistance calls\n", "- **`\"resistant\"`**: treat I as 1 - stricter, flags uncertain isolates\n", "- **`\"drop\"`**: remove I samples entirely" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.770731Z", "iopub.status.busy": "2026-02-08T15:04:35.770626Z", "iopub.status.idle": "2026-02-08T15:04:35.774187Z", "shell.execute_reply": "2026-02-08T15:04:35.773697Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "intermediate='susceptible': [1 0 0 1 0 0]\n", "intermediate='resistant': [1 0 1 1 0 1]\n", "intermediate='drop': [1 0 1 0]\n", "\n", "Dataset labels: [0 1 1 0 0 0 0 0 0 0]\n", "Resistant: 10, Susceptible: 19\n" ] } ], "source": [ "# Compare all three modes on labels with Intermediate\n", "labels = [\"R\", \"S\", \"I\", \"R\", \"S\", \"I\"]\n", "\n", "enc_s = LabelEncoder(intermediate=\"susceptible\")\n", "print(\"intermediate='susceptible':\", enc_s.fit_transform(labels))\n", "\n", "enc_r = LabelEncoder(intermediate=\"resistant\")\n", "print(\"intermediate='resistant': \", enc_r.fit_transform(labels))\n", "\n", "enc_d = LabelEncoder(intermediate=\"drop\")\n", "print(\"intermediate='drop': \", enc_d.fit_transform(labels))\n", "\n", "# Apply to dataset labels (R/S only here, so all modes give the same result)\n", "enc = LabelEncoder()\n", "y_binary = enc.fit_transform(y[\"Drug\"].values)\n", "print(f\"\\nDataset labels: {y_binary[:10]}\")\n", "print(f\"Resistant: {y_binary.sum()}, Susceptible: {len(y_binary) - y_binary.sum()}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Filtering Datasets\n", "\n", "MaldiAMRKit provides a composable filter system for subsetting a `MaldiSet`.\n", "Filters can be combined with `&` (and), `|` (or), and `~` (not) operators.\n", "\n", "Available filters:\n", "- **`SpeciesFilter`**: keep samples from specific species\n", "- **`DrugFilter`**: filter by antibiotic resistance status\n", "- **`QualityFilter`**: filter by SNR, peak count, or baseline fraction (requires enriched metadata)\n", "- **`MetadataFilter`**: filter by any metadata column with a custom condition" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.775316Z", "iopub.status.busy": "2026-02-08T15:04:35.775221Z", "iopub.status.idle": "2026-02-08T15:04:35.780039Z", "shell.execute_reply": "2026-02-08T15:04:35.779481Z" } }, "outputs": [], "source": [ "# SpeciesFilter - keep samples from specific species\n", "f_species = SpeciesFilter(\"taxon\")\n", "filtered = data.filter(f_species)\n", "print(\n", " f\"SpeciesFilter('taxon'): {len(filtered.spectra)} of {len(data.spectra)} spectra kept\"\n", ")\n", "\n", "# Filtering for a species not in the data removes all samples\n", "f_other = SpeciesFilter(\"Escherichia coli\")\n", "filtered_empty = data.filter(f_other)\n", "print(f\"SpeciesFilter('Escherichia coli'): {len(filtered_empty.spectra)} spectra kept\")\n", "\n", "# DrugFilter - filter by antibiotic resistance status\n", "f_drug = DrugFilter(\"Drug\", status=\"R\")\n", "resistant = data.filter(f_drug)\n", "print(f\"DrugFilter('Drug', status='R'): {len(resistant.spectra)} spectra kept\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.781054Z", "iopub.status.busy": "2026-02-08T15:04:35.780950Z", "iopub.status.idle": "2026-02-08T15:04:35.785311Z", "shell.execute_reply": "2026-02-08T15:04:35.784844Z" } }, "outputs": [], "source": [ "# MetadataFilter - filter by any metadata column\n", "f_resistant = MetadataFilter(\"Drug\", lambda v: v == \"R\")\n", "resistant_only = data.filter(f_resistant)\n", "print(f\"Resistant only: {len(resistant_only.spectra)} spectra\")\n", "\n", "f_susceptible = MetadataFilter(\"Drug\", lambda v: v == \"S\")\n", "susceptible_only = data.filter(f_susceptible)\n", "print(f\"Susceptible only: {len(susceptible_only.spectra)} spectra\")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.786304Z", "iopub.status.busy": "2026-02-08T15:04:35.786212Z", "iopub.status.idle": "2026-02-08T15:04:35.790636Z", "shell.execute_reply": "2026-02-08T15:04:35.790258Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "taxon AND resistant: 10 spectra\n", "NOT resistant: 19 spectra\n", "\n", "Filter repr: (SpeciesFilter('taxon') & DrugFilter('Drug', status='R'))\n" ] } ], "source": [ "# Combine filters with & (and), | (or), ~ (not)\n", "f_combined = SpeciesFilter(\"taxon\") & DrugFilter(\"Drug\", status=\"R\")\n", "result = data.filter(f_combined)\n", "print(f\"taxon AND resistant: {len(result.spectra)} spectra\")\n", "\n", "# NOT resistant (equivalent to susceptible)\n", "f_not_r = ~DrugFilter(\"Drug\", status=\"R\")\n", "result2 = data.filter(f_not_r)\n", "print(f\"NOT resistant: {len(result2.spectra)} spectra\")\n", "\n", "# Display the composed filter\n", "print(f\"\\nFilter repr: {f_combined}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Custom Preprocessing Pipelines\n", "\n", "The default pipeline uses: clip negatives, sqrt transform, Savitzky-Golay\n", "smoothing, SNIP baseline correction, m/z trimming, and TIC normalization.\n", "\n", "You can build a custom pipeline by choosing from the available transformers:\n", "`ClipNegatives`, `SqrtTransform`, `LogTransform`, `SavitzkyGolaySmooth`,\n", "`SNIPBaseline`, `MzTrimmer`, `TICNormalizer`, `MedianNormalizer`,\n", "`PQNNormalizer`, `MzMultiTrimmer`." ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.791553Z", "iopub.status.busy": "2026-02-08T15:04:35.791461Z", "iopub.status.idle": "2026-02-08T15:04:35.794244Z", "shell.execute_reply": "2026-02-08T15:04:35.793857Z" } }, "outputs": [ { "data": { "text/plain": [ "PreprocessingPipeline([\n", " ('clip', ClipNegatives()),\n", " ('log', LogTransform()),\n", " ('smooth', SavitzkyGolaySmooth(window_length=15, polyorder=3)),\n", " ('baseline', SNIPBaseline(half_window=30)),\n", " ('trim', MzTrimmer(mz_min=3000, mz_max=15000)),\n", " ('normalize', MedianNormalizer())\n", "])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Build a custom pipeline with different parameters\n", "custom_pipe = PreprocessingPipeline(\n", " [\n", " (\"clip\", ClipNegatives()),\n", " (\"log\", LogTransform()), # log1p instead of sqrt\n", " (\n", " \"smooth\",\n", " SavitzkyGolaySmooth(window_length=15, polyorder=3),\n", " ), # different smoothing\n", " (\"baseline\", SNIPBaseline(half_window=30)), # narrower baseline window\n", " (\"trim\", MzTrimmer(mz_min=3000, mz_max=15000)), # narrower m/z range\n", " (\"normalize\", MedianNormalizer()), # median instead of TIC\n", " ]\n", ")\n", "custom_pipe" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.795408Z", "iopub.status.busy": "2026-02-08T15:04:35.795315Z", "iopub.status.idle": "2026-02-08T15:04:35.822298Z", "shell.execute_reply": "2026-02-08T15:04:35.821717Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Default pipeline: 6000 bins, m/z range 2000-20000\n", "Custom pipeline: 4000 bins, m/z range 3000-15000\n" ] } ], "source": [ "# Compare default vs custom pipeline on a single spectrum\n", "spec_default = MaldiSpectrum(\"../data/1s.txt\").preprocess().bin(3)\n", "spec_custom = MaldiSpectrum(\"../data/1s.txt\", pipeline=custom_pipe).preprocess().bin(3)\n", "\n", "print(f\"Default pipeline: {len(spec_default.binned)} bins, m/z range 2000-20000\")\n", "print(f\"Custom pipeline: {len(spec_custom.binned)} bins, m/z range 3000-15000\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:35.823596Z", "iopub.status.busy": "2026-02-08T15:04:35.823493Z", "iopub.status.idle": "2026-02-08T15:04:36.235340Z", "shell.execute_reply": "2026-02-08T15:04:36.234798Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Custom pipeline features shape: (29, 4000)\n", "Default pipeline features shape: (29, 6000)\n" ] } ], "source": [ "# Use a custom pipeline with MaldiSet\n", "data_custom = MaldiSet.from_directory(\n", " \"../data/\",\n", " \"../data/metadata/metadata.csv\",\n", " aggregate_by=dict(antibiotics=\"Drug\"),\n", " pipeline=custom_pipe,\n", ")\n", "print(f\"Custom pipeline features shape: {data_custom.X.shape}\")\n", "print(f\"Default pipeline features shape: {data.X.shape}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Pipeline Serialization\n", "\n", "Save and load pipeline configurations for reproducibility. Supports JSON, YAML,\n", "and Python dict formats." ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:36.236641Z", "iopub.status.busy": "2026-02-08T15:04:36.236540Z", "iopub.status.idle": "2026-02-08T15:04:36.239235Z", "shell.execute_reply": "2026-02-08T15:04:36.238753Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline as dict:\n", " clip: ClipNegatives\n", " log: LogTransform\n", " smooth: SavitzkyGolaySmooth {'window_length': 15, 'polyorder': 3}\n", " baseline: SNIPBaseline {'half_window': 30}\n", " trim: MzTrimmer {'mz_min': 3000, 'mz_max': 15000}\n", " normalize: MedianNormalizer\n" ] } ], "source": [ "# Serialize pipeline to a dictionary\n", "d = custom_pipe.to_dict()\n", "print(\"Pipeline as dict:\")\n", "for step in d[\"steps\"]:\n", " params = {k: v for k, v in step.items() if k not in (\"step_name\", \"name\")}\n", " print(f\" {step['step_name']}: {step['name']}\", end=\"\")\n", " if params:\n", " print(f\" {params}\")\n", " else:\n", " print()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:36.240101Z", "iopub.status.busy": "2026-02-08T15:04:36.240009Z", "iopub.status.idle": "2026-02-08T15:04:36.242666Z", "shell.execute_reply": "2026-02-08T15:04:36.242185Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'steps': [{'name': 'ClipNegatives', 'step_name': 'clip'},\n", " {'name': 'LogTransform', 'step_name': 'log'},\n", " {'name': 'SavitzkyGolaySmooth',\n", " 'polyorder': 3,\n", " 'step_name': 'smooth',\n", " 'window_length': 15},\n", " {'half_window': 30, 'name': 'SNIPBaseline', 'step_name': 'baseline'},\n", " {'mz_max': 15000,\n", " 'mz_min': 3000,\n", " 'name': 'MzTrimmer',\n", " 'step_name': 'trim'},\n", " {'name': 'MedianNormalizer', 'step_name': 'normalize'}]}\n", "\n", "Reloaded pipeline steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']\n", "Same m/z range: True\n" ] } ], "source": [ "from pprint import pprint\n", "\n", "# Save to JSON and inspect\n", "custom_pipe.to_json(\"my_pipeline.json\")\n", "\n", "custom_pipe_reloaded = PreprocessingPipeline.from_json(\"my_pipeline.json\")\n", "pprint(custom_pipe_reloaded.to_dict())\n", "\n", "# Reload from JSON\n", "reloaded = PreprocessingPipeline.from_json(\"my_pipeline.json\")\n", "print(f\"\\nReloaded pipeline steps: {reloaded.step_names}\")\n", "print(f\"Same m/z range: {reloaded.mz_range == custom_pipe.mz_range}\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:36.243474Z", "iopub.status.busy": "2026-02-08T15:04:36.243385Z", "iopub.status.idle": "2026-02-08T15:04:36.253802Z", "shell.execute_reply": "2026-02-08T15:04:36.253335Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "YAML-reloaded steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']\n" ] } ], "source": [ "# Save to YAML and reload\n", "custom_pipe.to_yaml(\"my_pipeline.yaml\")\n", "reloaded_yaml = PreprocessingPipeline.from_yaml(\"my_pipeline.yaml\")\n", "print(f\"YAML-reloaded steps: {reloaded_yaml.step_names}\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2026-02-08T15:04:36.254788Z", "iopub.status.busy": "2026-02-08T15:04:36.254698Z", "iopub.status.idle": "2026-02-08T15:04:36.256440Z", "shell.execute_reply": "2026-02-08T15:04:36.256072Z" } }, "outputs": [], "source": [ "import os\n", "\n", "# Clean up temporary files\n", "os.remove(\"my_pipeline.json\")\n", "os.remove(\"my_pipeline.yaml\")" ] } ], "metadata": { "kernelspec": { "display_name": "maldiamrkit (3.10.12)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }