{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MaldiAMRKit - Quick Start\n",
    "\n",
    "This notebook covers the basics of loading, preprocessing, and binning MALDI-TOF spectra."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If you haven't installed the package yet, run:\n",
    "```bash\n",
    "pip install maldiamrkit\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import MaldiAMRKit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:26.072070Z",
     "iopub.status.busy": "2026-02-08T15:04:26.071970Z",
     "iopub.status.idle": "2026-02-08T15:04:26.684663Z",
     "shell.execute_reply": "2026-02-08T15:04:26.684010Z"
    }
   },
   "outputs": [],
   "source": "from maldiamrkit import MaldiSet, MaldiSpectrum\nfrom maldiamrkit.filters import DrugFilter, MetadataFilter, SpeciesFilter\nfrom maldiamrkit.preprocessing import (\n    ClipNegatives,\n    LogTransform,\n    MedianNormalizer,\n    MzTrimmer,\n    PreprocessingPipeline,\n    SavitzkyGolaySmooth,\n    SNIPBaseline,\n    SpectrumQuality,\n    estimate_snr,\n)\nfrom maldiamrkit.susceptibility import LabelEncoder\nfrom maldiamrkit.visualization import plot_pseudogel, plot_spectrum"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing Pipeline\n",
    "\n",
    "Inspect the default composable preprocessing pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:26.686879Z",
     "iopub.status.busy": "2026-02-08T15:04:26.686678Z",
     "iopub.status.idle": "2026-02-08T15:04:26.690296Z",
     "shell.execute_reply": "2026-02-08T15:04:26.689889Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreprocessingPipeline([\n",
       "  ('clip', ClipNegatives()),\n",
       "  ('sqrt', SqrtTransform()),\n",
       "  ('smooth', SavitzkyGolaySmooth(window_length=20, polyorder=2)),\n",
       "  ('baseline', SNIPBaseline(half_window=40)),\n",
       "  ('trim', MzTrimmer(mz_min=2000, mz_max=20000)),\n",
       "  ('normalize', TICNormalizer())\n",
       "])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe = PreprocessingPipeline.default()\n",
    "pipe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and Preprocess a Single Spectrum"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:26.708633Z",
     "iopub.status.busy": "2026-02-08T15:04:26.708402Z",
     "iopub.status.idle": "2026-02-08T15:04:34.807856Z",
     "shell.execute_reply": "2026-02-08T15:04:34.807111Z"
    }
   },
   "outputs": [],
   "source": "# Load, preprocess (smoothing, baseline removal, normalization), and bin\nspec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\nspec.bin(3)  # bin width 3 Da\n\n# Plot the binned spectrum\n_ = plot_spectrum(spec, stage=\"binned\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Verbose Mode\n",
    "\n",
    "Enable verbose mode to see processing messages."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.809508Z",
     "iopub.status.busy": "2026-02-08T15:04:34.809234Z",
     "iopub.status.idle": "2026-02-08T15:04:34.831421Z",
     "shell.execute_reply": "2026-02-08T15:04:34.830726Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mass</th>\n",
       "      <th>intensity</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2000</td>\n",
       "      <td>0.000039</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2003</td>\n",
       "      <td>0.000041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2006</td>\n",
       "      <td>0.000083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2009</td>\n",
       "      <td>0.000123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2012</td>\n",
       "      <td>0.000109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5995</th>\n",
       "      <td>19985</td>\n",
       "      <td>0.000087</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5996</th>\n",
       "      <td>19988</td>\n",
       "      <td>0.000054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5997</th>\n",
       "      <td>19991</td>\n",
       "      <td>0.000048</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5998</th>\n",
       "      <td>19994</td>\n",
       "      <td>0.000050</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5999</th>\n",
       "      <td>19997</td>\n",
       "      <td>0.000012</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       mass  intensity\n",
       "0      2000   0.000039\n",
       "1      2003   0.000041\n",
       "2      2006   0.000083\n",
       "3      2009   0.000123\n",
       "4      2012   0.000109\n",
       "...     ...        ...\n",
       "5995  19985   0.000087\n",
       "5996  19988   0.000054\n",
       "5997  19991   0.000048\n",
       "5998  19994   0.000050\n",
       "5999  19997   0.000012\n",
       "\n",
       "[6000 rows x 2 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spec = MaldiSpectrum(\"../data/1s.txt\", verbose=True).preprocess()\n",
    "spec.bin(3).binned"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Plot Without Binning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.832721Z",
     "iopub.status.busy": "2026-02-08T15:04:34.832603Z",
     "iopub.status.idle": "2026-02-08T15:04:34.893125Z",
     "shell.execute_reply": "2026-02-08T15:04:34.892502Z"
    }
   },
   "outputs": [],
   "source": "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n_ = plot_spectrum(spec, stage=\"preprocessed\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quality Assessment\n",
    "\n",
    "### Signal-to-Noise Ratio (SNR)\n",
    "\n",
    "Use `estimate_snr()` to assess spectrum quality. Higher SNR indicates better signal quality."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.894729Z",
     "iopub.status.busy": "2026-02-08T15:04:34.894622Z",
     "iopub.status.idle": "2026-02-08T15:04:34.906353Z",
     "shell.execute_reply": "2026-02-08T15:04:34.905743Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Signal-to-Noise Ratio: 67.2\n"
     ]
    }
   ],
   "source": [
    "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n",
    "snr = estimate_snr(spec)\n",
    "print(f\"Signal-to-Noise Ratio: {snr:.1f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Comprehensive Quality Report\n",
    "\n",
    "Use `SpectrumQuality` for a comprehensive quality assessment."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.908167Z",
     "iopub.status.busy": "2026-02-08T15:04:34.908062Z",
     "iopub.status.idle": "2026-02-08T15:04:34.913273Z",
     "shell.execute_reply": "2026-02-08T15:04:34.912940Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SNR: 67.2\n",
      "Total Ion Count: 1.00e+00\n",
      "Peak Count: 74\n",
      "Baseline Fraction: 46.26%\n",
      "Dynamic Range: 1.46\n"
     ]
    }
   ],
   "source": [
    "qc = SpectrumQuality()\n",
    "report = qc.assess(spec)\n",
    "print(f\"SNR: {report.snr:.1f}\")\n",
    "print(f\"Total Ion Count: {report.total_ion_count:.2e}\")\n",
    "print(f\"Peak Count: {report.peak_count}\")\n",
    "print(f\"Baseline Fraction: {report.baseline_fraction:.2%}\")\n",
    "print(f\"Dynamic Range: {report.dynamic_range:.2f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Binning Methods\n",
    "\n",
    "MaldiAMRKit supports multiple binning strategies: uniform (default), proportional, adaptive, and custom."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.914758Z",
     "iopub.status.busy": "2026-02-08T15:04:34.914639Z",
     "iopub.status.idle": "2026-02-08T15:04:34.932822Z",
     "shell.execute_reply": "2026-02-08T15:04:34.932400Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Uniform: 6000 bins\n",
      "   bin_index  bin_start  bin_end  bin_width\n",
      "0          0       2000     2003          3\n",
      "1          1       2003     2006          3\n",
      "2          2       2006     2009          3\n",
      "3          3       2009     2012          3\n",
      "4          4       2012     2015          3\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "spec = MaldiSpectrum(\"../data/1s.txt\").preprocess()\n",
    "\n",
    "# Uniform binning (default)\n",
    "spec.bin(bin_width=3)\n",
    "print(f\"Uniform: {len(spec.binned)} bins\")\n",
    "print(spec.bin_metadata.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.934372Z",
     "iopub.status.busy": "2026-02-08T15:04:34.934264Z",
     "iopub.status.idle": "2026-02-08T15:04:34.942544Z",
     "shell.execute_reply": "2026-02-08T15:04:34.942053Z"
    }
   },
   "outputs": [],
   "source": [
    "# Proportional binning (bin width scales with m/z)\n",
    "spec.bin(bin_width=3, method=\"proportional\")\n",
    "print(f\"Proportional: {len(spec.binned)} bins\")\n",
    "print(f\"Width at start: {spec.bin_metadata.iloc[0]['bin_width']:.2f} Da\")\n",
    "print(f\"Width at end: {spec.bin_metadata.iloc[-1]['bin_width']:.2f} Da\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.943795Z",
     "iopub.status.busy": "2026-02-08T15:04:34.943691Z",
     "iopub.status.idle": "2026-02-08T15:04:34.963292Z",
     "shell.execute_reply": "2026-02-08T15:04:34.962754Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Adaptive: 3756 bins\n",
      "Min width: 1.00 Da\n",
      "Max width: 9.37 Da\n"
     ]
    }
   ],
   "source": [
    "# Adaptive binning (smaller bins in peak-dense regions)\n",
    "spec.bin(method=\"adaptive\", adaptive_min_width=1.0, adaptive_max_width=10.0)\n",
    "print(f\"Adaptive: {len(spec.binned)} bins\")\n",
    "print(f\"Min width: {spec.bin_metadata['bin_width'].min():.2f} Da\")\n",
    "print(f\"Max width: {spec.bin_metadata['bin_width'].max():.2f} Da\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.964630Z",
     "iopub.status.busy": "2026-02-08T15:04:34.964527Z",
     "iopub.status.idle": "2026-02-08T15:04:34.969690Z",
     "shell.execute_reply": "2026-02-08T15:04:34.969329Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Custom: 49 bins\n",
      "   bin_index    bin_start      bin_end   bin_width\n",
      "0          0  2000.000000  2367.346939  367.346939\n",
      "1          1  2367.346939  2734.693878  367.346939\n",
      "2          2  2734.693878  3102.040816  367.346939\n",
      "3          3  3102.040816  3469.387755  367.346939\n",
      "4          4  3469.387755  3836.734694  367.346939\n"
     ]
    }
   ],
   "source": [
    "# Custom binning (user-defined edges)\n",
    "custom_edges = np.linspace(2000, 20000, 50)  # 49 bins\n",
    "spec.bin(method=\"custom\", custom_edges=custom_edges)\n",
    "print(f\"Custom: {len(spec.binned)} bins\")\n",
    "print(spec.bin_metadata.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Build a Dataset from Multiple Spectra\n",
    "\n",
    "Use `MaldiSet` to load and process multiple spectra with metadata."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:34.971166Z",
     "iopub.status.busy": "2026-02-08T15:04:34.971064Z",
     "iopub.status.idle": "2026-02-08T15:04:35.470153Z",
     "shell.execute_reply": "2026-02-08T15:04:35.469504Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Features shape: (29, 6000)\n",
      "Labels shape: (29, 1)\n"
     ]
    }
   ],
   "source": [
    "data = MaldiSet.from_directory(\n",
    "    \"../data/\",\n",
    "    \"../data/metadata/metadata.csv\",\n",
    "    aggregate_by=dict(antibiotics=\"Drug\"),\n",
    ")\n",
    "X, y = data.X, data.y\n",
    "\n",
    "print(f\"Features shape: {X.shape}\")\n",
    "print(f\"Labels shape: {y.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.472364Z",
     "iopub.status.busy": "2026-02-08T15:04:35.472258Z",
     "iopub.status.idle": "2026-02-08T15:04:35.475495Z",
     "shell.execute_reply": "2026-02-08T15:04:35.475112Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Drug</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10s</th>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11s</th>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12s</th>\n",
       "      <td>R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13s</th>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14s</th>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Drug\n",
       "10s    S\n",
       "11s    R\n",
       "12s    R\n",
       "13s    S\n",
       "14s    S"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pseudogel Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.476634Z",
     "iopub.status.busy": "2026-02-08T15:04:35.476544Z",
     "iopub.status.idle": "2026-02-08T15:04:35.628416Z",
     "shell.execute_reply": "2026-02-08T15:04:35.627821Z"
    }
   },
   "outputs": [],
   "source": "_ = plot_pseudogel(data)"
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.629856Z",
     "iopub.status.busy": "2026-02-08T15:04:35.629755Z",
     "iopub.status.idle": "2026-02-08T15:04:35.769357Z",
     "shell.execute_reply": "2026-02-08T15:04:35.768823Z"
    }
   },
   "outputs": [],
   "source": "_ = plot_pseudogel(data, regions=[(2000, 3000), (6000, 7000)])"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Label Encoding\n",
    "\n",
    "Use `LabelEncoder` to convert R/I/S clinical resistance labels to binary (0/1).\n",
    "The `intermediate` parameter controls how \"I\" (intermediate) labels are handled:\n",
    "\n",
    "- **`\"susceptible\"`** (default): treat I as 0 - conservative, avoids false resistance calls\n",
    "- **`\"resistant\"`**: treat I as 1 - stricter, flags uncertain isolates\n",
    "- **`\"drop\"`**: remove I samples entirely"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.770731Z",
     "iopub.status.busy": "2026-02-08T15:04:35.770626Z",
     "iopub.status.idle": "2026-02-08T15:04:35.774187Z",
     "shell.execute_reply": "2026-02-08T15:04:35.773697Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "intermediate='susceptible': [1 0 0 1 0 0]\n",
      "intermediate='resistant':  [1 0 1 1 0 1]\n",
      "intermediate='drop':       [1 0 1 0]\n",
      "\n",
      "Dataset labels: [0 1 1 0 0 0 0 0 0 0]\n",
      "Resistant: 10, Susceptible: 19\n"
     ]
    }
   ],
   "source": [
    "# Compare all three modes on labels with Intermediate\n",
    "labels = [\"R\", \"S\", \"I\", \"R\", \"S\", \"I\"]\n",
    "\n",
    "enc_s = LabelEncoder(intermediate=\"susceptible\")\n",
    "print(\"intermediate='susceptible':\", enc_s.fit_transform(labels))\n",
    "\n",
    "enc_r = LabelEncoder(intermediate=\"resistant\")\n",
    "print(\"intermediate='resistant': \", enc_r.fit_transform(labels))\n",
    "\n",
    "enc_d = LabelEncoder(intermediate=\"drop\")\n",
    "print(\"intermediate='drop':      \", enc_d.fit_transform(labels))\n",
    "\n",
    "# Apply to dataset labels (R/S only here, so all modes give the same result)\n",
    "enc = LabelEncoder()\n",
    "y_binary = enc.fit_transform(y[\"Drug\"].values)\n",
    "print(f\"\\nDataset labels: {y_binary[:10]}\")\n",
    "print(f\"Resistant: {y_binary.sum()}, Susceptible: {len(y_binary) - y_binary.sum()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Filtering Datasets\n",
    "\n",
    "MaldiAMRKit provides a composable filter system for subsetting a `MaldiSet`.\n",
    "Filters can be combined with `&` (and), `|` (or), and `~` (not) operators.\n",
    "\n",
    "Available filters:\n",
    "- **`SpeciesFilter`**: keep samples from specific species\n",
    "- **`DrugFilter`**: filter by antibiotic resistance status\n",
    "- **`QualityFilter`**: filter by SNR, peak count, or baseline fraction (requires enriched metadata)\n",
    "- **`MetadataFilter`**: filter by any metadata column with a custom condition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.775316Z",
     "iopub.status.busy": "2026-02-08T15:04:35.775221Z",
     "iopub.status.idle": "2026-02-08T15:04:35.780039Z",
     "shell.execute_reply": "2026-02-08T15:04:35.779481Z"
    }
   },
   "outputs": [],
   "source": [
    "# SpeciesFilter - keep samples from specific species\n",
    "f_species = SpeciesFilter(\"taxon\")\n",
    "filtered = data.filter(f_species)\n",
    "print(\n",
    "    f\"SpeciesFilter('taxon'): {len(filtered.spectra)} of {len(data.spectra)} spectra kept\"\n",
    ")\n",
    "\n",
    "# Filtering for a species not in the data removes all samples\n",
    "f_other = SpeciesFilter(\"Escherichia coli\")\n",
    "filtered_empty = data.filter(f_other)\n",
    "print(f\"SpeciesFilter('Escherichia coli'): {len(filtered_empty.spectra)} spectra kept\")\n",
    "\n",
    "# DrugFilter - filter by antibiotic resistance status\n",
    "f_drug = DrugFilter(\"Drug\", status=\"R\")\n",
    "resistant = data.filter(f_drug)\n",
    "print(f\"DrugFilter('Drug', status='R'): {len(resistant.spectra)} spectra kept\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.781054Z",
     "iopub.status.busy": "2026-02-08T15:04:35.780950Z",
     "iopub.status.idle": "2026-02-08T15:04:35.785311Z",
     "shell.execute_reply": "2026-02-08T15:04:35.784844Z"
    }
   },
   "outputs": [],
   "source": [
    "# MetadataFilter - filter by any metadata column\n",
    "f_resistant = MetadataFilter(\"Drug\", lambda v: v == \"R\")\n",
    "resistant_only = data.filter(f_resistant)\n",
    "print(f\"Resistant only: {len(resistant_only.spectra)} spectra\")\n",
    "\n",
    "f_susceptible = MetadataFilter(\"Drug\", lambda v: v == \"S\")\n",
    "susceptible_only = data.filter(f_susceptible)\n",
    "print(f\"Susceptible only: {len(susceptible_only.spectra)} spectra\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.786304Z",
     "iopub.status.busy": "2026-02-08T15:04:35.786212Z",
     "iopub.status.idle": "2026-02-08T15:04:35.790636Z",
     "shell.execute_reply": "2026-02-08T15:04:35.790258Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "taxon AND resistant: 10 spectra\n",
      "NOT resistant: 19 spectra\n",
      "\n",
      "Filter repr: (SpeciesFilter('taxon') & DrugFilter('Drug', status='R'))\n"
     ]
    }
   ],
   "source": [
    "# Combine filters with & (and), | (or), ~ (not)\n",
    "f_combined = SpeciesFilter(\"taxon\") & DrugFilter(\"Drug\", status=\"R\")\n",
    "result = data.filter(f_combined)\n",
    "print(f\"taxon AND resistant: {len(result.spectra)} spectra\")\n",
    "\n",
    "# NOT resistant (equivalent to susceptible)\n",
    "f_not_r = ~DrugFilter(\"Drug\", status=\"R\")\n",
    "result2 = data.filter(f_not_r)\n",
    "print(f\"NOT resistant: {len(result2.spectra)} spectra\")\n",
    "\n",
    "# Display the composed filter\n",
    "print(f\"\\nFilter repr: {f_combined}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Custom Preprocessing Pipelines\n",
    "\n",
    "The default pipeline uses: clip negatives, sqrt transform, Savitzky-Golay\n",
    "smoothing, SNIP baseline correction, m/z trimming, and TIC normalization.\n",
    "\n",
    "You can build a custom pipeline by choosing from the available transformers:\n",
    "`ClipNegatives`, `SqrtTransform`, `LogTransform`, `SavitzkyGolaySmooth`,\n",
    "`SNIPBaseline`, `MzTrimmer`, `TICNormalizer`, `MedianNormalizer`,\n",
    "`PQNNormalizer`, `MzMultiTrimmer`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.791553Z",
     "iopub.status.busy": "2026-02-08T15:04:35.791461Z",
     "iopub.status.idle": "2026-02-08T15:04:35.794244Z",
     "shell.execute_reply": "2026-02-08T15:04:35.793857Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreprocessingPipeline([\n",
       "  ('clip', ClipNegatives()),\n",
       "  ('log', LogTransform()),\n",
       "  ('smooth', SavitzkyGolaySmooth(window_length=15, polyorder=3)),\n",
       "  ('baseline', SNIPBaseline(half_window=30)),\n",
       "  ('trim', MzTrimmer(mz_min=3000, mz_max=15000)),\n",
       "  ('normalize', MedianNormalizer())\n",
       "])"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Build a custom pipeline with different parameters\n",
    "custom_pipe = PreprocessingPipeline(\n",
    "    [\n",
    "        (\"clip\", ClipNegatives()),\n",
    "        (\"log\", LogTransform()),  # log1p instead of sqrt\n",
    "        (\n",
    "            \"smooth\",\n",
    "            SavitzkyGolaySmooth(window_length=15, polyorder=3),\n",
    "        ),  # different smoothing\n",
    "        (\"baseline\", SNIPBaseline(half_window=30)),  # narrower baseline window\n",
    "        (\"trim\", MzTrimmer(mz_min=3000, mz_max=15000)),  # narrower m/z range\n",
    "        (\"normalize\", MedianNormalizer()),  # median instead of TIC\n",
    "    ]\n",
    ")\n",
    "custom_pipe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.795408Z",
     "iopub.status.busy": "2026-02-08T15:04:35.795315Z",
     "iopub.status.idle": "2026-02-08T15:04:35.822298Z",
     "shell.execute_reply": "2026-02-08T15:04:35.821717Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Default pipeline: 6000 bins, m/z range 2000-20000\n",
      "Custom pipeline:  4000 bins, m/z range 3000-15000\n"
     ]
    }
   ],
   "source": [
    "# Compare default vs custom pipeline on a single spectrum\n",
    "spec_default = MaldiSpectrum(\"../data/1s.txt\").preprocess().bin(3)\n",
    "spec_custom = MaldiSpectrum(\"../data/1s.txt\", pipeline=custom_pipe).preprocess().bin(3)\n",
    "\n",
    "print(f\"Default pipeline: {len(spec_default.binned)} bins, m/z range 2000-20000\")\n",
    "print(f\"Custom pipeline:  {len(spec_custom.binned)} bins, m/z range 3000-15000\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:35.823596Z",
     "iopub.status.busy": "2026-02-08T15:04:35.823493Z",
     "iopub.status.idle": "2026-02-08T15:04:36.235340Z",
     "shell.execute_reply": "2026-02-08T15:04:36.234798Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Custom pipeline features shape: (29, 4000)\n",
      "Default pipeline features shape: (29, 6000)\n"
     ]
    }
   ],
   "source": [
    "# Use a custom pipeline with MaldiSet\n",
    "data_custom = MaldiSet.from_directory(\n",
    "    \"../data/\",\n",
    "    \"../data/metadata/metadata.csv\",\n",
    "    aggregate_by=dict(antibiotics=\"Drug\"),\n",
    "    pipeline=custom_pipe,\n",
    ")\n",
    "print(f\"Custom pipeline features shape: {data_custom.X.shape}\")\n",
    "print(f\"Default pipeline features shape: {data.X.shape}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Pipeline Serialization\n",
    "\n",
    "Save and load pipeline configurations for reproducibility. Supports JSON, YAML,\n",
    "and Python dict formats."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:36.236641Z",
     "iopub.status.busy": "2026-02-08T15:04:36.236540Z",
     "iopub.status.idle": "2026-02-08T15:04:36.239235Z",
     "shell.execute_reply": "2026-02-08T15:04:36.238753Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pipeline as dict:\n",
      "  clip: ClipNegatives\n",
      "  log: LogTransform\n",
      "  smooth: SavitzkyGolaySmooth  {'window_length': 15, 'polyorder': 3}\n",
      "  baseline: SNIPBaseline  {'half_window': 30}\n",
      "  trim: MzTrimmer  {'mz_min': 3000, 'mz_max': 15000}\n",
      "  normalize: MedianNormalizer\n"
     ]
    }
   ],
   "source": [
    "# Serialize pipeline to a dictionary\n",
    "d = custom_pipe.to_dict()\n",
    "print(\"Pipeline as dict:\")\n",
    "for step in d[\"steps\"]:\n",
    "    params = {k: v for k, v in step.items() if k not in (\"step_name\", \"name\")}\n",
    "    print(f\"  {step['step_name']}: {step['name']}\", end=\"\")\n",
    "    if params:\n",
    "        print(f\"  {params}\")\n",
    "    else:\n",
    "        print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:36.240101Z",
     "iopub.status.busy": "2026-02-08T15:04:36.240009Z",
     "iopub.status.idle": "2026-02-08T15:04:36.242666Z",
     "shell.execute_reply": "2026-02-08T15:04:36.242185Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'steps': [{'name': 'ClipNegatives', 'step_name': 'clip'},\n",
      "           {'name': 'LogTransform', 'step_name': 'log'},\n",
      "           {'name': 'SavitzkyGolaySmooth',\n",
      "            'polyorder': 3,\n",
      "            'step_name': 'smooth',\n",
      "            'window_length': 15},\n",
      "           {'half_window': 30, 'name': 'SNIPBaseline', 'step_name': 'baseline'},\n",
      "           {'mz_max': 15000,\n",
      "            'mz_min': 3000,\n",
      "            'name': 'MzTrimmer',\n",
      "            'step_name': 'trim'},\n",
      "           {'name': 'MedianNormalizer', 'step_name': 'normalize'}]}\n",
      "\n",
      "Reloaded pipeline steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']\n",
      "Same m/z range: True\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "\n",
    "# Save to JSON and inspect\n",
    "custom_pipe.to_json(\"my_pipeline.json\")\n",
    "\n",
    "custom_pipe_reloaded = PreprocessingPipeline.from_json(\"my_pipeline.json\")\n",
    "pprint(custom_pipe_reloaded.to_dict())\n",
    "\n",
    "# Reload from JSON\n",
    "reloaded = PreprocessingPipeline.from_json(\"my_pipeline.json\")\n",
    "print(f\"\\nReloaded pipeline steps: {reloaded.step_names}\")\n",
    "print(f\"Same m/z range: {reloaded.mz_range == custom_pipe.mz_range}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:36.243474Z",
     "iopub.status.busy": "2026-02-08T15:04:36.243385Z",
     "iopub.status.idle": "2026-02-08T15:04:36.253802Z",
     "shell.execute_reply": "2026-02-08T15:04:36.253335Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "YAML-reloaded steps: ['clip', 'log', 'smooth', 'baseline', 'trim', 'normalize']\n"
     ]
    }
   ],
   "source": [
    "# Save to YAML and reload\n",
    "custom_pipe.to_yaml(\"my_pipeline.yaml\")\n",
    "reloaded_yaml = PreprocessingPipeline.from_yaml(\"my_pipeline.yaml\")\n",
    "print(f\"YAML-reloaded steps: {reloaded_yaml.step_names}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2026-02-08T15:04:36.254788Z",
     "iopub.status.busy": "2026-02-08T15:04:36.254698Z",
     "iopub.status.idle": "2026-02-08T15:04:36.256440Z",
     "shell.execute_reply": "2026-02-08T15:04:36.256072Z"
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Clean up temporary files\n",
    "os.remove(\"my_pipeline.json\")\n",
    "os.remove(\"my_pipeline.yaml\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "maldiamrkit (3.10.12)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}