From 19bdb99faba983499b7a40819692855e89d688e1 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 16 Nov 2023 15:20:39 -0800 Subject: [PATCH 01/28] Add internal functionality for segmenting out glycan cores --- conftest.py | 68 ++++++++++++++++++++++++++- src/maldi_tools/extraction.py | 87 +++++++++++++++++++++++++++++++++++ tests/extraction_test.py | 31 +++++++++++-- 3 files changed, 182 insertions(+), 4 deletions(-) diff --git a/conftest.py b/conftest.py index 00f7530..d68ca4b 100644 --- a/conftest.py +++ b/conftest.py @@ -1,11 +1,14 @@ """Shared Fixtures for tests.""" from pathlib import Path -from typing import Generator +from typing import Generator, List +import json import numpy as np import pandas as pd import pytest +import random +import skimage.io as io import xarray as xr from pyimzml.ImzMLParser import ImzMLParser from pyimzml.ImzMLWriter import ImzMLWriter @@ -114,3 +117,66 @@ def image_xr(rng: np.random.Generator, library: pd.DataFrame) -> Generator[xr.Da dims=["peak", "x", "y"], ) yield img_xr + + +@pytest.fixture(scope="session") +def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, + rng: np.random.Generator): + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + + glycan_img: np.ndarray = np.zeros((10, 10)) + glycan_img[coords[:, 1] - 1, coords[:, 0] - 1] = rng.random(coords.shape[0]) + + glycan_img_file: Path = tmp_path_factory.mktemp("glycan_imgs") / "glycan_img.tiff" + io.imsave(glycan_img_file, glycan_img) + + yield glycan_img_file + + +@pytest.fixture(scope="session") +def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): + columns_write: List[str] = ["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"] + poslog_data: pd.DataFrame = pd.DataFrame( + np.random.rand(len(imz_data.coordinates) + 2, len(columns_write)), + columns=columns_write + ) + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + + poslog_regions: List[str] = [] + for i in np.arange(2): + poslog_regions.append("__") + poslog_regions.extend([f"R{i}XY"] * 50) + poslog_data["Region"] = poslog_regions + + poslog_file: Path = tmp_path_factory.mktemp("poslogs") / "poslog.txt" + poslog_data.to_csv( + poslog_file, header=None, index=False, sep=" ", mode='w', columns=columns_write + ) + + yield poslog_file + + +@pytest.fixture(scope="session") +def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + center_coord_indices: np.ndarray = np.arange(25, coords.shape[0], 50) + + centroid_data = {} + centroid_data["exportDateTime"] = None + centroid_data["fovs"] = [] + for i, cci in enumerate(center_coord_indices): + center_coord = coords[cci, :] + center_point_data = { + "name": f"Region{i}", + "centerPointPixels": { + "x": center_coord[0].item(), + "y": center_coord[1].item() + } + } + centroid_data["fovs"].append(center_point_data) + + centroid_file: Path = tmp_path_factory.mktemp("centroids") / "centroids.json" + with open(centroid_file, "w") as outfile: + outfile.write(json.dumps(centroid_data)) + + yield centroid_file diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index cab87dd..534403f 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -6,6 +6,7 @@ """ +import json import os from functools import partial from operator import itemgetter @@ -17,8 +18,10 @@ import xarray as xr from pyimzml.ImzMLParser import ImzMLParser from scipy import signal +from skimage.io import imread, imsave from tqdm.notebook import tqdm +from alpineer.io_utils import validate_paths from maldi_tools import plotting @@ -337,3 +340,87 @@ def library_matching( peak_df.to_csv(lib_matched_dir / "matched_peaks.csv") return peak_df + + +def generate_glycan_mask( + imz_data: ImzMLParser, + glycan_img_path: Path, +): + """Generate a mask for a provided glycan image input/ + + Args: + --- + imz_data (ImzMLParser): The imzML object, needed for coordinate identification. + glycan_img_path (Path): The path to the glycan image .tiff, needed to create the base mask. + + Returns: + ------- + np.ndarray: + The binary segmentation mask of the glycan image + """ + validate_paths([glycan_img_path]) + + glycan_img = imread(glycan_img_path) + glycan_mask = np.zeros(glycan_img.shape) + + coords = np.array([coord[:2] for coord in imz_data.coordinates]) + glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + + return glycan_mask + + +def map_coordinates_to_core_name( + imz_data: ImzMLParser, + centroid_path: Path, + poslog_path: Path, +): + """Maps each scanned coordinate on a slide to their respective core name (created by TSAI tiler) + + Args: + --- + imz_data (ImzMLParser): The imzML object, needed for coordinate identification. + centroid_path (Path): A JSON file mapping each core name to their respective centroid. + Generated by the TSAI tiler. + poslog_path (Path): A .txt file listing all the coordinates scanned, + needed to map coordinates to their respective core. + + Returns: + ------- + pd.DataFrame: + Maps each coordinate to their core + """ + validate_paths([centroid_path, poslog_path]) + + coords = np.array([coord[:2] for coord in imz_data.coordinates]) + region_core_info = pd.read_csv( + poslog_path, + delimiter=" ", + names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], + index_col=False, + skiprows=1 + ) + region_core_info = region_core_info[region_core_info["Region"] != "__"].copy() + + region_core_info = region_core_info[["Date", "Time", "Region", "X", "Y"]] + region_core_info["Region"] = region_core_info["Region"].str.extract(r"^(.*?)(?=X)") + region_core_info[["X", "Y"]] = coords + + with open(centroid_path, "r") as infile: + centroid_data = json.load(infile) + + core_region_mapping = {} + for core in centroid_data["fovs"]: + center_point = core["centerPointPixels"] + region_match = region_core_info.loc[ + ( + region_core_info["X"] == center_point["x"] + ) & + ( + region_core_info["Y"] == center_point["y"] + ), + "Region" + ].values[0] + core_region_mapping[region_match] = core["name"] + + region_core_info["Core"] = region_core_info["Region"].map(core_region_mapping) + return region_core_info diff --git a/tests/extraction_test.py b/tests/extraction_test.py index c1571cd..aed0ebe 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -29,7 +29,8 @@ def test_rolling_window(total_mass_df: pd.DataFrame) -> None: intensity_percentile: int = 99 window_setting: int = 10 log_intensities, log_int_percentile = extraction.rolling_window( - total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, window_size=window_setting + total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, + window_size=window_setting ) assert log_intensities.shape == (10000,) assert log_int_percentile.shape == (10000,) @@ -51,7 +52,10 @@ def test_signal_extraction( assert np.all(peak_candidates[1:] >= peak_candidates[:-1]) -def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple[np.ndarray, np.ndarray]): +def test_get_peak_widths( + total_mass_df: pd.DataFrame, + peak_idx_candidates: tuple[np.ndarray, np.ndarray] +): peak_candidate_idxs, peak_candidates = peak_idx_candidates peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths( total_mass_df=total_mass_df, @@ -119,7 +123,8 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) -def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path): +def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, + tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) @@ -137,3 +142,23 @@ def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: i assert row.mass_error == 0 assert row.composition in {"A", "B"} assert row.peak in {30, 45} + + +def test_generate_glycan_mask(imz_data: ImzMLParser, glycan_img_path: pathlib.Path): + glycan_mask: np.ndarray = extraction.generate_glycan_mask(imz_data, glycan_img_path) + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + + +def test_map_coordinates_to_core_name(imz_data: ImzMLParser, centroid_path: pathlib.Path, + poslog_path: pathlib.Path): + region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( + imz_data, centroid_path, poslog_path + ) + + region_core_mapping = region_core_info[["Region", "Core"]].drop_duplicates() + region_core_dict = region_core_mapping.set_index("Region")["Core"].to_dict() + + assert region_core_dict["R0"] == "Region0" + assert region_core_dict["R1"] == "Region1" + assert len(region_core_dict) == 2 From f4b33cf1a0dcbff3d75b1da2c9f756dba9e83229 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 17 Nov 2023 10:50:12 -0800 Subject: [PATCH 02/28] Format using pre-commit hooks --- conftest.py | 22 +++++++--------------- src/maldi_tools/extraction.py | 19 +++++++------------ tests/extraction_test.py | 16 ++++++---------- 3 files changed, 20 insertions(+), 37 deletions(-) diff --git a/conftest.py b/conftest.py index d68ca4b..8874408 100644 --- a/conftest.py +++ b/conftest.py @@ -1,13 +1,12 @@ """Shared Fixtures for tests.""" +import json from pathlib import Path from typing import Generator, List -import json import numpy as np import pandas as pd import pytest -import random import skimage.io as io import xarray as xr from pyimzml.ImzMLParser import ImzMLParser @@ -120,8 +119,7 @@ def image_xr(rng: np.random.Generator, library: pd.DataFrame) -> Generator[xr.Da @pytest.fixture(scope="session") -def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, - rng: np.random.Generator): +def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) glycan_img: np.ndarray = np.zeros((10, 10)) @@ -137,10 +135,9 @@ def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): columns_write: List[str] = ["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"] poslog_data: pd.DataFrame = pd.DataFrame( - np.random.rand(len(imz_data.coordinates) + 2, len(columns_write)), - columns=columns_write + np.random.rand(len(imz_data.coordinates) + 2, len(columns_write)), columns=columns_write ) - coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + np.array([coord[:2] for coord in imz_data.coordinates]) poslog_regions: List[str] = [] for i in np.arange(2): @@ -149,9 +146,7 @@ def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): poslog_data["Region"] = poslog_regions poslog_file: Path = tmp_path_factory.mktemp("poslogs") / "poslog.txt" - poslog_data.to_csv( - poslog_file, header=None, index=False, sep=" ", mode='w', columns=columns_write - ) + poslog_data.to_csv(poslog_file, header=None, index=False, sep=" ", mode="w", columns=columns_write) yield poslog_file @@ -161,17 +156,14 @@ def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) center_coord_indices: np.ndarray = np.arange(25, coords.shape[0], 50) - centroid_data = {} + centroid_data: dict = {} centroid_data["exportDateTime"] = None centroid_data["fovs"] = [] for i, cci in enumerate(center_coord_indices): center_coord = coords[cci, :] center_point_data = { "name": f"Region{i}", - "centerPointPixels": { - "x": center_coord[0].item(), - "y": center_coord[1].item() - } + "centerPointPixels": {"x": center_coord[0].item(), "y": center_coord[1].item()}, } centroid_data["fovs"].append(center_point_data) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 534403f..7731a73 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -16,12 +16,12 @@ import numpy as np import pandas as pd import xarray as xr +from alpineer.io_utils import validate_paths from pyimzml.ImzMLParser import ImzMLParser from scipy import signal -from skimage.io import imread, imsave +from skimage.io import imread from tqdm.notebook import tqdm -from alpineer.io_utils import validate_paths from maldi_tools import plotting @@ -346,7 +346,7 @@ def generate_glycan_mask( imz_data: ImzMLParser, glycan_img_path: Path, ): - """Generate a mask for a provided glycan image input/ + """Generate a mask for a provided glycan image input/. Args: --- @@ -374,7 +374,7 @@ def map_coordinates_to_core_name( centroid_path: Path, poslog_path: Path, ): - """Maps each scanned coordinate on a slide to their respective core name (created by TSAI tiler) + """Maps each scanned coordinate on a slide to their respective core name (created by TSAI tiler). Args: --- @@ -397,7 +397,7 @@ def map_coordinates_to_core_name( delimiter=" ", names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], index_col=False, - skiprows=1 + skiprows=1, ) region_core_info = region_core_info[region_core_info["Region"] != "__"].copy() @@ -412,13 +412,8 @@ def map_coordinates_to_core_name( for core in centroid_data["fovs"]: center_point = core["centerPointPixels"] region_match = region_core_info.loc[ - ( - region_core_info["X"] == center_point["x"] - ) & - ( - region_core_info["Y"] == center_point["y"] - ), - "Region" + (region_core_info["X"] == center_point["x"]) & (region_core_info["Y"] == center_point["y"]), + "Region", ].values[0] core_region_mapping[region_match] = core["name"] diff --git a/tests/extraction_test.py b/tests/extraction_test.py index aed0ebe..b1812ea 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -29,8 +29,7 @@ def test_rolling_window(total_mass_df: pd.DataFrame) -> None: intensity_percentile: int = 99 window_setting: int = 10 log_intensities, log_int_percentile = extraction.rolling_window( - total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, - window_size=window_setting + total_mass_df=total_mass_df, intensity_percentile=intensity_percentile, window_size=window_setting ) assert log_intensities.shape == (10000,) assert log_int_percentile.shape == (10000,) @@ -52,10 +51,7 @@ def test_signal_extraction( assert np.all(peak_candidates[1:] >= peak_candidates[:-1]) -def test_get_peak_widths( - total_mass_df: pd.DataFrame, - peak_idx_candidates: tuple[np.ndarray, np.ndarray] -): +def test_get_peak_widths(total_mass_df: pd.DataFrame, peak_idx_candidates: tuple[np.ndarray, np.ndarray]): peak_candidate_idxs, peak_candidates = peak_idx_candidates peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths( total_mass_df=total_mass_df, @@ -123,8 +119,7 @@ def test__matching_vec(library: pd.DataFrame, obs_mz: int, true_values: pd.Serie @pytest.mark.parametrize(argnames="_ppm", argvalues=[99]) -def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, - tmp_path: pathlib.Path): +def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: int, tmp_path: pathlib.Path): extraction_dir = tmp_path / "extraction_dir" extraction_dir.mkdir(parents=True, exist_ok=True) @@ -150,8 +145,9 @@ def test_generate_glycan_mask(imz_data: ImzMLParser, glycan_img_path: pathlib.Pa assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) -def test_map_coordinates_to_core_name(imz_data: ImzMLParser, centroid_path: pathlib.Path, - poslog_path: pathlib.Path): +def test_map_coordinates_to_core_name( + imz_data: ImzMLParser, centroid_path: pathlib.Path, poslog_path: pathlib.Path +): region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( imz_data, centroid_path, poslog_path ) From 2f98ec53213949e0c7e3472785e6202378134817 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 17 Nov 2023 12:30:42 -0800 Subject: [PATCH 03/28] Include core cropping process in Jupyter notebook, and update glycan mask function to support individual core cropping --- src/maldi_tools/extraction.py | 57 +++++++++++++++---------------- templates/maldi-pipeline.ipynb | 61 ++++++++++++++++++++++++++++++++-- tests/extraction_test.py | 34 ++++++++++++++----- 3 files changed, 114 insertions(+), 38 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 7731a73..8affc46 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -11,7 +11,7 @@ from functools import partial from operator import itemgetter from pathlib import Path -from typing import Dict, Tuple +from typing import Dict, List, Tuple import numpy as np import pandas as pd @@ -342,33 +342,6 @@ def library_matching( return peak_df -def generate_glycan_mask( - imz_data: ImzMLParser, - glycan_img_path: Path, -): - """Generate a mask for a provided glycan image input/. - - Args: - --- - imz_data (ImzMLParser): The imzML object, needed for coordinate identification. - glycan_img_path (Path): The path to the glycan image .tiff, needed to create the base mask. - - Returns: - ------- - np.ndarray: - The binary segmentation mask of the glycan image - """ - validate_paths([glycan_img_path]) - - glycan_img = imread(glycan_img_path) - glycan_mask = np.zeros(glycan_img.shape) - - coords = np.array([coord[:2] for coord in imz_data.coordinates]) - glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 - - return glycan_mask - - def map_coordinates_to_core_name( imz_data: ImzMLParser, centroid_path: Path, @@ -419,3 +392,31 @@ def map_coordinates_to_core_name( region_core_info["Core"] = region_core_info["Region"].map(core_region_mapping) return region_core_info + + +def generate_glycan_mask( + imz_data: ImzMLParser, glycan_img_path: Path, region_core_info: pd.DataFrame, cores_to_crop: List[str] +): + """Generate a mask for the specified cores, provided a glycan image input. + + Args: + --- + imz_data (ImzMLParser): The imzML object, needed for coordinate identification. + glycan_img_path (Path): The path to the glycan image .tiff, needed to create the base mask. + region_core_info (pd.DataFrame): Defines the coordinates associated with each FOV. + cores_to_crop (List[str]): Which cores to segment out. + + Returns: + ------- + np.ndarray: + The binary segmentation mask of the glycan image + """ + validate_paths([glycan_img_path]) + + glycan_img = imread(glycan_img_path) + glycan_mask = np.zeros(glycan_img.shape) + + coords = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values + glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + + return glycan_mask diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 3bffb46..3d5feee 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -574,12 +574,69 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Core Cropping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After glycan matching, each core on the TMA should be appropriately named by the TSAI MALDI tiler. **Ensure that this step is completed before running the following section.**\n", + "\n", + "To extract FOV-level statistics, a mask will be generated to segment out individual cores on a TMA. This section first maps each acquired coordinate on the slide to their core as defined by the TSAI MALDI tiler, then generates a mask for specific cores on the TMA." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", + "centroid_path = \"path/to/centroids.json\"\n", + "\n", + "# contains all coordinates in the order of acquisition\n", + "poslog_path = \"path/to/poslog.txt\"\n", + "\n", + "# define path to one glycan image, needed to find dimensions of mask\n", + "glycan_img_path = \"path/to/glycan_img.tiff\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# map coordinates to core names\n", + "region_core_info = extraction.map_coordinates_to_core_name(\n", + " imz_data=imz_data,\n", + " centroid_path=centroid_path,\n", + " poslog_path=poslog_path\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# define which core(s) you want to segment out\n", + "cores_to_crop = [\"FOV1\", \"FOV2\"]\n", + "\n", + "# extract a binary mask with just the cores specified\n", + "core_cropping_mask = extraction.generate_glycan_mask(\n", + " imz_data=imz_data,\n", + " glycan_img_path=glycan_img_path,\n", + " region_core_info=region_core_info,\n", + " cores_to_crop=cores_to_crop\n", + ")" + ] } ], "metadata": { @@ -598,7 +655,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" + "version": "3.11.6" }, "vscode": { "interpreter": { diff --git a/tests/extraction_test.py b/tests/extraction_test.py index b1812ea..c4632fb 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -2,6 +2,7 @@ import os import pathlib +from typing import List import numpy as np import pandas as pd @@ -139,12 +140,6 @@ def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: i assert row.peak in {30, 45} -def test_generate_glycan_mask(imz_data: ImzMLParser, glycan_img_path: pathlib.Path): - glycan_mask: np.ndarray = extraction.generate_glycan_mask(imz_data, glycan_img_path) - coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) - assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) - - def test_map_coordinates_to_core_name( imz_data: ImzMLParser, centroid_path: pathlib.Path, poslog_path: pathlib.Path ): @@ -152,9 +147,32 @@ def test_map_coordinates_to_core_name( imz_data, centroid_path, poslog_path ) - region_core_mapping = region_core_info[["Region", "Core"]].drop_duplicates() - region_core_dict = region_core_mapping.set_index("Region")["Core"].to_dict() + region_core_mapping: pd.DataFrame = region_core_info[["Region", "Core"]].drop_duplicates() + region_core_dict: dict = region_core_mapping.set_index("Region")["Core"].to_dict() assert region_core_dict["R0"] == "Region0" assert region_core_dict["R1"] == "Region1" assert len(region_core_dict) == 2 + + +def test_generate_glycan_mask( + imz_data: ImzMLParser, + glycan_img_path: pathlib.Path, + centroid_path: pathlib.Path, + poslog_path: pathlib.Path, +): + region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( + imz_data, centroid_path, poslog_path + ) + core_names: List[str] = list(region_core_info.keys()) + + glycan_mask: np.ndarray = extraction.generate_glycan_mask( + imz_data, glycan_img_path, region_core_info, [core_names[0]] + ) + coords = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values + assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + + non_hit_mask: np.ndarray = np.ones(glycan_mask.shape, dtype=bool) + for coord in coords: + non_hit_mask &= ~np.all(glycan_mask == coord, axis=1) + assert np.all(glycan_mask[non_hit_mask] == 0) From d863f9aee450979fa45cd0ca8fb482374ceaa04e Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 20 Nov 2023 11:45:53 -0800 Subject: [PATCH 04/28] Add commented line that user can run to crop out all cores on a TMA slide --- templates/maldi-pipeline.ipynb | 35 +++++++--------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 3d5feee..9610ea6 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -336,9 +336,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "print(f\"Candiate Peak Count: {len(peak_candidates)}\")" @@ -393,25 +391,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "peak_df, l_ips_r, r_ips_r, peak_widths_height = extraction.get_peak_widths(\n", - " total_mass_df=total_mass_df,\n", - " peak_candidate_idxs=peak_candidate_idxs,\n", - " peak_candidates=peak_candidates,\n", - " thresholds=thresholds,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "save_peak_spectra_debug = True" @@ -441,9 +421,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "panel_df" @@ -472,9 +450,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "tags": [] - }, + "metadata": {}, "outputs": [], "source": [ "image_data" @@ -629,6 +605,9 @@ "# define which core(s) you want to segment out\n", "cores_to_crop = [\"FOV1\", \"FOV2\"]\n", "\n", + "# alternatively, comment out the following line to crop out all cores\n", + "# cores_to_crop = region_core_info[\"Core\"].unique().tolist()\n", + "\n", "# extract a binary mask with just the cores specified\n", "core_cropping_mask = extraction.generate_glycan_mask(\n", " imz_data=imz_data,\n", From 388e623859e0b87a2e1a8244eb8f3e5a7b5f8704 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 20 Nov 2023 11:46:34 -0800 Subject: [PATCH 05/28] Add functionality to visualize the cropping mask after completion --- templates/maldi-pipeline.ipynb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 9610ea6..9337a3c 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -616,6 +616,16 @@ " cores_to_crop=cores_to_crop\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# visualize the mask\n", + "_ = plt.imshow(core_cropping_mask)" + ] } ], "metadata": { From 897dda9e23259ddda39019d7077b76cd30b753cd Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 20 Nov 2023 11:47:28 -0800 Subject: [PATCH 06/28] Change cores_to_crop to include names that are more TSAI-friendly --- templates/maldi-pipeline.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 9337a3c..4a96758 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -603,7 +603,7 @@ "outputs": [], "source": [ "# define which core(s) you want to segment out\n", - "cores_to_crop = [\"FOV1\", \"FOV2\"]\n", + "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", "\n", "# alternatively, comment out the following line to crop out all cores\n", "# cores_to_crop = region_core_info[\"Core\"].unique().tolist()\n", From 3e7d11d9fc806929eaf1d900c2940d58d4ae188d Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 20 Nov 2023 16:40:01 -0800 Subject: [PATCH 07/28] PYCODESTYLE and add test in case certain cores don't map onto the poslog --- conftest.py | 26 +++++++++++++++++++++++++- src/maldi_tools/extraction.py | 11 +++++++++-- tests/extraction_test.py | 7 +++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/conftest.py b/conftest.py index 8874408..af304c0 100644 --- a/conftest.py +++ b/conftest.py @@ -28,7 +28,8 @@ def imz_data(tmp_path_factory: TempPathFactory, rng: np.random.Generator) -> Imz img_dim: int = 10 # Generate random integers n for each coordinate (10 x 10). These will be used for creating - # random m/z and intensity values of length n. Lengths n are distributed along the standard gamma. + # random m/z and intensity values of length n. + # Lengths n are distributed along the standard gamma. ns: np.ndarray = np.rint(rng.standard_gamma(shape=2.5, size=(img_dim**2)) * 100).astype(int) # Generate random masses and sample different amounts of them, so we get duplicates @@ -172,3 +173,26 @@ def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): outfile.write(json.dumps(centroid_data)) yield centroid_file + + +@pytest.fixture(scope="session") +def bad_centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + center_coord_indices: np.ndarray = np.arange(25, coords.shape[0], 50) + + centroid_data: dict = {} + centroid_data["exportDateTime"] = None + centroid_data["fovs"] = [] + for i, cci in enumerate(center_coord_indices): + center_coord = coords[cci, :] + center_point_data = { + "name": f"Region{i}", + "centerPointPixels": {"x": center_coord[0].item() + 10000, "y": center_coord[1].item() + 10000}, + } + centroid_data["fovs"].append(center_point_data) + + centroid_file: Path = tmp_path_factory.mktemp("centroids") / "centroids.json" + with open(centroid_file, "w") as outfile: + outfile.write(json.dumps(centroid_data)) + + yield centroid_file diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 8affc46..d61cd22 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -387,8 +387,15 @@ def map_coordinates_to_core_name( region_match = region_core_info.loc[ (region_core_info["X"] == center_point["x"]) & (region_core_info["Y"] == center_point["y"]), "Region", - ].values[0] - core_region_mapping[region_match] = core["name"] + ] + if region_match.shape[0] == 0: + raise ValueError( + f"Could not find mapping of core {core['name']} to any location on the slide, " + "please verify that you positioned the central point of the core correctly " + "using the TSAI tiler, or that you've set the right poslog file." + ) + + core_region_mapping[region_match.values[0]] = core["name"] region_core_info["Core"] = region_core_info["Region"].map(core_region_mapping) return region_core_info diff --git a/tests/extraction_test.py b/tests/extraction_test.py index c4632fb..b11182b 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -155,6 +155,13 @@ def test_map_coordinates_to_core_name( assert len(region_core_dict) == 2 +def test_map_coordinates_to_core_name_malformed( + imz_data: ImzMLParser, bad_centroid_path: pathlib.Path, poslog_path: pathlib.Path +): + with pytest.raises(ValueError, match="Could not find mapping of core Region0"): + extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_path) + + def test_generate_glycan_mask( imz_data: ImzMLParser, glycan_img_path: pathlib.Path, From 2c0ca43d95a9fd2590f7cad41947306625bd3086 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 21 Nov 2023 11:03:27 -0800 Subject: [PATCH 08/28] Ensure line length is 100 like the other angelolab repos (not 110) --- ruff.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ruff.toml b/ruff.toml index 091e944..0afbdfe 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,5 +1,5 @@ # Formatting and Linting -line-length = 110 +line-length = 100 select = [ "D", # pydocstyle "E", # pycodestyle Error From 6fe1ca58a05489fd20ff4767ee922064211fd4ce Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 21 Nov 2023 13:35:38 -0800 Subject: [PATCH 09/28] Use rng fixture to generate random nums --- conftest.py | 4 ++-- ruff.toml | 2 +- src/maldi_tools/extraction.py | 38 +++++++++++++++++++---------------- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/conftest.py b/conftest.py index af304c0..25f222c 100644 --- a/conftest.py +++ b/conftest.py @@ -133,10 +133,10 @@ def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rn @pytest.fixture(scope="session") -def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): +def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): columns_write: List[str] = ["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"] poslog_data: pd.DataFrame = pd.DataFrame( - np.random.rand(len(imz_data.coordinates) + 2, len(columns_write)), columns=columns_write + rng.random(size=(len(imz_data.coordinates) + 2, len(columns_write))), columns=columns_write ) np.array([coord[:2] for coord in imz_data.coordinates]) diff --git a/ruff.toml b/ruff.toml index 0afbdfe..091e944 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,5 +1,5 @@ # Formatting and Linting -line-length = 100 +line-length = 110 select = [ "D", # pydocstyle "E", # pycodestyle Error diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index d61cd22..3323ab7 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -26,9 +26,9 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[pd.DataFrame, np.ndarray]: - """Iterates over all coordinates after opening the `imzML` data and extracts all masses, and sums the - intensities for all masses. Creates an intensity image, thresholded on `intensity_percentile` with - `np.percentile`. The masses are then sorted. + """Iterates over all coordinates after opening the `imzML` data and extracts all masses, + and sums the intensities for all masses. Creates an intensity image, thresholded on + `intensity_percentile` with `np.percentile`. The masses are then sorted. Args: ---- @@ -38,8 +38,8 @@ def extract_spectra(imz_data: ImzMLParser, intensity_percentile: int) -> tuple[p Returns: ------- tuple[pd.DataFrame, np.ndarray]: A tuple where the first element is the dataframe containing - the total masses and their intensities, and the second element is the thresholds matrix of the - image. + the total masses and their intensities, and the second element is the thresholds matrix + of the image. """ imz_coordinates: list = imz_data.coordinates @@ -73,14 +73,16 @@ def rolling_window( Args: ---- - total_mass_df (pd.DataFrame): A dataframe containing all the masses and their relative intensities. + total_mass_df (pd.DataFrame): A dataframe containing all the masses and their + relative intensities. intensity_percentile (int): The intensity for the quantile calculation. - window_size (int, optional): The sizve of the window for the rolling window method. Defaults to 5000. + window_size (int, optional): The sizve of the window for the rolling window method. + Defaults to 5000. Returns: ------- - tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities, and the second - element is the log intensity percentiles. + tuple[np.ndarray, np.ndarray]: A tuple where the first element is the log intensities, + and the second element is the log intensity percentiles. """ plt_range_min_ind: int = 0 plt_range_max_ind: int = len(total_mass_df["intensity"]) - 1 @@ -104,13 +106,14 @@ def signal_extraction( Args: ---- - total_mass_df (pd.DataFrame): A dataframe containing all the masses and their relative intensities. + total_mass_df (pd.DataFrame): A dataframe containing all the masses and their + relative intensities. log_int_percentile (np.ndarray): An array for the log intensity percentiles. Returns: ------- - tuple[np.ndarray, np.ndarray]: A tuple where the first element is the peak candidate indexes, and - the second is the candidate peaks. + tuple[np.ndarray, np.ndarray]: A tuple where the first element is the peak candidate + indexes, and the second is the candidate peaks. """ peak_candidate_indexes, _peak_properties = signal.find_peaks( total_mass_df["intensity"].values, prominence=np.exp(log_int_percentile) @@ -131,17 +134,18 @@ def get_peak_widths( Args: ---- - total_mass_df (pd.DataFrame): A dataframe containing all the masses and their relative intensities. + total_mass_df (pd.DataFrame): A dataframe containing all the masses and their + relative intensities. peak_candidate_idxs (np.ndarray): A list containing the indexes of the discovered peaks. peak_candidates (np.ndarray): A list containing the discovered peaks. thresholds (np.ndarray): The threshold matrix of the image. Returns: ------- - tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]: A tuple where the first element is a - DataFrame of the unique peaks discovered from the signal extraction process. The second and third - elements are the lower and upper bounds, and the fourth element is the height of the contour lines - at which the peak widths were calculated from. + tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray]: A tuple where the first element is + a DataFrame of the unique peaks discovered from the signal extraction process. + The second and third elements are the lower and upper bounds, and the fourth element is + the height of the contour lines at which the peak widths were calculated from. """ plt_range_min_ind: int = 0 plt_range_max_ind: int = len(total_mass_df["intensity"]) - 1 From 6cc4909151d11e8a63b91b8c2fa6e68231773ff6 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 21 Nov 2023 13:58:11 -0800 Subject: [PATCH 10/28] Simplify cropping logic for all cores, allow user to set cores_to_crop = None --- src/maldi_tools/extraction.py | 11 ++++++++--- templates/maldi-pipeline.ipynb | 5 +---- tests/extraction_test.py | 27 +++++++++++++++++++++------ 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 3323ab7..d7effc4 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -11,7 +11,7 @@ from functools import partial from operator import itemgetter from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import pandas as pd @@ -406,7 +406,10 @@ def map_coordinates_to_core_name( def generate_glycan_mask( - imz_data: ImzMLParser, glycan_img_path: Path, region_core_info: pd.DataFrame, cores_to_crop: List[str] + imz_data: ImzMLParser, + glycan_img_path: Path, + region_core_info: pd.DataFrame, + cores_to_crop: Optional[List[str]] = None, ): """Generate a mask for the specified cores, provided a glycan image input. @@ -415,7 +418,7 @@ def generate_glycan_mask( imz_data (ImzMLParser): The imzML object, needed for coordinate identification. glycan_img_path (Path): The path to the glycan image .tiff, needed to create the base mask. region_core_info (pd.DataFrame): Defines the coordinates associated with each FOV. - cores_to_crop (List[str]): Which cores to segment out. + cores_to_crop (Optional[List[str]]): Which cores to segment out. If None, use all. Returns: ------- @@ -423,6 +426,8 @@ def generate_glycan_mask( The binary segmentation mask of the glycan image """ validate_paths([glycan_img_path]) + if not cores_to_crop: + cores_to_crop = region_core_info["Core"].unique().tolist() glycan_img = imread(glycan_img_path) glycan_mask = np.zeros(glycan_img.shape) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 4a96758..2a93e98 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -602,12 +602,9 @@ "metadata": {}, "outputs": [], "source": [ - "# define which core(s) you want to segment out\n", + "# define which core(s) you want to crop out (set to None to crop all cores out)\n", "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", "\n", - "# alternatively, comment out the following line to crop out all cores\n", - "# cores_to_crop = region_core_info[\"Core\"].unique().tolist()\n", - "\n", "# extract a binary mask with just the cores specified\n", "core_cropping_mask = extraction.generate_glycan_mask(\n", " imz_data=imz_data,\n", diff --git a/tests/extraction_test.py b/tests/extraction_test.py index b11182b..78d7e04 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -168,18 +168,33 @@ def test_generate_glycan_mask( centroid_path: pathlib.Path, poslog_path: pathlib.Path, ): + # test for subset of FOVs region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( imz_data, centroid_path, poslog_path ) - core_names: List[str] = list(region_core_info.keys()) + core_names: List[str] = list(region_core_info["Core"].unique()) glycan_mask: np.ndarray = extraction.generate_glycan_mask( imz_data, glycan_img_path, region_core_info, [core_names[0]] ) - coords = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values + coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) - non_hit_mask: np.ndarray = np.ones(glycan_mask.shape, dtype=bool) - for coord in coords: - non_hit_mask &= ~np.all(glycan_mask == coord, axis=1) - assert np.all(glycan_mask[non_hit_mask] == 0) + all_coords_X, all_coords_Y = np.meshgrid(np.arange(1, 11), np.arange(1, 11)) + all_coords: np.ndarray = np.vstack((all_coords_X.ravel(), all_coords_Y.ravel())).T + coords_set: set = set(map(tuple, coords)) + non_hit_indices: np.ndarray = np.array([tuple(coord) not in coords_set for coord in all_coords]) + non_hit_coords: np.ndarray = all_coords[non_hit_indices] + + assert np.all(glycan_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) + + # test for all FOVs + glycan_mask = extraction.generate_glycan_mask(imz_data, glycan_img_path, region_core_info) + coords = region_core_info.loc[:, ["X", "Y"]].values + assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + + coords_set = set(map(tuple, coords)) + non_hit_indices = np.array([tuple(coord) not in coords_set for coord in all_coords]) + non_hit_coords = all_coords[non_hit_indices] + + assert np.all(glycan_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) From 9d8a6a6e81c3885cb10c879edc732a11c857ac2b Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 21 Nov 2023 14:05:59 -0800 Subject: [PATCH 11/28] Only read in the columns we need for region_core_info --- src/maldi_tools/extraction.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index d7effc4..7490db7 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -373,12 +373,11 @@ def map_coordinates_to_core_name( poslog_path, delimiter=" ", names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], + usecols=["Region", "X", "Y"], index_col=False, skiprows=1, ) region_core_info = region_core_info[region_core_info["Region"] != "__"].copy() - - region_core_info = region_core_info[["Date", "Time", "Region", "X", "Y"]] region_core_info["Region"] = region_core_info["Region"].str.extract(r"^(.*?)(?=X)") region_core_info[["X", "Y"]] = coords From 55ee0afffbe8b7b4a6f23899388dd17d508a153c Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 27 Nov 2023 14:20:08 -0800 Subject: [PATCH 12/28] Update core cropping process to include mask generation prior --- src/maldi_tools/extraction.py | 43 ++++++++++++++++----- templates/maldi-pipeline.ipynb | 70 +++++++++++++++++++++++++--------- tests/extraction_test.py | 36 +++++++++++++---- 3 files changed, 114 insertions(+), 35 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 7490db7..4af2334 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -19,7 +19,7 @@ from alpineer.io_utils import validate_paths from pyimzml.ImzMLParser import ImzMLParser from scipy import signal -from skimage.io import imread +from skimage.io import imread, imsave from tqdm.notebook import tqdm from maldi_tools import plotting @@ -346,6 +346,29 @@ def library_matching( return peak_df +def generate_glycan_mask( + imz_data: ImzMLParser, + glycan_img_path: Path, + glycan_mask_path: Path, +): + """Given a glycan image, generates an equivalent mask. + + Args: + --- + imz_data (ImzMLParser): The imzML object, needed for coordinate identification. + glycan_img_path (Path): Location of the .tiff file containing the glycan scan + glycan_mask_path (Path): Location where the mask will be saved + """ + validate_paths([glycan_img_path]) + + glycan_img = imread(glycan_img_path) + glycan_mask = np.zeros(glycan_img.shape) + + coords = np.array([coord[:2] for coord in imz_data.coordinates]) + glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + imsave(glycan_mask_path, glycan_mask) + + def map_coordinates_to_core_name( imz_data: ImzMLParser, centroid_path: Path, @@ -404,18 +427,18 @@ def map_coordinates_to_core_name( return region_core_info -def generate_glycan_mask( +def crop_glycan_cores( imz_data: ImzMLParser, - glycan_img_path: Path, + glycan_mask_path: Path, region_core_info: pd.DataFrame, cores_to_crop: Optional[List[str]] = None, ): - """Generate a mask for the specified cores, provided a glycan image input. + """Generate a mask for cropping out the specified cores. Args: --- imz_data (ImzMLParser): The imzML object, needed for coordinate identification. - glycan_img_path (Path): The path to the glycan image .tiff, needed to create the base mask. + glycan_mask_path (Path): The path to the glycan mask .tiff, needed to create the cropped mask. region_core_info (pd.DataFrame): Defines the coordinates associated with each FOV. cores_to_crop (Optional[List[str]]): Which cores to segment out. If None, use all. @@ -424,14 +447,14 @@ def generate_glycan_mask( np.ndarray: The binary segmentation mask of the glycan image """ - validate_paths([glycan_img_path]) + validate_paths([glycan_mask_path]) if not cores_to_crop: cores_to_crop = region_core_info["Core"].unique().tolist() - glycan_img = imread(glycan_img_path) - glycan_mask = np.zeros(glycan_img.shape) + glycan_mask = imread(glycan_mask_path) + core_cropped_mask = np.zeros(glycan_mask.shape) coords = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values - glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 - return glycan_mask + return core_cropped_mask diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 2a93e98..d67378d 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -554,16 +554,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Core Cropping" + "## Core Naming and Cropping" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After glycan matching, each core on the TMA should be appropriately named by the TSAI MALDI tiler. **Ensure that this step is completed before running the following section.**\n", + "For TMAs, each core is extracted all at once. However, this makes it difficult to locate the exact positions of each core. Additionally, the default names assigned to each core aren't particularly useful because they don't contain any information about their position on the TMA.\n", "\n", - "To extract FOV-level statistics, a mask will be generated to segment out individual cores on a TMA. This section first maps each acquired coordinate on the slide to their core as defined by the TSAI MALDI tiler, then generates a mask for specific cores on the TMA." + "This section will help you assign informative names to each core and afterwards, segment out the locations of specific cores to generate FOV-level statistics." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It is helpful first to create an all-encompassing mask with all the cores. This will make it clear where the TMA was scanned for the naming step. You will need to provide the path to one of your extracted glycan images first." ] }, { @@ -572,14 +579,32 @@ "metadata": {}, "outputs": [], "source": [ - "# TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", - "centroid_path = \"path/to/centroids.json\"\n", + "# define path to one glycan image, needed to properly dimension the mask\n", + "glycan_img_path = \"path/to/glycan_img.tiff\"\n", "\n", - "# contains all coordinates in the order of acquisition\n", - "poslog_path = \"path/to/poslog.txt\"\n", + "# define a save path for your mask\n", + "glycan_mask_path = \"path/to/glycan_mask.tiff\"\n", "\n", - "# define path to one glycan image, needed to find dimensions of mask\n", - "glycan_img_path = \"path/to/glycan_img.tiff\"" + "# generate and save the glycan mask\n", + "extraction.generate_glycan_mask(\n", + " imz_data=imz_data,\n", + " glycan_img_path=glycan_img_path,\n", + " glycan_mask_path=glycan_mask_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the TIFF saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The poslog file for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core." ] }, { @@ -588,6 +613,15 @@ "metadata": {}, "outputs": [], "source": [ + "# TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", + "centroid_path = \"path/to/centroids.json\"\n", + "\n", + "# contains all coordinates in the order of acquisition\n", + "poslog_path = \"path/to/poslog.txt\"\n", + "\n", + "# define path to one glycan image, needed to find dimensions of mask\n", + "glycan_img_path = \"path/to/glycan_img.tiff\"\n", + "\n", "# map coordinates to core names\n", "region_core_info = extraction.map_coordinates_to_core_name(\n", " imz_data=imz_data,\n", @@ -596,6 +630,13 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specify which cores you want to crop out, and visualize the resulting mask. You can use the mask at `core_cropping_mask` to subset on specific cores." + ] + }, { "cell_type": "code", "execution_count": null, @@ -611,15 +652,8 @@ " glycan_img_path=glycan_img_path,\n", " region_core_info=region_core_info,\n", " cores_to_crop=cores_to_crop\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + ")\n", + "\n", "# visualize the mask\n", "_ = plt.imshow(core_cropping_mask)" ] diff --git a/tests/extraction_test.py b/tests/extraction_test.py index 78d7e04..19ef566 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -9,6 +9,8 @@ import pytest import xarray as xr from pyimzml.ImzMLParser import ImzMLParser +from pytest import TempPathFactory +from skimage.io import imread from maldi_tools import extraction @@ -140,6 +142,26 @@ def test_library_matching(image_xr: xr.DataArray, library: pd.DataFrame, _ppm: i assert row.peak in {30, 45} +def test_generate_glycan_mask( + tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, glycan_img_path: pathlib.Path +): + glycan_mask_path: pathlib.Path = tmp_path_factory.mktemp("glycan_mask") / "glycan_mask.tiff" + extraction.generate_glycan_mask(imz_data, glycan_img_path, glycan_mask_path) + assert os.path.exists(glycan_mask_path) + + glycan_mask: np.ndarray = imread(glycan_mask_path) + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + + all_coords_X, all_coords_Y = np.meshgrid(np.arange(1, 11), np.arange(1, 11)) + all_coords: np.ndarray = np.vstack((all_coords_X.ravel(), all_coords_Y.ravel())).T + coords_set: set = set(map(tuple, coords)) + non_hit_indices: np.ndarray = np.array([tuple(coord) not in coords_set for coord in all_coords]) + non_hit_coords: np.ndarray = all_coords[non_hit_indices] + + assert np.all(glycan_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) + + def test_map_coordinates_to_core_name( imz_data: ImzMLParser, centroid_path: pathlib.Path, poslog_path: pathlib.Path ): @@ -162,7 +184,7 @@ def test_map_coordinates_to_core_name_malformed( extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_path) -def test_generate_glycan_mask( +def test_crop_glycan_cores( imz_data: ImzMLParser, glycan_img_path: pathlib.Path, centroid_path: pathlib.Path, @@ -174,11 +196,11 @@ def test_generate_glycan_mask( ) core_names: List[str] = list(region_core_info["Core"].unique()) - glycan_mask: np.ndarray = extraction.generate_glycan_mask( + core_cropped_mask: np.ndarray = extraction.crop_glycan_cores( imz_data, glycan_img_path, region_core_info, [core_names[0]] ) coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values - assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + assert np.all(core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) all_coords_X, all_coords_Y = np.meshgrid(np.arange(1, 11), np.arange(1, 11)) all_coords: np.ndarray = np.vstack((all_coords_X.ravel(), all_coords_Y.ravel())).T @@ -186,15 +208,15 @@ def test_generate_glycan_mask( non_hit_indices: np.ndarray = np.array([tuple(coord) not in coords_set for coord in all_coords]) non_hit_coords: np.ndarray = all_coords[non_hit_indices] - assert np.all(glycan_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) + assert np.all(core_cropped_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) # test for all FOVs - glycan_mask = extraction.generate_glycan_mask(imz_data, glycan_img_path, region_core_info) + core_cropped_mask = extraction.crop_glycan_cores(imz_data, glycan_img_path, region_core_info) coords = region_core_info.loc[:, ["X", "Y"]].values - assert np.all(glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + assert np.all(core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) coords_set = set(map(tuple, coords)) non_hit_indices = np.array([tuple(coord) not in coords_set for coord in all_coords]) non_hit_coords = all_coords[non_hit_indices] - assert np.all(glycan_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) + assert np.all(core_cropped_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) From 765fd309473677cbff8911ea31ccc927f6cc5a75 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 28 Nov 2023 15:01:26 -0800 Subject: [PATCH 13/28] The TSAI tiler, unfortunately, does not support .tiff files --- src/maldi_tools/extraction.py | 2 +- templates/maldi-pipeline.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 4af2334..2c84391 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -356,7 +356,7 @@ def generate_glycan_mask( Args: --- imz_data (ImzMLParser): The imzML object, needed for coordinate identification. - glycan_img_path (Path): Location of the .tiff file containing the glycan scan + glycan_img_path (Path): Location of the .png file containing the glycan scan glycan_mask_path (Path): Location where the mask will be saved """ validate_paths([glycan_img_path]) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index d67378d..a61e875 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -583,7 +583,7 @@ "glycan_img_path = \"path/to/glycan_img.tiff\"\n", "\n", "# define a save path for your mask\n", - "glycan_mask_path = \"path/to/glycan_mask.tiff\"\n", + "glycan_mask_path = \"path/to/glycan_mask.png\"\n", "\n", "# generate and save the glycan mask\n", "extraction.generate_glycan_mask(\n", @@ -597,7 +597,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the TIFF saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**" + "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the PNG saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**" ] }, { From a505f33c145ce616ec81d57c553c632bb961ba59 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 28 Nov 2023 15:17:21 -0800 Subject: [PATCH 14/28] Add type annotations --- src/maldi_tools/extraction.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 2c84391..e4e3a0c 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -361,10 +361,10 @@ def generate_glycan_mask( """ validate_paths([glycan_img_path]) - glycan_img = imread(glycan_img_path) - glycan_mask = np.zeros(glycan_img.shape) + glycan_img: np.ndarray = imread(glycan_img_path) + glycan_mask: np.ndarray = np.zeros(glycan_img.shape) - coords = np.array([coord[:2] for coord in imz_data.coordinates]) + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 imsave(glycan_mask_path, glycan_mask) @@ -391,8 +391,8 @@ def map_coordinates_to_core_name( """ validate_paths([centroid_path, poslog_path]) - coords = np.array([coord[:2] for coord in imz_data.coordinates]) - region_core_info = pd.read_csv( + coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) + region_core_info: pd.DataFrame = pd.read_csv( poslog_path, delimiter=" ", names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], @@ -405,12 +405,12 @@ def map_coordinates_to_core_name( region_core_info[["X", "Y"]] = coords with open(centroid_path, "r") as infile: - centroid_data = json.load(infile) + centroid_data: dict = json.load(infile) - core_region_mapping = {} + core_region_mapping: dict = {} for core in centroid_data["fovs"]: - center_point = core["centerPointPixels"] - region_match = region_core_info.loc[ + center_point: dict = core["centerPointPixels"] + region_match: pd.Series = region_core_info.loc[ (region_core_info["X"] == center_point["x"]) & (region_core_info["Y"] == center_point["y"]), "Region", ] @@ -451,10 +451,10 @@ def crop_glycan_cores( if not cores_to_crop: cores_to_crop = region_core_info["Core"].unique().tolist() - glycan_mask = imread(glycan_mask_path) - core_cropped_mask = np.zeros(glycan_mask.shape) + glycan_mask: np.ndarray = imread(glycan_mask_path) + core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape) - coords = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values + coords: np.ndarray = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 return core_cropped_mask From ba0052fad1e81517e71b99738bfe810fe2011764 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 29 Nov 2023 14:24:22 -0800 Subject: [PATCH 15/28] Update core cropping functionality to support a list of poslog files --- conftest.py | 31 +++++++++++---------- src/maldi_tools/extraction.py | 51 +++++++++++++++++++++++----------- templates/maldi-pipeline.ipynb | 8 ++++-- tests/extraction_test.py | 25 ++++++++++------- 4 files changed, 72 insertions(+), 43 deletions(-) diff --git a/conftest.py b/conftest.py index 25f222c..f64110a 100644 --- a/conftest.py +++ b/conftest.py @@ -133,29 +133,32 @@ def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rn @pytest.fixture(scope="session") -def poslog_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): +def poslog_dir(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): columns_write: List[str] = ["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"] - poslog_data: pd.DataFrame = pd.DataFrame( - rng.random(size=(len(imz_data.coordinates) + 2, len(columns_write))), columns=columns_write - ) - np.array([coord[:2] for coord in imz_data.coordinates]) + poslog_base_dir: Path = tmp_path_factory.mktemp("poslogs") - poslog_regions: List[str] = [] for i in np.arange(2): - poslog_regions.append("__") - poslog_regions.extend([f"R{i}XY"] * 50) - poslog_data["Region"] = poslog_regions + poslog_data: pd.DataFrame = pd.DataFrame( + rng.random(size=(int(len(imz_data.coordinates) / 2) + 2, len(columns_write))), + columns=columns_write, + ) + + poslog_regions: List[str] = [] + for j in np.arange(2): + poslog_regions.append("__") + poslog_regions.extend([f"R{j}XY"] * 25) + poslog_data["Region"] = poslog_regions - poslog_file: Path = tmp_path_factory.mktemp("poslogs") / "poslog.txt" - poslog_data.to_csv(poslog_file, header=None, index=False, sep=" ", mode="w", columns=columns_write) + poslog_file: Path = poslog_base_dir / f"poslog{i}.txt" + poslog_data.to_csv(poslog_file, header=None, index=False, sep=" ", mode="w", columns=columns_write) - yield poslog_file + yield poslog_base_dir @pytest.fixture(scope="session") def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) - center_coord_indices: np.ndarray = np.arange(25, coords.shape[0], 50) + center_coord_indices: np.ndarray = np.arange(10, coords.shape[0], 25) centroid_data: dict = {} centroid_data["exportDateTime"] = None @@ -178,7 +181,7 @@ def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): @pytest.fixture(scope="session") def bad_centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) - center_coord_indices: np.ndarray = np.arange(25, coords.shape[0], 50) + center_coord_indices: np.ndarray = np.arange(10, coords.shape[0], 25) centroid_data: dict = {} centroid_data["exportDateTime"] = None diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index e4e3a0c..f351c2e 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -17,6 +17,7 @@ import pandas as pd import xarray as xr from alpineer.io_utils import validate_paths +from alpineer.misc_utils import verify_in_list from pyimzml.ImzMLParser import ImzMLParser from scipy import signal from skimage.io import imread, imsave @@ -372,7 +373,7 @@ def generate_glycan_mask( def map_coordinates_to_core_name( imz_data: ImzMLParser, centroid_path: Path, - poslog_path: Path, + poslog_paths: List[Path], ): """Maps each scanned coordinate on a slide to their respective core name (created by TSAI tiler). @@ -381,28 +382,43 @@ def map_coordinates_to_core_name( imz_data (ImzMLParser): The imzML object, needed for coordinate identification. centroid_path (Path): A JSON file mapping each core name to their respective centroid. Generated by the TSAI tiler. - poslog_path (Path): A .txt file listing all the coordinates scanned, - needed to map coordinates to their respective core. + poslog_paths (List[Path]): A list of .txt files listing all the coordinates scanned, + needed to map coordinates to their respective core. They must be specified + in the order of extraction. Returns: ------- pd.DataFrame: Maps each coordinate to their core """ - validate_paths([centroid_path, poslog_path]) + validate_paths([centroid_path] + poslog_paths) coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) - region_core_info: pd.DataFrame = pd.read_csv( - poslog_path, - delimiter=" ", - names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], - usecols=["Region", "X", "Y"], - index_col=False, - skiprows=1, - ) - region_core_info = region_core_info[region_core_info["Region"] != "__"].copy() - region_core_info["Region"] = region_core_info["Region"].str.extract(r"^(.*?)(?=X)") - region_core_info[["X", "Y"]] = coords + region_core_info: pd.DataFrame = pd.DataFrame(columns=["Region", "X", "Y"]) + coord_index: int = 0 + num_regions: int = 0 + + for poslog_path in poslog_paths: + region_core_sub: pd.DataFrame = pd.read_csv( + poslog_path, + delimiter=" ", + names=["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"], + usecols=["Region", "X", "Y"], + index_col=False, + skiprows=1, + ) + region_core_sub = region_core_sub[region_core_sub["Region"] != "__"].copy() + extracted: pd.Series = ( + region_core_sub["Region"].str.extract(r"R(\d+)XY", expand=False).astype(int) + num_regions + ) + region_core_sub["Region"] = extracted + + coords_subset: np.ndarray = coords[coord_index : (coord_index + region_core_sub.shape[0]), :] + region_core_sub[["X", "Y"]] = coords_subset + region_core_info = pd.concat([region_core_info, region_core_sub]) + + coord_index += region_core_sub.shape[0] + num_regions += len(region_core_sub["Region"].unique()) with open(centroid_path, "r") as infile: centroid_data: dict = json.load(infile) @@ -423,6 +439,7 @@ def map_coordinates_to_core_name( core_region_mapping[region_match.values[0]] = core["name"] + region_core_info[["Region", "X", "Y"]] = region_core_info[["Region", "X", "Y"]].astype(int) region_core_info["Core"] = region_core_info["Region"].map(core_region_mapping) return region_core_info @@ -451,8 +468,10 @@ def crop_glycan_cores( if not cores_to_crop: cores_to_crop = region_core_info["Core"].unique().tolist() + verify_in_list(specified_cores=cores_to_crop, all_cores=list(region_core_info["Core"])) + glycan_mask: np.ndarray = imread(glycan_mask_path) - core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape) + core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.int16) coords: np.ndarray = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index a61e875..5272fb7 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -604,7 +604,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The poslog file for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core." + "The poslog files for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core.\n", + "\n", + "**NOTE: order matters when defining `poslog_paths`! Make sure this matches up with the order of acquisition for your run.**" ] }, { @@ -617,7 +619,7 @@ "centroid_path = \"path/to/centroids.json\"\n", "\n", "# contains all coordinates in the order of acquisition\n", - "poslog_path = \"path/to/poslog.txt\"\n", + "poslog_paths = [\"path/to/poslog1.txt\", \"path/to/poslog2.txt\"]\n", "\n", "# define path to one glycan image, needed to find dimensions of mask\n", "glycan_img_path = \"path/to/glycan_img.tiff\"\n", @@ -626,7 +628,7 @@ "region_core_info = extraction.map_coordinates_to_core_name(\n", " imz_data=imz_data,\n", " centroid_path=centroid_path,\n", - " poslog_path=poslog_path\n", + " poslog_paths=poslog_paths\n", ")" ] }, diff --git a/tests/extraction_test.py b/tests/extraction_test.py index 19ef566..c235731 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -163,39 +163,44 @@ def test_generate_glycan_mask( def test_map_coordinates_to_core_name( - imz_data: ImzMLParser, centroid_path: pathlib.Path, poslog_path: pathlib.Path + imz_data: ImzMLParser, centroid_path: pathlib.Path, poslog_dir: pathlib.Path ): + poslog_paths: List[pathlib.Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( - imz_data, centroid_path, poslog_path + imz_data, centroid_path, poslog_paths ) region_core_mapping: pd.DataFrame = region_core_info[["Region", "Core"]].drop_duplicates() region_core_dict: dict = region_core_mapping.set_index("Region")["Core"].to_dict() - assert region_core_dict["R0"] == "Region0" - assert region_core_dict["R1"] == "Region1" - assert len(region_core_dict) == 2 + assert len(region_core_dict) == 4 + assert region_core_dict[0] == "Region0" + assert region_core_dict[1] == "Region1" + assert region_core_dict[2] == "Region2" + assert region_core_dict[3] == "Region3" def test_map_coordinates_to_core_name_malformed( - imz_data: ImzMLParser, bad_centroid_path: pathlib.Path, poslog_path: pathlib.Path + imz_data: ImzMLParser, bad_centroid_path: pathlib.Path, poslog_dir: pathlib.Path ): + poslog_paths: List[pathlib.Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] with pytest.raises(ValueError, match="Could not find mapping of core Region0"): - extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_path) + extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_paths) def test_crop_glycan_cores( imz_data: ImzMLParser, glycan_img_path: pathlib.Path, centroid_path: pathlib.Path, - poslog_path: pathlib.Path, + poslog_dir: pathlib.Path, ): - # test for subset of FOVs + poslog_paths: List[pathlib.Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( - imz_data, centroid_path, poslog_path + imz_data, centroid_path, poslog_paths ) core_names: List[str] = list(region_core_info["Core"].unique()) + # test for subset of FOVs core_cropped_mask: np.ndarray = extraction.crop_glycan_cores( imz_data, glycan_img_path, region_core_info, [core_names[0]] ) From b2970ccad3a91d2e471f3c5722162647cf1c0e5d Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 29 Nov 2023 14:56:58 -0800 Subject: [PATCH 16/28] Ensure regex is R(\d+)X, not R(\d+)XY --- src/maldi_tools/extraction.py | 2 +- templates/maldi-pipeline.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index f351c2e..f879131 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -409,7 +409,7 @@ def map_coordinates_to_core_name( ) region_core_sub = region_core_sub[region_core_sub["Region"] != "__"].copy() extracted: pd.Series = ( - region_core_sub["Region"].str.extract(r"R(\d+)XY", expand=False).astype(int) + num_regions + region_core_sub["Region"].str.extract(r"R(\d+)X", expand=False).astype(int) + num_regions ) region_core_sub["Region"] = extracted diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 5272fb7..394ddd9 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -649,9 +649,9 @@ "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", "\n", "# extract a binary mask with just the cores specified\n", - "core_cropping_mask = extraction.generate_glycan_mask(\n", + "core_cropping_mask = extraction.crop_glycan_cores(\n", " imz_data=imz_data,\n", - " glycan_img_path=glycan_img_path,\n", + " glycan_mask_path=glycan_mask_path,\n", " region_core_info=region_core_info,\n", " cores_to_crop=cores_to_crop\n", ")\n", From d32b1338471b693e3e2b1db08fcfb430f2666fa3 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 5 Dec 2023 12:08:48 -0800 Subject: [PATCH 17/28] Pre-generate all the individual crop masks, then load in as necessary --- conftest.py | 35 +++++++++++++++++++++--- src/maldi_tools/extraction.py | 49 +++++++++++++++++++++++----------- tests/extraction_test.py | 50 ++++++++++++++++++++++------------- 3 files changed, 96 insertions(+), 38 deletions(-) diff --git a/conftest.py b/conftest.py index f64110a..484208c 100644 --- a/conftest.py +++ b/conftest.py @@ -1,6 +1,7 @@ """Shared Fixtures for tests.""" import json +import os from pathlib import Path from typing import Generator, List @@ -120,7 +121,9 @@ def image_xr(rng: np.random.Generator, library: pd.DataFrame) -> Generator[xr.Da @pytest.fixture(scope="session") -def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): +def glycan_img_path( + tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator +) -> Generator[Path, None, None]: coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) glycan_img: np.ndarray = np.zeros((10, 10)) @@ -133,7 +136,9 @@ def glycan_img_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rn @pytest.fixture(scope="session") -def poslog_dir(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator): +def poslog_dir( + tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np.random.Generator +) -> Generator[Path, None, None]: columns_write: List[str] = ["Date", "Time", "Region", "PosX", "PosY", "X", "Y", "Z"] poslog_base_dir: Path = tmp_path_factory.mktemp("poslogs") @@ -156,7 +161,7 @@ def poslog_dir(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser, rng: np @pytest.fixture(scope="session") -def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): +def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser) -> Generator[Path, None, None]: coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) center_coord_indices: np.ndarray = np.arange(10, coords.shape[0], 25) @@ -179,7 +184,9 @@ def centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): @pytest.fixture(scope="session") -def bad_centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): +def bad_centroid_path( + tmp_path_factory: TempPathFactory, imz_data: ImzMLParser +) -> Generator[Path, None, None]: coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) center_coord_indices: np.ndarray = np.arange(10, coords.shape[0], 25) @@ -199,3 +206,23 @@ def bad_centroid_path(tmp_path_factory: TempPathFactory, imz_data: ImzMLParser): outfile.write(json.dumps(centroid_data)) yield centroid_file + + +@pytest.fixture(scope="session") +def region_core_info(imz_data: ImzMLParser, centroid_path: Path, poslog_dir: Path) -> pd.DataFrame: + poslog_paths: List[Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] + region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( + imz_data, centroid_path, poslog_paths + ) + + yield region_core_info + + +@pytest.fixture(scope="session") +def glycan_crop_save_dir( + tmp_path_factory: TempPathFactory, glycan_img_path: Path, region_core_info: pd.DataFrame +) -> Generator[Path, None, None]: + glycan_crop_save_dir: Path = tmp_path_factory.mktemp("glycan_crops") + extraction.generate_glycan_crop_masks(glycan_img_path, region_core_info, glycan_crop_save_dir) + + yield glycan_crop_save_dir diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index f879131..85465f1 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -16,7 +16,7 @@ import numpy as np import pandas as pd import xarray as xr -from alpineer.io_utils import validate_paths +from alpineer.io_utils import list_files, remove_file_extensions, validate_paths from alpineer.misc_utils import verify_in_list from pyimzml.ImzMLParser import ImzMLParser from scipy import signal @@ -444,19 +444,37 @@ def map_coordinates_to_core_name( return region_core_info -def crop_glycan_cores( - imz_data: ImzMLParser, +def generate_glycan_crop_masks( glycan_mask_path: Path, region_core_info: pd.DataFrame, - cores_to_crop: Optional[List[str]] = None, + glycan_crop_save_dir: Path, ): - """Generate a mask for cropping out the specified cores. + """Generates and saves masks for each core in `region_core_info` for cropping purposes. Args: --- - imz_data (ImzMLParser): The imzML object, needed for coordinate identification. glycan_mask_path (Path): The path to the glycan mask .tiff, needed to create the cropped mask. region_core_info (pd.DataFrame): Defines the coordinates associated with each FOV. + glycan_crop_save_dir (Path): The directory to save the glycan crop masks. + """ + validate_paths([glycan_mask_path]) + + glycan_mask: np.ndarray = imread(glycan_mask_path) + np.array([]) + + for core in region_core_info["Core"].unique(): + core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.int16) + coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core, ["X", "Y"]].values + core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + imsave(glycan_crop_save_dir / f"{core}.tiff", core_cropped_mask) + + +def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[List[str]] = None): + """Generate a mask for cropping out the specified cores. + + Args: + --- + glycan_crop_save_dir (Path): The directory containing the glycan crop mask for each individual core. cores_to_crop (Optional[List[str]]): Which cores to segment out. If None, use all. Returns: @@ -464,16 +482,17 @@ def crop_glycan_cores( np.ndarray: The binary segmentation mask of the glycan image """ - validate_paths([glycan_mask_path]) - if not cores_to_crop: - cores_to_crop = region_core_info["Core"].unique().tolist() + validate_paths([glycan_crop_save_dir]) - verify_in_list(specified_cores=cores_to_crop, all_cores=list(region_core_info["Core"])) + all_core_masks = remove_file_extensions(list_files(glycan_crop_save_dir, substrs=".tiff")) + cores = cores_to_crop if cores_to_crop else all_core_masks + verify_in_list(specified_cores=cores_to_crop, all_cores=all_core_masks) - glycan_mask: np.ndarray = imread(glycan_mask_path) - core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.int16) + test_mask: np.ndarray = imread(os.path.join(glycan_crop_save_dir, f"{cores[0]}.tiff")) + glycan_mask: np.ndarray = np.zeros(test_mask.shape) - coords: np.ndarray = region_core_info.loc[region_core_info["Core"].isin(cores_to_crop), ["X", "Y"]].values - core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 + for core in cores: + core_mask: np.ndarray = imread(glycan_crop_save_dir / f"{core}.tiff") + glycan_mask += core_mask - return core_cropped_mask + return glycan_mask diff --git a/tests/extraction_test.py b/tests/extraction_test.py index c235731..1751585 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -8,6 +8,8 @@ import pandas as pd import pytest import xarray as xr +from alpineer.io_utils import list_files, remove_file_extensions +from alpineer.misc_utils import verify_same_elements from pyimzml.ImzMLParser import ImzMLParser from pytest import TempPathFactory from skimage.io import imread @@ -188,40 +190,50 @@ def test_map_coordinates_to_core_name_malformed( extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_paths) -def test_crop_glycan_cores( - imz_data: ImzMLParser, - glycan_img_path: pathlib.Path, - centroid_path: pathlib.Path, - poslog_dir: pathlib.Path, +def test_generate_glycan_crop_masks( + tmp_path_factory: TempPathFactory, glycan_img_path: pathlib.Path, region_core_info: pd.DataFrame ): - poslog_paths: List[pathlib.Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] - region_core_info: pd.DataFrame = extraction.map_coordinates_to_core_name( - imz_data, centroid_path, poslog_paths - ) - core_names: List[str] = list(region_core_info["Core"].unique()) + glycan_crop_save_dir: pathlib.Path = tmp_path_factory.mktemp("glycan_crops") + extraction.generate_glycan_crop_masks(glycan_img_path, region_core_info, glycan_crop_save_dir) - # test for subset of FOVs - core_cropped_mask: np.ndarray = extraction.crop_glycan_cores( - imz_data, glycan_img_path, region_core_info, [core_names[0]] - ) - coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values - assert np.all(core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + core_names: List[str] = remove_file_extensions(list_files(glycan_crop_save_dir, substrs=".tiff")) + verify_same_elements(generated_core_masks=core_names, all_core_masks=region_core_info["Core"].unique()) all_coords_X, all_coords_Y = np.meshgrid(np.arange(1, 11), np.arange(1, 11)) all_coords: np.ndarray = np.vstack((all_coords_X.ravel(), all_coords_Y.ravel())).T + for core in core_names: + core_mask: np.ndarray = imread(glycan_crop_save_dir / f"{core}.tiff") + core_coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core, ["X", "Y"]].values + assert np.all(core_mask[core_coords[:, 1] - 1, core_coords[:, 0] - 1] == 255) + + coords_set: set = set(map(tuple, core_coords)) + non_hit_indices: np.ndarray = np.array([tuple(coord) not in coords_set for coord in all_coords]) + non_hit_coords: np.ndarray = all_coords[non_hit_indices] + assert np.all(core_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) + + +def test_load_glycan_crop_masks(glycan_crop_save_dir: pathlib.Path, region_core_info: pd.DataFrame): + core_names: List[str] = remove_file_extensions(list_files(glycan_crop_save_dir)) + + all_coords_X, all_coords_Y = np.meshgrid(np.arange(1, 11), np.arange(1, 11)) + all_coords: np.ndarray = np.vstack((all_coords_X.ravel(), all_coords_Y.ravel())).T + + # test for a subset of FOVs + core_cropped_mask: np.ndarray = extraction.load_glycan_crop_masks(glycan_crop_save_dir, [core_names[0]]) + coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core_names[0], ["X", "Y"]].values + assert np.all(core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) + coords_set: set = set(map(tuple, coords)) non_hit_indices: np.ndarray = np.array([tuple(coord) not in coords_set for coord in all_coords]) non_hit_coords: np.ndarray = all_coords[non_hit_indices] - assert np.all(core_cropped_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) # test for all FOVs - core_cropped_mask = extraction.crop_glycan_cores(imz_data, glycan_img_path, region_core_info) + core_cropped_mask = extraction.load_glycan_crop_masks(glycan_crop_save_dir) coords = region_core_info.loc[:, ["X", "Y"]].values assert np.all(core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] == 255) coords_set = set(map(tuple, coords)) non_hit_indices = np.array([tuple(coord) not in coords_set for coord in all_coords]) non_hit_coords = all_coords[non_hit_indices] - assert np.all(core_cropped_mask[non_hit_coords[:, 1] - 1, non_hit_coords[:, 0] - 1] == 0) From 2058c9e4adc8d1521acab3b09424045becf192fd Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 5 Dec 2023 12:18:58 -0800 Subject: [PATCH 18/28] Update the notebook with the new pre-generated mask workflow --- templates/maldi-pipeline.ipynb | 54 +++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/templates/maldi-pipeline.ipynb b/templates/maldi-pipeline.ipynb index 394ddd9..de1c35b 100644 --- a/templates/maldi-pipeline.ipynb +++ b/templates/maldi-pipeline.ipynb @@ -570,7 +570,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "It is helpful first to create an all-encompassing mask with all the cores. This will make it clear where the TMA was scanned for the naming step. You will need to provide the path to one of your extracted glycan images first." + "It is helpful first to create an all-encompassing mask that defines the locations of all the cores. This will make it clear where the TMA was scanned for the naming step. You will need to provide the path to one of your extracted glycan images first.\n", + "\n", + "* `glycan_img_path`: path to one glycan image, needed to properly dimension the mask\n", + "* `glycan_mask_path`: where the mask will be saved" ] }, { @@ -579,10 +582,7 @@ "metadata": {}, "outputs": [], "source": [ - "# define path to one glycan image, needed to properly dimension the mask\n", "glycan_img_path = \"path/to/glycan_img.tiff\"\n", - "\n", - "# define a save path for your mask\n", "glycan_mask_path = \"path/to/glycan_mask.png\"\n", "\n", "# generate and save the glycan mask\n", @@ -606,7 +606,8 @@ "source": [ "The poslog files for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core.\n", "\n", - "**NOTE: order matters when defining `poslog_paths`! Make sure this matches up with the order of acquisition for your run.**" + "* `centroid_path`: TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", + "* `poslog_paths`: list of poslog files used for the TMA, contains all coordinates in order of acquisition. **Make sure this matches up with the order of acquisition for your run.**" ] }, { @@ -615,15 +616,9 @@ "metadata": {}, "outputs": [], "source": [ - "# TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", "centroid_path = \"path/to/centroids.json\"\n", - "\n", - "# contains all coordinates in the order of acquisition\n", "poslog_paths = [\"path/to/poslog1.txt\", \"path/to/poslog2.txt\"]\n", "\n", - "# define path to one glycan image, needed to find dimensions of mask\n", - "glycan_img_path = \"path/to/glycan_img.tiff\"\n", - "\n", "# map coordinates to core names\n", "region_core_info = extraction.map_coordinates_to_core_name(\n", " imz_data=imz_data,\n", @@ -636,7 +631,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Specify which cores you want to crop out, and visualize the resulting mask. You can use the mask at `core_cropping_mask` to subset on specific cores." + "To generate FOV-level statistics, an individual mask for each core named by TSAI will be saved. They can then be loaded in as needed in the FOV-level-statistic-generating functions.\n", + "\n", + "* `glycan_crop_save_dir`: the directory where these masks will be saved" ] }, { @@ -645,14 +642,37 @@ "metadata": {}, "outputs": [], "source": [ - "# define which core(s) you want to crop out (set to None to crop all cores out)\n", - "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", + "glycan_crop_save_dir = \"path/to/glycan/crops\"\n", + "if not os.path.exists(glycan_crop_save_dir):\n", + " os.makedirs(glycan_crop_save_dir)\n", "\n", - "# extract a binary mask with just the cores specified\n", - "core_cropping_mask = extraction.crop_glycan_cores(\n", - " imz_data=imz_data,\n", + "extraction.generate_glycan_crop_masks(\n", " glycan_mask_path=glycan_mask_path,\n", " region_core_info=region_core_info,\n", + " glycan_crop_save_dir=glycan_crop_save_dir\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the following cell to visualize the masks for certain cores for testing.\n", + "\n", + "* `cores_to_crop`: define all the cores you want to visualize their masks for. If multiple cores are specified, the individual masks are combined. Set to `None` to crop all cores out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", + "\n", + "# extract a binary mask with just the cores specified\n", + "core_cropping_mask = extraction.load_glycan_crop_masks(\n", + " glycan_crop_save_dir=glycan_crop_save_dir,\n", " cores_to_crop=cores_to_crop\n", ")\n", "\n", From 9ec12a93a51d5b463af9429817c7502af32f1354 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 17 Jan 2024 11:49:43 -0800 Subject: [PATCH 19/28] Ensure masks are of dtype np.uint32 --- src/maldi_tools/extraction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 85465f1..37b0ea8 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -363,7 +363,7 @@ def generate_glycan_mask( validate_paths([glycan_img_path]) glycan_img: np.ndarray = imread(glycan_img_path) - glycan_mask: np.ndarray = np.zeros(glycan_img.shape) + glycan_mask: np.ndarray = np.zeros(glycan_img.shape, dtype=np.uint32) coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 @@ -463,7 +463,7 @@ def generate_glycan_crop_masks( np.array([]) for core in region_core_info["Core"].unique(): - core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.int16) + core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.uint32) coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core, ["X", "Y"]].values core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 imsave(glycan_crop_save_dir / f"{core}.tiff", core_cropped_mask) @@ -489,7 +489,7 @@ def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[L verify_in_list(specified_cores=cores_to_crop, all_cores=all_core_masks) test_mask: np.ndarray = imread(os.path.join(glycan_crop_save_dir, f"{cores[0]}.tiff")) - glycan_mask: np.ndarray = np.zeros(test_mask.shape) + glycan_mask: np.ndarray = np.zeros(test_mask.shape, dtype=np.uint32) for core in cores: core_mask: np.ndarray = imread(glycan_crop_save_dir / f"{core}.tiff") From bf8c32f9f03d74da2f180659af8290f51def697f Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 17 Jan 2024 11:52:26 -0800 Subject: [PATCH 20/28] Ensure glycan_crop_save_dir is set as a pathlib.Path --- src/maldi_tools/extraction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 37b0ea8..662c990 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -458,15 +458,13 @@ def generate_glycan_crop_masks( glycan_crop_save_dir (Path): The directory to save the glycan crop masks. """ validate_paths([glycan_mask_path]) - glycan_mask: np.ndarray = imread(glycan_mask_path) - np.array([]) for core in region_core_info["Core"].unique(): core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.uint32) coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core, ["X", "Y"]].values core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 - imsave(glycan_crop_save_dir / f"{core}.tiff", core_cropped_mask) + imsave(Path(glycan_crop_save_dir) / f"{core}.tiff", core_cropped_mask) def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[List[str]] = None): From 83e5fc6ee73b4f989940309a0f8f68f6e591045c Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 18 Jan 2024 11:46:53 -0800 Subject: [PATCH 21/28] Change from np.uint32 to np.uint8 --- src/maldi_tools/extraction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 662c990..449e8cf 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -363,7 +363,7 @@ def generate_glycan_mask( validate_paths([glycan_img_path]) glycan_img: np.ndarray = imread(glycan_img_path) - glycan_mask: np.ndarray = np.zeros(glycan_img.shape, dtype=np.uint32) + glycan_mask: np.ndarray = np.zeros(glycan_img.shape, dtype=np.uint8) coords: np.ndarray = np.array([coord[:2] for coord in imz_data.coordinates]) glycan_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 @@ -461,7 +461,7 @@ def generate_glycan_crop_masks( glycan_mask: np.ndarray = imread(glycan_mask_path) for core in region_core_info["Core"].unique(): - core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.uint32) + core_cropped_mask: np.ndarray = np.zeros(glycan_mask.shape, dtype=np.uint8) coords: np.ndarray = region_core_info.loc[region_core_info["Core"] == core, ["X", "Y"]].values core_cropped_mask[coords[:, 1] - 1, coords[:, 0] - 1] = 255 imsave(Path(glycan_crop_save_dir) / f"{core}.tiff", core_cropped_mask) @@ -487,7 +487,7 @@ def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[L verify_in_list(specified_cores=cores_to_crop, all_cores=all_core_masks) test_mask: np.ndarray = imread(os.path.join(glycan_crop_save_dir, f"{cores[0]}.tiff")) - glycan_mask: np.ndarray = np.zeros(test_mask.shape, dtype=np.uint32) + glycan_mask: np.ndarray = np.zeros(test_mask.shape, dtype=np.uint8) for core in cores: core_mask: np.ndarray = imread(glycan_crop_save_dir / f"{core}.tiff") From 4baab51da2dd19c61520332dc589c515380d7533 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Tue, 23 Jan 2024 14:54:13 -0800 Subject: [PATCH 22/28] Fix path specification for load_glycan_crop_masks --- src/maldi_tools/extraction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 449e8cf..561a8a7 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -486,11 +486,11 @@ def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[L cores = cores_to_crop if cores_to_crop else all_core_masks verify_in_list(specified_cores=cores_to_crop, all_cores=all_core_masks) - test_mask: np.ndarray = imread(os.path.join(glycan_crop_save_dir, f"{cores[0]}.tiff")) + test_mask: np.ndarray = imread(Path(glycan_crop_save_dir) / f"{cores[0]}.tiff") glycan_mask: np.ndarray = np.zeros(test_mask.shape, dtype=np.uint8) for core in cores: - core_mask: np.ndarray = imread(glycan_crop_save_dir / f"{core}.tiff") + core_mask: np.ndarray = imread(Path(glycan_crop_save_dir) / f"{core}.tiff") glycan_mask += core_mask return glycan_mask From 0f916c3357dfdffad507855f7eaae812c5fb9603 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Wed, 15 May 2024 13:46:23 -0700 Subject: [PATCH 23/28] Add core-cropping notebook (will be renamed) --- templates/core-cropping.ipynb | 206 ++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 templates/core-cropping.ipynb diff --git a/templates/core-cropping.ipynb b/templates/core-cropping.ipynb new file mode 100644 index 0000000..3143f44 --- /dev/null +++ b/templates/core-cropping.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e749e089-0ab2-4947-9756-fd666d1d8a90", + "metadata": {}, + "source": [ + "## Core Naming and Cropping" + ] + }, + { + "cell_type": "markdown", + "id": "93fc332e-05db-4245-8cc0-8fb39df951d3", + "metadata": {}, + "source": [ + "For TMAs, each core is extracted all at once. However, this makes it difficult to locate the exact positions of each core. Additionally, the default names assigned to each core aren't particularly useful because they don't contain any information about their position on the TMA.\n", + "\n", + "This section will help you assign informative names to each core and afterwards, segment out the locations of specific cores to generate FOV-level statistics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6ae2cf3-9d06-40dc-95b3-04b2d874bd1b", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pathlib\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from pyimzml.ImzMLParser import ImzMLParser\n", + "from maldi_tools import extraction" + ] + }, + { + "cell_type": "markdown", + "id": "a5d90873-ff08-4d52-88c7-91a08a432d04", + "metadata": {}, + "source": [ + "Load in the imzml data associated with your run.\n", + "\n", + "TODO: only the coordinates should be needed for this step, should be saved further upstream for efficient loading." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b66c027b-12d9-4b8d-94eb-cae93e1b6c28", + "metadata": {}, + "outputs": [], + "source": [ + "imzml_dir = base_dir / \"imzml\"\n", + "data_name = \"panc2055_imzML\"\n", + "data_file = pathlib.Path(data_name) / \"panc2055.imzML\"\n", + "data_path = imzml_dir / data_file\n", + "\n", + "imz_data = ImzMLParser(data_path, include_spectra_metadata=\"full\")" + ] + }, + { + "cell_type": "markdown", + "id": "86ec9754-45e1-47dd-8de1-e730df573b3e", + "metadata": {}, + "source": [ + "It is helpful first to create an all-encompassing mask that defines the locations of all the cores. This will make it clear where the TMA was scanned for the naming step. You will need to provide the path to one of your extracted glycan images first.\n", + "\n", + "* `glycan_img_path`: path to one glycan image, needed to properly dimension the mask\n", + "* `glycan_mask_path`: where the mask will be saved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81003123-2cf6-4251-920b-a49dc32c58b2", + "metadata": {}, + "outputs": [], + "source": [ + "glycan_img_path = \"path/to/glycan_img.tiff\"\n", + "glycan_mask_path = \"path/to/glycan_mask.png\"\n", + "\n", + "# generate and save the glycan mask\n", + "extraction.generate_glycan_mask(\n", + " imz_data=imz_data,\n", + " glycan_img_path=glycan_img_path,\n", + " glycan_mask_path=glycan_mask_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3f8aa3b4-4715-40e7-b443-5084e818657a", + "metadata": {}, + "source": [ + "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the PNG saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**" + ] + }, + { + "cell_type": "markdown", + "id": "c9ca28b7-2eca-4129-8d4a-ca177c10b7f9", + "metadata": {}, + "source": [ + "The poslog files for your TMA run will contain each scanned coordinate in the exact order it was scanned. This, along with the tiler output, will be needed to map each coordinate to its respective core.\n", + "\n", + "* `centroid_path`: TSAI MALDI tiler output, contains name of each core mapped to respective centroid\n", + "* `poslog_paths`: list of poslog files used for the TMA, contains all coordinates in order of acquisition. **Make sure this matches up with the order of acquisition for your run.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c56ed5d3-f32e-4e4c-ad54-1b8bf9d8322c", + "metadata": {}, + "outputs": [], + "source": [ + "centroid_path = \"path/to/centroids.json\"\n", + "poslog_paths = [\"path/to/poslog1.txt\", \"path/to/poslog2.txt\"]\n", + "\n", + "# map coordinates to core names\n", + "region_core_info = extraction.map_coordinates_to_core_name(\n", + " imz_data=imz_data,\n", + " centroid_path=centroid_path,\n", + " poslog_paths=poslog_paths\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "adfee254-1008-46af-b521-9c711be10ff3", + "metadata": {}, + "source": [ + "To generate FOV-level statistics, an individual mask for each core named by TSAI will be saved. They can then be loaded in as needed in the FOV-level-statistic-generating functions.\n", + "\n", + "* `glycan_crop_save_dir`: the directory where these masks will be saved" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ba07ad1-0eb2-4f66-8e36-02d1c636ccc8", + "metadata": {}, + "outputs": [], + "source": [ + "glycan_crop_save_dir = \"path/to/glycan/crops\"\n", + "if not os.path.exists(glycan_crop_save_dir):\n", + " os.makedirs(glycan_crop_save_dir)\n", + "\n", + "extraction.generate_glycan_crop_masks(\n", + " glycan_mask_path=glycan_mask_path,\n", + " region_core_info=region_core_info,\n", + " glycan_crop_save_dir=glycan_crop_save_dir\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "17337d3f-d4a3-4f34-a69f-924b1da17f3d", + "metadata": {}, + "source": [ + "Run the following cell to visualize the masks for certain cores for testing.\n", + "\n", + "* `cores_to_crop`: define all the cores you want to visualize their masks for. If multiple cores are specified, the individual masks are combined. Set to `None` to crop all cores out." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "124c7c0a-9e3b-4fad-bdd9-5f1448f26219", + "metadata": {}, + "outputs": [], + "source": [ + "cores_to_crop = [\"R1C1\", \"R1C2\"]\n", + "\n", + "# extract a binary mask with just the cores specified\n", + "core_cropping_mask = extraction.load_glycan_crop_masks(\n", + " glycan_crop_save_dir=glycan_crop_save_dir,\n", + " cores_to_crop=cores_to_crop\n", + ")\n", + "\n", + "# visualize the mask\n", + "_ = plt.imshow(core_cropping_mask)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0cf5173bf020f7aea796c41b42df8c6c6a9f2045 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 17 May 2024 09:50:15 -0700 Subject: [PATCH 24/28] Don't include full metadata --- templates/core-cropping.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/core-cropping.ipynb b/templates/core-cropping.ipynb index 3143f44..98b2fc4 100644 --- a/templates/core-cropping.ipynb +++ b/templates/core-cropping.ipynb @@ -55,7 +55,7 @@ "data_file = pathlib.Path(data_name) / \"panc2055.imzML\"\n", "data_path = imzml_dir / data_file\n", "\n", - "imz_data = ImzMLParser(data_path, include_spectra_metadata=\"full\")" + "imz_data = ImzMLParser(data_path)" ] }, { From 7d487f920789e9ae2b4f993ecfc099f4d180d4f5 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 17 May 2024 09:52:11 -0700 Subject: [PATCH 25/28] Add templates.json file for TSAI tiler --- files/template.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 files/template.json diff --git a/files/template.json b/files/template.json new file mode 100644 index 0000000..18105a7 --- /dev/null +++ b/files/template.json @@ -0,0 +1,12 @@ +{ + "exportDateTime": "2023-08-02T16:55:21.610Z", + "fovs": [ + { + "name": "Template", + "centerPointPixels": { + "x": 88, + "y": 88 + } + } + ] +} From 4a6ad6270d17b655e3fffed7f9c42538e710c4bb Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Fri, 17 May 2024 09:53:10 -0700 Subject: [PATCH 26/28] Document where to find the template.json file for TSAI tiler --- templates/core-cropping.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/core-cropping.ipynb b/templates/core-cropping.ipynb index 98b2fc4..ff620af 100644 --- a/templates/core-cropping.ipynb +++ b/templates/core-cropping.ipynb @@ -92,7 +92,7 @@ "id": "3f8aa3b4-4715-40e7-b443-5084e818657a", "metadata": {}, "source": [ - "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the PNG saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.**" + "Each core on the TMA should be appropriately named by the TSAI MALDI tiler. You will need to provide the PNG saved at `glycan_mask_path` as input. **Ensure that this step is completed before running the following sections.** You will find a template JSON you can use TSAI at `maldi-tools/files/template.json`." ] }, { From 7540686e72017af652554059b7c6b8f263773f9b Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Thu, 29 Aug 2024 00:06:45 -0700 Subject: [PATCH 27/28] Map to closest core in case centroid in mapping dict was "missed" by MALDI scan --- src/maldi_tools/extraction.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 561a8a7..99cfb6c 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -8,6 +8,7 @@ import json import os +import warnings from functools import partial from operator import itemgetter from pathlib import Path @@ -431,10 +432,19 @@ def map_coordinates_to_core_name( "Region", ] if region_match.shape[0] == 0: - raise ValueError( + core_centroid_distances: pd.Series = np.sqrt( + (region_core_info["x"] - center_point["x"]) ** 2 + + (region_core_info["y"] - center_point["y"]) ** 2 + ) + region_match = region_core_info.iloc[core_centroid_distances.idxmin(), 0] + + # TODO: add an error threshold of a few pixels, shouldn't naively map everything + warnings.warn( f"Could not find mapping of core {core['name']} to any location on the slide, " "please verify that you positioned the central point of the core correctly " - "using the TSAI tiler, or that you've set the right poslog file." + "using the TSAI tiler, or that you've set the right poslog file.\n\n" + f"The closest region to the core's centroid is {region_match.values[0]}, " + "using this as finalized mapping." ) core_region_mapping[region_match.values[0]] = core["name"] From 664d01bf5a8ae0e30a5f97302da802bfbfb3ec20 Mon Sep 17 00:00:00 2001 From: alex-l-kong Date: Mon, 11 Nov 2024 14:49:44 -0800 Subject: [PATCH 28/28] Fix remaining tests --- src/maldi_tools/extraction.py | 9 +++++---- tests/extraction_test.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/maldi_tools/extraction.py b/src/maldi_tools/extraction.py index 99cfb6c..57675ab 100644 --- a/src/maldi_tools/extraction.py +++ b/src/maldi_tools/extraction.py @@ -433,10 +433,11 @@ def map_coordinates_to_core_name( ] if region_match.shape[0] == 0: core_centroid_distances: pd.Series = np.sqrt( - (region_core_info["x"] - center_point["x"]) ** 2 - + (region_core_info["y"] - center_point["y"]) ** 2 + np.abs(region_core_info["X"].values - center_point["x"]).astype(float) ** 2 + + np.abs(region_core_info["Y"].values - center_point["y"]).astype(float) ** 2 ) - region_match = region_core_info.iloc[core_centroid_distances.idxmin(), 0] + + region_match = region_core_info.iloc[np.argmin(core_centroid_distances), :] # TODO: add an error threshold of a few pixels, shouldn't naively map everything warnings.warn( @@ -494,7 +495,7 @@ def load_glycan_crop_masks(glycan_crop_save_dir: Path, cores_to_crop: Optional[L all_core_masks = remove_file_extensions(list_files(glycan_crop_save_dir, substrs=".tiff")) cores = cores_to_crop if cores_to_crop else all_core_masks - verify_in_list(specified_cores=cores_to_crop, all_cores=all_core_masks) + verify_in_list(specified_cores=cores, all_cores=all_core_masks) test_mask: np.ndarray = imread(Path(glycan_crop_save_dir) / f"{cores[0]}.tiff") glycan_mask: np.ndarray = np.zeros(test_mask.shape, dtype=np.uint8) diff --git a/tests/extraction_test.py b/tests/extraction_test.py index 1751585..8678cd3 100644 --- a/tests/extraction_test.py +++ b/tests/extraction_test.py @@ -185,8 +185,9 @@ def test_map_coordinates_to_core_name( def test_map_coordinates_to_core_name_malformed( imz_data: ImzMLParser, bad_centroid_path: pathlib.Path, poslog_dir: pathlib.Path ): + print(bad_centroid_path) poslog_paths: List[pathlib.Path] = [poslog_dir / pf for pf in os.listdir(poslog_dir)] - with pytest.raises(ValueError, match="Could not find mapping of core Region0"): + with pytest.warns(match="Could not find mapping of core Region0"): extraction.map_coordinates_to_core_name(imz_data, bad_centroid_path, poslog_paths)