diff --git a/regression/rt-cetsa-analysis-tool/.bumpversion.cfg b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg new file mode 100644 index 0000000..ecf7eb8 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.5.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:VERSION] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/regression/rt_cetsa_analysis/__init__.py] + +[bumpversion:file:plugin.json] + +[bumpversion:file:ict.yml] + +[bumpversion:file:rt_cetsa_analysis.cwl] diff --git a/regression/rt-cetsa-analysis-tool/.gitignore b/regression/rt-cetsa-analysis-tool/.gitignore new file mode 100644 index 0000000..c32211a --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/.gitignore @@ -0,0 +1 @@ +tests/out diff --git a/regression/rt-cetsa-analysis-tool/Dockerfile b/regression/rt-cetsa-analysis-tool/Dockerfile new file mode 100755 index 0000000..1cf326d --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/Dockerfile @@ -0,0 +1,14 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" + +COPY env-linux.yml ${EXEC_DIR}/env-linux.yml +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +RUN apt-get update && apt-get install -y wget +RUN mkdir -p miniconda3 && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda3/miniconda.sh && \ +bash miniconda3/miniconda.sh -b -u -p miniconda3 && rm miniconda3/miniconda.sh +RUN /opt/executables/miniconda3/bin/conda env update -f env-linux.yml diff --git a/regression/rt-cetsa-analysis-tool/Dockerfile-original b/regression/rt-cetsa-analysis-tool/Dockerfile-original new file mode 100644 index 0000000..6311372 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/Dockerfile-original @@ -0,0 +1,55 @@ +FROM r-base:4.4.0 + + +ARG EXEC_DIR="/opt/executables" + +RUN apt-get update && apt-get install -y python3 + +#Create folders +RUN mkdir -p ${EXEC_DIR} + +# Install R packages external dependencies +RUN apt-get install -y libssl-dev \ + && apt-get install -y libcurl4-openssl-dev libfontconfig1-dev \ + && apt-get install -y libharfbuzz-dev libfribidi-dev \ + && apt-get install -y libxml2-dev \ + && apt-get install -y libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \ + && apt-get install -y cmake + +# Install required R packages +ADD requirements/Requirements_1.R ${EXEC_DIR}/Requirements_1.R +RUN Rscript ${EXEC_DIR}/Requirements_1.R + +ADD requirements/Requirements_2.R ${EXEC_DIR}/Requirements_2.R +RUN Rscript ${EXEC_DIR}/Requirements_2.R + +ADD requirements/Requirements_3.R ${EXEC_DIR}/Requirements_3.R +RUN Rscript ${EXEC_DIR}/Requirements_3.R + +RUN apt-get install -y --allow-downgrades libxcb-render0=1.15-1 libxcb-shm0=1.15-1 libxcb1=1.15-1 +RUN apt-get install -y libcairo2-dev + +ADD requirements/Requirements_4.R ${EXEC_DIR}/Requirements_4.R +RUN Rscript ${EXEC_DIR}/Requirements_4.R + +RUN apt-get install -y libgit2-dev + +ADD requirements/Requirements_6.R ${EXEC_DIR}/Requirements_6.R +RUN Rscript ${EXEC_DIR}/Requirements_6.R + +ADD requirements/Requirements_5.R ${EXEC_DIR}/Requirements_5.R +RUN Rscript ${EXEC_DIR}/Requirements_5.R + +COPY pyproject.toml ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +RUN apt-get update && apt-get install -y python3 +RUN apt-get -y install python3-pip python3-venv +COPY src/ ${EXEC_DIR}/src +RUN pip3 install ${EXEC_DIR} --no-cache-dir --break-system-packages + +WORKDIR ${EXEC_DIR} + +# CMD ["Rscript", "main.R"] +CMD ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] +# CMD ["--help"] +# CMD ["bash"] diff --git a/regression/rt-cetsa-analysis-tool/README.md b/regression/rt-cetsa-analysis-tool/README.md new file mode 100644 index 0000000..3e0f38e --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/README.md @@ -0,0 +1,31 @@ +# RT_CETSA Analysis Tool (v0.5.0-dev0) + +This WIPP plugin runs statistical analysis for the RT-CETSA pipeline. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eight input argument and one output argument: + +| Name | Description | I/O | Type | +|-----------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input directory containing the all data files | Input | genericData | +| `--params` | name of the moltenprot fit params csv file in the input directory | Input | string | +| `--values` | name of the moltenprot baseline corrected values csv file in the input directory +| `--platemap` | Path to the platemap file | Input | genericData | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | + +## Build options + +By default `./build-docker` will build the image using `Dockerfile`, which install R with conda. +In regression are noticed, this file can be swapped with `Dockerfile-original` which is the original version +of the dockerfile that is using apt to install all dependencies and is known to work correctly. diff --git a/regression/rt-cetsa-analysis-tool/VERSION b/regression/rt-cetsa-analysis-tool/VERSION new file mode 100644 index 0000000..412ff8c --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/VERSION @@ -0,0 +1 @@ +0.5.0-dev0 diff --git a/regression/rt-cetsa-analysis-tool/build-docker.sh b/regression/rt-cetsa-analysis-tool/build-docker.sh new file mode 100755 index 0000000..93d9b45 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Antoine Gerardin ", + "Najib Ishaq ", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7" +openpyxl = "^3.1.3" +pandas = "^2.2.2" +numpy = "^1.26.4" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +extend-ignore = [ + "RET505", # Unnecessary `else` after `return` statement + "E501", # Line too long + "ANN001", # Missing type annotation + "D102", # Missing docstring in public method + "ANN201", # Missing return type annotation + "N806", # Variable in function should be lowercase + "D205", # 1 blank line required between summary line and description + "N803", # Argument name should be lowercase + "PLR0913", # Too many arguments + "D415", # First line should end with a period, question mark, or exclamation point + "PLR2004", # Magic value used in comparison + "B006", # Do not use mutable default arguments + "D107", # Missing docstring + "D101", # Missing docstring + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file + "PTH123", # `open()` should be replaced with `Path.open()` + "PTH118", # `os.path.join()` should be replaced with `/` operator + "PTH100", # `os.path.abspath()` should be replaced with `Path.resolve()` + "PLR0915", # Too many statements + "PLR0912", # Too many branches + "C901", # Function is too complex + "T201", # `print` used + "E722", # Do not use bare 'except' + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling + "ANN202", # Missing return type annotation for private function + "ARG002", # Unused method argument + "N802", # Function name should be lowercase + "PTH103", # `os.makedirs()` should be replaced with `Path.mkdir(parents=True)` + "ANN003", # Missing type annotation for `**kwargs` + "B007", # Loop control variable not used within the loop body + "ANN204", # Missing return type annotation for magic method + "D417", # Missing argument descriptions in the docstring + "ANN205", # Missing return type annotation for static method + "PLR5501", # Use `elif` instead of `else` following `if` condition to avoid unnecessary indentation + "EM102", # Exception must not use an f-string literal + "D414", # Section has no content + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "A001", # Variable `input` is shadowing a Python builtin + "A002", # Argument `input` is shadowing a Python builtin + "E741", # Ambiguous variable name: `l` + "PTH120", # `os.path.dirname()` should be replaced by `Path.parent` + "N816", # Variable `cfFilename` in global scope should not be mixedCase + "PTH109", # `os.getcwd()` should be replaced by `Path.cwd()` +] diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_1.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_1.R new file mode 100644 index 0000000..cfd173c --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_1.R @@ -0,0 +1,3 @@ +install.packages('argparse') +install.packages('logging') +install.packages('tidyverse') diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_2.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_2.R new file mode 100644 index 0000000..60ee622 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_2.R @@ -0,0 +1 @@ +install.packages('drc') diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_3.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_3.R new file mode 100644 index 0000000..cba1975 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_3.R @@ -0,0 +1,6 @@ +install.packages('readxl') +install.packages('stringr') +install.packages('ggthemes') +install.packages('cowplot') +install.packages('ggpubr') +install.packages('MESS') diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_4.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_4.R new file mode 100644 index 0000000..a779a3a --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_4.R @@ -0,0 +1 @@ +install.packages('hrbrthemes') diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_5.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_5.R new file mode 100644 index 0000000..f55228f --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_5.R @@ -0,0 +1 @@ +install.packages('devtools') diff --git a/regression/rt-cetsa-analysis-tool/requirements/Requirements_6.R b/regression/rt-cetsa-analysis-tool/requirements/Requirements_6.R new file mode 100644 index 0000000..caae61f --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/requirements/Requirements_6.R @@ -0,0 +1 @@ +install.packages('gert') diff --git a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl new file mode 100644 index 0000000..0684991 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl @@ -0,0 +1,44 @@ +class: CommandLineTool +cwlVersion: v1.2 +baseCommand: ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] +inputs: + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + params: + inputBinding: + prefix: --params + type: string? + values: + inputBinding: + prefix: --values + type: string? + platemap: + inputBinding: + prefix: --platemap + type: File + preview: + inputBinding: + prefix: --preview + type: boolean? + outDir: + inputBinding: + prefix: --outDir + type: Directory +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + EnvVarRequirement: + envDef: + WORKDIR: /opt/executables/ + DockerRequirement: + dockerPull: polusai/rt-cetsa-analysis-tool:0.5.0-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/regression/rt-cetsa-analysis-tool/run-plugin.sh b/regression/rt-cetsa-analysis-tool/run-plugin.sh new file mode 100755 index 0000000..64468bc --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash +version=$( None: + """CLI for rt-cetsa-analysis tool.""" + logger.info("Starting the CLI for rt-cetsa-analysis tool.") + + logger.info(f"Input directory: {inp_dir}") + logger.info(f"params: {params_filename}") + logger.info(f"values: {values_filename}") + logger.info(f"platemap path: {platemap_path}") + logger.info(f"Output directory: {out_dir}") + + if params_filename: + params_path = inp_dir / params_filename + elif (inp_dir / "params.csv").exists(): + params_path = inp_dir / "params.csv" + else: + raise ValueError( + f"No 'params.csv' moltenprot parameters file found in {inp_dir}.", + ) + + if values_filename: + values_path = inp_dir / values_filename + elif (inp_dir / "values.csv").exists(): + values_path = inp_dir / "values.csv" + else: + raise ValueError(f"No 'values.csv' moltenprot values file found in {inp_dir}.") + + if not params_path.exists(): + raise FileNotFoundError(f"params file not found : {params_path}") + if not values_path.exists(): + raise FileNotFoundError(f"values file not found : {values_path}") + + data_path = preprocess_data(platemap_path, values_path, params_path, out_dir) + + logger.info(f"combined data csv file: {data_path}") + + if preview: + outputs: list[str] = ["signif_df.csv"] + out_json = {"files": outputs} + with (out_dir / "preview.json").open("w") as f: + json.dump(out_json, f, indent=2) + return + + run_rscript(data_path, out_dir) + + +if __name__ == "__main__": + app() diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R new file mode 100644 index 0000000..cb2be4d --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R @@ -0,0 +1,1559 @@ +# # @@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@ @@@@@@ +# # @@@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ +# # @@! @@@ @@! !@@ @@! @@! !@@ @@! @@@ +# # !@! @!@ !@! !@! !@! !@! !@! !@! @!@ +# # @!@!!@! @!! @!@!@!@!@ !@! @!!!:! @!! !!@@!! @!@!@!@! +# # !!@!@! !!! !!!@!@!!! !!! !!!!!: !!! !!@!!! !!!@!!!! +# # !!: :!! !!: :!! !!: !!: !:! !!: !!! +# # :!: !:! :!: :!: :!: :!: !:! :!: !:! +# # :: ::: :: ::: ::: :: :::: :: :::: :: :: ::: +# # Isothermal Analysis of CETSA/RT-CETSA Experimental Sets +# # +# # Plate assignment, data cleanup, and functions +# # Patents: PCT/US21/45184, HHS E-022-2022-0-US-01 + +print("####### loading all functions used in the analysis...") + +library(tidyverse) +library(readxl) +library(stringr) +library(drc) +library(ggthemes) +library(cowplot) +library(hrbrthemes) +library(ggpubr) +library(MESS) +library(devtools) +# load_all("."); +#' construct_grid +#' Construct a grid with compatable headers for MoltenProt file prep +#' +#' @param row_num Number of rows in microplate +#' @param col_num Number of columns in microplate +#' @param pad_num Add padding 0 to well address? +#' +#' @return df containing the grid +#' @export +#' +#' @examples +#' construct_grid(16,24) +#' construct_grid(32,48,TRUE) +construct_grid <- + function(row_num = 16, + col_num = 24, + pad_num = FALSE) { + if (pad_num == FALSE) { + grid <- + expand.grid(row = LETTERS[1:(row_num)], col = c(1:(col_num))) %>% + arrange(row) %>% + mutate(address = paste(row, col, sep = '')) %>% + dplyr::select(-c('row', 'col')) + } else { + letter <- LETTERS[1:(row_num)] + number <- c(1:(col_num)) + number <- str_pad(number, 2, pad = '0') + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), + col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + } + return(grid) + } + +#' prepMatLabforMolt +#' +#' Need to provide location of file, usually in ./data/, and sheet location +#' +#' @param file_loc Location of raw Matlab data file +#' @param sheet What sheet in the .xlsx is the data located +#' @param col_names Are there column names in the data sheet +#' @param start_temp Start temp for the experiment +#' @param end_temp End temp for the experiment +#' +#' @return df containing the raw values for the RT-CETSA experiment +#' @export +#' +#' @examples +#' prepMatLabforMolt(file_loc = './data/210318_plate3.xlsx', +#' start_temp = startTemp, +#' end_temp = endTemp) +prepMatLabforMolt <- function(file_loc = './data/rtcetsa_raw.xlsx', + sheet = 'Sheet1', + col_names = FALSE, + start_temp = 37, + end_temp = 90) { + if (file.exists(file_loc) == FALSE) { + stop('File does not exist at path supplied.') + } + df <- + read_excel( + path = file_loc, + sheet = sheet, + col_names = col_names, + .name_repair = 'unique' + ) + if (nrow(df) == 0 || ncol(df) == 0) { + stop('Imported file is empty. Please navigate to correct RT-CETSA file') + } + df <- df %>% + dplyr::select(-c('...1', '...2')) %>% + rownames_to_column() %>% + rename('well' = 'rowname') + + # Construct temperature index (t_n) and pivot around the data to tidy + tracker <- 1 + for (val in 2:ncol(df) - 1) { + names(df)[val + 1] <- paste('t_', val, sep = '') + tracker <- tracker + 1 + } + df <- df %>% + pivot_longer(., cols = 2:ncol(df)) %>% + pivot_wider(names_from = well) %>% + rename(., 'Temperature' = 'name') %>% + mutate(., Temperature = as.integer(gsub("[^0-9.]", "", Temperature))) + + #Create temperature index in line with experimental parameters supplied in main script + temperature_df <- + seq(start_temp, end_temp, by = ((end_temp - start_temp) / (nrow(df) - + 1))) %>% + round(., digits = 1) + for (i in 1:length(temperature_df)) + df$Temperature[i] <- temperature_df[i] + + # Assemble data for moltenprot analysis by splitting 384-well plate to 96-well plate with appropriate index + grid_96w <- construct_grid(row_num = 8, col_num = 12) + q1 <- df %>% + dplyr::select(., 1, 2:97) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q1)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q2 <- df %>% + dplyr::select(., 1, 98:193) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q2)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q3 <- df %>% + dplyr::select(., 1, 194:289) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q3)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q4 <- df %>% + dplyr::select(., 1, 290:385) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q4)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + write.csv(q1, './data/cleaned_expt1.csv', row.names = FALSE) + write.csv(q2, './data/cleaned_expt2.csv', row.names = FALSE) + write.csv(q3, './data/cleaned_expt3.csv', row.names = FALSE) + write.csv(q4, './data/cleaned_expt4.csv', row.names = FALSE) + + return(df) +} + +# Read in MoltenProt readout, with different column identities for different models +retrieveMoltenData <- + function(model = 'standard', + plate_format = 384) { + # Retrieve experimental data from processed file folders + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + if (model == 'standard') { + exp1_param <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp2_param <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp3_param <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp4_param <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + # Reformat ID column in each exp from MoltenProt format (A1, not A01) to arrange + exp1_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp1_param$ID, perl = TRUE) + exp1_param <- exp1_param %>% arrange(ID) + exp2_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp2_param$ID, perl = TRUE) + exp2_param <- exp2_param %>% arrange(ID) + exp3_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp3_param$ID, perl = TRUE) + exp3_param <- exp3_param %>% arrange(ID) + exp4_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp4_param$ID, perl = TRUE) + exp4_param <- exp4_param %>% arrange(ID) + # Combine all experiments and add identifiers + exp_param_full <- + exp1_param %>% rbind(., exp2_param, exp3_param, exp4_param) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + dplyr::select( + -c( + 'ID', + 'kN_init', + 'bN_init', + 'kU_init', + 'bU_init', + 'dHm_init', + 'Tm_init', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S', + 'dCp_component' + ) + ) %>% + bind_cols(col_by_row) %>% + relocate(c('row', 'col'), .after = well) %>% + dplyr::select(-'well') + exp_param_full <- well_assignment(exp_param_full, 384) + return(exp_param_full) + } + if (model == 'irrev') { + exp1_param <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp2_param <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp3_param <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp4_param <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + # Reformat ID column in each exp from MoltenProt format (A1, not A01) to arrange + exp1_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp1_param$ID, perl = TRUE) + exp1_param <- exp1_param %>% arrange(ID) + exp2_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp2_param$ID, perl = TRUE) + exp2_param <- exp2_param %>% arrange(ID) + exp3_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp3_param$ID, perl = TRUE) + exp3_param <- exp3_param %>% arrange(ID) + exp4_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp4_param$ID, perl = TRUE) + exp4_param <- exp4_param %>% arrange(ID) + # Combine all experiments and add identifiers + exp_param_full <- + exp1_param %>% rbind(., exp2_param, exp3_param, exp4_param) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + dplyr::select(c( + 'well', + 'Ea_fit', + 'Tf_fit', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S' + )) %>% + bind_cols(col_by_row) %>% + relocate(c('row', 'col'), .after = well) %>% + dplyr::select(-'well') + exp_param_full <- well_assignment(exp_param_full, 384) + return(exp_param_full) + } + } + +# Gather base-line corrected fit curves for the 384-well plate and pivot plate +retrieve_FittedCurves <- + function(model = 'baseline-fit', + start_temp = 37, + end_temp = 90) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + if (model == 'baseline-fit') { + exp1_curve <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') + exp2_curve <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp3_curve <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp4_curve <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp_curve_all <- + cbind( + xp1 = exp1_curve, + xp2 = exp2_curve, + xp3 = exp3_curve, + xp4 = exp4_curve + ) %>% + rename(., Temperature = xp1.Temperature) %>% + mutate(., Temperature = paste('val_t_', Temperature, sep = '')) + exp_curve_all <- exp_curve_all %>% + pivot_longer(cols = 2:ncol(exp_curve_all)) %>% + pivot_wider(names_from = Temperature) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + bind_cols(col_by_row) %>% + dplyr::select(-c('name', 'well', 'row', 'col')) %>% + add_tempheaders(., start_temp, end_temp) + message('Fit curves retrieved.') + return(exp_curve_all) + } + if (model == 'fit_curves') { + exp1_curve <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') + exp2_curve <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp3_curve <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp4_curve <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp_curve_all <- + cbind( + xp1 = exp1_curve, + xp2 = exp2_curve, + xp3 = exp3_curve, + xp4 = exp4_curve + ) %>% + rename(., Temperature = xp1.Temperature) %>% + mutate(., Temperature = paste('val_t_', Temperature, sep = '')) + exp_curve_all <- exp_curve_all %>% + pivot_longer(cols = 2:ncol(exp_curve_all)) %>% + pivot_wider(names_from = Temperature) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + bind_cols(col_by_row) %>% + dplyr::select(-c('name', 'well', 'row', 'col')) %>% + add_tempheaders(., start_temp, end_temp) + message('Fit curves retrieved.') + return(exp_curve_all) + } + } + +# Construct full data frame with curve fit and parameters for analysis +bind_fulldf <- function(param_df, curve_df) { + df <- cbind(param_df, curve_df) + return(df) +} + +#Convert any columns containing Kelvin values from MoltenProt to Celsius +kelToCel <- function(df) { + df <- df %>% + mutate(Tm_fit = Tm_fit - 273.15) %>% + mutate(T_onset = T_onset - 273.15) +} +# # +# ISO-CETSA Functions (new) +# # + +# Add temperature headers to df +add_tempheaders <- function(df, + start_temp = 37, + end_temp = 90) { + temperature_df <- + seq(start_temp, end_temp, by = ((end_temp - start_temp) / (ncol(df) - 1))) %>% + round(., digits = 1) + for (i in 1:ncol(df)) { + colnames(df)[i] <- paste('t_', temperature_df[i], sep = '') + } + message('Temperature assignments changed for ', + ncol(df), + ' points.') + return(df) +} + +# Add row and column to a tidy dataframe (columns are each temperatures, rows are wells/conditions) +add_rowcol <- function(df, well_num) { + if (well_num == 96) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:8), col = sprintf('%.2d', 1:12)) %>% + arrange(., row) + } + else if (well_num == 384) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), + col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + } + message('Row + Column assignments created for ', + well_num, + '-well plate') + df <- cbind(col_by_row, df) + return(df) +} + +# Add well assignmnets for each plate +well_assignment <- function(df, well_num) { + if (well_num == 96) { + letter <- LETTERS[1:8] + number <- c(1:12) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + else if (well_num == 384) { + letter <- LETTERS[1:16] + number <- c(1:24) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + message('Well assignments created for ', well_num, '-well plate.') + df <- cbind(temp_df, df) + return(df) +} + +# Assign compound ids and concentration from platemap +plate_assignment <- function(df, platemap_file) { + id_df <- read_excel(platemap_file, sheet = 'sample') %>% + dplyr::select(-1) %>% + pivot_longer(., cols = 1:ncol(.)) %>% + rename(ncgc_id = value) %>% + dplyr::select(-c('name')) + id_df$ncgc_id <- gsub('empty', 'vehicle', id_df$ncgc_id) + conc_df <- read_excel(platemap_file, sheet = 'conc') %>% + dplyr::select(-1) %>% + pivot_longer(., cols = 1:ncol(.)) %>% + rename(conc = value) %>% + dplyr::select(-c('name')) + df <- cbind(id_df, conc_df, df) + message('Plate assignment attached to dataframe.') + df$row <- as.numeric(df$row) + df$col <- as.numeric(df$col) + return(df) +} + +# Calculate AUC for each well +print("####### loading calculate_auc...") +calculate_auc <- function(df) { + #Retrieve temperatures to be used for AUC determination. + auc.df <- df %>% + dplyr::select(matches('t_\\d')) + + #Initialize the AUC column + df$auc <- NA + + # Pivot and clean each row for AUC model + for (i in 1:nrow(auc.df)) { + curveVals <- auc.df[i,] %>% + pivot_longer(cols = everything(), + names_to = 'temp', + values_to = 'response') + curveVals$temp <- curveVals$temp %>% + sub('t_', '', .) + curveVals$temp <- as.numeric(curveVals$temp) + df$auc[i] <- auc(x = curveVals$temp, y = curveVals$response) + } + message('AUC Values calculated for ', nrow(auc.df), ' wells.') + return(df) +} + +control_grouping <- function(df, control = 'DMSO', pc = 'control') { + control_df <- filter(df, ncgc_id == control | ncgc_id == pc) + if (nrow(control_df) == 0) { + message('No control wells found. Review control input to function.') + } else + if (nrow(control_df) > 0) { + control_df <- control_df %>% + dplyr::select(-'conc') + return(control_df) + } +} + +control_variability <- + function(df, nc = 'vehicle', pc = 'control') { + #Filter out positive and negative controls into their own df + nc.controls.df <- df %>% + filter(ncgc_id == nc) %>% + dplyr::select(-c('ncgc_id', 'well', 'row', 'col')) + pc.controls.df <- df %>% + filter(ncgc_id == pc) %>% + dplyr::select(-c('ncgc_id', 'well', 'row', 'col')) + + #Calculate means, sd, and %CV + nc.mean.df <- + apply(nc.controls.df[1:ncol(nc.controls.df)], 2, mean) + nc.sd.df <- apply(nc.controls.df[1:ncol(nc.controls.df)], 2, sd) + pc.mean.df <- + apply(pc.controls.df[1:ncol(pc.controls.df)], 2, mean) + pc.sd.df <- apply(pc.controls.df[1:ncol(pc.controls.df)], 2, sd) + + #Calculate %CV + nc.var.df <- tibble(nc.mean = nc.mean.df, nc.sd = nc.sd.df) %>% + mutate(nc.cv = (nc.sd / nc.mean) * 100) + pc.var.df <- tibble(pc.mean = pc.mean.df, pc.sd = pc.sd.df) %>% + mutate(pc.cv = (pc.sd / pc.mean) * 100) + analysis_method <- colnames(nc.controls.df) + var_df <- cbind(analysis_method, nc.var.df, pc.var.df) + message('Control group variability analyzed.') + return(var_df) + } + +# Returns thermogram with mean/sd of DMSO curve across temps +control_thermogram <- function(df, pcTm, ncTm) { + subset_df <- subset(df, grepl('t_', analysis_method)) %>% + mutate(temp = as.numeric(gsub('t_', '', analysis_method))) %>% + dplyr::select(-'analysis_method') + therm_plot <- ggplot(subset_df, aes(x = temp)) + + geom_line(aes(y = nc.mean), + size = 1.5, + alpha = 0.75, + color = '#88CCEE') + + geom_errorbar(aes(ymin = nc.mean - nc.sd, ymax = nc.mean + nc.sd), + size = 0.5, + width = 1) + + geom_point( + aes(y = nc.mean), + size = 3.25, + shape = 21, + color = 'black', + fill = '#88CCEE' + ) + + geom_line(aes(y = pc.mean), + size = 1.5, + alpha = 0.75, + color = '#882255') + + geom_errorbar(aes(ymin = pc.mean - pc.sd, ymax = pc.mean + pc.sd), + size = 0.5, + width = 1) + + geom_point( + aes(y = pc.mean), + size = 3.25, + shape = 21, + color = 'black', + fill = '#EE3377' + ) + + theme_minimal() + + labs(title = 'Control Thermograms', + x = 'Temperature [C]', + y = 'Fraction Unfolded') + print(therm_plot) + return(therm_plot) +} + +# Controls analysis and z' output for groups +# Possible outputs: +# output = 'plot': Cowplot of controls +# output = 'df': Control dataframe +control_analysis <- + function(df, + nc = 'vehicle', + pc = 'control', + output = '', + controlDF) { + controls.df <- df %>% + filter(ncgc_id == nc | ncgc_id == pc) + + #Calculate Z' from controls for each parameter + test_params <- + c('Tm_fit', + 'auc') + Tm.nc.mean <- + mean(controls.df$Tm_fit[controls.df$ncgc_id == nc]) + Tm.nc.sd <- sd(controls.df$Tm_fit[controls.df$ncgc_id == nc]) + Tm.pc.mean <- + mean(controls.df$Tm_fit[controls.df$ncgc_id == pc]) + Tm.pc.sd <- sd(controls.df$Tm_fit[controls.df$ncgc_id == pc]) + Tm.z <- + 1 - (((3 * Tm.pc.sd) + (3 * Tm.nc.sd)) / abs(Tm.pc.mean - Tm.nc.mean)) + + message('Z\' for Tm: ', signif(Tm.z)) + auc.nc.mean <- mean(controls.df$auc[controls.df$ncgc_id == nc]) + auc.nc.sd <- sd(controls.df$auc[controls.df$ncgc_id == nc]) + auc.pc.mean <- mean(controls.df$auc[controls.df$ncgc_id == pc]) + auc.pc.sd <- sd(controls.df$auc[controls.df$ncgc_id == pc]) + auc.z <- + 1 - (((3 * auc.pc.sd) + (3 * auc.nc.sd)) / abs(auc.pc.mean - auc.nc.mean)) + message('Z\' for AUC: ', signif(auc.z)) + + if (output == 'plot') { + Tm.plot <- + ggplot(controls.df, aes(x = ncgc_id, y = Tm_fit, fill = ncgc_id)) + + geom_boxplot(outlier.alpha = 0, size = 0.75) + + geom_jitter(shape = 21, size = 3) + + theme_minimal() + + scale_fill_hue() + + labs(title = 'Controls | Tagg', + subtitle = paste('Z\': ', signif(Tm.z), sep = '')) + + theme( + legend.position = 'none', + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + axis.text.y = element_text(size = 10), + axis.title.y = element_text(size = 12, face = 'bold'), + plot.title = element_text(size = 12, face = 'bold') + ) + auc.plot <- + ggplot(controls.df, aes(x = ncgc_id, y = auc, fill = ncgc_id)) + + geom_boxplot(outlier.alpha = 0, size = 0.75) + + geom_jitter(shape = 21, size = 3) + + theme_minimal() + + scale_fill_hue() + + labs(title = 'Controls | AUC', + subtitle = paste('Z\': ', signif(auc.z), sep = '')) + + theme( + legend.position = 'none', + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + axis.text.y = element_text(size = 10), + axis.title.y = element_text(size = 12, face = 'bold'), + plot.title = element_text(size = 12, face = 'bold') + ) + right.grid <- + plot_grid(Tm.plot, auc.plot, ncol = 1) + control.grid <- + plot_grid( + control_thermogram(controlDF, ncTm = Tm.nc.mean, pcTm = Tm.pc.mean), + right.grid, + ncol = 2, + nrow = 1 + ) + out <- paste(outdir,'controls.png',sep="/") + ggsave(out, dpi = 'retina', scale = 1.5) + return(control.grid) + } + if (output == 'df') { + means <- + c(Tm.nc.mean, + auc.nc.mean) + parameters <- c('Tm_fit', 'auc') + output.df <- tibble(parameters, means) + return(output.df) + } + } + + +# Dose-response curve fit with LL.4 log-logistic fit +# otrace = TRUE; output from optim method is displayed. Good for diagnostic +# trace = TRUE; trace from optim displayed +# robust fitting +# robust = 'lms': doesn't handle outlier/noisy data well +dr_fit <- function(df) { + try(expr = { + drm( + resp ~ conc, + data = df, + type = 'continuous', + fct = LL.4(), + control = drmc( + errorm = FALSE, + maxIt = 10000, + noMessage = TRUE, + ) + ) + }) +} + +# Deprecated for now... +# Perform drm on each compound at each temperature +dr_analysis <- + function(df, + control = 'DMSO', + export_label = '', + plot = TRUE) { + # Construct df from the unique compound ids (less control) with empty analysis parameters + model_df <- + tibble(compound = (unique(filter( + df, ncgc_id != control + )$ncgc_id))) %>% + filter(compound != 'control') + for (i in 6:ncol(df)) { + col.nm <- colnames(df)[i] + model_df[, col.nm] <- NA + } + + # Make a long df with the parameters (colnames above) + modelfit_df <- tibble(colnames(model_df)[2:ncol(model_df)]) + names(modelfit_df)[1] <- 'analysis' + + # Loop through each column in every row of modelfit_df and create a drm model for each and + # add statistics and readouts to a temp df that is bound to modelfit_df + for (i in 1:nrow(model_df)) { + # Create a working df with the raw data from compound[i] + temp_df <- + filter(df, df$ncgc_id == model_df$compound[(i)]) %>% + dplyr::select(-c('well', 'row', 'col')) + print(paste('Analyzing: ', model_df$compound[i]), sep = '') + + # This temp df will hold the statistics that we read out from each model, and is reset every time. + # Parameters to include: + # ec50: EC50 reading of the curve fit + # pval: curve fit pvalue + # noEffect: p-value of the noEffect test of the dose-response + # hill: LL4 parameter B + # ec50: LL4 parameter A + # lowerlim: LL4 parameter C + # upperlim: LL4 parameter D + temp_modelfit_df <- modelfit_df[1] %>% + mutate( + ec50 = 0, + noEffect = 0, + hill = 0, + lowerlim = 0, + upperlim = 0, + ec50 = 0 + ) + # Iterate through columns + for (n in 3:ncol(temp_df)) { + #Make df for drm model by selecting concentration and appropriate column + dr_df <- temp_df %>% dplyr::select(c(2, n)) + colnames(dr_df)[1] <- 'conc' + colnames(dr_df)[2] <- 'resp' + temp.model <- + drm( + resp ~ conc, + data = dr_df, + fct = LL.4(), + control = drmc( + errorm = FALSE, + maxIt = 500, + noMessage = TRUE + ) + ) + # Construct fitted curve for plotting + pred.fit <- + expand.grid(pr.x = exp(seq(log(min( + dr_df[1] + )), log(max( + dr_df[1] + )), length = 1000))) + # Seems necessary to make loop continue through curves that can't be fit... NEED TO STUDY + if ("convergence" %in% names(temp.model) == FALSE) { + pm <- + predict(object = temp.model, + newdata = pred.fit, + interval = 'confidence') + pred.fit$p <- pm[, 1] + pred.fit$pmin <- pm[, 2] + pred.fit$pmax <- pm[, 3] + # Plot out dose response curve if conditional met in function + if (plot == TRUE) { + dr_plot <- ggplot(dr_df, aes(x = conc, y = resp)) + + geom_line( + data = pred.fit, + aes(x = pr.x, y = p), + size = 1.5, + color = 'black' + ) + + geom_point( + size = 4, + shape = 21, + fill = 'orange', + color = 'black' + ) + + scale_x_log10() + + theme_cowplot() + + labs( + title = paste( + 'Analysis of ', + model_df$compound[i], + ' by ', + colnames(temp_df)[n], + sep = '' + ), + subtitle = paste( + 'EC50: ', + signif(temp.model$coefficients[4], 3), + ' nM', + '\n', + 'Significance of noEffect Test: ', + signif(noEffect(temp.model)[3], 3), + sep = '' + ), + x = 'Concentration' + ) + print(dr_plot) + + out <- paste(outdir,'dr_curves', export_label, sep="/") + png( + filename = paste( + out, + '_', + model_df$compound[i], + colnames(temp_df)[n], + '.png', + sep = '' + ), + width = 3200, + height = 1800, + res = 300 + ) + print(dr_plot) + dev.off() + } + print(n) + # Extract fit parameters for the dr model + temp_modelfit_df$ec50[(n - 2)] <- + signif(temp.model$coefficients[4], 3) + temp_modelfit_df$noEffect[(n - 2)] <- + signif(noEffect(temp.model)[3], 3) + temp_modelfit_df$hill[(n - 2)] <- + signif(temp.model$coefficients[1], 3) + temp_modelfit_df$lowerlim[(n - 2)] <- + signif(temp.model$coefficients[2], 3) + temp_modelfit_df$upperlim[(n - 2)] <- + signif(temp.model$coefficients[3], 3) + } + } + modelfit_df <- modelfit_df %>% cbind(., temp_modelfit_df[2:6]) + names(modelfit_df)[names(modelfit_df) == 'ec50'] <- + paste('ec50_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'noEffect'] <- + paste('noEffect_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'hill'] <- + paste('hill_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'lowerlim'] <- + paste('lowerlim_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'upperlim'] <- + paste('upperlim_', model_df$compound[i], sep = '') + } + return(modelfit_df) + } + + + +# Extract residual sum of squares for dmso columns +# Returns df with all dmso values ready for model fit +dmso_rss <- function(df, control = 'DMSO') { + df_rss <- df %>% + dplyr::select(starts_with("t_")) %>% + pivot_longer(cols = everything()) + colnames(df_rss)[1] <- 'conc' + colnames(df_rss)[2] <- 'resp' + df_rss$conc <- as.integer(gsub('t_', '', df_rss$conc)) + message('Fitting DMSO thermogram...') + rss_model <- dr_fit(df_rss) + rss_dmso <- sum(residuals(rss_model) ^ 2) + message('DMSO RSS: ', signif(rss_dmso, 6)) + plot( + rss_model, + type = 'all', + cex = 0.5, + main = paste('DMSO Thermogram Fit\n', 'RSS: ', signif(rss_dmso, 5), sep = + ''), + sub = paste('DMSO RSS: ', signif(rss_dmso, 5), sep = ''), + xlab = 'Temperature', + ylab = 'Fraction Unfolded', + ylim = c(-0.25, 1.25) + ) + return(df_rss) +} + +compare_models <- function(df, dmso.rss.df, plot = FALSE) { + temp_df <- df %>% dplyr::select(-one_of( + 'Ea_fit', + 'Tf_fit', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S' + )) %>% + filter(ncgc_id != 'DMSO' & ncgc_id != 'ignore') + rss_df <- temp_df %>% + dplyr::select(-starts_with('t_')) + rss_df$null.rss <- NA + rss_df$alt.rss <- NA + rss_df$rss.diff <- NA + + dmso.model <- dr_fit(dmso.rss.df) + dmso.rss <- sum(residuals(dmso.model) ^ 2) + for (i in 1:nrow(temp_df)) { + cmpnd.df <- temp_df[i, ] %>% + dplyr::select(starts_with('t_')) %>% + pivot_longer(cols = everything()) + colnames(cmpnd.df)[1] <- 'conc' + colnames(cmpnd.df)[2] <- 'resp' + cmpnd.df$conc <- as.integer(gsub('t_', '', cmpnd.df$conc)) + + # Fitting the null model + null.model <- bind_rows(dmso.rss.df, cmpnd.df) + null.drm <- dr_fit(null.model) + null.rss <- sum(residuals(null.drm) ^ 2) + rss_df$null.rss[i] <- null.rss + message( + 'Null model for ', + temp_df$ncgc_id[i], + ' at concentration \' ', + temp_df$conc[i], + '\': ', + signif(null.rss, 6) + ) + if (plot == TRUE) { + plot(null.drm, + type = 'all', + cex = 0.5, + main = 'Null model fit') + } + # Fitting the alternate model + cmpnd.drm <- dr_fit(cmpnd.df) + cmpnd.rss <- sum(residuals(cmpnd.drm) ^ 2) + alt.rss <- sum(cmpnd.rss, dmso.rss) + rss_df$alt.rss[i] <- alt.rss + message ( + 'Alternate Model for ', + temp_df$ncgc_id[i], + ' at concentration \' ', + temp_df$conc[i], + ' \': ', + signif(alt.rss, 6) + ) + rss.diff <- null.rss - alt.rss + message('RSS.0 - RSS.1: ', signif(rss.diff, 6)) + rss_df$rss.diff[i] <- rss.diff + } + return(rss_df) +} + +# Calculate the traditional melting parameters from the full + rss model and output df +# of the parameters for each compound +calculate_meltingparams <- function (df, control = 'vehicle') { + #Standard parameters to test: + test_params <- c('dHm_fit', 'Tm_fit', 'dG_std', 'T_onset') + + #Set up to loop through entire dataframe for each of the above params + for (i in 1:length(test_params)) { + #Initialize the column name + current_param <- test_params[i] + df[, paste(current_param, '_diff', sep = '')] <- NA + + #First, calculate the mean of control columns + mean_control <- mean(df[[current_param]][df$ncgc_id == control]) + + #Then, subtract this mean value from each well in the plate in a new column. + #Can't figure out how to mutate with a pasted column name... + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_diff', sep = '')] <- + df[i, current_param] - mean_control + } + + #Print out mean and stdev of vehicle for each condition + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + message('Vehicle mean for ', + current_param, + ': ', + mean_control, + ' (SD: ', + std_control, + ')') + } + return(df) +} + +# Print out the volcano plots for each parameter and RSS vs. p-val +plot_volcanos <- function(df, save = TRUE) { + test_params <- + c('Tm_fit.maxDiff', + 'auc.maxDiff') + test_pval <- + c('Tm_fit.maxDiff', + 'auc.maxDiff') + + # Plot out RSS Difference(x) vs. Parameter Difference(y) + # Conditional fill: grey/alpha if not significant in either + # grey/alpha if not significant in either #DDDDDD + # teal if by parameter only #009988 + # orange if by NPARC only #EE7733 + # wine if by both #882255 + # NEED TO CODE THIS BETTER WTF + for (i in 1:length(test_params)) { + current_param <- test_params[i] + current_pval <- test_pval[i] + plot.df <- df %>% + dplyr::select(compound, + rss.diff, + mannwhit.pval, + one_of(current_param), + one_of(current_pval)) + # Assign significance testing outcomes + plot.df$sigVal <- + case_when((plot.df$mannwhit.pval < 0.05 & + plot.df[, current_pval] < 0.05) ~ 'Both', + (plot.df$mannwhit.pval < 0.05 & + plot.df[, current_pval] >= 0.05) ~ 'RSS NPARC', + (plot.df$mannwhit.pval >= 0.05 & + plot.df[, current_pval] < 0.05) ~ 'Parameter', + (plot.df$mannwhit.pval >= 0.05 & + plot.df[, current_pval] >= 0.05) ~ 'Insignificant' + ) + + fillvalues <- + c('Both', 'RSS NPARC', 'Parameter', 'Insignificant') + colors <- c('#882255', '#EE7733', '#009988', '#DDDDDD') + volcano_plot <- + ggplot(plot.df, + aes(x = rss.diff, + y = plot.df[, current_param], + label = compound)) + + geom_point(shape = 21, + aes(fill = sigVal), + size = 5) + + theme_minimal() + + labs( + title = paste('Residual Variance vs. ', current_param, sep = ''), + y = paste(current_param, ' Experimental - Vehicle Mean', sep = ''), + x = 'RSS0 - RSS1 NPARC', + fill = 'Significance Detected' + ) + + scale_fill_manual(breaks = fillvalues, values = colors) + + theme(legend.position = 'bottom') + print(volcano_plot) + out <- paste(outdir,"/",current_param, sep="") + ggsave( + paste(out, '_volcano.png', sep = ''), + dpi = 'retina', + scale = 1.25 + ) + } +} + +# Plot out RSS Difference by p-value for the MannWhitney +rss.pval.plot <- function (df, savePlot = FALSE) { + plot.df <- df %>% + dplyr::select(compound, rss.diff, mannwhit.pval, mannwhit.ec50) + plot.df$mannwhit.pval <- log2(plot.df$mannwhit.pval) + + rss.plot <- + ggplot(plot.df, + aes(x = rss.diff, y = mannwhit.pval, fill = mannwhit.ec50)) + + geom_point(shape = 21, size = 3.5) + + theme_minimal() + + scale_fill_gradient(low = '#EE3377', + high = '#88CCEE', + na.value = 'grey20') + + labs( + title = 'RSS vs. Mann Whitney P-val', + x = 'RSS0 - RSS1', + y = 'Log2 Mann Whitney P-val', + fill = 'NPARC EC50' + ) + print(rss.plot) + if (savePlot == TRUE) { + out <- paste(outdir,'/rssPvalcomp.png',sep="") + ggsave(out, + dpi = 'retina', + scale = 1.25 + ) + } +} + +# +parameter_doseresponse <- + function(df, control = 'vehicle', plot = TRUE) { + #First calculate the mean for the control/vehicle condition, and construct df with this. + parameter_df <- + tibble(compound = (unique(filter( + df, ncgc_id != control + )$ncgc_id))) + for (i in 1:nrow(parameter_df)) { + + } + } + +calculate_zscore <- + function(df, control = 'vehicle', plot = FALSE) { + test_params <- + c('Tm_fit') + for (i in 1:length(test_params)) { + current_param <- test_params[i] + mean_control <- + mean(df[[current_param]][df$ncgc_id == control]) + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + df[, paste(current_param, '_zscore', sep = '')] <- NA + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_zscore', sep = '')] <- + (df[i, current_param] - mean_control) / std_control + } + } + return(df) + } + +convert_zscore <- function(df, control = 'vehicle', plot = FALSE) { + test_params <- + c('Tm_fit_zscore') + for (i in 1:length(test_params)) { + current_param <- test_params[i] + mean_control <- mean(df[[current_param]][df$ncgc_id == control]) + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + df[, paste(current_param, '_pval', sep = '')] <- NA + + #Calculate p value for normal distribution from z score for each column. + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_pval', sep = '')] <- + 2 * pnorm(-abs(df[i, current_param])) + } + } + return(df) +} + +# Fit the null model to a set of conc ~ resp values. +# Returns: RSS values +# Requires; df with concentration and response values +# df[1]: concentration values +# df[2]: response values +fit_nullmodel <- function(df, + plot.model, + graphTitle = '') { + null.model <- lm(resp ~ 1, data = df) + #Plot if TRUE. For diagnostic use mostly... + if (plot.model == TRUE) { + out <- paste(outdir,'models', sep="/") + try(jpeg(filename = paste(out, graphTitle, '.jpg', sep = + ''))) + try(plot( + df$conc, + df$resp, + main = graphTitle, + pch = 21, + cex = 3, + col = 'black', + bg = 'orange', + lwd = 3 + )) + try(abline(null.model, col = 'black', lwd = 3)) + try(dev.off() + ) + } + #Return squared residuals for null model + # message('NUll Model RSS: ', + # (sum(residuals(null.model) ^ 2))) + return(sum(residuals(null.model) ^ 2)) +} + +# Fit the alternate model log logistic to a set of conc ~ resp values. +# Returns: RSS values +# Requires: same as null_model +fit_altmodel <- function(df, + plot.model, + graphTitle = '') { + alt.model <- dr_fit(df) + if (plot.model == TRUE) { + out <- paste(outdir,'models', sep="/") + try(jpeg(filename = paste(out, graphTitle, '.jpg', sep = + ''))) + try(plot( + alt.model, + main = graphTitle, + pch = 21, + cex = 3, + col = 'black', + bg = 'cyan', + lwd = 3 + )) + try(dev.off() + ) + } + # message('Alternate Model RSS: ', + # (sum(residuals(alt.model) ^ 2))) + return(sum(residuals(alt.model) ^ 2)) +} + +# Derive RSS values for null and alternate model for each compound from full_df +compute.rss.models <- + function(df, + control = 'DMSO', + plotModel = TRUE, + rssPlot = TRUE, + drPlot = TRUE) { + #Construct tibble of unique compounds names + rss.df <- tibble(compound = (unique( + filter(df, ncgc_id != control | ncgc_id != 'vehicle')$ncgc_id + ))) %>% + filter(compound != 'control') %>% + filter(compound != 'vehicle') + rss.df$null.model.n <- NA + rss.df$alt.model.n <- NA + rss.df$null.model.sum <- NA + rss.df$alt.model.sum <- NA + rss.df$null.model.sd <- NA + rss.df$alt.model.sd <- NA + rss.df$rss.diff <- NA + rss.df$mannwhit.pval <- NA + rss.df$mannwhit.ec50 <- NA + + for (i in 1:nrow(rss.df)) { + #Construct df for current compound + fit.df <- df %>% filter(ncgc_id == toString(rss.df[i, 1])) %>% + dplyr::select(ncgc_id, conc, starts_with('t_')) %>% + dplyr::select(!contains('onset')) + + #Plot out dose-response thermogram here? + if (drPlot == TRUE) { + dr.thermogram(fit.df, target = rss.df$compound[i]) + } + + #Construct a df to hold the rss values until final calculations of mean,sd,N + cmpnd.fit.df <- fit.df %>% + dplyr::select(starts_with('t_')) + cmpnd.fit.df <- tibble(temp = colnames(cmpnd.fit.df)) + cmpnd.fit.df$null <- NA + cmpnd.fit.df$alt <- NA + + #Iterate through each temperature, construct df, perform rss analysis, and add to cmpnd.fit.df + for (t in 3:ncol(fit.df)) { + current.fit.df <- fit.df %>% + dplyr::select(1:2, colnames(fit.df)[t]) + colnames(current.fit.df)[3] <- 'resp' + cmpnd.fit.df$null[t - 2] <- + fit_nullmodel(current.fit.df, + plot.model = plotModel, + graphTitle = as.character(paste( + current.fit.df[1, 1], ' Null Model at ', colnames(fit.df)[t], sep = '' + ))) + cmpnd.fit.df$alt[t - 2] <- + fit_altmodel(current.fit.df, + plot.model = plotModel, + graphTitle = as.character(paste( + current.fit.df[1, 1], ' Alternate Model at ', colnames(fit.df)[t], sep = '' + ))) + } + # RSS0-RSS1 + cmpnd.fit.df <- cmpnd.fit.df %>% + mutate(diff = null - alt) + #Now, we calculate and assign rss values for both models in the rss.df for this compound. + rss.df$null.model.n[i] <- length(na.omit(cmpnd.fit.df$null)) + rss.df$alt.model.n[i] <- length(na.omit(cmpnd.fit.df$alt)) + rss.df$null.model.sum[i] <- sum(cmpnd.fit.df$null) + rss.df$alt.model.sum[i] <- sum(cmpnd.fit.df$alt) + rss.df$null.model.sd[i] <- sd(cmpnd.fit.df$null) + rss.df$alt.model.sd[i] <- sd(cmpnd.fit.df$alt) + rss.df$rss.diff[i] <- + sum(cmpnd.fit.df$null) - sum(cmpnd.fit.df$alt) + + #Perform Mann-Whitney iU test on alternative vs. null model dataframe for compound. + mann.whit <- + wilcox.test(x = cmpnd.fit.df$null, + y = cmpnd.fit.df$alt, + exact = TRUE) + rss.df$mannwhit.pval[i] <- mann.whit$p.value + + #Message out RSS0-RSS1 and p value + message('RSS Difference for ', + rss.df[i, 1], + ': ', + rss.df$rss.diff[i]) + message('Mann-Whitney U Test p-val: ', + rss.df$mannwhit.pval[i]) + + # Construct drc model and derive ec50 if p-val is significant + if (rss.df$mannwhit.pval[i] <= 0.05) { + #Find what temperature is the max point + rss.max.temp <- + cmpnd.fit.df$temp[cmpnd.fit.df$diff == max(cmpnd.fit.df$diff)] + #Construct df ready for drc at max temperature + rss.drc.df <- fit.df %>% + dplyr::select(conc, one_of(rss.max.temp)) + colnames(rss.drc.df)[2] <- 'resp' + rss.drc.model <- dr_fit(rss.drc.df) + ec50.temp <- rss.drc.model$coefficients[4] + if (length(ec50.temp != 0)) { + rss.df$mannwhit.ec50[i] <- signif(ec50.temp, 3) + } + } + + #Plot the RSS values across the temperature range if true + if (rssPlot == TRUE) { + # First, clean up the temperatures + cmpnd.fit.df$temp <- sub('t_', '', cmpnd.fit.df$temp) + cmpnd.fit.df$temp <- as.numeric(cmpnd.fit.df$temp) + + #Plot RSS as + rss.plot <- ggplot(cmpnd.fit.df, aes(x = temp, y = diff)) + + geom_point(shape = 21, + size = 4, + fill = '#AA4499') + + theme_minimal() + + labs( + title = paste(current.fit.df[1, 1], ' RSS Difference', sep = ''), + subtitle = paste('Mann-Whitney U pval: ', signif(rss.df$mannwhit.pval[i])), + sep = '', + y = 'RSS0-RSS1', + x = 'Temperature [C]' + ) + print(rss.plot) + out <- paste(outdir,'models', current.fit.df[1, 1], sep="/") + ggsave( + filename = paste(out, '_rss.png', sep =''), + scale = 1.25, + dpi = 'retina' + ) + } + } + return(rss.df) + } + +# Compute the rss difference and significance for each of the parameters +compute_parameter.rssmodel <- function(df, plotModel = FALSE) { + #Test parameters for standard model + test_params <- + c('Tm_fit', + 'auc') + + #Construct df of unique compounds and initialize parameter readouts. + param.rss.df <- tibble(compound = (unique( + filter(df, ncgc_id != 'control' & ncgc_id != 'vehicle')$ncgc_id + ))) + param.rss.df$Tm_fit.ec50 <- as.numeric(NA) + param.rss.df$Tm_fit.pval <- as.numeric(NA) + param.rss.df$Tm_fit.maxDiff <- as.numeric(NA) + param.rss.df$auc.ec50 <- as.numeric(NA) + param.rss.df$auc.pval <- as.numeric(NA) + param.rss.df$auc.maxDiff <- as.numeric(NA) + + control.means <- control_analysis(df, output = 'df') + + for (i in 1:nrow(param.rss.df)) { + cmpnd.fit.df <- df %>% + filter(ncgc_id == param.rss.df$compound[i]) + #Now iterate through columns in test_params + for (p in 1:length(test_params)) { + current_param <- test_params[p] + current.fit.df <- cmpnd.fit.df %>% + dplyr::select(ncgc_id, conc, I(test_params[p])) + colnames(current.fit.df)[3] <- 'resp' + current.model <- dr_fit(current.fit.df) + + #Workaround to avoid drm that can't converge + if (class(current.model) != 'list') { + param.rss.df[i, paste(current_param, '.pval', sep = '')] <- + noEffect(current.model)[3] + + param.rss.df[i, paste(current_param, '.ec50', sep = '')] <- + summary(current.model)$coefficients[4] + + #Calculate the maximum difference in param and subtract negative control mean from it. + current.fit.df$absDiff <- + abs(current.fit.df$resp - control.means$means[control.means$parameters == + current_param]) + param.rss.df[i, paste(current_param, '.maxDiff', sep = '')] <- + current.fit.df$resp[current.fit.df$absDiff == max(current.fit.df$absDiff)] - control.means$means[control.means$parameters == + current_param] + + message('Analyzing Compound ', param.rss.df[i, 1], '...') + message(current_param) + message('EC50: ', param.rss.df[i, paste(current_param, '.ec50', sep = + '')]) + message('No Effect ANOVA p-val: ', signif(param.rss.df[i, paste(current_param, '.pval', sep = + '')]), 1) + if (plotModel == TRUE) { + out <- paste(outdir,'models', param.rss.df[i, 1], sep="/") + png( + filename = paste( + out, + '_', + current_param, + '.png', + sep = '' + ), + bg = 'transparent' + ) + plot( + current.model, + main = paste( + param.rss.df[i, 1], + '\n', + ' NoEffect pval: ', + signif(param.rss.df[i, paste(current_param, '.pval', sep = + '')]), + '\n', + 'EC50: ', + signif(param.rss.df[i, paste(current_param, '.ec50', sep = + '')]), + '\n', + current_param + ) + ) + dev.off() + } + } + } + } + return(param.rss.df) +} + +# Creates a thermogram with all concentrations of a target for plotting +# Must match exact ncgc_id in well assignment.. +dr.thermogram <- function(df, target = '') { + # Create df with the compound, conc, and temperature columns + # df <- df %>% + # dplyr::select(ncgc_id, conc, matches('t_\\d')) %>% + # filter(., ncgc_id == target) + df <- df %>% + pivot_longer(cols = 3:ncol(df), + names_to = 'temp', + values_to = 'resp') + df$temp <- as.numeric(sub('t_', '', df$temp)) + + dr.plot <- ggplot(df, aes( + y = resp, + x = temp, + fill = as.factor(signif(conc)), + group_by(signif(conc)) + )) + + geom_line(color = 'black', + alpha = 0.8, + size = 1) + + geom_point(shape = 21, size = 3) + + theme_minimal() + + scale_color_viridis_d() + + labs( + title = paste('Dose-Response Thermogram for ', target, sep = ''), + x = 'Temperature [C]', + y = 'Response', + fill = 'Concentration' + ) + + theme() + print(dr.plot) + out <- paste(outdir,'models', 'dr_', sep="/") + ggsave( + filename = paste(out, target, '.png', sep = ''), + scale = 1.25, + dpi = 'retina' + ) + return(dr.plot) +} + +# Export heatmaps of EC50 and P-values across analysis parameters +# Pass in parameters df +parameter_heatmaps <- function(df, plotHeat = FALSE) { + ec50.heat.df <- df %>% + dplyr::select(compound, contains('ec50')) %>% + pivot_longer(cols = !compound, + names_to = 'parameter', + values_to = 'ec50') %>% + mutate(ec50 = log10(ec50)) + ec50.heat.df$parameter <- ec50.heat.df$parameter %>% + sub('.ec50', '', .) + + ec50.heat.plot <- + ggplot(ec50.heat.df, + aes( + x = parameter, + y = compound, + fill = ec50, + label = signif(ec50) + )) + + geom_tile(color = 'black') + + geom_text(alpha = 0.85, size = 2.5) + + theme_minimal() + + scale_fill_gradientn(colors = c('#EE3377', '#DDCC77', '#88CCEE'), ) + + labs(title = 'EC50 Parameter Comparison', + fill = 'Log EC50') + + theme( + axis.title.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold') + ) + + pval.heat.df <- df %>% + dplyr::select(compound, contains('pval')) %>% + pivot_longer(cols = !compound, + names_to = 'parameter', + values_to = 'pval') %>% + mutate(sigVal = ifelse(pval < (0.05 / length(unique( + df + ))), 'Significant', 'Insignificant')) + pval.heat.df$parameter <- pval.heat.df$parameter %>% + sub('.pval', '', .) + pval.heat.plot <- + ggplot(pval.heat.df, + aes( + x = parameter, + y = compound, + fill = sigVal, + label = signif(pval) + )) + + geom_tile(color = 'black') + + geom_text(alpha = 0.85, size = 2.5) + + theme_minimal() + + labs(title = 'P-Value Parameter Comparison', + fill = 'P-Value') + + theme( + axis.title.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + + ) + if (plotHeat == TRUE) { + print(pval.heat.plot) + out <- paste(outdir, 'pval_heatmap.png' , sep="/") + ggsave(out, + dpi = 'retina', + scale = 1.25) + print(ec50.heat.plot) + out <- paste(outdir, 'ec50_heatmap.png' , sep="/") + ggsave(out, + dpi = 'retina', + scale = 1.25) + } +} + +# Mutates a binary variable testing each analysis method for significance +# 0 if insignificant +# 1 if significant +determineSig <- function(df, alpha = 0.05) { + analysisMethods <- dplyr::select(df, contains('pval')) + analysisMethods <- colnames(analysisMethods) + analysisMethodsNames <- sub('pval', 'pval.sig', analysisMethods) + sigVal <- alpha / nrow(df) + for (i in 1:length(analysisMethods)) { + df[, analysisMethodsNames[i]] <- + ifelse(df[, analysisMethods[i]] < sigVal, 1, 0) + df[is.na(df)] <- 0 + } + return(df) +} + +# Use after determineSig from above +rankOrder <- function(df) { + methodSig <- dplyr::select(df, contains('pval.sig')) + methodSig <- colnames(methodSig) + methodRank <- sub('pval.sig', 'rankOrder', methodSig) + methods <- sub('.rankOrder', '', methodRank) + methodsEC <- paste(methods, '.ec50', sep = '') + + for (i in 1:length(methods)) { + rank.df <- filter(df, df[, (methodSig[i])] == 1) + rank.df[, methodRank[i]] <- + as.integer(rank(rank.df[, methodsEC[i]])) + df <- left_join(df, rank.df) + } + return(df) +} diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R new file mode 100644 index 0000000..2d5ee87 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R @@ -0,0 +1,16 @@ +suppressWarnings(library(logging)) +library(tidyverse) + + +# Initialize the logger +basicConfig() + +args = commandArgs(trailingOnly=TRUE) + +data <- args[1] +outdir <- args[2] + +loginfo('data = %s', data) +loginfo('outdir = %s', outdir) + +source('./main_analysis.R') diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R new file mode 100644 index 0000000..7deea6e --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R @@ -0,0 +1,89 @@ +# # @@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@ @@@@@@ +# # @@@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ +# # @@! @@@ @@! !@@ @@! @@! !@@ @@! @@@ +# # !@! @!@ !@! !@! !@! !@! !@! !@! @!@ +# # @!@!!@! @!! @!@!@!@!@ !@! @!!!:! @!! !!@@!! @!@!@!@! +# # !!@!@! !!! !!!@!@!!! !!! !!!!!: !!! !!@!!! !!!@!!!! +# # !!: :!! !!: :!! !!: !!: !:! !!: !!! +# # :!: !:! :!: :!: :!: :!: !:! :!: !:! +# # :: ::: :: ::: ::: :: :::: :: :::: :: :: ::: +# # NonParametric Multiparameter Analysis of CETSA/RT-CETSA Experimental Sets +# # +# # Written by: Michael Ronzetti {NIH/NCATS/UMD} +# # Patents: PCT/US21/45184, HHS E-022-2022-0-US-01 +# # Main Analysis + +library(tidyverse) +library(readxl) +library(stringr) +library(drc) +library(ggthemes) +library(cowplot) +library(hrbrthemes) +library(ggpubr) +library(MESS) +library(devtools) + +# read in input data +full_df <- read_csv(data, show_col_types = FALSE) + +# BECAUSE OF BUG +pdf(file = NULL) + +# EXPERIMENTAL PARAMETERS AND SETUP +# +# Input experiment parameters here +startTemp <- 37 +endTemp <- 95 +plate_format <- 384 +control <- 'vehicle' +pc <- 'control' + +source('./functions.R') + +full_df <- calculate_auc(full_df) + +# Perform some preliminary control group analysis of variability +control_df <- + control_grouping(full_df, control, pc) # Pull out control compound datapoints +control_var <- + control_variability(control_df) # Read out the control group variability +controlPlot <- + control_analysis( + full_df, + nc = 'vehicle', + pc = 'control', + output = 'plot', + controlDF = control_var + ) + +#Calculate melting parameter difference for each well from MoltenProt +# full_df <- calculate_meltingparams(full_df) %>% +# calculate_zscore() %>% +# convert_zscore + +#Derive RSS values for null and alternate model for each compound from full_df +rss <- compute.rss.models(full_df, rssPlot = FALSE, drPlot = FALSE, plotModel = FALSE) + +#Perform dose-response for each thermogram parameter +parameters <- compute_parameter.rssmodel(full_df, plotModel = FALSE) + +#Merge these plots for further analysis +signif.df <- merge(rss, parameters) +colnames(signif.df)[9] <- 'mannwhit.pval' +signif.df <- determineSig(signif.df) +signif.df <- rankOrder(signif.df) + +# Volcano plots comparing the different parameters of analysis against the NPARC RSS Difference +# Colored by significance test and whether the compound passes any. +plot_volcanos(signif.df, save = FALSE) + +# Plot of RSS Differences vs. p-values for NPARC +rss.pval.plot(signif.df, savePlot = TRUE) + +#Heatmap of compounds vs. different measurement styles. +parameter_heatmaps(signif.df, plotHeat = TRUE) + +#Write out signif.df and full_df +write.csv(x = full_df, file = paste(outdir,'full_df.csv',sep="/")) +write.csv(x = signif.df, file = paste(outdir,'signif_df.csv',sep="/")) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/preprocess_data.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/preprocess_data.py new file mode 100644 index 0000000..ac4dccb --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/preprocess_data.py @@ -0,0 +1,134 @@ +"""Preprocess Data.""" + +from pathlib import Path + +import openpyxl +import pandas as pd + + +def preprocess_data(platemap_path, values_path, params_path, out_dir): + """Preprocessing all data before running analysis.""" + prepared_platemap = prepare_platemap(platemap_path, out_dir) + sample = preprocess_platemap_sample(prepared_platemap, out_dir) + conc = preprocess_platemap_conc(prepared_platemap, out_dir) + values = preprocess_values(values_path, out_dir) + params = preprocess_params(params_path, out_dir) + df = pd.concat([sample, conc, params, values], axis=1) + df = df.astype({"row": int, "col": int}) + first_column = df.pop("well") + df.insert(0, "well", first_column) + data_path = out_dir / "data.csv" + df.to_csv(data_path, index=True) + return data_path + + +def prepare_platemap(platemap_path: Path, out_dir: Path): + """Preprocess platemap to normalize inputs for downstream tasks.""" + platemap = openpyxl.load_workbook(platemap_path) + for name in platemap.sheetnames: + if "sample" in name.lower(): + sample = platemap[name] + sample.title = "sample" + if "conc" in name.lower(): + conc = platemap[name] + conc.title = "conc" + + prepared_platemap_path = out_dir / platemap_path.name + platemap.save(prepared_platemap_path) + return prepared_platemap_path + + +def preprocess_platemap_sample(platemap_path: Path, out_dir: Path): + """Preprocess platemap sample sheet. + + Returns: + dataframe with row, col and ncgc_id columns. + """ + df = pd.read_excel(platemap_path, "sample") + df.drop(columns=df.columns[0], axis=1, inplace=True) + df = df.stack() + df = df.reset_index(level=[0, 1], name="ncgc_id") + df.rename(columns={"level_0": "row", "level_1": "col"}, inplace=True) + df["row"] += 1 + df.index += 1 + + df = df.replace("empty", "vehicle") # empty are relabed as vehicle + + processed_platemap_path = out_dir / (platemap_path.stem + "_sample.csv") + df.to_csv(processed_platemap_path, index=True) + return df + + +def preprocess_platemap_conc(platemap_path: Path, out_dir: Path): + """Preprocess platemap conc sheet. + + Returns: + dataframe with conc column. + """ + df = pd.read_excel(platemap_path, "conc") + df.drop(columns=df.columns[0], axis=1, inplace=True) + df = df.stack() + df = df.reset_index(level=[0, 1], name="conc") + df.drop(columns=["level_0", "level_1"], inplace=True) + df.index += 1 + processed_platemap_path = out_dir / (platemap_path.stem + "_conc.csv") + df.to_csv(processed_platemap_path, index=True) + return df + + +def preprocess_params(params_path: Path, out_dir: Path): + """Preprocess moltenprot params. + + Returns: + dataframe subselection of columns. + All values are converted to celsius. + """ + df = pd.read_csv(params_path) + df = df[["dHm_fit", "Tm_fit", "BS_factor", "T_onset", "dG_std"]] + df["Tm_fit"] = df["Tm_fit"] - 273.15 + df["T_onset"] = df["T_onset"] - 273.15 + df.index += 1 + processed_params_path = out_dir / params_path.name + df.to_csv(processed_params_path, index=True) + return df + + +def preprocess_values(values_path: Path, out_dir: Path): + """Preprocess moltenprot values. + + Returns: + dataframe measurement series for each well. + All temperature are converted to celsius + """ + df = pd.read_csv(values_path) + # update temp to plug to the rest of R code. + df["Temperature"] = df["Temperature"] - 273.15 + df["Temperature"] = df["Temperature"].round(2) + df["Temperature"] = df["Temperature"].apply(lambda t: "t_" + str(t)) + # the only step that is really necessary + df = df.transpose() + df.columns = df.iloc[0] # make the first row the column names + df = df.drop(df.index[0]) # drop first row + df = df.reset_index(names=["well"]) # remove well names and use seq index + df.index += 1 # should start at 1 + processed_values_path = out_dir / values_path.name + df.to_csv(processed_values_path, index=True) + return df + + +def preprocess_values_R(values_path: Path, out_dir: Path): + """NOTE: this should be removed.""" + df = pd.read_csv(values_path) + # update temp to plug to the rest of R code. + df["Temperature"] = df["Temperature"] - 273.15 + df["Temperature"] = df["Temperature"].round(2) + df["Temperature"] = df["Temperature"].apply(lambda t: "t_" + str(t)) + # the only step that is really necessary + df = df.transpose() + df.columns = df.iloc[0] # make the first row the column names + df = df.drop(df.index[0]) # drop first row + df = df.reset_index(drop=True) # remove well names and use seq index + df.index += 1 # should start at 1 + processed_values_path = out_dir / values_path.name + df.to_csv(processed_values_path, index=True) + return processed_values_path diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py new file mode 100644 index 0000000..f136741 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py @@ -0,0 +1,41 @@ +"""Run R scripts.""" + +import logging +import os +import subprocess +from pathlib import Path + +POLUS_LOG = os.environ.get("POLUS_LOG", logging.INFO) +WORKDIR = os.environ.get("WORKDIR", "") + +logger = logging.getLogger("rt_cetsa_analysis") +logger.setLevel(POLUS_LOG) + + +def run_rscript( + data_filepath: Path, + out_dir: Path, +): + """Run R script.""" + cwd = Path(__file__).parent + + if WORKDIR: + cwd = ( + Path(WORKDIR) + / "src" + / "polus" + / "tabular" + / "regression" + / "rt_cetsa_analysis/" + ) + + logger.info(f"current working directory : {cwd.as_posix()}") + + cmd = [ + "Rscript", + "./main.R", + data_filepath.as_posix(), + out_dir.as_posix(), + ] + + subprocess.run(args=cmd, cwd=cwd) diff --git a/regression/rt-cetsa-analysis-tool/tests/__init__.py b/regression/rt-cetsa-analysis-tool/tests/__init__.py new file mode 100644 index 0000000..d420712 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests.""" diff --git a/regression/rt-cetsa-analysis-tool/tests/conftest.py b/regression/rt-cetsa-analysis-tool/tests/conftest.py new file mode 100644 index 0000000..4983315 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/conftest.py @@ -0,0 +1,13 @@ +"""Set up.""" +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py b/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py new file mode 100644 index 0000000..fb8cb0b --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py @@ -0,0 +1,20 @@ +"""Tests.""" +from pathlib import Path + +import pytest +from polus.tabular.regression.rt_cetsa_analysis.run_rscript import run_rscript + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_run_rscript(): + """Run R script.""" + inpDir = Path.cwd() / "tests" / "data" + params = "plate_(1-59)_moltenprot_params.csv" + values = "plate_(1-59)_moltenprot_curves.csv" + platemap = Path.cwd() / "tests" / "data" / "platemap.xlsx" + outDir = Path.cwd() / "tests" / "out" + + params = inpDir / params + values = inpDir / values + + run_rscript(params, values, platemap, outDir) diff --git a/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg new file mode 100644 index 0000000..8f4a97e --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.5.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:VERSION] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py] + +[bumpversion:file:plugin.json] + +[bumpversion:file:ict.yml] + +[bumpversion:file:rt_cetsa_moltenprot.cwl] diff --git a/regression/rt-cetsa-moltenprot-tool/.gitignore b/regression/rt-cetsa-moltenprot-tool/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/regression/rt-cetsa-moltenprot-tool/Dockerfile b/regression/rt-cetsa-moltenprot-tool/Dockerfile new file mode 100755 index 0000000..4e6d5bc --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir +RUN pip3 install . + +ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_moltenprot"] +CMD ["--help"] diff --git a/regression/rt-cetsa-moltenprot-tool/README.md b/regression/rt-cetsa-moltenprot-tool/README.md new file mode 100644 index 0000000..4ea0d90 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/README.md @@ -0,0 +1,23 @@ +# RT_CETSA MoltenProt (v0.5.0-dev0) + +This WIPP plugin runs moltenprot regression for the RT-CETSA pipeline. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eight input argument and one output argument: + +| Name | Description | I/O | Type | +|-----------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | +| `--filePattern` | File Pattern to parse input files | Input | string | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/regression/rt-cetsa-moltenprot-tool/VERSION b/regression/rt-cetsa-moltenprot-tool/VERSION new file mode 100644 index 0000000..412ff8c --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/VERSION @@ -0,0 +1 @@ +0.5.0-dev0 diff --git a/regression/rt-cetsa-moltenprot-tool/build-docker.sh b/regression/rt-cetsa-moltenprot-tool/build-docker.sh new file mode 100755 index 0000000..80539db --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Antoine Gerardin ", + "Najib Ishaq ", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +filepattern = "^2.0.5" +pandas = "^2.2.2" +matplotlib = "^3.8.4" +scipy = "^1.13.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +extend-ignore = [ + "RET505", # Unnecessary `else` after `return` statement + "E501", # Line too long + "ANN001", # Missing type annotation + "D102", # Missing docstring in public method + "ANN201", # Missing return type annotation + "N806", # Variable in function should be lowercase + "D205", # 1 blank line required between summary line and description + "N803", # Argument name should be lowercase + "PLR0913", # Too many arguments + "D415", # First line should end with a period, question mark, or exclamation point + "PLR2004", # Magic value used in comparison + "B006", # Do not use mutable default arguments + "D107", # Missing docstring + "D101", # Missing docstring + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file + "PTH123", # `open()` should be replaced with `Path.open()` + "PTH118", # `os.path.join()` should be replaced with `/` operator + "PTH100", # `os.path.abspath()` should be replaced with `Path.resolve()` + "PLR0915", # Too many statements + "PLR0912", # Too many branches + "C901", # Function is too complex + "T201", # `print` used + "E722", # Do not use bare 'except' + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling + "ANN202", # Missing return type annotation for private function + "ARG002", # Unused method argument + "N802", # Function name should be lowercase + "PTH103", # `os.makedirs()` should be replaced with `Path.mkdir(parents=True)` + "ANN003", # Missing type annotation for `**kwargs` + "B007", # Loop control variable not used within the loop body + "ANN204", # Missing return type annotation for magic method + "D417", # Missing argument descriptions in the docstring + "ANN205", # Missing return type annotation for static method + "PLR5501", # Use `elif` instead of `else` following `if` condition to avoid unnecessary indentation + "EM102", # Exception must not use an f-string literal + "D414", # Section has no content + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "A001", # Variable `input` is shadowing a Python builtin + "A002", # Argument `input` is shadowing a Python builtin + "E741", # Ambiguous variable name: `l` + "PTH120", # `os.path.dirname()` should be replaced by `Path.parent` + "N816", # Variable `cfFilename` in global scope should not be mixedCase + "PTH109", # `os.getcwd()` should be replaced by `Path.cwd()` +] diff --git a/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl new file mode 100644 index 0000000..272ae56 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl @@ -0,0 +1,32 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + filePattern: + inputBinding: + prefix: --intensities + type: string? + preview: + inputBinding: + prefix: --preview + type: boolean? + outDir: + inputBinding: + prefix: --outDir + type: Directory +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/rt-cetsa-moltenprot-tool:0.5.0-dev0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/regression/rt-cetsa-moltenprot-tool/run-plugin.sh b/regression/rt-cetsa-moltenprot-tool/run-plugin.sh new file mode 100755 index 0000000..948a180 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash +version=$( tuple[pandas.DataFrame, pandas.DataFrame]: + """Run moltenprot. + + Args: + file_path : path to intensities file. + + Returns: + tuple of dataframe containing the fit_params and the curve values. + """ + fit = fit_data(file_path, moltenprot_params) + + # sort fit_params by row/column + fit_params = fit.plate_results + fit_params["_index"] = fit_params.index + fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) + fit_params["number"] = fit_params.apply( + lambda row: row._index[1:], + axis=1, + ).astype(int) + fit_params = fit_params.drop(columns="_index") + fit_params = fit_params.sort_values(["letter", "number"]) + fit_params = fit_params.drop(columns=["letter", "number"]) + + # keep only 2 signicant digits for temperature index + fit_curves = fit.plate_raw_corr + fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) + + return fit_params, fit_curves + + +def fit_data( + file_path: pathlib.Path, + moltenprot_params: dict[str, int], +) -> core.MoltenProtFit: + """Fit data to a model using Moltprot.""" + fit = core.MoltenProtFit( + filename=file_path, + input_type="csv", + ) + + fit.SetAnalysisOptions( + model="santoro1988", + baseline_fit=moltenprot_params["baseline_fit"], + baseline_bounds=moltenprot_params["baseline_bounds"], + dCp=0, + onset_threshold=0.01, + savgol=moltenprot_params["savgol"], + blanks=[], + exclude=[], + invert=False, + mfilt=None, + shrink=None, + trim_max=moltenprot_params["trim_max"], + trim_min=moltenprot_params["trim_min"], + ) + + fit.PrepareData() + fit.ProcessData() + + return fit diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py new file mode 100644 index 0000000..6eabd8e --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py @@ -0,0 +1,130 @@ +"""CLI for rt-cetsa-moltprot-tool.""" + +import json +import logging +import os +import pathlib + +import typer +from polus.tabular.regression.rt_cetsa_moltenprot import run_moltenprot_fit + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger(__file__) +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + +app = typer.Typer() + + +@app.command() +def main( + inp_dir: pathlib.Path = typer.Option( + ..., + "--inpDir", + help="Input directory containing the data files.", + exists=True, + dir_okay=True, + readable=True, + resolve_path=True, + ), + intensities: str = typer.Option( + None, + "--intensities", + help="name of the intensities file (optional).", + ), + out_dir: pathlib.Path = typer.Option( + ..., + "--outDir", + help="Output directory to save the results.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), + preview: bool = typer.Option( + False, + "--preview", + help="Preview the files that will be processed.", + ), + savgol: int = typer.Option( + 10, + "--savgol", + help="molten prot savgol parameter.", + ), + trim_min: int = typer.Option( + 0, + "--trim_min", + help="molten prot trim_min parameter.", + ), + trim_max: int = typer.Option( + 0, + "--trim_max", + help="molten prot trim_max parameter.", + ), + baseline_fit: int = typer.Option( + 3, + "--baseline_fit", + help="molten prot baseline_fit parameter.", + ), + baseline_bounds: int = typer.Option( + 3, + "--baseline_bounds", + help="molten prot baseline_bounds parameter.", + ), +) -> None: + """CLI for rt-cetsa-moltprot-tool.""" + logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") + + logger.info(f"Input directory: {inp_dir}") + logger.info(f"Output directory: {out_dir}") + + moltenprot_params = { + "savgol": savgol, + "trim_max": trim_max, + "trim_min": trim_min, + "baseline_fit": baseline_fit, + "baseline_bounds": baseline_bounds, + } + + logger.info(f"Moltenprot params {moltenprot_params}") + + # NOTE we may eventually deal with other types. + if POLUS_TAB_EXT != ".csv": + msg = "this tool can currently only process csv files." + raise ValueError(msg) + + if intensities is not None: + intensities_file = inp_dir / intensities + if not intensities_file.exists(): + raise FileNotFoundError(intensities_file) + else: + if len(list(inp_dir.iterdir())) != 1: + raise FileExistsError( + f"There should be a single intensities file in {inp_dir}", + ) + intensities_file = next(inp_dir.iterdir()) + logger.info(f"Using intensities file: {intensities_file}") + + if preview: + outputs = ["params" + POLUS_TAB_EXT, "values" + POLUS_TAB_EXT] + out_json = {"files": outputs} + with (out_dir / "preview.json").open("w") as f: + json.dump(out_json, f, indent=2) + return + + fit_params, fit_curves = run_moltenprot_fit(intensities_file, moltenprot_params) + + fit_params_path = out_dir / ("params" + POLUS_TAB_EXT) + fit_curves_path = out_dir / ("values" + POLUS_TAB_EXT) + + fit_params.to_csv(fit_params_path, index=True) + fit_curves.to_csv(fit_curves_path, index=True) + + +if __name__ == "__main__": + app() diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py new file mode 100644 index 0000000..79d51f7 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py @@ -0,0 +1,3780 @@ +"""Copyright 2018-2021 Vadim Kotov, Thomas C. Marlovits. + +This file is part of MoltenProt. + +MoltenProt is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +MoltenProt is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MoltenProt. If not, see . +""" + +### Citation +# a simple dict with strings that provide different citation formatting +citation = { + "long": "\nIf you found MoltenProt helpful in your work, please cite:\nKotov et al., Protein Science (2021)\ndoi: 10.1002/pro.3986\n", + "html": """

If you found MoltenProt helpful in your work, please cite:

+

Kotov et al., Protein Science (2021)

+

doi: 10.1002/pro.3986

""", + "short": "Citation: Kotov et al., Protein Science (2021) doi: 10.1002/pro.3986", +} + + +### Modules +# some useful mathematical functions +# For printing line number and file name +from inspect import currentframe +from inspect import getframeinfo + +# for generating htmls +from string import Template + +# plotting +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import gridspec + +# for color conversion +from matplotlib.colors import rgb2hex + +# interpolation +from scipy.interpolate import interp1d + +# fitting routine from scipy +from scipy.optimize import curve_fit + +# median filtering +from scipy.signal import medfilt + +cf = currentframe() +cfFilename = getframeinfo(cf).filename # type: ignore[arg-type] + +# handling exceptions +# saving class instances to JSON format +import json + +# creating folders +import os +import sys + +# function to recognize module versions +from distutils.version import LooseVersion + +# for compression of output JSON +# for timestamps +from time import strftime + +# data processing +import pandas as pd + +# import the fitting models +from . import models + +# A variable for reliable access to other resources of MoltenProt (e.g. report template) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +# MoltenProt is stored in a plain-text file VERSION (also used by setup.py) +# extract and save it to a variable +with open(os.path.join(__location__, "MOLTENPROT_VERSION")) as version_file: + __version__ = version_file.read().strip() + +# get scipy version (some methods may not be available in earlier versions) +scipy_version = sys.modules["scipy"].__version__ + +# check if running from a PyInstaller bundle +if hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS"): + from_pyinstaller = True +else: + from_pyinstaller = False + +# for parallel processing of some for loops using joblib (may hang up) +try: + from joblib import Parallel + from joblib import delayed + + # this is only needed if we want to auto-estimate the available cores - currently done in MoltenProtMain + # access module version without importing the whole module: + joblib_version = sys.modules["joblib"].__version__ + # check joblib version: only 0.12 and above can pickle instance methods (which is used in mp) + if LooseVersion(joblib_version) >= LooseVersion("0.12"): + parallelization = True + else: + print( + "Warning: available joblib version ({}) is incompatible with current code parallelization".format( + joblib_version, + ), + ) + print("Information: Consider updating to joblib 0.12 or above") + parallelization = False +except ImportError: + print("Warning: joblib module not found, parallelization of code is not possible.") + joblib_version = "None" # only for printing version info + parallelization = False + +# NOTE MoltenProtFit and MoltenProtFitMultiple have different parallelization approaches: +# MoltenProtFit - can only parallelize figure plotting and n_jobs=3 works well +# MoltenProtFitMultiple - reads and runs several MoltenProtFit instances in parallel (F330, F350 etc), +# but each of them gets only one job (i.e. MoltenProtFit.n_jobs is always 1) + +### Constants imported from models.py +R = models.R +T_std = models.T_std + +# Standard plate index +# NOTE currently the software requires the layout to contain A1-H12 index, even if only a small part is used +alphanumeric_index = [] +for k in ["A", "B", "C", "D", "E", "F", "G", "H"]: + for l in range(1, 13): + alphanumeric_index.append(k + str(l)) +# convert index to pandas Series and set its name +alphanumeric_index = pd.Series(data=alphanumeric_index) +alphanumeric_index.name = "ID" # type: ignore[attr-defined] + +# dictionary holding the default values and and their description for CLI interface/tooltips +# dictionary key is the name of the option, each entry contains a tuple of default parameter value and its descriptions +# NOTE for mfilt the default value is only used when the option is supplied in the CLI + +# default data preparation parameters +prep_defaults = { + "blanks": [], + "blanks_h": "Input the sample ID's with buffer-only control", # subtract blanks in prep step + "exclude": [], + "exclude_h": 'Specify the well(s) to omit during analysis; this option is intended for simple removal of some bad wells; if many samples must be excluded, use a layout and add "Ignore" to the annotation of the sample', + "invert": False, # DELETE? + "invert_h": "Invert the curve", + "mfilt": None, + "mfilt_h": "Apply median filter with specificed window size (in temperature units) to remove spikes; 4 degrees is a good starting value", + "shrink": None, + "shrink_h": "Average the data points to a given degree step;\n may help to make trends more apparent and speeds up the computation;\n typical values 0.5-3.0", + "trim_max": 0, + "trim_max_h": "Decrease the finishing temperature by this value", + "trim_min": 0, + "trim_min_h": "Increase the starting temperature by this value", +} + +# default analysis parameters +analysis_defaults = { + "model": "santoro1988", + "model_h": "Select a model for describing the experimental data", + "baseline_fit": 10, + "baseline_fit_h": "The length of the input data (in temperature degrees) for initial estimation of pre- and post-transition baselines", + "baseline_bounds": 3, + "baseline_bounds_h": "Baseline bounds are set as multiples of stdev of baseline parameters obtained in the pre-fitting routine; this should stabilize the fit and speed up convergence; set to 0 to remove any bounds for baselines", + "dCp": 0, + "dCp_h": "Heat capacity change of unfolding for all samples (in J/mol/K), used only in equilibrium models; this value overrides the respective column in the layout", + "onset_threshold": 0.01, + "onset_threshold_h": "Percent unfolded to define the onset of unfolding", + "savgol": 10, + "savgol_h": "Set window size (in temperature units) for Savitzky-Golay filter used to calculate the derivative", +} + +# all other settings for MoltenProtFit +defaults = { + "debug": False, # currently not exposed in the CLI + "debug_h": "Print developer information to the console", + "dec": ".", + "dec_h": "CSV decimal separator, enclosed in quotes", + "denaturant": "C", + "denaturant_h": "For plain CSV input only; specify temperature scale that drives denaturation, in K or C", + "j": 1, # TODO not related to the core functions, supply to respective methods (output etc) + "j_h": "Number of jobs to be spawned by parallelized parts of the code; should not be higher than the amount of CPU's in the computer; for most recent laptops a value of 3 is recommended", + "layout": None, # TODO should not be set in SetAnalysisOptions, but rather in __init__ + "layout_h": "CSV file with layout", + "sep": ",", + "sep_h": "CSV separator, enclosed in quotes", + "readout": "Signal", + "readout_h": "For plain CSV input only; specify type of input signal", + "spectrum": False, + "spectrum_h": "If true, columns in the input CSV will be treated as separate wavelengths of a spectrum", + "heatmap_cmap": "coolwarm_r", # a color-safe heatmap color with red being "bad" (low value) + "heatmap_cmap_h": "Matplotlib code for colormap that would be used to color-code heatmaps in reports or images", +} + +# dictionary with avialble models +# NOTE to be automatically identified the model has to be subclass or subsubclass of MoltenProtModel +avail_models = {} +for model in models.MoltenProtModel.__subclasses__(): + avail_models[model.short_name] = model # subclass of MoltenProtModel + for submodel in model.__subclasses__(): # subsubclass of MoltenProtModel + avail_models[submodel.short_name] = submodel + +# add a dummy model to indicate that the dataset should not be analysed +avail_models["skip"] = "skip" # type: ignore[assignment] + +### Utility functions + + +def normalize(input, new_min=0, new_max=1, from_input=False): + """Helper function to normalize a pandas Series. + + Make the data occupy a specified range (defined by new_min and new_max) + + Parameters + ---------- + input : pd.Series + input Series to normalize + new_min, new_max : float + range for normalization (default [0-1]) + from_input : bool + use absolute vales of min/max of the input series to create normalization range + this is used to perform inversion of curves (mirror around X-axis) + + References: + ---------- + The general formula for normalization is from here: + https://en.wikipedia.org/wiki/Normalization_(image_processing) + """ + if from_input: + new_min = min(abs(input)) + new_max = max(abs(input)) + + return (input - input.min()) * (new_max - new_min) / ( + input.max() - input.min() + ) + new_min + + +def to_odd(input, step): + """Helper function to convert the window length in temperature units to an odd number of datapoints. + + NOTE for user's simplicity the window size is specified in temperature units + for filters, however, an odd integer is needed + + Parameters + ---------- + input : float + number in degrees to convert to a number of datapoints + step : float + temperature step in the dataset + + Returns: + ------- + float + an odd number of datapoints which should more or less fit in the + requested temperature range + """ + # calculate the approximate amount of datapoints corresponding to the range + input = input / step + # convert input to an integer + input = int(input) + # if it can be divided by 2 without the remainder (Euclidean division) + # than it is even and needs to become odd + # % computes the remainder from Euclidean division + # the complementary operator is // + if not input % 2: + input = input + 1 + return input + + +def analysis_kwargs(input_dict): + """Takes a dict and returns a valid argument dict for SetAnalysisOptions + Unknown options are removed, default values are supplied. + """ + output = {} + for i, j in input_dict.items(): + if i in list(analysis_defaults.keys()) or i in list(prep_defaults.keys()): + output[i] = j + return output + + +def showVersionInformation(): + """Print dependency version info.""" + print(cfFilename, currentframe().f_lineno) + print(f"MoltenProt v. {__version__}") + if from_pyinstaller: + print("(PyInstaller bundle)") + pd.show_versions(as_json=False) # matplotlib is also there + print(f"joblib : {joblib_version}") + + +### Wrappers + + +def mp_read_excel(filename, sheetname, index_col): + """Read XLSX files to pandas DataFrames and return None if errors occur. + + Parameters + ---------- + filename + filename for reading + sheetname + which sheet to read from file + index_col + which column to use as index + + Returns: + ------- + pd.DataFrame from Excel file or None if reading failed + """ + try: + # NOTE in pandas below 0.24 even though the data should be considered index-free (index_col=None) + # the first column was sometimes used as index (corresponds to Time in Prometheus XLSX) + # for now explicitly pass some column to use as index + return pd.read_excel(filename, sheetname, index_col=index_col) + except: + # NOTE XLSX parsing module has its own error naming system, so individual errors cannot be catched unless xlrd module is imported + print( + 'Warning: sheet "{}" does not exist in the input file {}'.format( + sheetname, + filename, + ), + ) + return None + + +### JSON I/O functions and variables + + +def serialize(obj): + """Helper function for serialization of DataFrames and other non-standard objects.""" + if isinstance(obj, pd.core.frame.DataFrame): + # NOTE original data contains 12 decimal digits, while default encoding parameter is 10 + # pandas calculates with 16 decimal digits (or more?) and in this part there will be certain discrepancy + # between the runs (i.e. run analysis>save to json>run analysis again) + # in any case the impact on the ultimate result is not measurable + return { + "DataFrame": obj.to_json( + force_ascii=False, + double_precision=15, + orient="split", + ), + } + elif isinstance(obj, MoltenProtFit): + output = obj.__dict__ + # delete some method descriptions (only needed for parallel processing) + if "plotfig" in output: + del output["plotfig"] + + # delete layout attribute, it will be set from parent MPFM when loading back + if "layout" in output: + del output["layout"] + + return {"MoltenProtFit": output} + elif isinstance(obj, MoltenProtFitMultiple): + output = obj.__dict__ + # a better way to query a dict, see: + # https://docs.quantifiedcode.com/python-anti-patterns/correctness/not_using_get_to_return_a_default_value_from_a_dictionary.html + # TODO this deletion is probably not needed? + if output.get("PrepareAndAnalyseSingle"): + del output["PrepareAndAnalyseSingle"] + if output.get("WriteOutputSingle"): + del output["WriteOutputSingle"] + # add version info and timestamp + return { + "MoltenProtFitMultiple": output, + "version": __version__, + "timestamp": strftime("%c"), + } + return None + + +def deserialize(input_dict): + """Helper function to deserialize MoltenProtFit instances and DataFrames from JSON files (read into a dict). + + If the object contains any plate_ entries, cycle through them and convert to DataFrames + also applies to the layout entry (again, a DataFrame) + """ + if input_dict.get("MoltenProtFit"): + # read a MPF instance from the dict + # plate_ variables conversion occurs during intitialization. + # the downstream part of the json (e.g. a single MPFIT instance) + output = MoltenProtFit(None, input_type="from_dict") + output.__dict__.update(input_dict["MoltenProtFit"]) + return output + elif input_dict.get("MoltenProtFitMultiple"): + # this level also has version info and timestamp + # NOTE if needed, this section can be used to discard too old JSON sessions + if input_dict.get("version") and input_dict.get("timestamp"): + print( + "Information: JSON session was created with MoltenProt v. {} on {}".format( + input_dict["version"], + input_dict["timestamp"], + ), + ) + else: + print( + "Warning: JSON session contains no version and timestamp, is it an old one?", + ) + # create an empty instance and update its dict + output = MoltenProtFitMultiple() + output.__dict__.update(input_dict["MoltenProtFitMultiple"]) + + # set layouts in all datasets + output.UpdateLayout() + return output + elif input_dict.get("DataFrame"): + return pd.read_json(input_dict["DataFrame"], precise_float=True, orient="split") + return input_dict + + +### Classes + + +class MoltenProtFit: + """Stores and analyses a single dataset. + + Attributes: + ---------- + defaults : dict + dictionary holding the default values and and their description + hm_dic : dict + dictionary for important heatmap parameters + plate - the workhorse variable, contains the values currently being procesed + plate_raw - initially imported data, but without invalid wells or the ones excluded by user + plate_results - fit values and sort score for individual wells + plate_derivative - the derivative curve for raw (?) data + plate_fit - the curves computed based on the fit parameters + resultfolder - a subfolder created from the file name to write the output (assigned by MoltenProtMain.py) + dT - step of temperature + xlim - the range for x-axis on RFU(T) plots + filename - the name of the input file + Lists holding removed wells: + bad_fit - ID of wells that could not be fit + blanks - blank wells (subtracted prior to analysis) -> now part of SetAnalysisOptions + exclude - supplied by user through --exclude option -> now part of SetAnalysisOptions + bad_Tm - the melting temperature is out of range of T + readout_type - the signal in the assay + + Notes: + ----- + This is an internal class, do not create it manually. + All official messages must be sent through print_message() method (enforces selecting message type) + Output information style: + "Fatal: " - error that causes the program to stop + "Warning: " - something may be messed up, but the program can proceed + "Information: " any other message, that has nothing to do with the program flow + """ + + # dictionary for important heatmap parameters: + # > which values are good high or low + # > what is the title for plot + # > what are the tick labels + # TODO for Tm, dHm and the like, substitute text info to the min and max numbers + # TODO use this as is a general hm_dic for all types of analysis + hm_dic = { + "S": { + "lowerbetter": True, + "title": "Std. Error of Estimate Heatmap", + "tick_labels": ["Bad fit", "Reference", "Good fit"], + }, + "dHm_fit": { + "lowerbetter": False, + "title": "Unfolding Enthalpy Heatmap", + "tick_labels": ["Low dHm", "Reference", "High dHm"], + }, + "Tm_fit": { + "lowerbetter": False, + "title": "Melting Temperature Heatmap", + "tick_labels": ["Low Tm", "Reference", "High Tm"], + }, + "d_fit": { + "lowerbetter": True, + "title": "Slope Heatmap", + "tick_labels": ["Flat", "Reference", "Steep"], + }, + "Tm_init": { + "lowerbetter": False, + "title": "Melting Temperature Heatmap", + "tick_labels": ["Low Tm", "Reference", "High Tm"], + }, + "T_onset": { + "lowerbetter": False, + "title": "Onset Temperature Heatmap", + "tick_labels": ["Low T_ons", "Reference", "High T_ons"], + }, + "a_fit": { + "lowerbetter": True, + "title": "Slope Heatmap", + "tick_labels": ["Flat", "Reference", "Steep"], + }, + "Tagg_fit": { + "lowerbetter": False, + "title": "Aggregation Temperature Heatmap", + "tick_labels": ["Low Tagg", "Reference", "High Tagg"], + }, + "Tagg_init": { + "lowerbetter": False, + "title": "Aggregation Temperature Heatmap", + "tick_labels": ["Low Tagg", "Reference", "High Tagg"], + }, + "dG_std": { + "lowerbetter": False, + "title": "Standard Gibbs Free Energy of Unfolding", + "tick_labels": ["na", "na", "na"], + }, + "BS_factor": { + "lowerbetter": False, + "title": "Dimesionless Signal Window", + "tick_labels": ["Narrow", "Reference", "Wide"], + }, + } + + def __init__( + self, + filename, + scan_rate=None, + denaturant=defaults["denaturant"], + sep=defaults["sep"], + dec=defaults["dec"], + debug=defaults["debug"], + input_type="csv", + parent_filename="", + readout_type="Signal", + ) -> None: + """Parameters + ---------- + filename + for csv files the file to parse, when MPF is made by MPFM this will be substituted to the filename used to create MPFM + scan_rate + scan rate in degrees per min (required for kinetic models) + denaturant + temperature (C or K) or chemical (under construction) + sep,dec + csv import parameters + debug + print additional info + input_type + defines where the data is coming from: + > csv - default value, corresponds to the standard csv file + > from_xlsx - means that the instance of MoltenProtFit is being created by MoltenProtFitMultiple, input will be a ready-to-use DataFrame + parent_filename + the filename of the original xlsx file + readout_type + the name of the readout (for plot Y-axis) + + Notes: + ----- + """ + # set debugging mode: if the __version__ is "git", then this is a development file + # and debugging is done by default. In all other cases, it is False + # TODO add debug to CLI interface + if __version__[:3] == "git": + self.debug = True + else: + self.debug = debug + + if input_type == "from_dict": + # this would return an empty object + # all manipulations are done by deserialize function + pass + else: + # NOTE the logic of overwriting etc is handled by CLI/GUI code + self.resultfolder = None + # attribute to hold diagnostic messages. + self.protocolString = "" + self.scan_rate = scan_rate + self.fixed_params = None + # load the dataset into the plate variable + if input_type == "csv": + try: + self.plate = pd.read_csv( + filename, + sep=sep, + decimal=dec, + index_col="Temperature", + encoding="utf-8", + ) + except ValueError: + self.print_message("Input *.csv file is invalid!", "f") + print( + "Please check if column called 'Temperature' exists and separators are specified correctly.", + ) + # the filename is needed for report title + self.filename = filename + elif input_type == "from_xlsx": + self.plate = filename + # BUG this is only needed for report generation, a dummy value + # Ideally, it should be the name of parent xlsx file + self.filename = parent_filename + + # If requested, convert temperature of input files to Kelvins + if denaturant == "K": + # input temperature is already in internal units + self.denaturant = "K" + elif denaturant == "C": + # convert Celsius to Kelvins + self.plate.index = self.plate.index + 273.15 + self.denaturant = "K" + elif denaturant == "F": + msg = "NO Fahrenheit, please!" + raise ValueError(msg) + else: + # if temperature was not recognized, set denaturant to chemical type + # NOTE currently not implemented in MoltenProtFit + print(f"Assuming that {denaturant} is a chemical denaturant") + self.denaturant = denaturant + msg = "Chemical denaturation not implemented yet" + raise NotImplementedError(msg) + + # save a copy of raw data + self.plate_raw = self.plate.copy() + + # compute the (average) step of Temperature (required for conversion of temperature ranges to sets of datapoints) + self.dT = (max(self.plate.index) - min(self.plate.index)) / ( + len(self.plate.index) - 1 + ) + + # for plots only: compute xlim from Temperature range + self.xlim = [min(self.plate.index) - 5, max(self.plate.index) + 5] + + # currently only for plots: give a proper label to Y-axis + self.readout_type = readout_type + + self.bad_fit = [] # type: ignore[var-annotated] + self.bad_Tm = [] # type: ignore[var-annotated] + + def __getstate__(self): + """A special method to enable pickling of class methods (for parallel exectution).""" + output = self.__dict__ + output["plotfig"] = self.plotfig + return output + + def converter96(self, use_column, reference=None): + """Reads self.plate_results with well ID in the index (A1, A2 etc) and some values in columns (Tm_fit, etc) and returns a DataFrame emulating a 96-well plate where each well has a normalized respective column value. + + Parameters + ---------- + use_column + which column to use in self.plate_results + reference + well to use as reference + + Notes: + ----- + Normalization is done based on lowerbetter information from self.hm_dic: + 1 - means the highest possible value and the best value as well + 0 - means the worst and the lowest possible value + + If a reference is supplied, it is subtracted from the data + Reference code _was not maintained_ for a while and is probably faulty! + """ + # create a DataFrame emulating a 96-well plate + output = pd.DataFrame( + index=["A", "B", "C", "D", "E", "F", "G", "H"], + columns=list(range(1, 13)), + ) + + # check if the use_column value is a valid one + self.print_message( + "Creating heatmap for column {}".format( + self.plate_results[use_column].name, + ), + "i", + ) + for i in output.index: + for j in output.columns: + a = i + str(j) + if (use_column in self.plate_results.columns) and ( + a in self.plate_results.index + ): + output.loc[[i], [j]] = self.plate_results.loc[a, use_column] + + # if reference value is supplied, subtract it from the values + if reference is not None: + try: + # probe if the well is OK (not NaN), that's easier to do with the input DataFrame + output = output - self.plate_results[use_column][reference] + except KeyError: + self.print_message( + "Supplied reference is invalid, creating a reference-free heatmap", + "w", + ) + + # invert the values if lowerbetter=true + # just a precaution against outdated self.hm_dic + try: + if self.hm_dic[use_column]["lowerbetter"]: + output = output - max(output.max()) + output = output * -1 + except KeyError: + self.print_message( + "{} was not found in hm_dic, the colors may be wrong!".format( + use_column, + ), + "w", + ) + + # bring everything to range 0-1 + # normalize by max value (this way we make sure that Nans changed to 1000 are out of range) + # NOTE in some rare cases, when there is only one sample left and it has value 0 in plate96 + min_val = output.min().min() + max_val = output.max().max() + if min_val == max_val: + self.print_message( + "Only one sample left after pre-processing and fitting!", + "w", + ) + output = output * 0 + else: + output = (output - min_val) / (max_val - min_val) + # make all Nan's equal to 1000 + output.fillna(1000, inplace=True) + return output + + def heatmap( + self, + output_path, + plate96, + use_column, + heatmap_cmap=defaults["heatmap_cmap"], + lowerbetter=False, + title="Result Heatmap", + tick_labels=["Unstable", "Reference", "Stable"], + pdf_report=False, + save=True, + ): + """Create a heatmap. + + Parameters + ---------- + output_path + where to save the image + plate96 + dataframe created with method converter96 + use_column + which curve parameter to use for heatmap + heatmap_cmap + matplotlib colormap for heatmap + lowerbetter + indicated if lower values of a parameters correspond to higher stability (e.g. S) + title + the title of the heatmap + tick_labels + how to label the colorbar + pdf_report + if True, will only return a figure object (and size will be adjusted to meet A4) + save + if True, save image to disk + """ + # tweak colormap to have outside values colored as gray + cmap = plt.get_cmap(heatmap_cmap).copy() + cmap.set_over("0.5") + + # check if there are negative values + # (plate96_sort<0).any() returns a Series with True/False indicating presence of negative values in respective columns + # we then use any() for this series to get the True statement. Very weird way to do it indeed. + if ((plate96 < 0).any()).any(): + # negative values are present, so we're dealing with ref-based data set + + # now check if we are dealing with higherbetter or lowerbetter values + if lowerbetter: + # we have to invert sign so that all values that are less than the reference are positive + plate96 = plate96 * -1 + + # Normalize positive values by the highest value and negative values by the lowest + # this is not fully correct because the slope of color(value) dependence line is different for values + # below zero and above zero. Other ways to do it are too hacky anyway + plate96[plate96 > 0] = plate96[plate96 > 0] / max(plate96.max()) + plate96[plate96 < 0] = plate96[plate96 < 0] / abs(min(plate96.min())) + vmin = -1 + tick_values = [-1, 0, 1] + else: + vmin = min(plate96.min()) + tick_values = [vmin, 1] + # this format means how many decimal digits is allowed + # set tick labels from use_column + col_min = f"{self.plate_results[use_column].min():.3f}" + col_max = f"{self.plate_results[use_column].max():.3f}" + tick_labels = [col_max, col_min] if lowerbetter else [col_min, col_max] + + # Making not available values gray (e.g. bad fit or blanks) + # either convert pd to numpy array and make a mask, or + # create figure canvas + # A4 is 8.3 x 11.7 inches, for report the whole page is needed + # for other outputs we need half the hight (i.e. A5 in landscape orientation) + if pdf_report: + fig, ax = plt.subplots(3, 1, figsize=(8.3, 11.7)) + cbar_shrink = 0.3 + cbar_orient = "horizontal" + heatmap_axis = ax[0] + axis_aspect = ["auto"] + else: + fig = plt.figure(figsize=(8.3, 11.7 / 2), tight_layout=True) + cbar_shrink = 8 / 12 + cbar_orient = "vertical" + heatmap_axis = fig.gca() + axis_aspect = ["equal", "box"] + fig.suptitle( + title, + fontweight="bold", + ) + # create the heatmap + # NOTE in some rare cases when the heatmap consists of a single sample matplotlib will raise a warning: + """ + RuntimeWarning: invalid value encountered in less_equal + b = b[(b <= intv[1] + eps) & (b >= intv[0] - eps)] + RuntimeWarning: invalid value encountered in greater_equal + b = b[(b <= intv[1] + eps) & (b >= intv[0] - eps)] + """ + c = heatmap_axis.pcolor(plate96, edgecolors="k", cmap=cmap, vmin=vmin, vmax=1) + + # cycle through all wells and write there the ID + for i in plate96.index: + for j in plate96.columns: + x = plate96.columns.get_loc(j) + y = plate96.index.get_loc(i) + heatmap_axis.text( + x + 0.5, + y + 0.5, + i + str(j), + horizontalalignment="center", + verticalalignment="center", + ) + + # y axis has to be inverted so that well A1 is in top left corner + heatmap_axis.invert_yaxis() + # addtional hacks: enforce square size of wells, hide axes and ticks + heatmap_axis.set_aspect(*axis_aspect) + heatmap_axis.axis("off") + # create a colorbar with text labels + cbar = fig.colorbar( + c, + ax=heatmap_axis, + ticks=tick_values, + shrink=cbar_shrink, + orientation=cbar_orient, + ) + # set colorbar ticks depending on the requested orientation + if cbar_orient == "horizontal": + cbar.ax.set_xticklabels(tick_labels) + elif cbar_orient == "vertical": + cbar.ax.set_yticklabels(tick_labels) + # label is not needed, because it will be in the figure title + + if pdf_report: + # return the figure object for subsequent manipulations + return (fig, ax) + + if save: + plt.savefig( + os.path.join(output_path, "heatmap_" + str(use_column) + ".png"), + dpi=(200), + tight_layout=True, + ) + # clean up after plotting so that no parameters are carried over to genpics + plt.close("all") + return None + + def print_message(self, text, message_type): + """Prints messages and saves them to protocolString. + + Parameters + ---------- + text + message text + message_type : i/w/f + type of message (Information, Warning, Fatal) + """ + # line No and file name are only printed in debug mode and only for Fatals and Warnings + if self.debug and (message_type != "i"): + cf = currentframe() + cfFilename = getframeinfo(cf).filename + print(f"Line {cf.f_back.f_lineno}, in file {cfFilename}") + if message_type == "f": + print("Fatal: " + text + f" ({self.readout_type})") + sys.exit(1) + elif message_type == "w": + msg = "Warning: " + text + f" ({self.readout_type})" + print(msg) + msg = msg + "\n" + self.protocolString = self.protocolString + msg + elif message_type == "i": + msg = "Information: " + text + f" ({self.readout_type})" + print(msg) + msg = msg + "\n" + self.protocolString = self.protocolString + msg + else: + raise ValueError(f"Unknown message type '{message_type}'") + + def _calculate_raw_corr(self): + """Compute baseline-corrected raw data; requires presence of kN, bN, kU, bU in plate_results. + + Notes: + ----- + calculate fraction unfolded (funf) + How to derive the formula: + K_eq(T) = (Fn(T) - F(T)) / (F(T) - Fu(T)) + K_eq() = funf/fnat = funf / (1-funf) + funf = (Fn(T) - F(T)) / (Fn(T) - Fu(T)) + where T is temperature, F(T) assay signal, Fu and Fn - baseline signal + funf - fraction unfolded; fnat - fraction native + whichever formula is used for calculation, the result is the same + + TODO: the same calculation using plate_fit can be done to yield "fit" variant of funf + """ + # initiate an empty DataFrame + self.plate_raw_corr = pd.DataFrame(index=self.plate.index) + + # a helper function for pandas.apply() + def calculate_raw_corr_series(input_series): + # NOTE the badly fitted samples will be present in the column names of self.plate + # but they are not present in the index of self.plate_results, thus we need to check for it + # Obviously the calculation cannot be done for the curves that could not be fit + if input_series.name in self.plate_results.index: + # here we need transposed plate_results to have sample ID's in the columns + output = ( + input_series.index * self.plate_results["kN_fit"][input_series.name] + + self.plate_results["bN_fit"][input_series.name] + - input_series + ) / ( + input_series.index * self.plate_results["kN_fit"][input_series.name] + + self.plate_results["bN_fit"][input_series.name] + - input_series.index + * self.plate_results["kU_fit"][input_series.name] + - self.plate_results["bU_fit"][input_series.name] + ) + # calculation above produces an index object, which we have to convert to a pd.Series + output = pd.Series( + output, + index=self.plate.index, + name=input_series.name, + ) + self.plate_raw_corr = pd.concat( + [self.plate_raw_corr, pd.Series(output)], + axis=1, + ) + + self.plate.apply(calculate_raw_corr_series) + + def _estimate_baseline(self, input_series, fit_length, estimate_Tm=False): + """Estimates pre- and post-transition baselines for a series + function for a single Series (index is Temperature, name is sample ID, values are RFU's). + + Parameters + ---------- + input_series + pd.Series with Temperature in index, name is sample ID, values are the signal + fit_length + number of degrees to be used from the start or end of data for fitting + estimate_Tm + additionally estimate melting temperature Tm with a heuristic + """ + # convert fit length in temperature degrees to datapoint number (using self.dT) + fit_datapoints = int(fit_length / self.dT) + # NOTE do not use Nan's to prevent issues during fitting + pre_fit, pre_covm = np.polyfit( + input_series.dropna().iloc[:fit_datapoints].index, + input_series.dropna().iloc[:fit_datapoints], + 1, + cov=True, + ) + post_fit, post_covm = np.polyfit( + input_series.dropna().iloc[-fit_datapoints:].index, + input_series.dropna().iloc[-fit_datapoints:], + 1, + cov=True, + ) + self.plate_results.loc[["kN_init"], [input_series.name]] = pre_fit[0] + self.plate_results.loc[["bN_init"], [input_series.name]] = pre_fit[1] + self.plate_results.loc[["kU_init"], [input_series.name]] = post_fit[0] + self.plate_results.loc[["bU_init"], [input_series.name]] = post_fit[1] + pre_stdev = np.sqrt(np.diagonal(pre_covm)) + post_stdev = np.sqrt(np.diagonal(pre_covm)) + # estimate_Tm stdev of each parameter and write it to plate_results_stdev (used to set bounds) + self.plate_results_stdev.loc[["kN_init"], [input_series.name]] = pre_stdev[0] + self.plate_results_stdev.loc[["bN_init"], [input_series.name]] = pre_stdev[1] + self.plate_results_stdev.loc[["kU_init"], [input_series.name]] = post_stdev[0] + self.plate_results_stdev.loc[["bU_init"], [input_series.name]] = post_stdev[1] + + if estimate_Tm: + # now the Tm part - find the maximum of smoothened derivative for S-shaped curves (low-to-high) + # or the minimum for Z-shaped ones + # intersection of the difference line with zero -> intersection of baselines + dintersect = -(post_fit[1] - pre_fit[1]) / (post_fit[0] - pre_fit[0]) + # min/max/middle temperature range + tmin = min(input_series.index) + tmax = max(input_series.index) + tmid = (tmin + tmax) / 2 + # value of baseline difference at tmid + b_diff = (post_fit[0] - pre_fit[0]) * tmid + (post_fit[1] - pre_fit[1]) + + if (dintersect > tmin + fit_length) and (dintersect < tmax - fit_length): + # NOTE rule out intersecting baselines - in this case post-baseline is not always + # above or below the pre-baseline + self.plate_results.loc["Tm_init", input_series.name] = tmid + else: + if b_diff > 0: + # low-to-high curve - use max of the deriv as Tm_init + self.plate_results.loc[ + "Tm_init", + input_series.name, + ] = self.plate_derivative[input_series.name].idxmax() + elif b_diff < 0: + # high-to-low curve - use min of the deriv as Tm_init + self.plate_results.loc[ + "Tm_init", + input_series.name, + ] = self.plate_derivative[input_series.name].idxmin() + else: + # rare case - curves are identical raise a warning and use mid-range + self.print_message( + "Baselines are identical in sample {}".format( + input_series.name, + ), + "w", + ) + self.print_message( + "Using the middle of temperature range as Tm_init", + "i", + ) + self.plate_results.loc["Tm_init", input_series.name] = tmid + + def _calc_Tons(self, Tm_col, dHm_col, onset_threshold): + """Computes onset temperature Tons based on supplied column names with dHm and Tm + and adds the value to plate_results. + + Parameters + ---------- + Tm_col + column with Tm values (Tm_fit, Tm1_fit, etc) + dHm_col + column with slope values + onset_threshold + fraction unfolded that corresponds to onset of unfolded (e.g. 0.01 - 1% must be unfolded) + """ + self.plate_results["T_onset"] = 1 / ( + 1 / self.plate_results[Tm_col] + - R + / self.plate_results[dHm_col] + * np.log(onset_threshold / (1 - onset_threshold)) + ) + + # also calculate stdev for T_onset using error propagation from Tm_fit and dHm_fit + self.plate_results_stdev["T_onset"] = ( + np.sqrt( + (self.plate_results_stdev[Tm_col] / self.plate_results[Tm_col]) ** 2 + + (self.plate_results_stdev[dHm_col] / self.plate_results[dHm_col]) ** 2 + + ( + np.sqrt( + self.plate_results_stdev[dHm_col] ** 2 + + ( + R + * np.log(onset_threshold / (1 - onset_threshold)) + * self.plate_results_stdev[Tm_col] + ) + ** 2, + ) + / ( + self.plate_results[Tm_col] + - R + * np.log(onset_threshold / (1 - onset_threshold)) + * self.plate_results[Tm_col] + ) + ) + ** 2, + ) + * self.plate_results.T_onset + ) + + def plotfig( + self, + output_path, + wellID, + datatype="overview", + save=True, + show=False, + data_ax=None, + vline_legend=False, + ): + """Plot the curves from individual wells. + Creates two subplots - top the fit + data, lower - derivative. + + Parameters + ---------- + output_path + where to write the file (can be also a dummy value) + wellID + from which well to plot + datatype + what is being plotted: + > overview - plot experimental data , fit data and some fit parameters + > very_raw - a workaround parameter plotting from plate attribute; made for the GUI when the data is loaded, but not processed yet + > raw - plot unprocessed data + > normalized - data after preprocessing + > derivative - first derivative + > fitted - based on the equation + save + actually save the file + show + show image instead + data_ax + instead of creating new ones, plot in these axes (for PDF reports: disables derivative plot and legend plot) + vline_legend + if True then all vlines will be added to the legend (looks bad when individual images are saved) + """ + if data_ax is None: + # create the figure object + fig = plt.figure(1, figsize=(8, 7)) + # create a specification for the relative plot sizes + gs = gridspec.GridSpec(3, 1, height_ratios=[4, 2, 0.05], figure=fig) + # get objects of individual subplots + data_ax = fig.add_subplot(gs[0]) # experimental data, fit, etc + deriv_ax = fig.add_subplot(gs[1]) # the derivative + else: + data_ax = data_ax + deriv_ax = None + + # NOTE currently all internal manipulations are done in K + # and conversion back to original scale is not done + if self.denaturant == "K" or self.denaturant == "C": + degree_sign = "K" + else: + # NOTE this is a placeholder for chemical denaturant scale + pass + + # a carry-over from the cycle + i = wellID + + if datatype == "overview": + # plot the fit + # plot the experimental data + # TODO use markevery=n to plot every n-th datapoint + # format used to be kx, however, for bigger datasets this doesn't look good + data_ax.plot( + self.plate[i].index.values, + self.plate[i], + "k.", + mew=1, + label="Experiment", + ) # , markevery=40) + + data_ax.plot( + self.plate[i].index.values, + self.plate_fit[i], + label="Fit", + ) + + # a label for Y-axis + data_ax.set_ylabel(self.readout_type) + + # calculate the offsets for the plot based on the overall length + max_val = max(self.plate[i].dropna()) + min_val = min(self.plate[i].dropna()) + y_range = max_val - min_val + data_ax.set_ylim([min_val - 0.1 * y_range, max_val + 0.1 * y_range]) + + # force specific range for x-axis + data_ax.set_xlim(self.xlim) + else: + # HACK hide the derivative plot + if deriv_ax is not None: + deriv_ax.set_visible(False) + # set the source dataframe based on the supplied option + if datatype == "very_raw": + sourcedf = self.plate + ylabel = self.readout_type + elif datatype == "raw": + sourcedf = self.plate_raw + ylabel = self.readout_type + elif datatype == "normalized": + # after all processing normalized curves are in the plate variable + sourcedf = self.plate + ylabel = self.readout_type + elif datatype == "derivative": + sourcedf = self.plate_derivative + ylabel = "dRFU/dT" + elif datatype == "fitted": + sourcedf = self.plate_fit + ylabel = "Fitted RFU" + else: + self.print_message("Invalid plotting source requested", "w") + + if datatype == "derivative": + # for derivative plots plot all values + plt.plot(sourcedf[i].index.values, sourcedf[i], label=i) + else: + data_ax.plot( + sourcedf[i].index.values, + sourcedf[i], + "k.", + mew=1, + label="Experiment", + ) + data_ax.set_ylabel(ylabel) + + data_ax.set_xlabel(f"Temperature, {degree_sign}") + data_ax.set_title("Sample " + str(i), fontsize=12, y=1.05) + data_ax.grid(True, which="both") + + # commands specific only to overview mode: + if datatype == "overview": + # plot the determined baselines + # create np.poly1d objects with respective fit parameters + # TODO for line just 2 points are needed... + poly_pre = np.poly1d( + *self.plate_results.loc[[i], ["kN_fit", "bN_fit"]].values, + ) + poly_post = np.poly1d( + *self.plate_results.loc[[i], ["kU_fit", "bU_fit"]].values, + ) + data_ax.plot( + self.plate[i].index.values, + poly_post(self.plate[i].index.values), + label="Post- baseline", + linestyle="--", + ) + data_ax.plot( + self.plate[i].index.values, + poly_pre(self.plate[i].index.values), + label="Pre- baseline", + linestyle="--", + ) + + # visualization of the lines requested (specific to different model types) + # NOTE this would not print the value/stdev on the plot, has to be done separately + for parameter_name in self.plotlines: + if vline_legend: + data_ax.axvline( + self.plate_results[parameter_name][i], + ls="dotted", + c="b", + lw=3, + label=parameter_name, + ) + else: + # NOTE in this case lines are not labeled so that they are not listed in the legend + data_ax.axvline( + self.plate_results[parameter_name][i], + ls="dotted", + c="b", + lw=3, + ) + # add text with the parameter used to generate the line - doesn't look nice in some caes + data_ax.text( + self.plate_results[parameter_name][i], + data_ax.get_ylim()[0] + 0.05 * y_range, + " " + parameter_name, + fontsize=12, + ) + if deriv_ax is not None: + fig.legend(loc="lower center", ncol=4, fontsize=12) + # commands for derivative plot (used only in overview mode) + deriv_ax.plot( + self.plate[i].index.values, + self.plate_derivative[i], + color="k", + ) + deriv_ax.set_xlabel(f"Temperature, {degree_sign}") + deriv_ax.set_ylabel(f"d({self.readout_type})/dT") + + # delete the X-label on the data axes + data_ax.set_xlabel("") + + # xlim for derivative plot and data plot must be the same! + deriv_ax.set_xlim(self.xlim) + deriv_ax.grid(True, which="both") + + if deriv_ax is None: + # if data_ax was provided externally, then showing/saving should not be done + return + if show: + plt.show() + elif save: + plt.savefig(output_path + "/" + str(i) + ".png", dpi=(100)) + plt.close("all") + + ## internal methods for creating HTML report elements + def html_heatmap(self, heatmap_cmap, display): + """Returns a div-block with a heatmap of the sortby parameter (the last column in self.plate_results). + + Parameters + ---------- + heatmap_cmap + matplotlib colormap for heatmap + display + whether the heatmap is shown or not in the final HTML: + > table (standard view) + > none (not visible) + > block (compact view) + + Notes: + ----- + The heatmap that the user sees when opening the HTML will have display=table, all other will start as display=none + """ + # this string template corresponds to a single sample entry (has to be wrapped within rows) + # NOTE when gray cells are clicked a window still pops up but says that there is no such image + # due to limitations of possible attributes within html, the possible layout info is stored in the + # title attribute which also gets displayed as a tooltip (and will be also shown in the bottom of the heatmap) + sample = '
$ID
\n' + sample = Template(sample) + # row template + # TODO add extra spaces for readability of output HTML file + row = '
\n$SAMPLES
' + row = Template(row) + + # extract the heatmap from matplotlib + cmap = plt.get_cmap(heatmap_cmap) + + # by convention the model-specific final sorting parameter is stored in the last column + # TODO add support creating HTML heatmaps for an arbitrary column + colors = self.plate_results.iloc[:, -1] + colors = normalize(colors) + + # convert numbers to HEX colors + colors = colors.apply(lambda x: rgb2hex(cmap(x))) + + # outer cycle creates rows, inner cycle creates lines for 1-12 + row_output = "" + for i in ["A", "B", "C", "D", "E", "F", "G", "H"]: + line = "" + for j in range(1, 13): + # if a respective color exists then colorise the
+ # if not, then set it to "lightgray" + sample_id = str(i) + str(j) + # NOTE modify this line to add additional information (e.g. Tm) to the text under heatmap + CONDITION = self.layout["Condition"][sample_id] + + if sample_id in colors.index: + line = line + sample.substitute( + ID=sample_id, + COLOR=colors[sample_id], + CONDITION=CONDITION, + ) + else: + # lightgray blends with mid-range of coolwarm, so change to gray + line = line + sample.substitute( + ID=sample_id, + COLOR="gray", + CONDITION=CONDITION, + ) + row_output = row_output + row.substitute(ROWNAME="Row_" + i, SAMPLES=line) + + # once the heatmap itself is created, it is wrapped around in the additional table + # that would control if the hm is shown, define the title, etc + output_template = '
\n
\n

$TITLE_TEXT

\n Click on the wells to open plots in a separate window \n
\n $HEATMAP\n
' + output_template = Template(output_template) + title_text = "{}: heatmap of {} (model {})".format( + self.readout_type, + self.plate_results.iloc[:, -1].name, + self.model, + ) + return output_template.substitute( + DISPLAY=display, + IDENTIFIER=self.readout_type, + TITLE_TEXT=title_text, + HEATMAP=row_output, + ) + + def html_button(self): + """Returns an HTML button string with the readout name on top.""" + button_template = '\n' + button_template = Template(button_template) + return button_template.substitute(IDENTIFIER=self.readout_type) + + ## Methods for GUI communication + def printAnalysisSettings(self): + """Prints current analysis settings.""" + for setting, def_value in analysis_defaults.items(): + if setting[-2:] == "_h": + # filter out the help message entries + pass + else: + print( + "{} = {} (default: {})".format( + setting, + getattr(self, setting), + def_value, + ), + ) + print("\n") + + def analysisHasBeenDone(self): + """Check if analysis compleded and self.plate_results is created.""" + return "plate_results" in self.__dict__ + + def testWellID(self, wellID, ignore_results=False): + """Check if a well exists in self.plate_results (return True), otherwise return False. + + Parameters + ---------- + wellID + sample ID to check + ignore_results: bool + if True, will look for the ID's in plate_raw, even if self.plate_results exists + """ + # check if analysis was done, and depending on that choose the index to check + if "plate_results" in self.__dict__: + index_tocheck = self.plate_results.index + else: + index_tocheck = self.plate_raw.columns + + if ignore_results: + index_tocheck = self.plate_raw.columns + + return wellID in index_tocheck + + def getResultsColumns(self): + """Returns a tuple of columns from self.plate_results that are relevant for GUI.""" + if "plate_results" in self.__dict__: + output = [self.plate_results.iloc[:, -1].name, *self.plotlines] + + # BS-factor is more useful than S, but not always available + if "BS_factor" in self.plate_results.columns: + output = [*output, "BS_factor"] + else: + output = [*output, "S"] + return output + else: + self.print_message("No plate_results attribute found", "w") + self.print_message("Please perform analysis first", "i") + return None + + ## Big methods for data input, processing and output + def SetAnalysisOptions( + self, + model=analysis_defaults["model"], + baseline_fit=analysis_defaults["baseline_fit"], + baseline_bounds=analysis_defaults["baseline_bounds"], + dCp=analysis_defaults["dCp"], + onset_threshold=analysis_defaults["onset_threshold"], + savgol=analysis_defaults["savgol"], + # these are pre-processing options (defaults stored in a different dict) + blanks=prep_defaults["blanks"], # TODO set in layout instead? + exclude=prep_defaults["exclude"], # TODO set in layout instead? + invert=prep_defaults["invert"], + mfilt=prep_defaults["mfilt"], + shrink=prep_defaults["shrink"], + trim_max=prep_defaults["trim_max"], + trim_min=prep_defaults["trim_min"], + # TODO layouts must be handled somewhere else... + layout=None, + layout_input_type="csv", + ): + """Sets in the MoltenProt instance all analysis-related parameters that will then be used by methods PrepareData() and ProcessData(). For parameter description see analysis_defaults and prep_defaults dicts. + + References: + ---------- + https://homepages.inf.ed.ac.uk/rbf/HIPR2/median.htm + the median filter: how it works and why it may be better than the mean + """ + self.model = model + self.baseline_fit = baseline_fit + self.baseline_bounds = baseline_bounds + # current value for onset_threshold (used in santoro1988(d) to obtain values of T_onset) + self.onset_threshold = 0.01 + # the value for savgol window to compute the derivative + self.savgol = savgol + + self.blanks = blanks + self.exclude = exclude + self.invert = invert + # to avoid confustion, medfilt is the imported method, mfilt is the respective analysis flag + self.mfilt = mfilt + self.shrink = shrink + # NOTE changes made by trim_min/trim_max are saved directly to self.plate + self.trim_min = trim_min + self.trim_max = trim_max + + # NOTE to prevent carry-over from previous run (e.g. in JSON) reset the bad fit list + self.bad_fit = [] + + # TESTING setting layout and dCp moved to a separate method + self.SetLayout(layout=layout, layout_input_type=layout_input_type, dCp=dCp) + + def SetLayout( + self, + layout=None, + layout_input_type="csv", + dCp=analysis_defaults["dCp"], + ): + """Sets layout and dCp.""" + if layout is not None: + if layout_input_type == "csv": + try: + # the format for layout is more strict: it is always a csv with commas as separators + # and index column called ID; more restrictions will follow + # TODO add check for the size of layout DataFrame - should be always 96 + # NOTE it may be better to use ";" as csv separator, because it would be easier to write stuff + self.layout = pd.read_csv(layout, index_col="ID", encoding="utf_8") + except: + self.print_message( + "Unsupported layout format! No layout info will be available", + "w", + ) + self.layout = None + elif layout_input_type == "from_xlsx": + self.layout = layout + # Read blanks and skipped samples from the layout information + self.blanks = list( + self.layout[self.layout["Condition"].astype(str) == "Blank"].index, + ) + self.exclude = list( + self.layout[self.layout["Condition"].astype(str) == "Ignore"].index, + ) + else: + # in all other cases set the instance attribute to None + self.layout = None + + # heat capacity change values for the whole plate + # self.dCp can be one of the following: "from_layout" or value specified by the user + # TODO overwrite layout value instead + if dCp >= 0: + # user-set dCp from CLI overrides dCp supplied in the layout + self.print_message(f"dCp for all samples is set to {dCp}", "i") + self.dCp = dCp + elif self.layout is not None: + # if there is a layout, there may or may not be dCp values + self.print_message( + "Using per-sample dCp values as provided in the layout (invalid values will be turned to 0)", + "i", + ) + # ensure that dCp values are numbers, but not something else + # if an invalid value occurs, set it to the default value of 0 + self.layout["dCp"] = pd.to_numeric(self.layout["dCp"], errors="coerce") + self.layout["dCp"].fillna(0, inplace=True) + # also negative values must be turned to 0 + self.layout.loc[layout["dCp"] < 0, ["dCp"]] = 0 + self.dCp = "from_layout" + else: + msg = f"Incorrect value for dCp ({dCp})!" + raise ValueError(msg) + + def SetFixedParameters(self, fixed_params): + """Takes a dataframe with alphanumeric columns (A1-H12) and index being names of + the parameters (e.g. Tf_fit, Ea_fit) and sets it as the attribute self.fixed_params. + + Notes: + ----- + No sanity checks of the input are currently done + """ + self.fixed_params = fixed_params + + def PrepareData(self): + """Prepares input data for processing.""" + # copy raw data to the main plate + # NOTE this is primarily needed for json i/o, to ensure that the analysis runs + # are the same after save/load cycle + # also the dT step size must be reset + self.plate = self.plate_raw.copy() + self.dT = (max(self.plate.index) - min(self.plate.index)) / ( + len(self.plate.index) - 1 + ) + + # Remove all-Nan columns + self.plate = self.plate.dropna(how="all", axis=1) + + # remove the user-specified unneeded wells + if self.exclude: + self.plate = self.plate.drop( + self.plate.columns.intersection(self.exclude), + axis=1, + ) + + # trim data from the beggining or end + if self.trim_min: + self.plate = self.plate[ + self.plate.index >= min(self.plate.index) + self.trim_min + ] + if self.trim_max: + self.plate = self.plate[ + self.plate.index <= max(self.plate.index) - self.trim_max + ] + + # invert the curves + if self.invert: + # reflect the curve relative to the x-axis + self.plate = self.plate * -1 + # normalize + self.plate = self.plate.apply(normalize, from_input=True) + + # if blank wells were specified, average them and subtract from the remaining data + if self.blanks: + self.print_message("Subtracting background...", "i") + try: + bg = self.plate[self.blanks].mean(axis=1) + # remove (drop) buffer-only columns from the dataset + self.plate = self.plate.drop( + self.plate.columns.intersection(self.blanks), + axis=1, + ) + # subtract the background + self.plate = self.plate.sub(bg, axis=0) + except KeyError: + self.print_message("Same well was supplied as Blank and Ignore", "f") + print( + "Please check parameters for --blank --exclude and the layout file", + ) + + # apply median filter + if self.mfilt: + # convert the temperature range to an odd integer window size + self.mfilt = to_odd(self.mfilt, self.dT) + + # check if the window is bigger than the whole dataset + if len(self.plate.index) < self.mfilt: + # NOTE the output may be confusing to the user, because user gives degrees, but gets datapoints + msg = f"Specified medfilt window size ({self.mfilt} datapoints) is bigger than the dataset length ({len(self.plate.index)} datapoints)!" + raise ValueError( + msg, + ) + + self.plate = self.plate.apply(medfilt, kernel_size=self.mfilt) + + # NOTE this must be done after median filtering (spikes are bad for averaging) + if self.shrink is not None: + if self.shrink > self.dT: + # create the range for Temperature binning (will be also used as the index) + bin_range = np.arange( + np.floor(min(self.plate.index)), + np.ceil(max(self.plate.index)) + self.shrink, + self.shrink, + ) + # temporary DataFrame for binning + self.plate_binned = pd.DataFrame( + index=bin_range[:-1], + columns=self.plate.columns, + dtype="float64", + ) + + # average values using the temperature range (plate_binned index, plate_binned index + bin step) + for i in self.plate_binned.index: + self.plate_binned.loc[i, :] = self.plate[ + (self.plate.index > i) & (self.plate.index < i + self.shrink) + ].mean() + + self.plate = self.plate_binned + # remove possible empty rows + self.print_message( + f"Input data binned down to {self.shrink} degree step", + "i", + ) + # required for proper derivative computation + self.dT = self.shrink + else: + msg = f"Requested shrinking step ({self.shrink} degrees) is less than the average temperature step ({self.dT} degrees)" + raise ValueError( + msg, + ) + + # compute the derivative + # NOTE the best way to do it for any type of data - savgol filtering + # which is not present in older scipy versions. In this case use a less correct approach + # NOTE it is not smart to save converted values of savgol to self.savgol, rather use internal window_length var + try: + from scipy.signal import savgol_filter + + # convert window size in temperature units to an odd integer + window_length = to_odd(self.savgol, self.dT) + # check if the window is bigger than the whole dataset + if len(self.plate.index) < window_length: + # NOTE the output may be confusing to the user, because user gives degrees, but gets datapoints + msg = f"Specified savgol window size ({window_length} datapoints) is bigger than the dataset length ({len(self.plate.index)} datapoints)!" + raise ValueError( + msg, + ) + + # NOTE an additional check for savgol requirements (4 is polyorder used by default) + if window_length < 4: + msg = f"Specified savgol window size ({window_length} datapoints) is smaller than the polynomial order of the filter (4); increase savgol window size, or do not shrink the data" + raise ValueError( + msg, + ) + """ + NOTE default mode for savgol is interp, which just uses polynomial fit of the last window to + create the data values beyond the end of the dataset. This may create tails in the derivative + that interfere with peak detection. "nearest" mode just takes the first value and uses it + for the missing parts of the window at data edges; see here for other modes: + https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.signal.savgol_filter.html + """ + self.plate_derivative = self.plate.apply( + savgol_filter, + window_length=window_length, + polyorder=4, + deriv=1, + mode="nearest", + ) + except: + """ + NOTE usually, this exception would happen with old scipy version + However, this also happened in pyinstaller bundle for Windows: + LinAlgError: SVD did not converge in Linear Least Squares + The derivate is not a crucial element (fits usually converge even with a random Tm guess), + so it should be OK to proceed. + """ + self.print_message( + "Cannot compute a smoothened derivative, fit results can be suboptimal", + "w", + ) + self.print_message("Falling back to the difference method", "i") + self.plate_derivative = (self.plate - self.plate.shift(periods=1)) / self.dT + + def ProcessData(self): + """Performs curve fitting and creates results dataframes. + + Notes: + ----- + The names for the parameters and the contents of plate_results variable are different for various methods, so they are set up based on the selected analysis + """ + if self.model == "skip": + self.print_message("Dataset was omitted from analysis", "i") + # check if there are plate_results attributes from previous analysis and delete them + # NOTE there are more leftover attributes which will be saved to JSON, but without plate_results the dataset should not be recognized as a processed one + if self.analysisHasBeenDone(): + del self.plate_results + del self.plate_results_stdev + # return statement allows to silently terminate the ProcessData method + return + + # check if model name is correct + if self.model not in avail_models.keys(): + msg = f"Unknown model '{self.model}'" + raise ValueError(msg) + + model = avail_models[self.model](scan_rate=self.scan_rate) + + self.print_message("Processing data...", "i") + + # Set here scan rate for the model + # None - not specified (e.g. for CSV), doesn't matter for equilibrium and empirical models; can be set in CLI + # XLSX - parsed from Excel and always available + # kinetic models should raise a ValueError + model.scan_rate = self.scan_rate + + # the function to do curve fitting + f = model.fun + # generate parameter names + result_index = [] + for i in model.param_names(): + result_index.append(i + "_init") + for i in model.param_names(): + result_index.append(i + "_fit") + result_index.append("S") + + # select dataframe to be fit (e.g. can be changed to plate_derivative) + df_for_fitting = self.plate + # create dataframe to store results and populate with initial parameter values + # NOTE by default empty dataframes get "object" data type, which can cause problems when using numpy + # if the datatype is specified explicitly, the problem should not exist + self.plate_results = pd.DataFrame( + index=result_index, + columns=self.plate.columns.values, + dtype="float64", + ) + + # create a dataframe to store stdev for fit parameters + # for simplicity the dataframe will also have initial parameters included + # they will be dropped in the end of processing + self.plate_results_stdev = pd.DataFrame( + index=result_index, + columns=self.plate.columns.values, + dtype="float64", + ) + + # an empty p0 variable + p0 = [] + # run a cycle through all columns and calculate fits + self.print_message("Fitting curves...", "i") + + for i in df_for_fitting.columns.values: + # drop Nan values to prevent crashes of fitting + data = df_for_fitting[i].dropna() + T = np.float64(data.index.values) + # guess initial parameters + p0 = model.param_init(data) + self.plate_results[i][0 : len(p0)] = p0 + # get parameter bounds + param_bounds = model.param_bounds(data) + # if the model has these parameters, then run a more precise initial parameter estimation + if {"kN", "bN", "kU", "bU"}.issubset(model.param_names()): + # furthermore, if there is a Tm than run smart Tm pre-estimation + estimate_Tm = "Tm" in model.param_names() + self._estimate_baseline( + data, + fit_length=self.baseline_fit, + estimate_Tm=estimate_Tm, + ) + + # Set bounds for kN, bN, kU, bU to be a multiple of stdev of baseline-prefitting + # NOTE this assumes that the parameters are the first four in the list of bounds/p0 + # originally param_bounds is a tuple, so it has to be converted to a list... + if self.baseline_bounds > 0: + param_bounds = list(param_bounds) + param_bounds[0] = list(param_bounds[0]) + param_bounds[1] = list(param_bounds[1]) + param_bounds[0][:4] = list( + self.plate_results[i][:4] + - self.plate_results_stdev[i][:4] * self.baseline_bounds, + ) + param_bounds[1][:4] = list( + self.plate_results[i][:4] + + self.plate_results_stdev[i][:4] * self.baseline_bounds, + ) + elif self.baseline_bounds < 0: + msg = f"Expected a non-negative int for baseline_bounds, but got {self.baseline_bounds}" + raise ValueError( + msg, + ) + + # if this MoltenProtFit instance has any fixed parameter dataframe, + # they will be supplied to the model + if self.fixed_params is not None: + model.set_fixed(list(self.fixed_params[i])) + try: + # for some reason, pandas Series for initial parameters have to be converted to a list + # the fit gives two arrays: fit parameters (p) and covariance matrix (covm for stdev estimation) + + # depending on scipy version enforce the parameter limits or not + if (LooseVersion(scipy_version) >= LooseVersion("0.17")) and ( + param_bounds is not None + ): + # NOTE adding ftol=0.01 and xtol=0.01 may in some cases speed up the fitting (loosens the convergence criteria) + # default values 1e-8 are a bit too conservative; in preliminary tests the speedup was marginal + p, covm = curve_fit( + f, + T, + data, + list(self.plate_results[i][0 : len(p0)]), + bounds=param_bounds, + ) + else: + p, covm = curve_fit( + f, + T, + data.values, + list(self.plate_results[i][0 : len(p0) + 1]), + ) + self.plate_results[i][len(p0) : (len(p0)) * 2] = p + # this is the official way to compute stdev error (from scipy docs) + self.plate_results_stdev[i][len(p0) : (len(p0)) * 2] = np.sqrt( + np.diagonal(covm), + ) + except RuntimeError: + # these probably correspond to bad fitting + self.print_message( + f"Curve fit for {i} failed, probably invalid transition.", + "w", + ) + # generate a list of bad fits + self.bad_fit.append(i) + except ValueError as e: + self.print_message(e.__str__(), "w") + # catch some problems of fitting with santoro1988d + self.print_message( + f"Curve fit for {i} failed unexpectedly (ValueError)", + "w", + ) + # generate a list of bad fits + self.bad_fit.append(i) + + # supply sqrt(n) for S calculation + # to calculate RMSE we don't care about the amount of parameters, + # however, they must be included for standrard error of estimate + # (more info here: http://people.duke.edu/~rnau/compare.htm) + try: + self.plate_results[i].loc["S"] = np.sqrt(len(T) - len(p0)) + except TypeError: + self.print_message( + f"Curve for sample {i} has just one value!", + "w", + ) + # add to bad samples list + self.bad_fit.append(i) + + # drop the wells that could not be fit + self.plate_results.drop(self.bad_fit, axis=1, inplace=True) + self.plate_results_stdev.drop(self.bad_fit, axis=1, inplace=True) + + # empty DataFrame with temperature index to store computed fitted curves + self.plate_fit = pd.DataFrame(index=self.plate.index) + + def calculate_fit(input_series): + """Helper function to compute fit curves + input DataFrame is self.plate_results. + """ + # use the fit parameters from plate_results and the index of plate_fit to compute fit curves + self.plate_fit = pd.concat( + [ + self.plate_fit, + pd.Series( + f( + self.plate_fit.index, + *list(input_series[len(p0) : (len(p0)) * 2]), + ), + index=self.plate_fit.index, + name=input_series.name, + ), + ], + axis=1, + ) + + # apply the helper function + self.plate_results.apply(calculate_fit) + + # calculate S and put it into plate_results + self.print_message("Estimating S...\n", "i") + + # compute S using df_for_fitting + plate_S = (df_for_fitting - self.plate_fit) ** 2 + plate_S = np.sqrt(plate_S.sum()) + self.plate_results.loc["S", :] = plate_S / self.plate_results.loc["S", :] + + # transpose the DataFrame to have sample wells in rows + self.plate_results = self.plate_results.T + self.plate_results_stdev = self.plate_results_stdev.T + + # compute BS-factor to assess how wide is the signal window relative to the noise + # this requires knowledge of baseline parameters and Tm + if {"kN", "bN", "kU", "bU", "Tm"}.issubset(model.param_names()): + # Use S as an overall proxy for assay noise + self.plate_results["BS_factor"] = 1 - 6 * self.plate_results.S / abs( + self.plate_results.kU_fit * self.plate_results.Tm_fit + + self.plate_results.bU_fit + - ( + self.plate_results.kN_fit * self.plate_results.Tm_fit + + self.plate_results.bN_fit + ), + ) + + # compute baseline-corrected raw data if standard baseline parameters are available + if {"kN", "bN", "kU", "bU"}.issubset(model.param_names()): + self._calculate_raw_corr() + + # add layout information to the results + if self.layout is not None: + self.plate_results = pd.concat( + [self.layout, self.plate_results], + join="inner", + axis=1, + ) + + # remove *_init columns and S from self.plate_results_stdev + self.plate_results_stdev.drop( + labels=result_index[: len(p0)], + axis=1, + inplace=True, + ) + self.plate_results_stdev.drop(labels="S", axis=1, inplace=True) + + # based on the model, calculate additional curve characteristics and set the vertical lines for plots + if model.sortby == "dG_std": + # Calculate T_onset + self._calc_Tons("Tm_fit", "dHm_fit", self.onset_threshold) + # dG and dCp component + self.CalculateThermodynamic() + # list of vertical lines to be plotted + self.plotlines = ["T_onset", "Tm_fit"] + elif model.sortby == "dG_comb_std": + # Calculate T_onset + self._calc_Tons("T1_fit", "dHm1_fit", self.onset_threshold) + # Calculate T2 from dT2_1_fit + # TODO error propagation + self.plate_results["T2_fit"] = ( + self.plate_results["T1_fit"] + self.plate_results["dT2_1_fit"] + ) + # NOTE dCp is hard to determine for the intermediate, so it is completely neglected + # dG is calculated for each reaction (N<->I and I<->U) and then combined following the principle of thermodynamic coupling + self.plate_results["dG_comb_std"] = self.plate_results["dHm1_fit"] * ( + 1 - T_std / self.plate_results["T1_fit"] + ) + self.plate_results["dHm2_fit"] * ( + 1 - T_std / self.plate_results["T2_fit"] + ) + # list of vertical lines to be plotted + self.plotlines = ["T_onset", "T1_fit", "T2_fit"] + elif model.sortby == "T_eucl": + # computes Euclidean distance for Tm/T_onset + # Tm and T_ons are on the same scale and are orthogonal characteristics of the sigmoidal curve + # thus, a sample with the most optimal combination is the one that is most far away from T=0 + self.plate_results["T_eucl"] = np.sqrt( + self.plate_results["Tm_fit"] ** 2 + + self.plate_results["T_onset_fit"] ** 2, + ) + # list of vertical lines to be plotted + self.plotlines = ["T_onset_fit", "Tm_fit"] + elif model.sortby == "T_eucl_comb": + self.plate_results["T_eucl_comb"] = np.sqrt( + self.plate_results["T1_fit"] ** 2 + + self.plate_results["T_onset1_fit"] ** 2, + ) + np.sqrt( + self.plate_results["T2_fit"] ** 2 + + self.plate_results["T_onset2_fit"] ** 2, + ) + self.plotlines = [ + "T_onset1_fit", + "T1_fit", + "T_onset2_fit", + "T2_fit", + ] + elif model.sortby == "pk_ratio_std": + # the kF/kR at std temperature; take as -log10 to have higher values for higher stability + self.plate_results["pk_ratio_std"] = -np.log10( + model.arrhenius( + T_std, + self.plate_results.TfF_fit, + self.plate_results.EaF_fit, + ) + / model.arrhenius( + T_std, + self.plate_results.TfR_fit, + self.plate_results.EaR_fit, + ), + ) + self.plotlines = ["TfF_fit", "TfR_fit"] + elif model.sortby == "pk_std": + # For irreversible reactions calculate the unfolding rate constant at std temperature + # based on Tf and Ea values + # NOTE by convention higher sorting value indicates higher stability, but it is not the case + # for rate constant of reaction N -> U; a common trick in chemistry is to calculate the + # negative log10 + self.plate_results["pk_std"] = -np.log10( + model.arrhenius( + T_std, + self.plate_results.Tf_fit, + self.plate_results.Ea_fit, + ), + ) + self.plotlines = ["Tf_fit"] + else: + self.print_message( + "Model {} contains no measure for final sorting".format( + model.short_name, + ), + "w", + ) + self.plotlines = [] + + # sort_values based on the model's sortby property + if model.sortby is not None: + self.plate_results.sort_values( + by=model.sortby, + inplace=True, + ascending=False, + ) + + # for proper json i/o + self.plate_results.index.name = "ID" + self.plate_results_stdev.index.name = "ID" + + def CalculateThermodynamic(self): + """Calculates some thermodynamic characteristics of data and append them to self.plate_results: + > dG_std - Gibbs free energy of unfolding at standard temperature (298 K), extrapolated using the values of dCp + > dCp - heat capacity change of unfolding (supplied by user either in the layout or in the command line) + > dHm - enthalpy of unfolding at Tm (calculated during fitting). + """ + # create temporary dataframe to store td results + column_names = ["dCp", "dG_std"] + # NOTE by default empty dataframes get "object" data type, which can cause problems when using numpy + # if the datatype is specified explicitly, the problem should not exist + # NOTE since dCp is set externally, it will be just copy-pasted and no stdev can be computed + td_results = pd.DataFrame( + index=self.plate_results.index, + columns=column_names, + dtype="float64", + ) + # create a separate dataframe for storing stdev + td_results_stdev = pd.DataFrame( + index=self.plate_results.index, + columns=[column_names[1]], + dtype="float64", + ) + + if self.dCp == "from_layout": + td_results["dCp"] = self.layout["dCp"] + else: + td_results["dCp"] = self.dCp + + # issue a warning about inaccuracy of extrapolated dG with dCp=0 + if any(td_results["dCp"] == 0): + self.print_message( + "One or more dCp values are set to 0; this results in an overestimate of dG_std", + "w", + ) + + # dG_std is extrapolated to standard temperature using the model described in Becktel and Schellman, 1987 + # Tm is chosen as the reference temperature for equation (4), which also means that dS(Tm) = dH(Tm)/Tm + td_results["dG_std"] = self.plate_results["dHm_fit"] * ( + 1 - T_std / self.plate_results["Tm_fit"] + ) - td_results["dCp"] * ( + self.plate_results["Tm_fit"] + - T_std + + T_std * np.log(T_std / self.plate_results["Tm_fit"]) + ) + td_results_stdev["dG_std"] = np.sqrt( + self.plate_results_stdev["dHm_fit"] ** 2 + + (self.plate_results["dHm_fit"] * T_std / self.plate_results["Tm_fit"]) + ** 2 + * ( + (self.plate_results_stdev["dHm_fit"] / self.plate_results["dHm_fit"]) + ** 2 + + (self.plate_results_stdev["Tm_fit"] / self.plate_results["Tm_fit"]) + ** 2 + ) + + (td_results["dCp"] * T_std) ** 2 + * ( + (self.plate_results_stdev["Tm_fit"] / T_std) ** 2 + + (self.plate_results_stdev["Tm_fit"] / self.plate_results["Tm_fit"]) + ** 2 + ), + ) + + # calculate contribution of dCp to the real dG + td_results["dCp_component"] = ( + T_std + - self.plate_results["Tm_fit"] + - T_std * np.log(T_std / self.plate_results["Tm_fit"]) + ) + td_results_stdev["dCp_component"] = np.sqrt( + (self.plate_results_stdev["Tm_fit"]) ** 2 + + ( + T_std + * self.plate_results_stdev["Tm_fit"] + / self.plate_results["Tm_fit"] + ) + ** 2, + ) + + # append td_results to self.plate_results and self.plate_results_stdev + self.plate_results = pd.concat( + [self.plate_results, td_results.loc[:, ["dCp_component", "dG_std"]]], + axis=1, + sort=True, + ) + + self.plate_results_stdev = pd.concat( + [self.plate_results_stdev, td_results_stdev], + axis=1, + sort=True, + ) + + def CombineResults(self, tm_stdev_filt, bs_filt, merge_dup, tm_key): + """A helper method to filter results and optionally average duplicates. + + Parameters + ---------- + tm_stdev_filt + samples with stdev for Tm above this value will be discarded + tm_key + the name for column with Tm + bs_filt + samples with BS-factor above this value will be kept + merge_dup + whether to merge duplicates (based on annotations in the layout) + + Returns: + ------- + results - joined plate_results/stdev DataFrame with optional filtering and duplicate averaging + + Notes: + ----- + * not all plate_results can contain BS-factor, so this part of filtering will be skipped + """ + # NOTE to prevent irreversible changes to plate_results* attributes, force a copy + results = self.plate_results.copy() + + # we may have 2 types of stdev, one from fitting and another one from duplicate averaging + stdev_fit = self.plate_results_stdev.copy() + + # add suffix _stdev to prevent duplicate column names in joined df's + stdev_fit.columns = stdev_fit.columns + "_stdev" + + # NOTE previously, merge_dup was run _before_ stdev filtering, but this messes up + # subsequent description of the procedure. Also, this operation joins 2 processes + # (removal of bad fits and removal of invalid dups) in one, which is bad + # a better way: first filter to remove crappy fits + # then merge dups and use stdev that comes from them. + # This approach also obsoletes the methods for stdev "joining" + + # use stdev_fit dataframe to get ID's of samples that have improper values + if tm_stdev_filt > 0: + drop_tm_stdev = stdev_fit[tm_key + "_fit_stdev"][ + stdev_fit[tm_key + "_fit_stdev"] > tm_stdev_filt + ].index + results = results.drop(drop_tm_stdev, axis=0) + if bs_filt > 0: + try: + results = results[results["BS_factor"] >= bs_filt] + except KeyError: + self.print_message( + "Column 'BS_factor' not found in the results DataFrame, cannot filter", + "w", + ) + + if merge_dup: + # TODO add an option for discarding samples that do not have a duplicate + # averaging capillary numbers doesn't make sense, convert them to strings + if "Capillary" in results.columns: + results.Capillary = results.Capillary.apply(str) + + # processing steps: remove duplicates with different aggregation functions: + # - text data (ID, Capillary) - create a comma-separated string of values + # - result data - compute mean, stdev and n (amount of duplicated values) + + # prepare DataFrames: + # split result data to text and numeric + text_data = results.select_dtypes(include=["object"]) + numeric_data = results.select_dtypes(include=["float64", "int64"]) + + # for text data convert ID index to a new column (a bit hacky) + text_data.reset_index(inplace=True) + text_data.set_index("ID", inplace=True, drop=False) + + # since the column for de-duplication is text, it has to be re-created for numeric_data and stdev_data + numeric_data = pd.concat([numeric_data, text_data["Condition"]], axis=1) + + # perform deduplication (grouping) + + # for text data we declare a lambda function + text_data = text_data.groupby("Condition").agg(lambda x: ",".join(list(x))) + + # results data - count of each sample + numeric_data_count = numeric_data.groupby("Condition").agg("count") + # now also stdev (add a proper suffix to index) + numeric_data_stdev = numeric_data.groupby("Condition").agg(np.std) + numeric_data_stdev.columns = numeric_data_stdev.columns + "_stdev" + # and finally average + numeric_data = numeric_data.groupby("Condition").agg(np.mean) + + # add n information to the text_data + # BUG if after filtering the dataset is empty, this operation produces an error + # this is caught with try/except + try: + text_data["n"] = numeric_data_count.iloc[:, [0]] + except ValueError as e: + self.print_message(e.__str__(), "w") + self.print_message( + "No data left after filtering in CombineResults()", + "w", + ) + + # update the results dataframe + results = pd.concat([text_data, numeric_data, numeric_data_stdev], axis=1) + else: + results = pd.concat([results, stdev_fit], axis=1) + return results + + def RenameResults(self, datatype="sct"): + """#HACK rename Tm to Tagg in for scattering data + # also chemical denaturation renaming can be done here. + """ + if datatype == "sct": + rename_dict = {"Tm_init": "Tagg_init", "Tm_fit": "Tagg_fit"} + elif datatype == "chem": + rename_dict = {"Tm_init": "dGH2O_init", "Tm_fit": "dGH2O_fit"} + msg = "Chemical denaturation renaming is not there yet" + raise NotImplementedError(msg) + + self.plate_results.rename(columns=rename_dict, inplace=True) + self.plate_results_stdev.rename(columns=rename_dict, inplace=True) + + @staticmethod + def _trim_string(string, length=30, symmetric=True): + """Helper method to trim a string to a specific length by removing the middle part. + + Arguments: + --------- + string + string to be trimmed + length + length of the data to keep + symmetric + if true, also show the end of the string + """ + # ensure that input is a str + string = str(string) + # if too short, return as is + if len(string) <= length: + return string + if symmetric: + return string[: length // 2] + " ... " + string[-length // 2 :] + else: + return string[:length] + "..." + + def _plotfig_pdf(self, samples, failed=False): + """A helper method for smart packing of individual sample plots into figures. + + Parameters + ---------- + samples + a valid list of samples to be plotted + failed + indicate if samples are from failed fits + + Returns: + ------- + A list of Figure objects to be added to pages list + """ + if failed: + suptitle = "Excluded/failed samples" + datatype = "raw" + else: + suptitle = "Successful fits" + datatype = "overview" + pages = [] + n_results = len(samples) + # intialize figures, axes, and plot counter + plot_fig, plot_axs = plt.subplots( + 4, + 3, + sharex=False, + sharey=False, + figsize=(8.3, 11.7), + ) + plot_fig.suptitle(suptitle, fontweight="bold", fontsize="x-large") + plot_axs = list(plot_axs.flat) + plot_counter = 0 + for sample in samples: + if plot_counter < 11: + self.plotfig( + "dummy_output", + sample, + datatype=datatype, + save=False, + show=False, + data_ax=plot_axs[plot_counter], + vline_legend=True, + ) + plot_counter += 1 + # if the 11th plot was plotted, append old figure and initialize a new one + # this should be also triggered if less than 11 is plotted + if (plot_counter == 11) or (plot_counter == n_results): + # make legend in the next axes + plot_axs[plot_counter].legend( + handles=plot_axs[plot_counter - 1].get_lines(), + mode="expand", + ncol=1, + ) + # hide remaining unused axes + while plot_counter <= 11: + plot_axs[plot_counter].set_axis_off() + plot_counter += 1 + plot_fig.tight_layout( + rect=(0.02, 0.05, 0.98, 0.95), + ) # rect leaves some margins empty + pages.append(plot_fig) + plot_fig, plot_axs = plt.subplots( + 4, + 3, + sharex=False, + sharey=False, + figsize=(8.3, 11.7), + ) + plot_fig.suptitle( + suptitle + " (continued)", + fontweight="bold", + fontsize="x-large", + ) + plot_axs = list(plot_axs.flat) + plot_counter = 0 + return pages + + def PdfReport(self, outfile): + """Generate and write a multi-page PDF report. + + Parameters + ---------- + outfile + location of the output, overwrite without confirmation + + Notes: + ----- + * heatmap and converter96 are ancient methods, so they are just carefully wrapped around + * each page is a figure object + * multi-page pdf as per mpl [docs](https://matplotlib.org/stable/gallery/misc/multipage_pdf.html) + """ + from matplotlib.backends.backend_pdf import PdfPages + from matplotlib.table import table as mpl_table + + result_columns = ( + self.getResultsColumns() + ) # the first value should be the recommended sorting parameter followed by vlines and BS/S + # add condition column + result_columns = ["Condition", *result_columns] + sort_parameter = result_columns[1] + # preprocess the result df + result_table = self.plate_results.loc[ + :, + result_columns, + ].copy() # to prevent edits to the original DF + result_table = np.round(result_table, 2) # round the numeric data + result_table["Condition"] = result_table["Condition"].apply( + self._trim_string, + length=10, + symmetric=False, + ) + result_table_colors = result_table.copy() # color table 1.0 is white 0 is black + result_table_colors.loc[:, :] = "1.0" + result_table_colors.iloc[::2] = "0.75" + result_table_colors = result_table_colors.values + result_index = result_table.index + result_table = result_table.values # convert to a list of lists + n_results = len(result_table) # total number of results + pages = [] + + ## Page 1: Heatmap of the respective sortby parameter, top 15 results, run info + plate96 = self.converter96(sort_parameter, reference=None) + page1, page1_ax = self.heatmap( + "dummy_output", + plate96, + sort_parameter, + save=False, + pdf_report=True, + ) + page1_ax[0].set_title( + f"Heatmap of {sort_parameter}", + loc="left", + fontweight="bold", + ) + # mpl tables cannot do word wrapping, so trim the file name + filename = self._trim_string(self.filename) + mp_version = __version__ + if from_pyinstaller: + mp_version += " (PyInstaller bundle)" + timestamp = strftime("%c") + # failed fits and user-excluded samples + excluded = self._get_failed_samples() + if len(excluded) > 0: + excluded_str = self._trim_string(", ".join(list(excluded))) + else: + excluded_str = "None" + + info_table = [ + ["Timestamp", timestamp], + ["Input file", filename], + ["Scan rate, degrees/min", self.scan_rate], + ["MoltenProt version", mp_version], + ["Analysis model", self.model], + ["Excluded/failed samples", excluded_str], + ] + info_table_ax = page1_ax[2] + info_table = mpl_table( + info_table_ax, + info_table, + loc="upper left", + edges="open", + cellLoc="left", + ) + + # using the solution from here to set proper font size in the table: + # https://stackoverflow.com/questions/15514005/how-to-change-the-tables-fontsize-with-matplotlib-pyplot + info_table.auto_set_font_size(False) + info_table_ax.set_title("Run info", loc="left", fontweight="bold") + top10_table_ax = page1_ax[1] + mpl_table( + top10_table_ax, + result_table[:15, :], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[:15], + cellColours=result_table_colors[:15, :], + ) + top10_table_ax.set_title("Top 15 results", loc="left", fontweight="bold") + + info_table_ax.set_axis_off() + top10_table_ax.set_axis_off() + + # finalize page 1 + page1.suptitle( + f"MoltenProt Report: {self.readout_type}", + fontweight="bold", + fontsize="x-large", + ) + # add citation to the bottom of the page + page1.text(0.5, 0.05, citation["short"], ha="center") # fontstyle='italic', + pages.append(page1) + + ## Full result table - create if more than 15 results (but less than 48) + if n_results > 15: + page2, page2_ax = plt.subplots(1, 1, figsize=(8.3, 11.7)) + mpl_table( + page2_ax, + result_table[:48], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[:48], + cellColours=result_table_colors[:48], + ) + page2_ax.set_axis_off() + page2_ax.set_title( + f"Result table (sorted by {sort_parameter})", + loc="left", + fontweight="bold", + ) + pages.append(page2) + + # if more than 48 samples are present + if n_results > 48: + page3, page3_ax = plt.subplots(1, 1, figsize=(8.3, 11.7)) + mpl_table( + page3_ax, + result_table[48:], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[48:], + cellColours=result_table_colors[48:], + ) + page3_ax.set_axis_off() + page3_ax.set_title( + "Result table (continued)", + loc="left", + fontweight="bold", + ) + pages.append(page3) + + ## pages with plots of individual curves + pages += self._plotfig_pdf(self.plate_results.index) + ## same as above, but for failed fits + pages += self._plotfig_pdf(excluded, failed=True) + + # write output + page_no = 1 + page_count = len(pages) + with PdfPages(outfile) as pdf_file: + for i in pages: + # add page number and save + i.text( + 0.5, + 0.025, + f"-- Page {page_no} of {page_count} --", + fontstyle="italic", + ha="center", + ) + page_no += 1 + pdf_file.savefig(i) + + # clean up mpl objects + plt.close("all") + + def _get_failed_samples(self): + """Return a list of samples that were either excluded or not fit. + + Notes: + ----- + * will raise a value error if the analysis is not done + """ + return self.plate_raw.columns.difference(self.plate_results.index) + + def WriteOutput( + self, + print10=False, + xlsx=False, + genpics=False, + heatmaps=[], + hm_ref=None, + heatmap_cmap=defaults["heatmap_cmap"], + resources_prefix="", + n_jobs=1, + no_data=False, + pdf=False, + ): + """Write the results to the disk. + + Parameters + ---------- + print10 + print 10 best samples to stdout + xlsx + write output in XLSX rather than CSV + genpics + generate figures for all samples + heatmaps + generate heatmaps for selected plate_results columns + hm_ref + reference well for heatmap + heatmap_cmap + matplotlib colormap for heatmap + resources_prefix + extra string to add when saving files (e.g. to prevent overwrites in multi-dataset case) + n_jobs + number of subprocesses to run figure plotting in parallel + no_data + do not output any data + pdf + write a report in PDF format + """ + # estimate how many samples could not go through the processing by finding the + # difference between the indexes of original dataset and the final results + # BUG if the sample is registered in xlsx annotations, but not present in the raw data plate + # (i.e. it was marked as bad capillary by Prometheus), then it will not be shown here + # such samples can be easily diagnosed through reports: even the raw-only signal is not plotted + # BUG failed samples have only ID's, but not layout info + failed_samples = self._get_failed_samples() + + # if there is a least one empty sample create respective results dataframes for it + # and temporarily append them to the main results (will not persist as a class attribute) + # NOTE index objects are immutable, so a new empty df has to be created and concatenated + if len(failed_samples) > 0: + failed_results = pd.DataFrame( + index=failed_samples, + columns=self.plate_results.columns, + ) + failed_results_stdev = pd.DataFrame( + index=failed_samples, + columns=self.plate_results_stdev.columns, + ) + output_results = pd.concat([self.plate_results, failed_results]) + output_results_stdev = pd.concat( + [self.plate_results_stdev, failed_results_stdev], + ) + else: + output_results = self.plate_results + output_results_stdev = self.plate_results_stdev + + output_path = self.resultfolder + + if print10: + # print the 10 best conditions to the console + self.print_message("\n\nBest 10 conditions (best->worst):", "i") + print(", ".join(list(self.plate_results[:10].index)) + "\n") + + # the checks for the folder existence are done in the main script + self.print_message("Writing results...", "i") + if not no_data: + if xlsx: + # create a holding object for *.xlsx export + writer = pd.ExcelWriter( + os.path.join(output_path, resources_prefix + "_Results.xlsx"), + ) + + self.plate_raw.to_excel(writer, "Raw data") + self.plate.to_excel(writer, "Preprocessed data") + self.plate_fit.to_excel(writer, "Fit curves") + self.plate_raw_corr.to_excel(writer, "Baseline-corrected") + output_results.to_excel(writer, "Fit parameters") + output_results_stdev.to_excel(writer, "Standard deviations") + + # this is the longest part in XLSX saving procedure + writer.save() + else: + # convert plate_results* dataframes to *.csv's + output_results.to_csv( + os.path.join(output_path, resources_prefix + "_results.csv"), + sep=",", + index_label="Parameters", + encoding="utf-8", + ) + output_results_stdev.to_csv( + os.path.join(output_path, resources_prefix + "_results_stdev.csv"), + sep=",", + index_label="Parameters", + encoding="utf-8", + ) + self.plate_fit.to_csv( + os.path.join(output_path, resources_prefix + "_fit.csv"), + sep=",", + encoding="utf-8", + ) + self.plate.to_csv( + os.path.join(output_path, resources_prefix + "_preproc_curves.csv"), + sep=",", + encoding="utf-8", + ) + self.plate_raw_corr.to_csv( + os.path.join(output_path, resources_prefix + "_raw_corr.csv"), + sep=",", + encoding="utf-8", + ) + # PDF report + if pdf: + self.PdfReport(os.path.join(output_path, resources_prefix + "_report.pdf")) + + # generate heatmaps + if len(heatmaps) > 0: + if "all" in heatmaps: + # columns shared (presumably) between all models + heatmaps = ["S", *self.plotlines] + for i in heatmaps: + try: + plate96 = self.converter96(i, hm_ref) + # check if the column name is in the hm_dic, if not use default options + try: + self.heatmap( + output_path, + plate96, + i, + lowerbetter=self.hm_dic[i]["lowerbetter"], + title=self.hm_dic[i]["title"], + tick_labels=self.hm_dic[i]["tick_labels"], + heatmap_cmap=heatmap_cmap, + ) + except KeyError: + try: + # print that lowerbetter for this heatmap can be wrong + self.print_message( + "{} was not found in hm_dic, the colors may be wrong!".format( + i, + ), + "w", + ) + self.heatmap( + output_path, + plate96, + i, + title="Heatmap of " + str(i), + ) + except: + self.print_message( + "Re-check the parameters for --heatmap option", + "w", + ) + except ValueError: + # NOTE a temporary fix for cases when a wrong heatmap parameter was supplied + self.print_message( + f'"{i}" is an invalid option for heatmap', + "w", + ) + print("Valid options are:") + print(", ".join(list(self.plate_results.columns))) + + # clean up after plotting so that no parameters are carried over to downstream plottings + plt.close("all") + + # generate plots of individual samples + # BUG for multi-dataset instance using --genfigs will overwrite images + if genpics: + self.print_message("Generating figures... This may take a while...", "i") + # depending on the value of parallelization either run a single or multi-processor command + if parallelization and n_jobs > 1: + # with picklable plotfig method: + Parallel(n_jobs=n_jobs)( + delayed(self.plotfig)(output_path=output_path, wellID=i) + for i in self.plate_fit.columns.values + ) + # plot raw data of failed samples + if len(failed_samples) > 0: + Parallel(n_jobs=n_jobs)( + delayed(self.plotfig)( + output_path=output_path, + datatype="raw", + wellID=i, + ) + for i in failed_samples + ) + else: + for i in self.plate_fit.columns.values: + # save to default path + self.plotfig(output_path, i) + + # plot failed samples, if any + # TODO add the derivative curve so that the plots look nicer + if len(failed_samples) > 0: + for i in failed_samples: + self.plotfig(output_path=output_path, datatype="raw", wellID=i) + + +class MoltenProtFitMultiple: + """The main class in MoltenProt; contains one or more datasets (i.e. MoltenProtFit instances) + and coordinates their processing. + """ + + def __init__( + self, + scan_rate=None, + denaturant=None, + layout=None, + source=None, + ) -> None: + """Only core settings are defined at the level of initialization: + + layout:DF - annotation for all samples + denaturant:str - can be either a unit of temperature (C, K) or denaturant name (GuHCl, Urea) + all internal processing is done with temperature in Kelvins + source: string - the name of the file that was used to get the data (currently not the real path) + debug level? + + All datasets must be added using a dedicated method. + Defaults to None, which is useful to re-create instances from JSON files + """ + self.layout = layout + if layout is not None: + self.layout_raw = layout.copy() # a backup copy of the original layout + else: + self.layout_raw = None + self.source = source + self.scan_rate = scan_rate + self.denaturant = denaturant + self.datasets = {} # type: ignore[var-annotated] + + def __getstate__(self): + """A special method to enable pickling of class methods.""" + output = self.__dict__ + output["PrepareAndAnalyseSingle"] = self.PrepareAndAnalyseSingle + output["WriteOutputSingle"] = self.WriteOutputSingle + return output + + def print_message(self, text, message_type): + """Print a message and add it to the protocolString of all datasets.""" + if message_type == "w": + prefix = "Warning: " + elif message_type == "i": + prefix = "Information: " + else: + msg = f"Unknown message type '{message_type}'" + raise ValueError(msg) + message = f"{prefix}{text} (All)" + print(message) + message += "\n" + for i, j in self.datasets.items(): + j.protocolString += message + + def GetAnalysisSettings(self): + """Get information on the analysis settings (shared settings and per-dataset model settings). + + Returns: + ------- + a tuple of two variables: + > dict with current analysis settings + > dataframe with dataset/model settings (compatible with QtTableWidget) + """ + # NOTE currently all settings except for model are the same for all datasets + # start from default settings + analysis_settings = analysis_kwargs(analysis_defaults) + model_settings = pd.Series( + index=self.GetDatasets(), + name="Model", + data="santoro1988", + ) + model_settings.index.name = "Dataset" + + for dataset_name, dataset in self.datasets.items(): + if dataset.analysisHasBeenDone(): + analysis_settings = analysis_kwargs(dataset.__dict__) + model_settings[dataset_name] = dataset.model + # unprocessed MPF may not have the model attribute + if hasattr(dataset, "model") and dataset.model == "skip": + model_settings[dataset_name] = dataset.model + + # reset index to comply with QtTableWidget + model_settings = model_settings.reset_index() + + return (analysis_settings, model_settings) + + def AddDataset(self, data, readout): + """Add an individual Dataset (as MP_fit instance in datasets attribute). + + Parameters + ---------- + data: pd.DataFrame + a ready-togo DataFrame with Index called "Temperature" or "Denaturant" (not implemented yet) + readout: str + a name for the dataset to be added; will be used to call internal data and thus has to be unique; spaces will be substituted with underscore + + Notes: + ----- + TODO add an option to indicate that the data is about aggregation (to rename to Tagg) + TODO set layouts here as well? + """ + if readout in list(self.datasets.keys()): + msg = "Readout '{}' is already present in this MoltenProtFitMultiple instance!" + raise RuntimeError( + msg, + ) + + # in readout name convert spaces to underscores + readout = readout.replace(" ", "_") + # NOTE parent_filename is only used in reports + self.datasets[readout] = MoltenProtFit( + data, + scan_rate=self.scan_rate, + input_type="from_xlsx", + parent_filename=self.source, + denaturant=self.denaturant, + readout_type=readout, + ) + + def DelDataset(self, todelete): + """Remove a dataset from the instance. + + Parameters + ---------- + todelete + which dataset to delete + + Notes: + ----- + This permanently removes a dataset, so the data cannot be recovered. If the data should be retained, then set model to 'skip' in Analysis + """ + if todelete in self.GetDatasets(): + del self.datasets[todelete] + else: + self.print_message(f"Dataset '{todelete}' not found", "w") + + def GetDatasets(self, no_skip=False): + """Returns available dataset names. + + Parameters + ---------- + no_skip + if True, then datasets with model 'skip' will not be included + """ + if no_skip: + output = [] + for dataset_name, dataset in self.datasets.items(): + # NOTE model attribute is only added after processing + if hasattr(dataset, "model") and dataset.model == "skip": + continue + output.append(dataset_name) + return tuple(output) + else: + return tuple(self.datasets.keys()) + + def UpdateLayout(self): + """For GUI communication only, update the layout of the datasets after the "master" layout in MPFM was changed. + + Notes: + ----- + BUG this method does not use the MPF.SetLayout, which makes code redundant + layouts are set only at SetAnalysisOptions state + """ + for dataset_id, mp_fit in self.datasets.items(): + mp_fit.layout = self.layout + # also update the layout info in plate_results (if present) + if hasattr(mp_fit, "plate_results"): + mp_fit.plate_results["Condition"] = self.layout["Condition"] + + def ResetLayout(self): + """Change the master layout in MPFM to layout_raw (recorded during parsing of XLSX in newer versions of moltenprot) and update all MPF instances.""" + if self.layout_raw is not None: + self.layout = ( + self.layout_raw.copy() + ) # need to copy, because all edits to the layout will propagate to layout_raw, and it will not be "original" + self.UpdateLayout() + else: + self.print_message("Attribute layout_raw is None, nothing to reset", "w") + + def SetScanRate(self, scan_rate): + """Sets a new scan rate (degC/min) to all datasets.""" + # too high precision is not relevant + self.scan_rate = round(scan_rate, 2) + for dataset_id, mp_fit in self.datasets.items(): + mp_fit.scan_rate = self.scan_rate + + def RenameResultsColumns(self, which, mapping): + """E.g. in scattering data, the output is not Tm, but Tagg + Also, for chemical denaturation the columns can be completely different + NOTE currently done via RenameResults of MoltenProtFit. + + Parameters + ---------- + which : str + to which dataset to apply + mapping : dict + a dictionary with keys being original names and values being new names + """ + self.print_message( + f"Renaming results columns in dataset {which}\n{mapping}", + "i", + ) + self.datasets[which].plate_results.rename(columns=mapping, inplace=True) + self.datasets[which].plate_results_stdev.rename(columns=mapping, inplace=True) + + def SetAnalysisOptions(self, which="all", printout=False, **kwargs): + """Set analysis options for a single dataset. + + Parameters + ---------- + which : str + To which dataset the options are applied; all means apply same settings for all datasets + printout : bool + print the settings + **kwargs + args for MoltenProtFit.SetAnalysisOptions() + """ + # Add layout-related options to kwargs + if kwargs.get("blanks"): + # check if the current layout has any blanks listed and remove those + self.layout.Condition = self.layout.Condition.replace("Blank", "") + try: + self.layout.loc[kwargs["blanks"], "Condition"] = "Blank" + except KeyError: + self.print_message( + "One or more blank samples have invalid IDs, no blank info will be available", + "w", + ) + # add to all datasets' protocol string + + # NOTE if the same sample is listed as blank and exclude, it will have only exclude in the end + # i.e. exclusion by user has higher priority + if kwargs.get("exclude"): + # check if the current layout has any excluded samples and remove those + self.layout.Condition = self.layout.Condition.replace("Ignore", "") + try: + self.layout.loc[kwargs["exclude"], "Condition"] = "Ignore" + except KeyError: + self.print_message( + "One or more excluded samples have invalid IDs, no exclusion done", + "w", + ) + + kwargs["layout"] = self.layout + kwargs["layout_input_type"] = "from_xlsx" + # NOTE the differences from 1x processing: + # the layout is shared, so we use self.layout, and also tell MoltenProtFit instance that the layout is already a DataFrame (using layout_input_type) + if which == "all": + for i, j in self.datasets.items(): + j.SetAnalysisOptions(**kwargs) + else: + self.datasets[which].SetAnalysisOptions(**kwargs) + + if printout: + print(f"Data type is {i}") + self.datasets[which].printAnalysisSettings() + + def PrepareAndAnalyseSingle(self, which): + """Run data processing pipeline on a single dataset. + + Parameters + ---------- + which + dataset name + """ + self.datasets[which].PrepareData() + self.datasets[which].ProcessData() + + # NOTE return statement is only needed for parallelized code (MoltenProtFitMultiple instance + # gets overwritten and computed results are not stored) + return self.datasets[which] + + def PrepareAndAnalyseAll(self, n_jobs=1): + """Run analysis on all datasets. + + Parameters + ---------- + n_jobs : int + how many parallel processes to start + """ + analysis_tuple = self.GetDatasets() + + # parallelization of analysis routine + if parallelization and n_jobs > 1: + results_tuple = Parallel(n_jobs=n_jobs)( + delayed(self.PrepareAndAnalyseSingle)(i) for i in analysis_tuple + ) + for i, j in zip(analysis_tuple, results_tuple): + self.datasets[i] = j + else: + for i in analysis_tuple: + self.PrepareAndAnalyseSingle(i) + + def CombineResults(self, outfile, tm_stdev_filt=-1, bs_filt=-1, merge_dup=False): + """Join all plate_results/stdev DataFrames and write to a single XLSX file. + + Parameters + ---------- + outfile : str + where to write the output (a full path) + tm_stdev_filt : float + samples with Tm stdev above this value will be discarded + bs_filt : float + samples with BS-factor below this value will be discarded + merge_dup : bool + whether to aggregate samples with identical annotations in layout + + Notes: + ----- + TODO: add more generic filtering options + """ + analysis_tuple = self.GetDatasets() + + # initiate xlsx writer objects + writer = pd.ExcelWriter(outfile) + + for i in analysis_tuple: + tm_key = "Tagg" if i == "Scattering" else "Tm" + + # skip non-processed datasets (model=skip) + if self.datasets[i].model != "skip": + output = self.datasets[i].CombineResults( + tm_stdev_filt=tm_stdev_filt, + bs_filt=bs_filt, + merge_dup=merge_dup, + # merge_stdev=merge_stdev, DEPRECATED? + tm_key=tm_key, + ) + # NOTE Excel sheets are now named identically to input dataset names + output.to_excel(writer, i) + + # write XLSX files to the result folder: + writer.save() + + def GenerateReport(self, heatmap_cmap, template_path=None): + """Creates an interactive HTML report (as a string). + + Parameters + ---------- + heatmap_cmap + matplotlib colormap for heatmap + template_path + the HTML template + + Returns: + ------- + A string made from report template where placeholders were substituted to actual HTML code + """ + # use default template if none provided + if template_path is None: + template_path = os.path.join(__location__, "resources/report.template") + + # open the html template + with open(template_path) as template_file: + template = template_file.read() + + # convert it to a string.Template instance + template = Template(template) + + heatmap_table = "" + buttons = "" + + # for the first heatmap display is table, for the rest it is none + display_heatmap = "table" + + # for a single-dataset MPFm do not show buttons + display_buttons = "none" if len(self.datasets) == 1 else "" + + # cycle through all datasets and get glue up strings to the starting button or heatmap string + for dataset_name, dataset in self.datasets.items(): + # skip non-processed datasets + if dataset.model != "skip": + heatmap_table += dataset.html_heatmap(heatmap_cmap, display_heatmap) + if display_heatmap == "table": + display_heatmap = "none" + first_heatmap = dataset_name + buttons += dataset.html_button() + + return template.substitute( + FILE=self.source, + FIRST_HEATMAP=first_heatmap, + HEATMAP_TABLE=heatmap_table, + DISPLAY_BUTTONS=display_buttons, + BUTTONS=buttons, + VERSION=__version__, + TIMESTAMP=strftime("%c"), + ) + + def WriteOutputSingle( + self, + which, + outfolder, + subfolder=False, + **kwargs, # keyword args for WriteOutput + ): + """Write output to disc for a single dataset. + + Parameters + ---------- + which : dataset to process + outfolder : folder for output + heatmap_cmap : str + matplotlib colormap for heatmap + xlsx : bool + write output in XLSX format (default is CSV) + genpics : bool + create figures for samples + heatmaps : list + create heatmaps for the column in list + subfolder : bool + write output in outfolder (default) or create a subfolder called "which_resources" + n_jobs : int + how many parallel processes can be spawned + no_data : bool + no data output + + Notes: + ----- + * No output generated for datasets with model "skip" + * Do not use this method directly, use WriteOutputAll instead + """ + if self.datasets[which].model == "skip": + pass + else: + if subfolder: + outfolder = os.path.join(outfolder, which + "_resources") + os.makedirs(outfolder, exist_ok=True) + + # HACK to minimize edits to MoltenProtFit assingment of outfolder is done via the attribute + self.datasets[which].resultfolder = outfolder + self.datasets[which].WriteOutput( + resources_prefix=which, + **kwargs, + ) + # delete the attribute completely + del self.datasets[which].resultfolder + + def WriteOutputAll( + self, + outfolder, + # report, + xlsx=False, + genpics=False, + heatmaps=[], + report_format=None, + heatmap_cmap=defaults["heatmap_cmap"], + n_jobs=1, + no_data=False, + session=False, + ): + """Write output to disc for all associated datasets. + + Parameters + ---------- + outfolder : str + the folder where report.html will be placed and per-dataset subfolders + xlsx : bool + write output in XLSX format (default is CSV) + report : bool DEPRECATED + generate HTML report + summary : bool DEPRECATED + create a compact summary XLSX file + report_format : None or str 'pdf', 'html', 'xlsx' + n_jobs : int + how many output processes to run + genpics : bool + create figures for samples + heatmaps : list + create heatmaps for the column in list + heatmap_cmap : str + matplotlib colormap for heatmap + session : bool + save MP session in JSON format + """ + # generate and populate a dict of output settings + output_kwargs = {} + + if len(self.datasets) == 1: + # for single-dataset instances (and no reports planned) + # write everything to outdir + output_kwargs["subfolder"] = False + + if xlsx: + output_kwargs["xlsx"] = True + if heatmaps: + # heatmaps should be a list with one or more elements + output_kwargs["heatmaps"] = heatmaps + if genpics: + output_kwargs["genpics"] = True + + output_kwargs["heatmap_cmap"] = heatmap_cmap + output_kwargs["no_data"] = no_data + + # NOTE since reports are pre-defined data bundles, they may override some of the previous settings + if report_format == "html": + # generate a reporthtml string + reporthtml = self.GenerateReport(heatmap_cmap=heatmap_cmap) + # write all datatets to dedicated subfolders + output_kwargs["subfolder"] = True + # write the HTML of the report to outdir + with open(os.path.join(outfolder, "report.html"), "w") as file: + file.write(reporthtml) + output_kwargs["xlsx"] = True + output_kwargs["genpics"] = True + output_kwargs["no_data"] = False + elif report_format == "xlsx": + self.CombineResults(os.path.join(outfolder, "report.xlsx"), -1, -1, False) + elif report_format == "pdf": + output_kwargs["pdf"] = True + + # write output in parallel or serially + if parallelization and n_jobs > 1: + if len(self.GetDatasets()) == 1: + # if there is only a single associated dataset, it makes sense to enable parallel figure plotting + self.WriteOutputSingle( + self.GetDatasets()[0], + outfolder, + n_jobs=n_jobs, + **output_kwargs, + ) + else: + Parallel(n_jobs=n_jobs)( + delayed(self.WriteOutputSingle)(i, outfolder, **output_kwargs) + for i in self.GetDatasets() + ) + else: + # resultfolder was cleaned previously or created fresh so we just have to supply a proper prefix + for i in self.GetDatasets(): + self.WriteOutputSingle(i, outfolder, **output_kwargs) + + # NOTE JSON dumping must be done _AFTER_ all parallelized jobs! + if session: + mp_to_json(self, output=os.path.join(outfolder, "MP_session.json")) + + +class MoltenProtFitMultipleLE(MoltenProtFitMultiple): + """A special class to handle the lumry_eyring model + The main difference is that Scattering signal is required and + the datasets are processed sequentially. + """ + + def PrepareAndAnalyseAll(self, n_jobs=1): + """First the scattering data is fit to get Ea and Tf for reaction U->A + Then they are supplied as fixed parameters to fit reaction N <-kF, kR -> U + This fit is done in all other datasets. + """ + # rename dataset if refolded data was used + if "Scattering (Unfolding)" in self.GetDatasets(): + self.datasets["Scattering"] = self.datasets.pop("Scattering (Unfolding)") + + # check if Scattering signal is present + if "Scattering" not in self.GetDatasets(): + msg = "lumry_eyring model requires a Scattering dataset" + raise ValueError(msg) + + # run analysis of Scattering data with irrev model (has to be changed from whatever was supplied) + self.datasets["Scattering"].model = "irrev" + self.PrepareAndAnalyseSingle("Scattering") + # cycle through all other datasets and add fixed parameters + for dataset in self.GetDatasets(): + if dataset != "Scattering": + # NOTE in GUI it is possible that some datasets are not skipped, are not Scattering and + # do not have lumry_eyring model, for those SetFixedParameters will cause AttributeError, + # because non-LE models do not use this feature + if self.datasets[dataset].model == "lumry_eyring": + self.datasets[dataset].SetFixedParameters( + self.datasets["Scattering"] + .plate_results.loc[:, ["Tf_fit", "Ea_fit"]] + .T, + ) + self.PrepareAndAnalyseSingle(dataset) + + +class MoltenProtFitMultipleRefold(MoltenProtFitMultiple): + """Contains special tweaks to work with refolding data from Prometheus NT.48. + + * Analyse all datasets, but not the refolding datasets (addDataset for normal unfolding, addRefoldDataset + for refolding datasets, maintain them separately, but use the same keys in datasets dict) + * take the baseline parameters from unfolding datasets and draw fraction refolded + * for fraction refolded generate a separate report HTML file + * how to show this in GUI? + + Eventually: + * add fraction_irreversibly_unfolded to the equation and fit refolding data + using the parameters fro unfolding as starting values + """ + + pass + + +### Data parsers +""" +Functions to create a MoltenProtFitMultiple instance from a specific +experimental data file. +""" + + +def _csv_helper(filename, sep, dec): + """Pre-processing steps for reading CSV files. + + Returns: + ------- + pd.DataFrame with Temperature as index + """ + try: + data = pd.read_csv( + filename, + sep=sep, + decimal=dec, + index_col="Temperature", + encoding="utf-8", + ) + except ValueError as e: + print(e) + msg = "Input *.csv file is invalid!\nCheck if column called 'Temperature' exists and separators are specified correctly" + raise ValueError( + msg, + ) + + # check if index contains duplicates and drop those + if data.index.duplicated().any(): + print( + "Warning: Temperature scale contains duplicates, all but first occurence are dropped", + ) + data = data.loc[data.index.drop_duplicates(), :] + return data + + +def parse_plain_csv( + filename, + scan_rate=None, + sep=defaults["sep"], + dec=defaults["dec"], + denaturant=defaults["denaturant"], + readout=defaults["readout"], + layout=defaults["layout"], +): + """Parse a standard CSV file with columns Temperature, A1, A2, ... + + Parameters + ---------- + filename : str + path to csv file + sep,dec - csv import parameters + denaturant : str + temperature in input file assumed to be Celsius (default value C), but could be also in K + readout : str + name for the experimental technique (e.g. CD or F330), will be used as key in dataset dict + layout : str or None + specify a special *.csv file which defines the plate layout (i.e. what conditions are in each sample) + + Returns: + ------- + MoltenProtFitMultiple instance + + #TODO add removal of zeros in A01, A02, etc + """ + # read the CSV into a DataFrame + data = _csv_helper(filename, sep, dec) + + # read layout (if provided) + if layout is not None: + try: + layout = pd.read_csv(layout, index_col="ID", encoding="utf_8") + except: + print( + "Warning: unsupported layout format! No layout info will be available", + ) + layout = None + + if layout is None: + # if no layout provided or could not be read, create an empty layout DataFrame + layout = pd.DataFrame(index=alphanumeric_index, columns=["Condition"]) + + # initialize and return a MoltenProtFitMultiple instance + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + denaturant=denaturant, + layout=layout, + source=filename, + ) + output.AddDataset(data, readout) + return output + + +def parse_spectrum_csv( + filename, + scan_rate=None, + denaturant=defaults["denaturant"], + sep=defaults["sep"], + dec=defaults["dec"], + readout=defaults["readout"], +): + """Parse CSV file with columns Temperature,wavelengths... + + Parameters + ---------- + filename : str + path to csv file + sep,dec - csv import parameters + denaturant : str + temperature in input file assumed to be Celsius (default value C), but could be also in K + readout : str + name for the experimental technique (e.g. CD or F330), will be used as key in dataset dict + + Returns: + ------- + MoltenProtFitMultiple instance + + Notes: + ----- + * Temperature axis is not sorted + * Layouts are generated automatically from column names (assumed to be respective wavelengths) + """ + data = _csv_helper(filename, sep, dec) + + # if data is too big, take a random subset + if len(data.columns) > 96: + print( + f"Warning: too many wavelengths in the spectrum ({len(data.columns)}), selecting random 96", + ) + data = data.sample(n=96, axis=1) + # to be on the safe side, sort columns ascending + data = data.loc[:, sorted(data.columns)] + # apply the alphanumeric index + data = data.T + data["ID"] = list(alphanumeric_index[: len(data)]) + data.index.name = "Condition" + data.reset_index(inplace=True) + + # set ID as index + data.set_index("ID", inplace=True) + # extract layout info and drop from the main df + # initialize the layout dataframe + layout = pd.DataFrame(index=alphanumeric_index, columns=["Condition"]) + layout.index.name = "ID" + layout.loc[data.index, "Condition"] = data.loc[:, "Condition"].copy() + data.drop(["Condition"], axis=1, inplace=True) + data = data.T + + # initialize and return a MoltenProtFitMultiple instance + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + denaturant=denaturant, + layout=layout, + source=filename, + ) + output.AddDataset(data, readout) + return output + + +def parse_prom_xlsx(filename, raw=False, refold=False, LE=False, deltaF=True): + """Parse a processed file from Prometheus NT.48. In these files temperature + is always in Celsius and the readouts are more or less known. Layout is read + from the overview sheet. + + Parameters + ---------- + filename : str + path to xlsx file + raw : bool + if the data is "raw" or "processed" in terms of the manufacturer's GUI + refold : bool + if refolding ramp was used (default False) + LE : bool + instead of standard instance, create the one with Lumry-Eyring model + deltaF : bool + compute an alternative signal-enhanced readout: F350-F330 difference + it is an extensive readout (proportional to protein conc, like F330 or F350), + which also makes the transitions more pronounced (like Ratio) + + Returns: + ------- + MoltenProtFitMultiple instance + + Notes: + ----- + * Parsing relies on sheet names in English + * Current implementation can successfully parse raw XLSX as long as there are less than 96 data columns (which is 3 times the number of capillaries). The data will be contaminated with straight lines of temperature and time + + Todo: + * the sheets have a standardized order in the file: Overview, Readout1_(unfolding), Readout1_(unfolding)_derivative ..., Readout1_(refolding), Readout1_(refolding)_derivative ... ; this can be used to parse data independently of sheet labels; also, the sheets containing "deriv" can be auto-excluded + """ + # force the input file to have absolute path (to be stored in JSON session) + filename = os.path.abspath(filename) + + # read the whole Excel file - get an Ordered Dict + input_xlsx = pd.read_excel(filename, None) + + # parse the layout + # layout contains 3 columns: Condition, Capillary and dCp + # the capillary info can be appended during report generation, but not earlier (needed for Blanks/References etc) + # NOTE if the user manipulated the Overview sheet, additional non-data rows can be read in and produce a messy layout DF. This doesn't seem to affect the processing + if "Overview" not in input_xlsx: + msg = f"Input file {filename} contains no overview sheet" + raise ValueError(msg) + + layout = input_xlsx["Overview"] + layout.reset_index(inplace=True) + # read scan rate from first row in column "Temperature Slope" + # NOTE without conversion to float scan_rate (even if 1) will be saved to JSON as "null"! + scan_rate = float(layout["Temperature Slope"].iloc[0]) + + layout = layout.reindex(["Capillary", "Sample ID", "dCp"], axis=1) + layout.rename(columns={"Sample ID": "Condition"}, inplace=True) + # concatenate A1-H12 and description, then use the "ID" column as the new index + layout = pd.concat([layout, alphanumeric_index], axis=1) + layout.set_index("ID", inplace=True) + + # initialize a MoltenProtFitMultiple instance + if LE: + output = MoltenProtFitMultipleLE( + scan_rate=scan_rate, + layout=layout, + denaturant="C", + source=filename, + ) + else: + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + layout=layout, + denaturant="C", + source=filename, + ) + + # cycle through available readouts and add them to MPMultiple + if refold: + # Full list of readouts, currently only unfolding can be processed + # TODO add a class to process refolding data in conjunction with unfolding + readouts = ( + "Ratio (Unfolding)", + "330nm (Unfolding)", + "350nm (Unfolding)", + "Scattering (Unfolding)", + "Ratio (Refolding)", + "330nm (Refolding)", + "350nm (Refolding)", + "Scattering (Refolding)", + ) + output.print_message( + "Currently refolding data is treated separately from unfolding data", + "w", + ) + else: + readouts = ("Ratio", "330nm", "350nm", "Scattering") + + # NOTE to avoid multiple checks of the scan rate (temp and time scale are the same for all readouts) + refined_scan_rate = None + for i in readouts: + if i in list(input_xlsx.keys()): + data = input_xlsx[i] + """ + Convert the read sheet from *.xlsx + The first column ("Unnamed: 0") contains several rows with NaN values that correspond to one or more columns of the annotations; there is at least one Called Sample ID, and then additional user-defined names. Those have to be removed + The next row contains the value "u'Time [s]'", and it becomes the first row once the previous operation is done. + """ + data = data[data.iloc[:, 0].notna()] + data = data.iloc[1:, :] + + if raw: + # warn the user that there is a potentially harmful data modification + output.print_message( + "Import of raw data requires interpolation to have all readings on the same temperature scale, i.e. the data gets irreversibly modified", + "w", + ) + # in proc data currently there will be: shared time, shared temp, readings + # in raw data there are 3 columns for each sample: time, temperature, readings + # NOTE in some older versions of the raw data the time and temperature are actually the same! + # care must be taken if scan_rate is determined from such files + + # extract readings, temperatures and times + readings = data.iloc[:, 2::3].copy() + temps = data.iloc[:, 1::3] + times = data.iloc[:, 0::3] + # the time and temperature of the first sample will be used in the final scale + time_scale = times.iloc[:, 0].astype(float) + temp_scale = temps.iloc[:, 0].astype(float) + # cycle through all samples and perform interpolation + for col_ix in range(len(readings.columns)): + r_col = readings.columns[col_ix] + t_col = temps.columns[col_ix] + # interpolation function + interpolator = interp1d( + temps[t_col], + readings[r_col], + bounds_error=False, + ) + # interpolated values for readings' + readings.loc[:, r_col] = interpolator(temp_scale) + # add time and temperature of the first sample to the output data + data = pd.concat([time_scale, temp_scale, readings], axis=1) + + # determine true scan rate by running a linear fit of temperature vs time + if refined_scan_rate is None: + temp_vs_time = data.iloc[:, :2].astype(float).dropna() + slope, intercept = np.polyfit( + temp_vs_time.iloc[:, 0], + temp_vs_time.iloc[:, 1], + 1, + ) + refined_scan_rate = slope * 60 # convert degC/sec to degC/min + + # remove the first column with time + data.drop(data.columns[0], inplace=True, axis=1) + # rename the first column to "Temperature" + data.rename(columns={data.columns[0]: "Temperature"}, inplace=True) + # set Temperature as the index column + data.set_index("Temperature", inplace=True) + # convert column names to A1-D12 + data.columns = list(alphanumeric_index.iloc[0 : len(data.columns)]) + # for compatibility with future pandas versions we must make sure that data type is float32 + data = data.apply(pd.to_numeric, errors="coerce") + + # Create a MoltenProtFit instance with this DataFrame as data source + output.AddDataset(data, i) + else: + output.print_message(f"Readout {i} not found", "w") + + if deltaF: + # check if F330 and F350 are available + if {"330nm", "350nm"}.issubset(output.GetDatasets()): + readout_ids = ("330nm", "350nm") + elif {"330nm_(Unfolding)", "350nm_(Unfolding)"}.issubset( + output.GetDatasets(), + ): + readout_ids = ("330nm_(Unfolding)", "350nm_(Unfolding)") + else: + output.print_message( + "Cannot compute readout deltaF: either F330 or F350 is missing in the input data", + "w", + ) + readout_ids = None + + if readout_ids is not None: + # HACK Temperature in other datasets is already in Kelvins, but denaturant is set to C + # to prevent adding extra 273.15 to index temporarily set denaturant to K + output.denaturant = "K" + output.AddDataset( + output.datasets[readout_ids[1]].plate + - output.datasets[readout_ids[0]].plate, + "deltaF", + ) + output.denaturant = "C" + + # Check if any datasets could be properly added + if len(output.datasets) < 1: + msg = f"Input file {filename} contains no data" + raise ValueError(msg) + + # assign refined scan_rate + if abs(refined_scan_rate - output.scan_rate) > 0.2: + # in range 1-7 degC/min the difference between nominal and true rate is less than 0.2 deg/min + output.print_message( + f"The difference between nominal ({output.scan_rate}) and estimated ({refined_scan_rate}) scan rate >0.2 degrees/min", + "w", + ) + output.SetScanRate(refined_scan_rate) + + return output + + +def mp_from_json(input_file): + """Read a json file and if successful return a ready-to-use MoltenProtFit instance. + + Parameters + ---------- + input_file + input file in JSON format + + Notes: + ----- + BUG column ordering is messed up after JSON I/O + """ + with open(input_file) as file: + return json.load(file, object_hook=deserialize) + + +def mp_to_json(object_inst, output=None): + """Convert an MoltenProtFit/MPFMultiple instance to a JSON file. + + Parameters + ---------- + object_inst : + MP instance to be converted to JSON + output : string or None + if None, return a JSON string + if output=='self', then use object_inst.resultfolder attribute + otherwise use str as a location where to write + + Returns: + ------- + string + only if output parameter is None + + Notes: + ----- + BUG Column sorting is usually A1 A2..., but after json i/o it is A1 A10... + """ + if output is None: + # indent=4 makes the output JSON more human-readable + return json.dumps(object_inst, default=serialize, indent=4) + + # if output is some kind of string we can use it to write the output + if output == "self": + # DELETE only useful for old MPFMultiple + # a special case is when self.resultfolder is used + # in all other just use the user-provided string + output = os.path.join(object_inst.resultfolder, "MP_session.json") + + with open(output, "w") as file: + json.dump(object_inst, file, default=serialize, indent=4) + + """ + # add compression: output file is smaller, but the process itself is slower + with gzip.GzipFile(self.resultfolder+'/MP_session.json.gz', 'w') as fout: + fout.write(json.dumps(self, default=serialize, indent=4)) + """ + return None diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/models.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/models.py new file mode 100644 index 0000000..ac1ec99 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/models.py @@ -0,0 +1,658 @@ +"""Copyright 2020,2021 Vadim Kotov, Thomas C. Marlovits. + +This file is part of MoltenProt. + +MoltenProt is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +MoltenProt is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MoltenProt. If not, see . +""" +""" +NOTE +This file formalizes models used in MoltenProt fitting (MPModel class) +User-defined models can be added here +""" + +# finding number of arguments in a function +from inspect import signature + +import numpy as np + +# numeric integration +from scipy.integrate import solve_ivp + +### Constants +R = 8.314 # universtal gas constant +T_std = 298.15 # standard temperature, in Kelvins + + +class MoltenProtModel: + # dummy function is defined here, because it does not make sense for the + # instances to be able to have a different function + # the first argument HAS to be the X-axis (independent var) + fun = lambda x, k, b: k * x + b + + # same applies for the model description - no point to make an instance-specific value + _description = "A dummy MoltenProt model" + + # since a single class is created for each model, then the long name is already in the class name + # here we can also add a short name used in the main module + short_name = "model_template" + + # what measure to use in final sorting - calculated in MoltenProtFit + # NOTE all measures are selected in such a way that higher values correspond to higher stability + # if None, then final sorting is skipped + sortby = None + + def __init__(self, scan_rate=None) -> None: + """In a general case scan rate is not relevant, so it is set to None + For kinetic data it needs to be set for sure. + """ + self.scan_rate = scan_rate + + def __repr__(self) -> str: + """For interactive prompts or print() - show the model description.""" + return self._description + + def __str__(self) -> str: + """When converting to a string return the short name.""" + return self.short_name + + def param_names(self): + """Return parameter names encoded in the function declaration as a list.""" + params = signature(self.fun).parameters + # skip parameter self + return list(params)[1:] + + def param_init(self, input_data=None): + """Return starting parameters based on the input values + input_data must be a pd.Series with an index (e.g. Temperature) + returns None if the parameters should be guessed by curve_fit. + """ + if input_data is None: + return + + def param_bounds(self, input_data=None): + """Return the bounds based on the input data. + input_data must be a pd.Series with an index (e.g. Temperature) + returns (-np.inf, np.inf) if no bounds are to be set. + """ + if input_data is None: + return (-np.inf, np.inf) + return None + + +class EquilibriumTwoState(MoltenProtModel): + short_name = "santoro1988" + _description = "N <-> U" + sortby = "dG_std" # type: ignore[assignment] + + # original function + # d -> dHm + # NOTE parameter names do matter. If kN,bN,kU,bU and Tm are present, then _estimate_baseline routine + # will be run by MoltenProtFit instance to get the best possible starting values + def fun(self, T, kN, bN, kU, bU, dHm, Tm): + return (kN * T + bN + (kU * T + bU) * np.exp(dHm / R * (1 / Tm - 1 / T))) / ( + 1 + np.exp(dHm / R * (1 / Tm - 1 / T)) + ) + + def param_bounds(self, input_data=None): + # if no data supplied, run the default action from the master class + # otherwise compute bounds from plate index or hard-coded + if input_data is None: + return super().param_bounds(None) + else: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, 60000, min(input_data.index)), + (np.inf, np.inf, np.inf, np.inf, 4000000, max(input_data.index)), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and 45 degree slope + # original implementation did not put values for Tm, which was computed dynamically in main code + # here a good starting Tm is just the middle of th range (or 0 if no data provided) + # NOTE for custom models any parameter initialization code should be implemented here + if input_data is None: + return (1, 0, 2, 0, 100000, 0) + else: + return ( + 1, + 0, + 2, + 0, + 100000, + min(input_data.index) + + (min(input_data.index) + max(input_data.index)) / 2.0, + ) + + +class EquilibriumThreeState(MoltenProtModel): + short_name = "santoro1988i" + _description = "N <-> I <-> U" + # in theory total stability of the protein is the sum of stabilities of N and I + sortby = "dG_comb_std" # type: ignore[assignment] + + def fun(self, T, kN, bN, kU, bU, kI, dHm1, T1, dHm2, dT2_1): + # dT2_1 = T2 - T1, i.e. the distance between the two transitions + return ( + kN * T + + bN + + kI * np.exp(dHm1 / R * (1 / T1 - 1 / T)) + + (kU * T + bU) + * np.exp(dHm1 / R * (1 / T1 - 1 / T)) + * np.exp(dHm2 / R * (1 / (T1 + dT2_1) - 1 / T)) + ) / ( + 1 + + np.exp(dHm1 / R * (1 / T1 - 1 / T)) + + np.exp(dHm1 / R * (1 / T1 - 1 / T)) + * np.exp(dHm2 / R * (1 / (T1 + dT2_1) - 1 / T)) + ) + + def param_bounds(self, input_data=None): + # TESTING preliminary results show that no limits for dHm are better in intermediate mode + if input_data is None: + return super().param_bounds(None) + else: + # by definition T2 follows T1, so dT2_1 is > 0 + # the upper bound for dT2_1 is 1/2 of the full temperature range (i.e. the limit on max distance between the two Tms) + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + -np.inf, + 0, + # allow dT2_1 to be +/- + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index), + np.inf, + (max(input_data.index) - min(input_data.index)) / 2, + ), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + # For dT2_1 we start from the assumption that dT2_1=0, i.e. there is no 2nd transition + if input_data is None: + return (1, 0, 2, 0, 1, 100000, 0, 100000, 0) + else: + # T1 is heuristically placed in the middle of the temp range + temp_range = max(input_data.index) - min(input_data.index) + return ( + 1, + 0, + 2, + 0, + 1, + 100000, + min(input_data.index) + 0.5 * temp_range, + 100000, + 0, + ) + + +class EmpiricalTwoState(MoltenProtModel): + short_name = "santoro1988d" + _description = "Same as santoro1988, but fits Tm and T_onset" + sortby = "T_eucl" # type: ignore[assignment] + # NOTE onset threshold is hard-coded to 0.01, i.e. onset point is 1% unfolded + onset_threshold = 0.01 + + def fun(self, T, kN, bN, kU, bU, T_onset, Tm): + return ( + kN * T + + bN + + (kU * T + bU) + * np.exp( + (T - Tm) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset - Tm), + ) + ) / ( + 1 + + np.exp( + (T - Tm) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset - Tm), + ) + ) + + def param_bounds(self, input_data=None): + if input_data is None: + return super().param_bounds(None) + else: + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + min(input_data.index), + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index) - 2, + max(input_data.index), + ), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + if input_data is None: + return (1, 0, 2, 0, 1, 2) + else: + return ( + 1, + 0, + 2, + 0, + min(input_data.index) + 10, + (min(input_data.index) + max(input_data.index)) / 2.0, + ) + + +class EmpiricalThreeState(MoltenProtModel): + short_name = "santoro1988di" + _description = "Same as santoro1988i, but fits Tm and T_onset" + # similar to thermodynamic 3-state model: sum up Euclidean temperature distance + # for both reaction steps + sortby = "T_eucl_comb" # type: ignore[assignment] + # NOTE onset threshold is hard-coded to 0.01, i.e. onset point is 1% unfolded + onset_threshold = 0.01 + + def fun(self, T, kN, bN, kU, bU, kI, T_onset1, T1, T_onset2, T2): + return ( + kN * T + + bN + + kI + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + + (kU * T + bU) + * np.exp( + (T - T2) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset2 - T2), + ) + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + ) / ( + 1 + + np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + + np.exp( + (T - T2) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset2 - T2), + ) + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + ) + + def param_bounds(self, input_data=None): + if input_data is None: + return super().param_bounds(None) + else: + return ( + [ + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + min(input_data.index), + min(input_data.index), + min(input_data.index), + ], + [ + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index), + max(input_data.index), + max(input_data.index), + max(input_data.index), + ], + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + if input_data is None: + return (1, 0, 2, 0, 1, 1, 1, 1, 1) + else: + temp_range = max(input_data.index) - min(input_data.index) + # with these starting values first transition is supposed to start in the beginning of the curve + # and the second transition starts in the end of the curve. Then they should meet + return ( + 1, + 0, + 2, + 0, + 1, + min(input_data.index) + 0.2 * temp_range, + min(input_data.index) + 0.4 * temp_range, + max(input_data.index) - 0.4 * temp_range, + max(input_data.index) - 0.2 * temp_range, + ) + + +class IrreversibleTwoState(MoltenProtModel): + short_name = "irrev" + _description = "N -> U" + sortby = "pk_std" # type: ignore[assignment] + xn = 1 # y0 (starting condition) for differential equation + + def __init__(self, scan_rate) -> None: + # scan rate is an essential parameter and must thus be set explicitly + if scan_rate is not None: + self.scan_rate = scan_rate + else: + msg = f"{self.short_name} model requires scan_rate to be set" + raise ValueError( + msg, + ) + + def arrhenius(self, t, Tf, Ea): + """Arrhenius equiation: defines dependence of reaction rate constant k on temperature + In this version of the equation we use Tf (a temperature of k=1) + to get rid of instead of pre-exponential constant A. + """ + return np.exp(-Ea / R * (1 / t - 1 / Tf)) + + def ode(self, t, xn, Tf, Ea): + """Ordinary differential equation for fraction native versus temperature + dxn/dT = -1/v*k(T)*xn. + + start_value xn - should be always 1 because at the start of assay we assume everything is folded + v - scan rate to convert minutes of scan rate to degrees (default 1) + xn - fraction native (xn + xagg = 1) + k(T) - temperature-dependent rate constant of aggregation + """ + return -1 / self.scan_rate * self.arrhenius(t, Tf, Ea) * xn + + def fun(self, t, kN, bN, kU, bU, Tf, Ea): + """Returns aggregation signal at given temperature + Signal(T) = (kN*T + bN)*xn +(kU*T + bU)*xu + k, b - baseline parameters (N or U state) + xn, xu - fraciton native/unfolded, xn + xu = 1 + in other words: + Signal(T) = kU*T + bU + (kN*T + bN - kU*T - bU) * xn. + """ + # step 1: numerically integrate agg_ode for given parameters - gives xn(T) + ivp_result = solve_ivp( + self.ode, + t_span=[min(t), max(t)], + t_eval=t, + y0=[self.xn], + args=(Tf, Ea), + method="BDF", + ) + + # step 2: return the result of the signal + return kU * t + bU + (kN * t + bN - kU * t - bU) * ivp_result.y[0, :] + + def param_init(self, input_data=None): + if input_data is None: + # without input data it's hard to guess starting values + # but it seems that starting with a high Tf may help + # kN, bN, kU, bU, Tf, Ea + return (0, 1, 0, 1, 400, 100000) + else: + # the baselines will have a better guess in MoltenProt + # since Tf may or may not coincide with the derivative peak + # it is taken as the middle of the temp range + return ( + 0, + 1, + 0, + 1, + min(input_data.index) + + (max(input_data.index) - min(input_data.index)) / 2.0, + 50000, + ) + + def param_bounds(self, input_data=None): + # NOTE it may happen that Tf is ouside the temperature range, but the curve is still OK + # also, MoltenProt calculations are always done in Kelvins + # thus, the default bounds are quite relaxed + if input_data is None: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, 1, 0), + (np.inf, np.inf, np.inf, np.inf, np.inf, np.inf), + ) + else: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, min(input_data.index) - 100, 0), + (np.inf, np.inf, np.inf, np.inf, max(input_data.index) + 100, np.inf), + ) + + +""" +NOTE given the current computational workload even of a simple kinetic equiation implementation of more complex +equations makes little sense + +class IrreversibleThreeState(IrreversibleTwoState): + short_name = 'irrev' + _description = "N -> I -> U" +""" + +# TODO another (potentially useful) variant could be to implement N <-kF,kR-> U without any A +# the class can be called ReversibleTwoState, because there are only U and N, and U can sometimes become N + + +class LumryEyring(IrreversibleTwoState): + short_name = "lumry_eyring" + _description = "N <- kF,kR -> U -> A" + + # the kF/kR at std temperature; take as -log10 to have higher values for higher stability + sortby = "pk_ratio_std" # type: ignore[assignment] + + # fmt: off + # ~_~ + z0 = (0,0) # starting values for system of 2x ode + # /,www,\ + # ||wWw|| an owl?! + # \|||/ + # ~~~m'm~~~ + # fmt: on + def __init__(self, scan_rate, tfea=[None, None]) -> None: + # scan rate is an essential parameter and must thus be set explicitly + if scan_rate is not None: + self.scan_rate = scan_rate + else: + msg = f"{self.short_name} model requires scan_rate to be set" + raise ValueError( + msg, + ) + + # tfea are characteristics of the irreversible aggregation reaction (Tf and Ea) + # and are obtained separately; they are fixed during fitting + self.tfea = tfea + + def set_fixed(self, tfea): + """Set parameters that are needed in the fit equation, but not being fit + tfea must be a list with two values [Tf, Ea] (floats) + NOTE only the length of the input is checked, but not the type of list elements. + """ + if len(tfea) != 2: + msg = "LE model requires tfea to be a list of two floats" + raise ValueError(msg) + self.tfea = tfea + + def ode( + self, + t, + z0, + TfF, + EaF, + TfR, + EaR, + Tf2, + Ea2, + ): + """A function to process a system of differential equations x and y packaged into array z = [x,y] + Implements a general case of Lumry-Eyring equation (case D in Mazurenko2017) + N <- kF, kR -> U - k2 -> A + there is an quasi-equilibrium between N and U, but irreversible conversion to A + As described in the paper: + + fN + fU + fA = 1 + dfAdT = 1/v * k2(T) * fU(T) ...................................... (eq. y) + dfUdT = 1/v* ( kF(T)*fN - (kR(T)+k2(T))*fU ) + + since fN = 1 - fU - fA, can rewrite dfUdT like this: + + dfUdT = 1/v * ( kF(T) * (1 - fU - fA) - ( kR(T)+k2(T) )*fU ) ..... (eq. x) + + the formula for rate in reaction i would then be: + ki(T) = exp(-Eai/R * (1/T - 1/Tfi)) + i can be F or R for N<->U and 2 for U->A + + t - temperature scale (converted to kinetic time using v, the scan rate (global constant) + z0 - starting values for equations in the system [fU=0, fA=0] + """ + # unpack initial values + x0, y0 = z0 + + dxdt = ( + 1 + / self.scan_rate + * ( + self.arrhenius(t, TfF, EaF) * (1 - x0 - y0) + - (self.arrhenius(t, TfR, EaR) + self.arrhenius(t, Tf2, Ea2)) * x0 + ) + ) + dydt = 1 / self.scan_rate * self.arrhenius(t, Tf2, Ea2) * x0 + + return (dxdt, dydt) + + # def fun(self, T, TfF, EaF, TfR, EaR, kNF, bNF, kUF, kAF, bAF): + def fun(self, T, kN, bN, kU, bU, kI, TfF, EaF, TfR, EaR): + """Uses pre-computed Tf and Ea for scattering data to model fluorescence signal + NOTE the original order and naming of the parameters is as follows: + T, TfF, EaF, TfR, EaR, kNF, bNF, kUF, kAF, bAF + T - temperature/time + TfF - temperature at which the rate constant kF=1 (reaction N->U) + EaF - activation energy for N->U + TfR,EaR - Tf and Ea for reaction U->N + kNF, bNF - slope and intercept for the baseline of state N + kUF - slope for the fluorescence of state U (assumed to be short-lived and not abundant, see Bedouelle2016) + kAF, bAF - slope/intercept for the baseline of state A, which ultimately makes up the post-transition baseline. + + For MoltenProt to recognize pre- and post- baseline parameters the have to be placed first + and renamed as follows: + kNF, bNF - kN, bN + kAF, bAF - kU, bU + kUF - kI (similar to three-state cases above) + + NOTE this is not the only way to define the law of signal; for instance, we can assume + that the fluorescence has a similar time dependence for states U and A; then kI (aka kUF) is not needed + and the law of signal will be: + (kN * T + bN) * fN + (kU*T + bU) * (fA + fU) + """ + ivp_result = solve_ivp( + self.ode, + t_span=[min(T), max(T)], + y0=self.z0, + args=(TfF, EaF, TfR, EaR, self.tfea[0], self.tfea[1]), + t_eval=T, + method="BDF", + ) + + # based on diff eqn compute fractions of each state + fU = ivp_result.y[0, :] + fA = ivp_result.y[1, :] + fN = 1 - fU - fA + # return modelled fluorescence signal + return (kN * T + bN) * fN + kI * fU + (kU * T + bU) * fA + + def param_init(self, input_data=None): + if input_data is None: + # without input data it's hard to guess starting values + # kN, bN, kU, bU, kI, TfF, EaF, TfR, EaR + return (0, 1, 0, 1, 0, 400, 100000, 400, 100000) + else: + # the baselines will have a better guess in MoltenProt + # Tf is probably not similar to Tm, however, it makes sense to try the middle of the range + temp_range = max(input_data.index) - min(input_data.index) + return ( + 0, + 1, + 0, + 1, + 0, + min(input_data.index) + temp_range / 2.0, + 50000, + min(input_data.index) + temp_range / 2.0, + 50000, + ) + + def param_bounds(self, input_data=None): + # NOTE it may happen that Tf is ouside the temperature range, but the curve is still OK + # also, MoltenProt calculations are always done in Kelvins + # thus, the default bounds are quite relaxed + # NOTE Tf should not be zero, see the Arrhenius formula + if input_data is None: + return ((-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, 1, 0, 1, 0), np.inf) + else: + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index) - 100, + 0, + min(input_data.index) - 100, + 0, + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index) + 100, + np.inf, + max(input_data.index) + 100, + np.inf, + ), + ) diff --git a/regression/rt-cetsa-moltenprot-tool/tests/__init__.py b/regression/rt-cetsa-moltenprot-tool/tests/__init__.py new file mode 100644 index 0000000..d420712 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests.""" diff --git a/regression/rt-cetsa-moltenprot-tool/tests/conftest.py b/regression/rt-cetsa-moltenprot-tool/tests/conftest.py new file mode 100644 index 0000000..4983315 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/conftest.py @@ -0,0 +1,13 @@ +"""Set up.""" +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py b/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py new file mode 100644 index 0000000..80816cc --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py @@ -0,0 +1,17 @@ +"""Tests.""" +from polus.tabular.regression.rt_cetsa_moltenprot import run_moltenprot_fit +from pathlib import Path + +import pytest + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_moltenprot(): + path = Path(__file__).parent / "data" / "plate_(1-58).csv" + params, values = run_moltenprot_fit(path) + assert params is not None + assert values is not None + + +def test_dummy_test(): + pass diff --git a/transforms/rt-cetsa-metadata-tool/.bumpversion.cfg b/transforms/rt-cetsa-metadata-tool/.bumpversion.cfg new file mode 100644 index 0000000..1ac2f14 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.5.0-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:VERSION] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/transforms/rt_cetsa_metadata/__init__.py] + +[bumpversion:file:plugin.json] diff --git a/transforms/rt-cetsa-metadata-tool/.gitignore b/transforms/rt-cetsa-metadata-tool/.gitignore new file mode 100644 index 0000000..248d769 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/.gitignore @@ -0,0 +1,2 @@ +out +~* diff --git a/transforms/rt-cetsa-metadata-tool/Dockerfile b/transforms/rt-cetsa-metadata-tool/Dockerfile new file mode 100755 index 0000000..7106150 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/Dockerfile @@ -0,0 +1,21 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir +RUN pip3 install . + +ENTRYPOINT ["python3", "-m", "polus.tabular.transforms.rt_cetsa_metadata"] +CMD ["--help"] diff --git a/transforms/rt-cetsa-metadata-tool/README.md b/transforms/rt-cetsa-metadata-tool/README.md new file mode 100644 index 0000000..9fd1ac8 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/README.md @@ -0,0 +1,26 @@ +# RT_CETSA metadata (v0.5.0-dev0) + +This WIPP plugin preprocess data for the RT-CETSA pipeline. +It generates temperature metadata and generate new image files which +names encode the metadata (1.tif will become 1_37.0.tif). +This metadata can be generated from a range provided as parameters or +extracted from a camera data file. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eight input argument and one output argument: + +| Name | Description | I/O | Type | +|-----------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/transforms/rt-cetsa-metadata-tool/VERSION b/transforms/rt-cetsa-metadata-tool/VERSION new file mode 100644 index 0000000..412ff8c --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/VERSION @@ -0,0 +1 @@ +0.5.0-dev0 diff --git a/transforms/rt-cetsa-metadata-tool/build-docker.sh b/transforms/rt-cetsa-metadata-tool/build-docker.sh new file mode 100755 index 0000000..0e8d5b0 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Antoine Gerardin ", + "Najib Ishaq ", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +filepattern = "^2.0.5" +pandas = "^2.2.2" +openpyxl = "^3.1.3" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +extend-ignore = [ + "RET505", # Unnecessary `else` after `return` statement + "E501", # Line too long + "ANN001", # Missing type annotation + "D102", # Missing docstring in public method + "ANN201", # Missing return type annotation + "N806", # Variable in function should be lowercase + "D205", # 1 blank line required between summary line and description + "N803", # Argument name should be lowercase + "PLR0913", # Too many arguments + "D415", # First line should end with a period, question mark, or exclamation point + "PLR2004", # Magic value used in comparison + "B006", # Do not use mutable default arguments + "D107", # Missing docstring + "D101", # Missing docstring + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file + "PTH123", # `open()` should be replaced with `Path.open()` + "PTH118", # `os.path.join()` should be replaced with `/` operator + "PTH100", # `os.path.abspath()` should be replaced with `Path.resolve()` + "PLR0915", # Too many statements + "PLR0912", # Too many branches + "C901", # Function is too complex + "T201", # `print` used + "E722", # Do not use bare 'except' + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling + "ANN202", # Missing return type annotation for private function + "ARG002", # Unused method argument + "N802", # Function name should be lowercase + "PTH103", # `os.makedirs()` should be replaced with `Path.mkdir(parents=True)` + "ANN003", # Missing type annotation for `**kwargs` + "B007", # Loop control variable not used within the loop body + "ANN204", # Missing return type annotation for magic method + "D417", # Missing argument descriptions in the docstring + "ANN205", # Missing return type annotation for static method + "PLR5501", # Use `elif` instead of `else` following `if` condition to avoid unnecessary indentation + "EM102", # Exception must not use an f-string literal + "D414", # Section has no content + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "A001", # Variable `input` is shadowing a Python builtin + "A002", # Argument `input` is shadowing a Python builtin + "E741", # Ambiguous variable name: `l` + "PTH120", # `os.path.dirname()` should be replaced by `Path.parent` + "N816", # Variable `cfFilename` in global scope should not be mixedCase + "PTH109", # `os.getcwd()` should be replaced by `Path.cwd()` +] diff --git a/transforms/rt-cetsa-metadata-tool/run-plugin.sh b/transforms/rt-cetsa-metadata-tool/run-plugin.sh new file mode 100755 index 0000000..da6c3bf --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash +version=$( None: + """CLI for rt-cetsa-metadata-tool.""" + logger.info("Starting the CLI for rt-cetsa-metadata-tool.") + + logger.info(f"Input directory: {inp_dir}") + logger.info(f"Output directory: {out_dir}") + + # NOTE we may eventually deal with other types. + if POLUS_TAB_EXT != ".csv": + msg = "this tool can currently only process csv files." + raise ValueError(msg) + + if metadata: + metadata_file = inp_dir / metadata + if not metadata_file.exists(): + raise FileNotFoundError(metadata_file) + logger.info(f"Using metadata file: {metadata_file}") + return preprocess_metadata(inp_dir, out_dir, metadata_file) + + if range: + logger.info(f"Interpolating temp values in : {range}") + return preprocess_from_range(inp_dir, out_dir, range) + return None + + +if __name__ == "__main__": + app() diff --git a/transforms/rt-cetsa-metadata-tool/tests/__init__.py b/transforms/rt-cetsa-metadata-tool/tests/__init__.py new file mode 100644 index 0000000..d420712 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests.""" diff --git a/transforms/rt-cetsa-metadata-tool/tests/conftest.py b/transforms/rt-cetsa-metadata-tool/tests/conftest.py new file mode 100644 index 0000000..4983315 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/tests/conftest.py @@ -0,0 +1,13 @@ +"""Set up.""" +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/transforms/rt-cetsa-metadata-tool/tests/test_metadata.py b/transforms/rt-cetsa-metadata-tool/tests/test_metadata.py new file mode 100644 index 0000000..aa69413 --- /dev/null +++ b/transforms/rt-cetsa-metadata-tool/tests/test_metadata.py @@ -0,0 +1,21 @@ +"""Tests.""" +from pathlib import Path +from polus.tabular.transforms.rt_cetsa_metadata import ( + preprocess_metadata, + preprocess_from_range, +) + +RAW_DIR = Path(__file__).parent / "data" / "images" +PREPROCESSED_DIR = Path(__file__).parent / "out" +PREPROCESSED_DIR.mkdir(exist_ok=True) +CAMERA_FILE = Path(__file__).parent / "data" / "CameraData.xlsx" + + +def test_preprocess_from_range(): + # preprocess_from_range(RAW_DIR, PREPROCESSED_DIR, range_t=(37, 90)) + pass + + +def test_preprocess_metadata(): + pass + # preprocess_metadata(RAW_DIR, PREPROCESSED_DIR, CAMERA_FILE)