From db4f9e2b6335d30b73b7d1bc44f81571096bbd0a Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Tue, 21 Jan 2025 10:55:54 +1100 Subject: [PATCH 1/8] Update file description (#15) * make file description better * Update CHANGELOG.md * updated readme and file validation description --- CHANGELOG.md | 2 + README.md | 75 +++++++++++++++---------- src/api/file_integrated.yaml | 2 +- src/api/file_unintegrated.yaml | 8 ++- src/api/file_unintegrated_censored.yaml | 10 +++- src/api/file_validation.yaml | 15 +++-- 6 files changed, 72 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 52f160e..3823e64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,5 +27,7 @@ * Added integrated test resource (PR #5). +* Updated file description in yaml file (PR #15). + ## BUGFIXES diff --git a/README.md b/README.md index 019b536..7845a40 100644 --- a/README.md +++ b/README.md @@ -118,18 +118,23 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input` | `file` | A subset of the common dataset. | -| `--output_unintegrated_censored` | `file` | (*Output*) Unintegrated dataset. | -| `--output_unintegrated` | `file` | (*Output*) Unintegrated dataset. | +| `--output_unintegrated_censored` | `file` | (*Output*) An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | +| `--output_unintegrated` | `file` | (*Output*) The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | | `--output_validation` | `file` | (*Output*) Hold-out dataset for validation. | ## File format: Unintegrated Censored -Unintegrated dataset +An unintegrated dataset with certain columns (cells metadata), such as +the donor information, hidden. These columns are intentionally hidden to +prevent bias. The batch correction algorithm should not have to rely on +these information to properly integrate different batches. This dataset +is used as the input for the batch correction algorithm. The cells +therein are identical to those in the unintegrated dataset. Example file: -`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/train.h5ad` +`resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad` Format: @@ -170,10 +175,12 @@ Data structure: ## File format: Unintegrated -Unintegrated dataset +The complete unintegrated dataset, including all cells’ metadata +(columns) from the unintegrated_censored dataset. The cells in this +dataset are the same to those in the unintegrated_censored dataset. Example file: -`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/train.h5ad` +`resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad` Format: @@ -219,14 +226,22 @@ Data structure: Hold-out dataset for validation. Example file: -`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad` +`resources_test/task_cyto_batch_integration/starter_file/validation.h5ad` Description: -Samples that were held out and will later be used only to assess whether -the batch integration was successful. E.g. if a donor from batch 2 was -corrected towards batch 1, but also actually measured in batch 1 -(without being used as input to the algorithm). +Dataset containing cells from samples that were held out for evaluating +batch integration output. The cells that are in this dataset belong to +samples which are not included in the unintegrated or +unintegrated_censored datasets. For example, if samples from donor A are +present in batch 1 and 2, the sample from batch 1 may be used as input +for the batch correction algorithm (and thus present in unintegrated and +unintegrated_censored datasets). The sample from batch 2, may not be +included as an input for the batch correction algorithm, but is needed +to validate whether whether the algorithm managed to correct the batch +effect in batch 2 towards batch 1. This sample will then be included in +this dataset (but not in unintegrated and unintegrated_censored +datasets). Format: @@ -269,16 +284,16 @@ Data structure: ## Component type: Method -A method. +A method for integrating batch effects in cytometry data. Arguments:
-| Name | Type | Description | -|:-----------|:-------|:-------------------------------| -| `--input` | `file` | Unintegrated dataset. | -| `--output` | `file` | (*Output*) Integrated dataset. | +| Name | Type | Description | +|:---|:---|:---| +| `--input` | `file` | An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | +| `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. |
@@ -290,11 +305,11 @@ Arguments:
-| Name | Type | Description | -|:-----------------------|:-------|:---------------------------------| -| `--input_unintegrated` | `file` | Unintegrated dataset. | -| `--input_validation` | `file` | Hold-out dataset for validation. | -| `--output` | `file` | (*Output*) Integrated dataset. | +| Name | Type | Description | +|:---|:---|:---| +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | +| `--input_validation` | `file` | Hold-out dataset for validation. | +| `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. |
@@ -309,18 +324,18 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input_validation` | `file` | Hold-out dataset for validation. | -| `--input_unintegrated` | `file` | Unintegrated dataset. | -| `--input_integrated` | `file` | Integrated dataset. | +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | +| `--input_integrated` | `file` | Integrated dataset which batch effect was corrected by an algorithm. | | `--output` | `file` | (*Output*) File indicating the score of a metric. | ## File format: Integrated -Integrated dataset +Integrated dataset which batch effect was corrected by an algorithm Example file: -`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/prediction.h5ad` +`resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad` Format: @@ -350,14 +365,14 @@ Data structure: File indicating the score of a metric. Example file: -`resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/score.h5ad` +`resources_test/task_cyto_batch_integration/starter_file/score.h5ad` Format:
AnnData object - uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' + uns: 'dataset_id', 'method_id', 'sample_ids', 'metric_ids', 'metric_values'
@@ -368,10 +383,10 @@ Data structure: | Slot | Type | Description | |:---|:---|:---| | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["method_id"]` | `string` | A unique identifier for the batch correction method. | +| `uns["sample_ids"]` | `string` | The samples assessed by the metric. | | `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | +| `uns["metric_values"]` | `double` | The metric values obtained. Must be of same length as ‘metric_ids’. | diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml index 9882901..0ef5380 100644 --- a/src/api/file_integrated.yaml +++ b/src/api/file_integrated.yaml @@ -1,7 +1,7 @@ type: file example: "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad" label: Integrated -summary: "Integrated dataset" +summary: "Integrated dataset which batch effect was corrected by an algorithm" info: format: type: h5ad diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index f8e9b00..33f27a0 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -1,9 +1,11 @@ #TODO: Change to the required and/or optional fields of the anndata type: file example: "resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad" -label: "Unintegrated" -summary: "Unintegrated dataset" - +label: Unintegrated +summary: | + The complete unintegrated dataset, including all cells' metadata (columns) from the + unintegrated_censored dataset. + The cells in this dataset are the same to those in the unintegrated_censored dataset. info: format: type: h5ad diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index 20d9f7b..0a51fc5 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -1,8 +1,14 @@ #TODO: Change to the required and/or optional fields of the anndata type: file example: "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad" -label: "Unintegrated Censored" -summary: "Unintegrated dataset" +label: Unintegrated Censored +summary: | + An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. + These columns are intentionally hidden to prevent bias. + The batch correction algorithm should not have to rely on these information + to properly integrate different batches. + This dataset is used as the input for the batch correction algorithm. + The cells therein are identical to those in the unintegrated dataset. info: format: type: h5ad diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index ad29b2e..ac6af85 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -3,10 +3,17 @@ example: "resources_test/task_cyto_batch_integration/starter_file/validation.h5a label: Validation summary: Hold-out dataset for validation. description: | - Samples that were held out and will later be used only to assess whether - the batch integration was successful. E.g. if a donor from batch 2 was corrected towards batch 1, - but also actually measured in batch 1 (without being used as input to the algorithm). - + Dataset containing cells from samples that were held out for evaluating batch integration output. + The cells that are in this dataset belong to samples which are not included in the unintegrated + or unintegrated_censored datasets. + For example, if samples from donor A are present in batch 1 and 2, the sample from batch 1 + may be used as input for the batch correction algorithm (and thus present in unintegrated + and unintegrated_censored datasets). + The sample from batch 2, may not be included as an input for the batch correction algorithm, + but is needed to validate whether whether the algorithm managed to correct the batch effect + in batch 2 towards batch 1. + This sample will then be included in this dataset (but not in unintegrated + and unintegrated_censored datasets). info: format: type: h5ad From 1460c34719881c996380eafae0fdfc58ca7f0904 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Tue, 4 Feb 2025 21:45:05 +1100 Subject: [PATCH 2/8] Update anndata schema (#18) * update yaml for files * Update file_integrated.yaml * add is_control to common dataset and clean up * update input file format --- CHANGELOG.md | 5 +++ README.md | 42 ++++++++++++++----------- src/api/file_common_dataset.yaml | 22 ++++++++++--- src/api/file_unintegrated.yaml | 20 ++++++------ src/api/file_unintegrated_censored.yaml | 24 +++++++------- src/api/file_validation.yaml | 20 ++++++------ 6 files changed, 78 insertions(+), 55 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3823e64..e433173 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,5 +29,10 @@ * Updated file description in yaml file (PR #15). +* Updated file schema (PR #18): + * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. + * Removed donor_id obs from unintegrated censored. + * Removed to_correct var from everything except common_dataset. All datasets now will only contain markers that need to be corrected. + ## BUGFIXES diff --git a/README.md b/README.md index 7845a40..9f81ddb 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control', 'is_validation' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -91,6 +91,8 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | +| `obs["is_validation"]` | `boolean` | Whether the cell will be used as validation data or not. If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. If TRUE, then the cell will only be included in validation. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -118,8 +120,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input` | `file` | A subset of the common dataset. | -| `--output_unintegrated_censored` | `file` | (*Output*) An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | -| `--output_unintegrated` | `file` | (*Output*) The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | +| `--output_unintegrated_censored` | `file` | (*Output*) An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. Only markers that need to be batch corrected are present. | +| `--output_unintegrated` | `file` | (*Output*) The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | | `--output_validation` | `file` | (*Output*) Hold-out dataset for validation. |
@@ -131,7 +133,8 @@ the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells -therein are identical to those in the unintegrated dataset. +therein are identical to those in the unintegrated dataset. Only markers +that need to be batch corrected are present. Example file: `resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad` @@ -141,8 +144,8 @@ Format:
AnnData object - obs: 'batch', 'sample', 'donor' - var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' + obs: 'batch', 'sample', 'is_control' + var: 'numeric_id', 'channel', 'marker', 'marker_type' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -156,12 +159,11 @@ Data structure: |:---|:---|:---| | `obs["batch"]` | `string` | Batch information. | | `obs["sample"]` | `string` | Sample ID. | -| `obs["donor"]` | `string` | (*Optional*) Donor ID. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | -| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -177,7 +179,8 @@ Data structure: The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this -dataset are the same to those in the unintegrated_censored dataset. +dataset are the same to those in the unintegrated_censored dataset. Only +markers that need to be batch corrected are present. Example file: `resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad` @@ -187,8 +190,8 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' - var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' + var: 'numeric_id', 'channel', 'marker', 'marker_type' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -205,11 +208,11 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | -| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -241,15 +244,16 @@ included as an input for the batch correction algorithm, but is needed to validate whether whether the algorithm managed to correct the batch effect in batch 2 towards batch 1. This sample will then be included in this dataset (but not in unintegrated and unintegrated_censored -datasets). +datasets). +Only markers that need to be batch corrected are present. Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' - var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' + var: 'numeric_id', 'channel', 'marker', 'marker_type' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -266,11 +270,11 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | -| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -292,7 +296,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | +| `--input` | `file` | An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. Only markers that need to be batch corrected are present. | | `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. |
@@ -307,7 +311,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | | `--input_validation` | `file` | Hold-out dataset for validation. | | `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. | @@ -324,7 +328,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input_validation` | `file` | Hold-out dataset for validation. | -| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | | `--input_integrated` | `file` | Integrated dataset which batch effect was corrected by an algorithm. | | `--output` | `file` | (*Output*) File indicating the score of a metric. | diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index d9746a2..87795cd 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -31,6 +31,23 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true + - type: boolean + name: is_validation + description: | + Whether the cell will be used as validation data or not. + If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. + If TRUE, then the cell will only be included in validation. + required: true var: - type: integer name: numeric_id @@ -52,11 +69,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index 33f27a0..cae1be7 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -6,6 +6,7 @@ summary: | The complete unintegrated dataset, including all cells' metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. + Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -35,6 +36,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -52,15 +63,6 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true - - type: boolean - name: to_correct - description: Whether the marker will be batch corrected - required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index 0a51fc5..f841421 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -9,6 +9,7 @@ summary: | to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. + Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -26,10 +27,16 @@ info: name: sample description: Sample ID required: true - - type: string - name: donor - description: Donor ID - required: false + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -47,15 +54,6 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true - - type: boolean - name: to_correct - description: Whether the marker will be batch corrected - required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index ac6af85..6dea033 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -14,6 +14,7 @@ description: | in batch 2 towards batch 1. This sample will then be included in this dataset (but not in unintegrated and unintegrated_censored datasets). + Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -43,6 +44,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -60,15 +71,6 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true - - type: boolean - name: to_correct - description: Whether the marker will be batch corrected - required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id From a011db4c8d941b0d2a485486988963413fdef6a1 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Wed, 5 Feb 2025 12:32:45 +1100 Subject: [PATCH 3/8] Revert "Update anndata schema (#18)" This reverts commit 1460c34719881c996380eafae0fdfc58ca7f0904. --- CHANGELOG.md | 5 --- README.md | 42 +++++++++++-------------- src/api/file_common_dataset.yaml | 22 +++---------- src/api/file_unintegrated.yaml | 20 ++++++------ src/api/file_unintegrated_censored.yaml | 24 +++++++------- src/api/file_validation.yaml | 20 ++++++------ 6 files changed, 55 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e433173..3823e64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,10 +29,5 @@ * Updated file description in yaml file (PR #15). -* Updated file schema (PR #18): - * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. - * Removed donor_id obs from unintegrated censored. - * Removed to_correct var from everything except common_dataset. All datasets now will only contain markers that need to be corrected. - ## BUGFIXES diff --git a/README.md b/README.md index 9f81ddb..7845a40 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control', 'is_validation' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -91,8 +91,6 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | -| `obs["is_validation"]` | `boolean` | Whether the cell will be used as validation data or not. If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. If TRUE, then the cell will only be included in validation. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -120,8 +118,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input` | `file` | A subset of the common dataset. | -| `--output_unintegrated_censored` | `file` | (*Output*) An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. Only markers that need to be batch corrected are present. | -| `--output_unintegrated` | `file` | (*Output*) The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | +| `--output_unintegrated_censored` | `file` | (*Output*) An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | +| `--output_unintegrated` | `file` | (*Output*) The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | | `--output_validation` | `file` | (*Output*) Hold-out dataset for validation. |
@@ -133,8 +131,7 @@ the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells -therein are identical to those in the unintegrated dataset. Only markers -that need to be batch corrected are present. +therein are identical to those in the unintegrated dataset. Example file: `resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad` @@ -144,8 +141,8 @@ Format:
AnnData object - obs: 'batch', 'sample', 'is_control' - var: 'numeric_id', 'channel', 'marker', 'marker_type' + obs: 'batch', 'sample', 'donor' + var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -159,11 +156,12 @@ Data structure: |:---|:---|:---| | `obs["batch"]` | `string` | Batch information. | | `obs["sample"]` | `string` | Sample ID. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | +| `obs["donor"]` | `string` | (*Optional*) Donor ID. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | +| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -179,8 +177,7 @@ Data structure: The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this -dataset are the same to those in the unintegrated_censored dataset. Only -markers that need to be batch corrected are present. +dataset are the same to those in the unintegrated_censored dataset. Example file: `resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad` @@ -190,8 +187,8 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' - var: 'numeric_id', 'channel', 'marker', 'marker_type' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -208,11 +205,11 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | +| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -244,16 +241,15 @@ included as an input for the batch correction algorithm, but is needed to validate whether whether the algorithm managed to correct the batch effect in batch 2 towards batch 1. This sample will then be included in this dataset (but not in unintegrated and unintegrated_censored -datasets). -Only markers that need to be batch corrected are present. +datasets). Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' - var: 'numeric_id', 'channel', 'marker', 'marker_type' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -270,11 +266,11 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | | `var["marker_type"]` | `string` | Whether the marker is a functional or lineage marker. | +| `var["to_correct"]` | `boolean` | Whether the marker will be batch corrected. | | `layers["preprocessed"]` | `double` | preprocessed data, e.g. already compensated, transformed and debris/doublets removed. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | Nicely formatted name. | @@ -296,7 +292,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input` | `file` | An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. Only markers that need to be batch corrected are present. | +| `--input` | `file` | An unintegrated dataset with certain columns (cells metadata), such as the donor information, hidden. These columns are intentionally hidden to prevent bias. The batch correction algorithm should not have to rely on these information to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. | | `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. |
@@ -311,7 +307,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | | `--input_validation` | `file` | Hold-out dataset for validation. | | `--output` | `file` | (*Output*) Integrated dataset which batch effect was corrected by an algorithm. | @@ -328,7 +324,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input_validation` | `file` | Hold-out dataset for validation. | -| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. Only markers that need to be batch corrected are present. | +| `--input_unintegrated` | `file` | The complete unintegrated dataset, including all cells’ metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. | | `--input_integrated` | `file` | Integrated dataset which batch effect was corrected by an algorithm. | | `--output` | `file` | (*Output*) File indicating the score of a metric. | diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 87795cd..d9746a2 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -31,23 +31,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true - - type: boolean - name: is_validation - description: | - Whether the cell will be used as validation data or not. - If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. - If TRUE, then the cell will only be included in validation. - required: true var: - type: integer name: numeric_id @@ -69,6 +52,11 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index cae1be7..33f27a0 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -6,7 +6,6 @@ summary: | The complete unintegrated dataset, including all cells' metadata (columns) from the unintegrated_censored dataset. The cells in this dataset are the same to those in the unintegrated_censored dataset. - Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -36,16 +35,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true var: - type: integer name: numeric_id @@ -63,6 +52,15 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true + - type: boolean + name: to_correct + description: Whether the marker will be batch corrected + required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index f841421..0a51fc5 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -9,7 +9,6 @@ summary: | to properly integrate different batches. This dataset is used as the input for the batch correction algorithm. The cells therein are identical to those in the unintegrated dataset. - Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -27,16 +26,10 @@ info: name: sample description: Sample ID required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true + - type: string + name: donor + description: Donor ID + required: false var: - type: integer name: numeric_id @@ -54,6 +47,15 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true + - type: boolean + name: to_correct + description: Whether the marker will be batch corrected + required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index 6dea033..ac6af85 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -14,7 +14,6 @@ description: | in batch 2 towards batch 1. This sample will then be included in this dataset (but not in unintegrated and unintegrated_censored datasets). - Only markers that need to be batch corrected are present. info: format: type: h5ad @@ -44,16 +43,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true var: - type: integer name: numeric_id @@ -71,6 +60,15 @@ info: name: marker_type description: Whether the marker is a functional or lineage marker required: true + - type: boolean + name: to_correct + description: Whether the marker will be batch corrected + required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id From 2943780e4e2ee24206552d3ec176ba291ced5545 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 6 Feb 2025 10:44:12 +1100 Subject: [PATCH 4/8] Update anndata schema (#19) * update yaml for files * Update file_integrated.yaml * add is_control to common dataset and clean up * update input file format * revert changes to to_correct var * Update README.md * update script * trigger ci on all components [ci force] * Add data processor (#21) * Update anndata schema (#18) * update yaml for files * Update file_integrated.yaml * add is_control to common dataset and clean up * update input file format * Revert "Update anndata schema (#18)" This reverts commit 1460c34719881c996380eafae0fdfc58ca7f0904. * modified src/data_processors/process_dataset/script.py. The validation dataset now contains cells with obs.is_validation == True, while unitegrated and unintegrated censored contain cells with obs.is_validation == False * CHANGELOG.md updated * Update CHANGELOG.md --------- Co-authored-by: Givanna Putri * trigger ci on all components [ci force] * Removed emd metric It'll be updated by the changes in add-emd-per-matching-sample metric anyway. Plus it is breaking. --------- Co-authored-by: Robrecht Cannoodt Co-authored-by: Luca Leomazzi <76624347+LuLeom@users.noreply.github.com> --- CHANGELOG.md | 14 +++ README.md | 14 ++- scripts/create_resources/test_resources.sh | 2 +- src/api/file_common_dataset.yaml | 22 ++++- src/api/file_unintegrated.yaml | 15 ++- src/api/file_unintegrated_censored.yaml | 19 ++-- src/api/file_validation.yaml | 15 ++- src/data_processors/process_dataset/script.py | 16 ++-- src/metrics/emd_per_samples/config.vsh.yaml | 91 ------------------- src/metrics/emd_per_samples/script.py | 58 ------------ 10 files changed, 80 insertions(+), 186 deletions(-) delete mode 100644 src/metrics/emd_per_samples/config.vsh.yaml delete mode 100644 src/metrics/emd_per_samples/script.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 3823e64..e5c027b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,20 @@ ## MAJOR CHANGES +* Updated file schema (PR #18): + * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. + * Removed donor_id obs from unintegrated censored. + * Removed to_correct var from everything except common_dataset. + All datasets now will only contain markers that need to be corrected. + +* Reupdated the file schema (PR #19): + * Included changes in PR #21: data Processor component partitions cells between unintegrated(censored) + and validation. + * Add back to_correct var to every file except integrated to reflect the real world + batch correction workflow better. + * Reverted PR #18 to retain only the 1st two changes (add is_control and remove + donor_id from unintegrated_censored). + ## MINOR CHANGES * Enabled unit tests (PR #2). diff --git a/README.md b/README.md index 7845a40..a584f77 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control', 'is_validation' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -91,6 +91,8 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | +| `obs["is_validation"]` | `boolean` | Whether the cell will be used as validation data or not. If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. If TRUE, then the cell will only be included in validation. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -141,7 +143,7 @@ Format:
AnnData object - obs: 'batch', 'sample', 'donor' + obs: 'batch', 'sample', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -156,7 +158,7 @@ Data structure: |:---|:---|:---| | `obs["batch"]` | `string` | Batch information. | | `obs["sample"]` | `string` | Sample ID. | -| `obs["donor"]` | `string` | (*Optional*) Donor ID. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -187,7 +189,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -205,6 +207,7 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -248,7 +251,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -266,6 +269,7 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 191cbe4..ff9b7f4 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -36,7 +36,7 @@ mkdir -p $DATASET_DIR python << HERE import anndata as ad -adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/original_dataset.h5ad") +adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/common_dataset.h5ad") channelsofinterest = ['UV379-A', 'UV515-A', diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index d9746a2..87795cd 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -31,6 +31,23 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true + - type: boolean + name: is_validation + description: | + Whether the cell will be used as validation data or not. + If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. + If TRUE, then the cell will only be included in validation. + required: true var: - type: integer name: numeric_id @@ -52,11 +69,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index 33f27a0..c81705b 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -35,6 +35,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -56,11 +66,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index 0a51fc5..874482e 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -26,10 +26,16 @@ info: name: sample description: Sample ID required: true - - type: string - name: donor - description: Donor ID - required: false + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -51,11 +57,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index ac6af85..dda4365 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -43,6 +43,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -64,11 +74,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 2ec82c0..e41720d 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -27,13 +27,12 @@ adata = ad.read_h5ad(par["input"]) print("input:", adata) -validation_names = par["validation_sample_names"] or [] -is_validation = adata.obs["sample"].isin(validation_names) +print(">> Creating unintegrated data", flush=True) +adata_unintegrated = adata[adata.obs.is_validation==False] -print(">> Creating train data", flush=True) output_unintegrated = subset_h5ad_by_format( - adata[[not x for x in is_validation]], + adata_unintegrated, config, "output_unintegrated" ) @@ -41,15 +40,18 @@ print(">> Creating test data", flush=True) output_unintegrated_censored = subset_h5ad_by_format( - adata[[not x for x in is_validation]], + adata_unintegrated, config, "output_unintegrated_censored" ) print(f"output_unintegrated_censored: {output_unintegrated_censored}") -print(">> Creating solution data", flush=True) +print(">> Creating validation data", flush=True) + +adata_validation = adata[adata.obs.is_validation==True] + output_validation = subset_h5ad_by_format( - adata[is_validation], + adata_validation, config, "output_validation" ) diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml deleted file mode 100644 index e5f8214..0000000 --- a/src/metrics/emd_per_samples/config.vsh.yaml +++ /dev/null @@ -1,91 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_metric.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: emd_per_samples - -# Metadata for your component -info: - metrics: - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: emd_per_samples - # A relatively short label, used when rendering visualisarions (required) - label: EMD Per Samples - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "Earth Mover Distance to compute differences in marker expression across two samples." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - Earth Mover Distance (EMD) is a metric designed for comparing two distributions. - It is also known as the Wasserstein metric. - references: - doi: - - 10.1023/A:1026543900054 - links: - # URL to the documentation for this metric (required). - documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html - # URL to the code repository for this metric (required). - repository: https://github.com/TarikExner/CytoNormPy - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: .inf - # Whether a higher value represents a 'better' solution (required) - maximize: false - # Note: need this if we have component specific argument with no default. - # When running the actual command, either split the sample name by ; - # so Tube1_Batch1_WT;Tube1_Batch2_WT - # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT - # --samples_to_compare Tube1_Batch2_WT - test_setup: - starter_file: - samples_to_compare: - - Tube1_Batch1_WT - - Tube1_Batch2_WT - -# Component-specific parameters (optional) -arguments: - - name: "--samples_to_compare" - type: "string" - description: 2 samples to compare. - required: true - multiple: true - - name: "--layer" - type: "string" - default: "integrated" - description: The layer in input anndata containing the marker expression - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: [anndata] - github: [TarikExner/CytoNormPy] - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py deleted file mode 100644 index e21bf7a..0000000 --- a/src/metrics/emd_per_samples/script.py +++ /dev/null @@ -1,58 +0,0 @@ -import anndata as ad -import cytonormpy as cnp - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad", - "output": "output.h5ad", - "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT", - "layer": "integrated", -} -meta = {"name": "emd_per_samples"} -## VIASH END - -print("Reading input files", flush=True) - -input_integrated = ad.read_h5ad(par["input_integrated"]) -input_unintegrated = ad.read_h5ad(par["input_unintegrated"]) - -samples_to_compare = [x.strip() for x in par["samples_to_compare"]] - -layer = par["layer"] - -markers_to_assess = input_integrated.var[input_integrated.var["to_correct"]].index.to_numpy() - -print("Compute metrics", flush=True) - -# have to change the "sample" column to file_name for emd_comparison_from_anndata to work. -# Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they -# harcoded the column file_name and use it in assert. -# See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173 -input_integrated.obs["file_name"] = input_integrated.obs["sample"] - -df = cnp.emd_from_anndata( - adata=input_integrated, - file_list=samples_to_compare, - channels=markers_to_assess, - layer=layer, - sample_identifier_column="file_name", -) - -uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns] -uns_metric_values = df.loc["all_cells"].to_numpy() -uns_method_id = input_integrated.uns["method_id"] if "method_id" in input_integrated.uns else "unintegrated" - - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": input_integrated.uns["dataset_id"], - "method_id": uns_method_id, - "sample_ids": samples_to_compare, - "metric_ids": uns_metric_ids, - "metric_values": uns_metric_values, - } -) -output.write_h5ad(par["output"], compression="gzip") From 511c48bdb9a2cc0d73c299cbb4cefedc379ed90f Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 6 Feb 2025 10:56:53 +1100 Subject: [PATCH 5/8] accidentally broke the run benchmark workflow. shouldn't have completely removed the metric. added a dummy in for now that will be replaced in another branch. --- src/metrics/emd_per_samples/config.vsh.yaml | 84 +++++++++++++++++++++ src/metrics/emd_per_samples/script.py | 28 +++++++ 2 files changed, 112 insertions(+) create mode 100644 src/metrics/emd_per_samples/config.vsh.yaml create mode 100644 src/metrics/emd_per_samples/script.py diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml new file mode 100644 index 0000000..4d7807d --- /dev/null +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -0,0 +1,84 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: emd_per_samples + + + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: emd_per_samples + # A relatively short label, used when rendering visualisarions (required) + label: Emd Per Samples + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "FILL IN: A one sentence summary of this metric." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + FILL IN: A (multi-line) description of how this metric works. + # references: + # doi: + # - 10.1000/xx.123456.789 + # bibtex: + # - | + # @article{foo, + # title={Foo}, + # author={Bar}, + # journal={Baz}, + # year={2024} + # } + links: + # URL to the documentation for this metric (required). + documentation: https://url.to/the/documentation + # URL to the code repository for this metric (required). + repository: https://github.com/organisation/repository + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: 1 + # Whether a higher value represents a 'better' solution (required) + maximize: true + +# Component-specific parameters (optional) +# arguments: +# - name: "--n_neighbors" +# type: "integer" +# default: 5 +# description: Number of neighbors to use. + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + # setup: + # - type: python + # packages: numpy<2 + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py new file mode 100644 index 0000000..916d3df --- /dev/null +++ b/src/metrics/emd_per_samples/script.py @@ -0,0 +1,28 @@ +import anndata as ad + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input_validation": "resources_test/.../validation.h5ad", + "input_unintegrated": "resources_test/.../unintegrated.h5ad", + "input_integrated": "resources_test/.../integrated.h5ad", + "output": "output.h5ad", +} +meta = {"name": "emd_per_samples"} +## VIASH END + +print("Reading input files", flush=True) +input_validation = ad.read_h5ad(par["input_validation"]) +input_unintegrated = ad.read_h5ad(par["input_unintegrated"]) +input_integrated = ad.read_h5ad(par["input_integrated"]) + +print("Compute metrics", flush=True) +# metric_ids and metric_values can have length > 1 +# but should be of equal length +uns_metric_ids = ["emd_per_samples"] +uns_metric_values = [0.5] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData() +output.write_h5ad(par["output"], compression="gzip") From 78d46f36f087dc25597d6f772ec64744bdf22daa Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 6 Feb 2025 12:21:24 +1100 Subject: [PATCH 6/8] Revert "accidentally broke the run benchmark workflow." This reverts commit 511c48bdb9a2cc0d73c299cbb4cefedc379ed90f. --- src/metrics/emd_per_samples/config.vsh.yaml | 84 --------------------- src/metrics/emd_per_samples/script.py | 28 ------- 2 files changed, 112 deletions(-) delete mode 100644 src/metrics/emd_per_samples/config.vsh.yaml delete mode 100644 src/metrics/emd_per_samples/script.py diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml deleted file mode 100644 index 4d7807d..0000000 --- a/src/metrics/emd_per_samples/config.vsh.yaml +++ /dev/null @@ -1,84 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_metric.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: emd_per_samples - - - -# Metadata for your component -info: - metrics: - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: emd_per_samples - # A relatively short label, used when rendering visualisarions (required) - label: Emd Per Samples - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "FILL IN: A one sentence summary of this metric." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - FILL IN: A (multi-line) description of how this metric works. - # references: - # doi: - # - 10.1000/xx.123456.789 - # bibtex: - # - | - # @article{foo, - # title={Foo}, - # author={Bar}, - # journal={Baz}, - # year={2024} - # } - links: - # URL to the documentation for this metric (required). - documentation: https://url.to/the/documentation - # URL to the code repository for this metric (required). - repository: https://github.com/organisation/repository - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: 1 - # Whether a higher value represents a 'better' solution (required) - maximize: true - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: numpy<2 - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py deleted file mode 100644 index 916d3df..0000000 --- a/src/metrics/emd_per_samples/script.py +++ /dev/null @@ -1,28 +0,0 @@ -import anndata as ad - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - "input_validation": "resources_test/.../validation.h5ad", - "input_unintegrated": "resources_test/.../unintegrated.h5ad", - "input_integrated": "resources_test/.../integrated.h5ad", - "output": "output.h5ad", -} -meta = {"name": "emd_per_samples"} -## VIASH END - -print("Reading input files", flush=True) -input_validation = ad.read_h5ad(par["input_validation"]) -input_unintegrated = ad.read_h5ad(par["input_unintegrated"]) -input_integrated = ad.read_h5ad(par["input_integrated"]) - -print("Compute metrics", flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = ["emd_per_samples"] -uns_metric_values = [0.5] - -print("Write output AnnData to file", flush=True) -output = ad.AnnData() -output.write_h5ad(par["output"], compression="gzip") From e05ab56eaaeed50973ac46fedc17ab15040a3162 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 6 Feb 2025 12:21:33 +1100 Subject: [PATCH 7/8] Revert "Update anndata schema (#19)" This reverts commit 2943780e4e2ee24206552d3ec176ba291ced5545. --- CHANGELOG.md | 14 --- README.md | 14 +-- scripts/create_resources/test_resources.sh | 2 +- src/api/file_common_dataset.yaml | 22 +---- src/api/file_unintegrated.yaml | 15 +-- src/api/file_unintegrated_censored.yaml | 19 ++-- src/api/file_validation.yaml | 15 +-- src/data_processors/process_dataset/script.py | 16 ++-- src/metrics/emd_per_samples/config.vsh.yaml | 91 +++++++++++++++++++ src/metrics/emd_per_samples/script.py | 58 ++++++++++++ 10 files changed, 186 insertions(+), 80 deletions(-) create mode 100644 src/metrics/emd_per_samples/config.vsh.yaml create mode 100644 src/metrics/emd_per_samples/script.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e5c027b..3823e64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,20 +21,6 @@ ## MAJOR CHANGES -* Updated file schema (PR #18): - * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. - * Removed donor_id obs from unintegrated censored. - * Removed to_correct var from everything except common_dataset. - All datasets now will only contain markers that need to be corrected. - -* Reupdated the file schema (PR #19): - * Included changes in PR #21: data Processor component partitions cells between unintegrated(censored) - and validation. - * Add back to_correct var to every file except integrated to reflect the real world - batch correction workflow better. - * Reverted PR #18 to retain only the 1st two changes (add is_control and remove - donor_id from unintegrated_censored). - ## MINOR CHANGES * Enabled unit tests (PR #2). diff --git a/README.md b/README.md index a584f77..7845a40 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control', 'is_validation' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -91,8 +91,6 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | -| `obs["is_validation"]` | `boolean` | Whether the cell will be used as validation data or not. If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. If TRUE, then the cell will only be included in validation. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -143,7 +141,7 @@ Format:
AnnData object - obs: 'batch', 'sample', 'is_control' + obs: 'batch', 'sample', 'donor' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -158,7 +156,7 @@ Data structure: |:---|:---|:---| | `obs["batch"]` | `string` | Batch information. | | `obs["sample"]` | `string` | Sample ID. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | +| `obs["donor"]` | `string` | (*Optional*) Donor ID. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -189,7 +187,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -207,7 +205,6 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -251,7 +248,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -269,7 +266,6 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | -| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index ff9b7f4..191cbe4 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -36,7 +36,7 @@ mkdir -p $DATASET_DIR python << HERE import anndata as ad -adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/common_dataset.h5ad") +adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/original_dataset.h5ad") channelsofinterest = ['UV379-A', 'UV515-A', diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 87795cd..d9746a2 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -31,23 +31,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true - - type: boolean - name: is_validation - description: | - Whether the cell will be used as validation data or not. - If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. - If TRUE, then the cell will only be included in validation. - required: true var: - type: integer name: numeric_id @@ -69,6 +52,11 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index c81705b..33f27a0 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -35,16 +35,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true var: - type: integer name: numeric_id @@ -66,6 +56,11 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index 874482e..0a51fc5 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -26,16 +26,10 @@ info: name: sample description: Sample ID required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true + - type: string + name: donor + description: Donor ID + required: false var: - type: integer name: numeric_id @@ -57,6 +51,11 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index dda4365..ac6af85 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -43,16 +43,6 @@ info: name: group description: Biological group of the donor required: true - - type: integer - name: is_control - description: | - Whether the sample the cell came from can be used as a control for batch - effect correction. - 0: cannot be used as a control. - >= 1: can be used as a control. - For cells with >= 1: cells with the same value come from the same donor. - Different values indicate different donors. - required: true var: - type: integer name: numeric_id @@ -74,6 +64,11 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true + # obsm: + # - type: double + # name: X_pca + # description: The resulting PCA embedding. + # required: true uns: - type: string name: dataset_id diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index e41720d..2ec82c0 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -27,12 +27,13 @@ adata = ad.read_h5ad(par["input"]) print("input:", adata) -print(">> Creating unintegrated data", flush=True) +validation_names = par["validation_sample_names"] or [] +is_validation = adata.obs["sample"].isin(validation_names) -adata_unintegrated = adata[adata.obs.is_validation==False] +print(">> Creating train data", flush=True) output_unintegrated = subset_h5ad_by_format( - adata_unintegrated, + adata[[not x for x in is_validation]], config, "output_unintegrated" ) @@ -40,18 +41,15 @@ print(">> Creating test data", flush=True) output_unintegrated_censored = subset_h5ad_by_format( - adata_unintegrated, + adata[[not x for x in is_validation]], config, "output_unintegrated_censored" ) print(f"output_unintegrated_censored: {output_unintegrated_censored}") -print(">> Creating validation data", flush=True) - -adata_validation = adata[adata.obs.is_validation==True] - +print(">> Creating solution data", flush=True) output_validation = subset_h5ad_by_format( - adata_validation, + adata[is_validation], config, "output_validation" ) diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml new file mode 100644 index 0000000..e5f8214 --- /dev/null +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -0,0 +1,91 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: emd_per_samples + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: emd_per_samples + # A relatively short label, used when rendering visualisarions (required) + label: EMD Per Samples + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "Earth Mover Distance to compute differences in marker expression across two samples." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + Earth Mover Distance (EMD) is a metric designed for comparing two distributions. + It is also known as the Wasserstein metric. + references: + doi: + - 10.1023/A:1026543900054 + links: + # URL to the documentation for this metric (required). + documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html + # URL to the code repository for this metric (required). + repository: https://github.com/TarikExner/CytoNormPy + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: .inf + # Whether a higher value represents a 'better' solution (required) + maximize: false + # Note: need this if we have component specific argument with no default. + # When running the actual command, either split the sample name by ; + # so Tube1_Batch1_WT;Tube1_Batch2_WT + # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT + # --samples_to_compare Tube1_Batch2_WT + test_setup: + starter_file: + samples_to_compare: + - Tube1_Batch1_WT + - Tube1_Batch2_WT + +# Component-specific parameters (optional) +arguments: + - name: "--samples_to_compare" + type: "string" + description: 2 samples to compare. + required: true + multiple: true + - name: "--layer" + type: "string" + default: "integrated" + description: The layer in input anndata containing the marker expression + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: [anndata] + github: [TarikExner/CytoNormPy] + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py new file mode 100644 index 0000000..e21bf7a --- /dev/null +++ b/src/metrics/emd_per_samples/script.py @@ -0,0 +1,58 @@ +import anndata as ad +import cytonormpy as cnp + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad", + "output": "output.h5ad", + "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT", + "layer": "integrated", +} +meta = {"name": "emd_per_samples"} +## VIASH END + +print("Reading input files", flush=True) + +input_integrated = ad.read_h5ad(par["input_integrated"]) +input_unintegrated = ad.read_h5ad(par["input_unintegrated"]) + +samples_to_compare = [x.strip() for x in par["samples_to_compare"]] + +layer = par["layer"] + +markers_to_assess = input_integrated.var[input_integrated.var["to_correct"]].index.to_numpy() + +print("Compute metrics", flush=True) + +# have to change the "sample" column to file_name for emd_comparison_from_anndata to work. +# Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they +# harcoded the column file_name and use it in assert. +# See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173 +input_integrated.obs["file_name"] = input_integrated.obs["sample"] + +df = cnp.emd_from_anndata( + adata=input_integrated, + file_list=samples_to_compare, + channels=markers_to_assess, + layer=layer, + sample_identifier_column="file_name", +) + +uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns] +uns_metric_values = df.loc["all_cells"].to_numpy() +uns_method_id = input_integrated.uns["method_id"] if "method_id" in input_integrated.uns else "unintegrated" + + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": input_integrated.uns["dataset_id"], + "method_id": uns_method_id, + "sample_ids": samples_to_compare, + "metric_ids": uns_metric_ids, + "metric_values": uns_metric_values, + } +) +output.write_h5ad(par["output"], compression="gzip") From fdcac67c651431e1959dfc6a5f9cb894df7067b9 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 6 Feb 2025 15:42:27 +1100 Subject: [PATCH 8/8] Update anndata schema (#22) * update yaml for files * Update file_integrated.yaml * add is_control to common dataset and clean up * update input file format * revert changes to to_correct var * Update README.md * update script * trigger ci on all components [ci force] * Add data processor (#21) * Update anndata schema (#18) * update yaml for files * Update file_integrated.yaml * add is_control to common dataset and clean up * update input file format * Revert "Update anndata schema (#18)" This reverts commit 1460c34719881c996380eafae0fdfc58ca7f0904. * modified src/data_processors/process_dataset/script.py. The validation dataset now contains cells with obs.is_validation == True, while unitegrated and unintegrated censored contain cells with obs.is_validation == False * CHANGELOG.md updated * Update CHANGELOG.md --------- Co-authored-by: Givanna Putri * trigger ci on all components [ci force] * Removed emd metric It'll be updated by the changes in add-emd-per-matching-sample metric anyway. Plus it is breaking. * added emd back in --------- Co-authored-by: Robrecht Cannoodt Co-authored-by: Luca Leomazzi <76624347+LuLeom@users.noreply.github.com> --- CHANGELOG.md | 14 ++++++++++++ README.md | 14 +++++++----- scripts/create_resources/test_resources.sh | 2 +- src/api/file_common_dataset.yaml | 22 ++++++++++++++----- src/api/file_unintegrated.yaml | 15 ++++++++----- src/api/file_unintegrated_censored.yaml | 19 ++++++++-------- src/api/file_validation.yaml | 15 ++++++++----- src/data_processors/process_dataset/script.py | 16 ++++++++------ src/metrics/emd_per_samples/config.vsh.yaml | 6 ++--- src/metrics/emd_per_samples/script.py | 18 +++++++++++---- 10 files changed, 97 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3823e64..e5c027b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,20 @@ ## MAJOR CHANGES +* Updated file schema (PR #18): + * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. + * Removed donor_id obs from unintegrated censored. + * Removed to_correct var from everything except common_dataset. + All datasets now will only contain markers that need to be corrected. + +* Reupdated the file schema (PR #19): + * Included changes in PR #21: data Processor component partitions cells between unintegrated(censored) + and validation. + * Add back to_correct var to every file except integrated to reflect the real world + batch correction workflow better. + * Reverted PR #18 to retain only the 1st two changes (add is_control and remove + donor_id from unintegrated_censored). + ## MINOR CHANGES * Enabled unit tests (PR #2). diff --git a/README.md b/README.md index 7845a40..a584f77 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control', 'is_validation' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -91,6 +91,8 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | +| `obs["is_validation"]` | `boolean` | Whether the cell will be used as validation data or not. If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. If TRUE, then the cell will only be included in validation. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -141,7 +143,7 @@ Format:
AnnData object - obs: 'batch', 'sample', 'donor' + obs: 'batch', 'sample', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -156,7 +158,7 @@ Data structure: |:---|:---|:---| | `obs["batch"]` | `string` | Batch information. | | `obs["sample"]` | `string` | Sample ID. | -| `obs["donor"]` | `string` | (*Optional*) Donor ID. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -187,7 +189,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -205,6 +207,7 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | @@ -248,7 +251,7 @@ Format:
AnnData object - obs: 'cell_type', 'batch', 'sample', 'donor', 'group' + obs: 'cell_type', 'batch', 'sample', 'donor', 'group', 'is_control' var: 'numeric_id', 'channel', 'marker', 'marker_type', 'to_correct' layers: 'preprocessed' uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' @@ -266,6 +269,7 @@ Data structure: | `obs["sample"]` | `string` | Sample ID. | | `obs["donor"]` | `string` | Donor ID. | | `obs["group"]` | `string` | Biological group of the donor. | +| `obs["is_control"]` | `integer` | Whether the sample the cell came from can be used as a control for batch effect correction. 0: cannot be used as a control. \>= 1: can be used as a control. For cells with \>= 1: cells with the same value come from the same donor. Different values indicate different donors. | | `var["numeric_id"]` | `integer` | Numeric ID associated with each marker. | | `var["channel"]` | `string` | The channel / detector of the instrument. | | `var["marker"]` | `string` | (*Optional*) The marker name associated with the channel. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 191cbe4..ff9b7f4 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -36,7 +36,7 @@ mkdir -p $DATASET_DIR python << HERE import anndata as ad -adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/original_dataset.h5ad") +adata = ad.read_h5ad("resources_test/task_cyto_batch_integration/starter_file/common_dataset.h5ad") channelsofinterest = ['UV379-A', 'UV515-A', diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index d9746a2..87795cd 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -31,6 +31,23 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true + - type: boolean + name: is_validation + description: | + Whether the cell will be used as validation data or not. + If FALSE, then the cell will only be included in unintegrated and unintegrated_censored. + If TRUE, then the cell will only be included in validation. + required: true var: - type: integer name: numeric_id @@ -52,11 +69,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated.yaml b/src/api/file_unintegrated.yaml index 33f27a0..c81705b 100644 --- a/src/api/file_unintegrated.yaml +++ b/src/api/file_unintegrated.yaml @@ -35,6 +35,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -56,11 +66,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_unintegrated_censored.yaml b/src/api/file_unintegrated_censored.yaml index 0a51fc5..874482e 100644 --- a/src/api/file_unintegrated_censored.yaml +++ b/src/api/file_unintegrated_censored.yaml @@ -26,10 +26,16 @@ info: name: sample description: Sample ID required: true - - type: string - name: donor - description: Donor ID - required: false + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -51,11 +57,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/api/file_validation.yaml b/src/api/file_validation.yaml index ac6af85..dda4365 100644 --- a/src/api/file_validation.yaml +++ b/src/api/file_validation.yaml @@ -43,6 +43,16 @@ info: name: group description: Biological group of the donor required: true + - type: integer + name: is_control + description: | + Whether the sample the cell came from can be used as a control for batch + effect correction. + 0: cannot be used as a control. + >= 1: can be used as a control. + For cells with >= 1: cells with the same value come from the same donor. + Different values indicate different donors. + required: true var: - type: integer name: numeric_id @@ -64,11 +74,6 @@ info: name: to_correct description: Whether the marker will be batch corrected required: true - # obsm: - # - type: double - # name: X_pca - # description: The resulting PCA embedding. - # required: true uns: - type: string name: dataset_id diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 2ec82c0..e41720d 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -27,13 +27,12 @@ adata = ad.read_h5ad(par["input"]) print("input:", adata) -validation_names = par["validation_sample_names"] or [] -is_validation = adata.obs["sample"].isin(validation_names) +print(">> Creating unintegrated data", flush=True) +adata_unintegrated = adata[adata.obs.is_validation==False] -print(">> Creating train data", flush=True) output_unintegrated = subset_h5ad_by_format( - adata[[not x for x in is_validation]], + adata_unintegrated, config, "output_unintegrated" ) @@ -41,15 +40,18 @@ print(">> Creating test data", flush=True) output_unintegrated_censored = subset_h5ad_by_format( - adata[[not x for x in is_validation]], + adata_unintegrated, config, "output_unintegrated_censored" ) print(f"output_unintegrated_censored: {output_unintegrated_censored}") -print(">> Creating solution data", flush=True) +print(">> Creating validation data", flush=True) + +adata_validation = adata[adata.obs.is_validation==True] + output_validation = subset_h5ad_by_format( - adata[is_validation], + adata_validation, config, "output_validation" ) diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml index e5f8214..0e98642 100644 --- a/src/metrics/emd_per_samples/config.vsh.yaml +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -47,8 +47,8 @@ info: test_setup: starter_file: samples_to_compare: - - Tube1_Batch1_WT - - Tube1_Batch2_WT + - Mouse1_Batch1_WT + - Mouse1_Batch2_WT # Component-specific parameters (optional) arguments: @@ -88,4 +88,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [midtime,midmem,midcpu] \ No newline at end of file diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py index e21bf7a..81b0d0e 100644 --- a/src/metrics/emd_per_samples/script.py +++ b/src/metrics/emd_per_samples/script.py @@ -6,8 +6,9 @@ # in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad", + "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad", "output": "output.h5ad", - "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT", + "samples_to_compare": ["Mouse1_Batch1_WT", "Mouse1_Batch2_WT"], "layer": "integrated", } meta = {"name": "emd_per_samples"} @@ -22,7 +23,11 @@ layer = par["layer"] -markers_to_assess = input_integrated.var[input_integrated.var["to_correct"]].index.to_numpy() +# markers_to_assess = input_integrated.var[ +# input_integrated.var["to_correct"] +# ].index.to_numpy() + +markers_to_assess = input_integrated.var.index.to_numpy() print("Compute metrics", flush=True) @@ -30,7 +35,8 @@ # Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they # harcoded the column file_name and use it in assert. # See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173 -input_integrated.obs["file_name"] = input_integrated.obs["sample"] +# stop gap for now. This script will be overriden in the branch that handle emd anyway. +input_integrated.obs["file_name"] = input_unintegrated.obs["sample"] df = cnp.emd_from_anndata( adata=input_integrated, @@ -42,7 +48,11 @@ uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns] uns_metric_values = df.loc["all_cells"].to_numpy() -uns_method_id = input_integrated.uns["method_id"] if "method_id" in input_integrated.uns else "unintegrated" +uns_method_id = ( + input_integrated.uns["method_id"] + if "method_id" in input_integrated.uns + else "unintegrated" +) print("Write output AnnData to file", flush=True)