From aa964c9ff78f5291e4c3bd7228c2003200587761 Mon Sep 17 00:00:00 2001 From: Bonita Lam Date: Fri, 27 Aug 2021 10:45:46 -0700 Subject: [PATCH] ENCD-6099-audit-index-reads-no-read-structure (#4071) --- src/encoded/audit/file.py | 12 ++++++++++++ src/encoded/schemas/changelogs/file.md | 1 + src/encoded/schemas/file.json | 2 ++ src/encoded/tests/data/inserts/file.json | 10 ++++++++++ src/encoded/tests/test_audit_file.py | 22 ++++++++++++++++++++++ 5 files changed, 47 insertions(+) diff --git a/src/encoded/audit/file.py b/src/encoded/audit/file.py index b0a58a96840..8a7bffb100e 100644 --- a/src/encoded/audit/file.py +++ b/src/encoded/audit/file.py @@ -629,6 +629,17 @@ def audit_file_index_of(value, system): ) yield AuditFailure('inconsistent index file', detail, level='ERROR') + +def audit_index_reads_read_structure(value, system): + if value['output_type'] == 'index reads': + if 'read_structure' not in value or \ + 'read_structure' in value and len(value['read_structure']) == 0: + detail = ( + f'Index file {audit_link(path_to_text(value["@id"]), value["@id"])} ' + f'is missing read structure information.' + ) + yield AuditFailure('missing read structure', detail, level='INTERNAL_ACTION') + function_dispatcher = { 'audit_step_run': audit_file_processed_step_run, 'audit_derived_from': audit_file_processed_derived_from, @@ -643,6 +654,7 @@ def audit_file_index_of(value, system): 'audit_read_structure': audit_read_structure, 'audit_file_matching_md5sum': audit_file_matching_md5sum, 'audit_file_index_of': audit_file_index_of, + 'audit_index_reads_read_structure': audit_index_reads_read_structure, } diff --git a/src/encoded/schemas/changelogs/file.md b/src/encoded/schemas/changelogs/file.md index 8daa20c784c..cc4935293e7 100644 --- a/src/encoded/schemas/changelogs/file.md +++ b/src/encoded/schemas/changelogs/file.md @@ -8,6 +8,7 @@ * Added *fragments* to the enum list for *output_type*, the new output type is to be submitted as tar.gz only * Added *sparse peak count matrix* to the enum list for *output_type* * Added *functional conservation quantifications* to the enum list for *output_type* +* Added *cell barcode* and *element barcode* to the enum list for *sequence_element* of the *read_structure* property ### Schema version 29 * Added Ultima Genomics platform to dependency that enforces that *read_length* property should not be specified for FASTQ files produced by Oxford Nanopore and PacBio platforms. diff --git a/src/encoded/schemas/file.json b/src/encoded/schemas/file.json index 94def647032..c8ed9852744 100644 --- a/src/encoded/schemas/file.json +++ b/src/encoded/schemas/file.json @@ -1996,6 +1996,8 @@ "enum": [ "adapter", "barcode", + "cell barcode", + "element barcode", "UMI" ] }, diff --git a/src/encoded/tests/data/inserts/file.json b/src/encoded/tests/data/inserts/file.json index a761581dfaf..c8d53bdc666 100644 --- a/src/encoded/tests/data/inserts/file.json +++ b/src/encoded/tests/data/inserts/file.json @@ -2808,6 +2808,16 @@ "sequence_element": "UMI", "start": 13, "end": 18 + }, + { + "sequence_element": "cell barcode", + "start": 19, + "end": 25 + }, + { + "sequence_element": "element barcode", + "start": 26, + "end": 33 } ] }, diff --git a/src/encoded/tests/test_audit_file.py b/src/encoded/tests/test_audit_file.py index b7ddd852c4c..1df86a070fb 100644 --- a/src/encoded/tests/test_audit_file.py +++ b/src/encoded/tests/test_audit_file.py @@ -1127,3 +1127,25 @@ def test_audit_incorrect_index(testapp, assert any(error['category'] == 'inconsistent index file' and 'multiple non-Illumina fastq' in error['detail'] for error in errors_list) + + +def test_audit_index_reads_read_structure(testapp, fastq_index): + res = testapp.get(fastq_index['@id'] + '@@index-data') + errors = res.json['audit'] + errors_list = [error for v in errors.values() for error in v if error['category'] == 'missing read structure'] + assert errors_list + testapp.patch_json( + fastq_index['@id'], + { + 'read_structure': [{ + 'sequence_element': 'cell barcode', + 'start': 1, + 'end': 20 + }] + } + ) + res = testapp.get(fastq_index['@id'] + '@@index-data') + errors = [error for v in res.json['audit'].values() for error in v] + assert not any( + error['category'] == 'missing read structure' for error in errors + )