Skip to content

Commit

Permalink
add maskprimers extract option
Browse files Browse the repository at this point in the history
  • Loading branch information
ggabernet committed Aug 20, 2024
1 parent ec62649 commit d223b8a
Show file tree
Hide file tree
Showing 13 changed files with 351 additions and 110 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
"test_nebnext_umi",
"test_rnaseq_bulk",
"test_rnaseq_sc",
"test_maskprimers_extract"
]
fail-fast: false
steps:
Expand Down
19 changes: 18 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,24 @@
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [4.1.0] -
## [4.2.0] - dev

### `Added`

- [#334](https://github.com/nf-core/airrflow/pull/334) Added TRUST4 support.
- Added option to extract V and C primers when sequence is unknown.

### `Fixed`

- Avoid saving pRESTO intermediate fastq files in results directory.
- Simplified pRESTO Maskprimers score and Maskprimers extract processes.

### `Dependencies`

### `Deprecated parameters`


## [4.1.0] - Avenseguim

### `Added`

Expand Down
10 changes: 5 additions & 5 deletions assets/repertoire_comparison.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,13 @@ tryCatch( {
write.table(tab_seqs, file=paste0(seq_dir,"/Table_sequences_assembly.tsv"), sep="\t", quote=F, row.names=F)
plot_table <- tidyr::pivot_longer(tab_seqs,
cols=Sequences_R1:Igblast,
cols=Sequences:Igblast,
names_to = "steps",
values_to ="count")
plot_table$steps <- factor(plot_table$steps, levels=c("Sequences_R1", "Sequences_R2", "Filtered_quality_R1",
"Filtered_quality_R2", "Mask_primers_R1", "Mask_primers_R2",
"Paired", "Build_consensus", "Assemble_pairs", "Unique",
"Representative_2", "Igblast"))
firstcol = which(colnames(tab_seqs) == "Sequences")
lastcol = which(colnames(tab_seqs) == "Igblast")
plot_table$steps <- factor(plot_table$steps, levels=colnames(tab_seqs)[firstcol:lastcol])
seqs_plot <- ggplot(data=plot_table,
aes(x=steps, y=count, group=sample_id)) +
geom_line(aes(colour=sample_id)) +
Expand Down
4 changes: 1 addition & 3 deletions bin/log_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,8 +369,7 @@

colnames = [
"Sample",
"Sequences_R1",
"Sequences_R2",
"Sequences",
"Filtered_quality_R1",
"Filtered_quality_R2",
"Mask_primers_R1",
Expand All @@ -388,7 +387,6 @@
values = [
df_process_list[2].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
Expand Down
104 changes: 62 additions & 42 deletions bin/log_parsing_no-umi.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
# Written by David Ladd and released under the MIT license (2020).
# Written by David Ladd, Gisela Gabernet and released under the MIT license (2020).

# log_parsing_no-umi.py
# Parsing log files for each of the steps for QC analysis.
Expand All @@ -23,7 +23,7 @@
df_process_list = []

for process in processes:
find = subprocess.check_output(["find", process, "-name", "*command_log.txt"])
find = subprocess.check_output(["find", process, "-name", "*command_log*"])
log_files = find.decode().split("\n")
log_files = list(filter(None, log_files))

Expand Down Expand Up @@ -59,49 +59,67 @@

df_process_list.append(df_process)

elif process in ["mask_primers", "filter_by_sequence_quality"]:
if process in ["filter_by_sequence_quality"]:
s_code = []
pass_pairs = []
fail_pairs = []
process_name = []

for logfile in log_files:
with open(logfile, "r") as f:
for line in f:
if " START>" in line:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
process_name.append(process)
elif "PASS>" in line:
pass_pairs.append(line.strip().removeprefix("PASS> "))
elif "FAIL>" in line:
fail_pairs.append(line.strip().removeprefix("FAIL> "))

df_process = pd.DataFrame.from_dict(
{
"Sample": s_code,
"pass_pairs": pass_pairs,
"fail_pairs": fail_pairs,
"process": process_name,
}
)

df_process_list.append(df_process)

elif process in ["mask_primers"]:
s_code = []
s_readtype = []
output_file = []
seqs_R1 = []
seqs_R2 = []
pass_R1 = []
pass_R2 = []
fail_R1 = []
fail_R2 = []
n_seqs = []
n_pass = []
n_fail = []
process_name = []

for logfile in log_files:
c = 0
if "_R1" in logfile:
s_readtype.append("R1")
elif "_R2" in logfile:
s_readtype.append("R2")
with open(logfile, "r") as f:
# print(f.read())
for line in f:
if " START>" in line:
if c < 1:
s_code.append(logfile.split("/")[1].split("_command_log")[0])
process_name.append(process)
s_code.append(logfile.split("/")[1].split("_command_log")[0])
process_name.append(process)
elif "SEQUENCES>" in line:
if c < 1:
seqs_R1.append(line.strip().removeprefix("SEQUENCES> "))
else:
seqs_R2.append(line.strip().removeprefix("SEQUENCES> "))
n_seqs.append(line.strip().removeprefix("SEQUENCES> "))
elif "PASS>" in line:
if c < 1:
pass_R1.append(line.strip().removeprefix("PASS> "))
else:
pass_R2.append(line.strip().removeprefix("PASS> "))
n_pass.append(line.strip().removeprefix("PASS> "))
elif "FAIL>" in line:
if c < 1:
fail_R1.append(line.strip().removeprefix("FAIL> "))
c += 1
else:
fail_R2.append(line.strip().removeprefix("FAIL> "))
n_fail.append(line.strip().removeprefix("FAIL> "))

df_process = pd.DataFrame.from_dict(
{
"Sample": s_code,
"start_R1": seqs_R1,
"pass_R1": pass_R1,
"fail_R1": fail_R1,
"readtype": s_readtype,
"start": n_seqs,
"pass": n_pass,
"fail": n_fail,
"process": process_name,
}
)
Expand Down Expand Up @@ -282,23 +300,13 @@
"Sequences",
"Assemble_pairs",
"Filtered_quality",
"Mask_primers",
"Mask_primers_R1",
"Mask_primers_R2",
"Unique",
"Representative_2",
"Igblast",
]

values = [
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_pairs"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[3].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]

# Tables provide extra info and help debugging
df_process_list[0].to_csv(
path_or_buf="Table_all_details_assemble_mates.tsv",
Expand All @@ -316,6 +324,18 @@
df_process_list[3].to_csv(path_or_buf="Table_all_details_deduplicate.tsv", sep="\t", header=True, index=False)
df_process_list[4].to_csv(path_or_buf="Table_all_details_igblast.tsv", sep="\t", header=True, index=False)

values = [
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_pairs"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
df_process_list[3].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]

final_table = dict(zip(colnames, values))
print(final_table)
df_final_table = pd.DataFrame.from_dict(final_table)
Expand Down
3 changes: 2 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ process {
ext.args = [ "--disable_quality_filtering --disable_length_filtering",
params.trim_fastq ?: "--disable_adapter_trimming",
params.clip_r1 > 0 ? "--trim_front1 ${params.clip_r1}" : "", // Remove bp from the 5' end of read 1
params.clip_r2 > 0 ? "--trim_front2 ${params.clip_r2}" : "", // Remove bp from the 5' end of read 2
params.clip_r2 > 0 ? "--trim_front2 ${params.clip_r2}" : "", // Remove bp from the 5' end of read 2
params.three_prime_clip_r1 > 0 ? "--trim_tail1 ${params.three_prime_clip_r1}" : "", // Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed
params.three_prime_clip_r2 > 0 ? "--trim_tail2 ${params.three_prime_clip_r2}" : "", // Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed
params.trim_nextseq ? "--trim_poly_g" : "", // Apply the --nextseq=X option, to trim based on quality after removing poly-G tails
Expand Down Expand Up @@ -136,6 +136,7 @@ process {
mode: params.publish_dir_mode,
pattern: "*{txt,log,tab}"
]
ext.args2 = "-f ID PRIMER ERROR"
}

withName: PRESTO_MASKPRIMERS_ALIGN {
Expand Down
52 changes: 52 additions & 0 deletions conf/test_maskprimers_extract.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/airrflow -profile test,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test extract primers profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 2
max_memory = '6.GB'
max_time = '6.h'

// Input data
input = pipelines_testdata_base_path + 'testdata-bcr/Metadata_test_airr.tsv'
cprimers = pipelines_testdata_base_path + 'testdata-bcr/C_primers.fasta'
vprimers = pipelines_testdata_base_path + 'testdata-bcr/V_primers.fasta'
reference_fasta = pipelines_testdata_base_path + 'database-cache/imgtdb_base.zip'
reference_igblast = pipelines_testdata_base_path + 'database-cache/igblast_base.zip'

mode = 'fastq'

library_generation_method = 'specific_pcr_umi'
maskprimers_extract = true
cprimer_position = 'R1'
umi_length = 8
umi_start = 6
umi_position = 'R1'
index_file = true
isotype_column = 'c_primer'
lineage_trees = false
primer_r1_mask_mode = 'trim'
primer_r2_mask_mode = 'cut'
primer_r1_extract_len = 28
primer_r2_extract_len = 29
}

process{
withName:"DEFINE_CLONES*"{
ext.args = ['outname':'', 'model':'hierarchical',
'method':'nt', 'linkage':'single',
'outputby':'sample_id', 'min_n':10]
}
}
27 changes: 16 additions & 11 deletions modules/local/presto/presto_maskprimers_extract.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,36 @@ process PRESTO_MASKPRIMERS_EXTRACT {
'biocontainers/presto:0.7.1--pyhdfd78af_0' }"

input:
tuple val(meta), path(R2)
val(extract_start),
val(extract_length),
tuple val(meta), path(read)
val(extract_start)
val(extract_length)
val(extract_mode)
val(barcode)
val(suffix)

output:
tuple val(meta), path("*_R2_primers-pass.fastq") , emit: reads
path "*_command_log_R2.txt", emit: logs
path "*_R2.log"
tuple val(meta), path("*_primers-pass.fastq") , emit: reads
path "*.txt", emit: logs
path "*.log"
path "*.tab", emit: log_tab
path "versions.yml" , emit: versions

script:
def args = task.ext.args?: ''
def args2 = task.ext.args2?: ''
def barcode = barcode ? '--barcode' : ''
"""
MaskPrimers.py extract --nproc ${task.cpus} \\
-s $R2 \\
MaskPrimers.py extract \\
--nproc ${task.cpus} \\
-s $read \\
--start ${extract_start} \\
--len ${extract_length} \\
$barcode \\
$args \\
--mode ${extract_mode} \\
--outname ${meta.id}_R2 \\
--log ${meta.id}_R2.log >> ${meta.id}_command_log_R2.txt
ParseLog.py -l ${meta.id}_R2.log $args2
--outname ${meta.id}_${suffix} \\
--log ${meta.id}_${suffix}.log >> ${meta.id}_command_log_${suffix}.txt
ParseLog.py -l *.log $args2
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
24 changes: 19 additions & 5 deletions modules/local/presto/presto_maskprimers_score.nf
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
process PRESTO_MASKPRIMERS {
process PRESTO_MASKPRIMERS_SCORE {
tag "$meta.id"
label "process_high"
label 'immcantation'
Expand All @@ -11,6 +11,7 @@ process PRESTO_MASKPRIMERS {
input:
tuple val(meta), path(read)
path(primers)
val(primer_start)
val(barcode)
val(primer_maxerror)
val(primer_mask_mode)
Expand All @@ -19,17 +20,30 @@ process PRESTO_MASKPRIMERS {

output:
tuple val(meta), path("*_primers-pass.fastq"), emit: reads
path "*_command_log.txt", emit: logs
path "*.txt", emit: logs
path "*.log"
path "*.tab", emit: log_tab
path "versions.yml" , emit: versions


script:
def revpr = params.reverse_primers ? '--revpr' : ''
def args = task.ext.args?: ''
def args2 = task.ext.args2?: ''
def revpr = reverse_primers ? '--revpr' : ''
def barcode = barcode ? '--barcode' : ''
"""
MaskPrimers.py score --nproc ${task.cpus} -s $read -p ${primers} $barcode $revpr --maxerror ${primer_maxerror} --mode ${primer_mask_mode} --outname ${meta.id}_${suffix} --log ${meta.id}_${suffix}.log > ${meta.id}_${suffix}_command_log.txt
ParseLog.py -l *.log -f ID PRIMER ERROR
MaskPrimers.py score \\
--nproc ${task.cpus} \\
-s $read \\
-p ${primers} \\
--maxerror ${primer_maxerror} \\
--mode ${primer_mask_mode} \\
--start ${primer_start} \\
$barcode $revpr \\
$args \\
--outname ${meta.id}_${suffix} \\
--log ${meta.id}_${suffix}.log > ${meta.id}_command_log_${suffix}.txt
ParseLog.py -l *.log $args2
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
Loading

0 comments on commit d223b8a

Please sign in to comment.