Skip to content

Commit

Permalink
Add new IgDiscover test cases. Try to handle alignment of IgDiscover
Browse files Browse the repository at this point in the history
output better
  • Loading branch information
williamdlees committed May 12, 2023
1 parent 69c6fe6 commit ec774fa
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 1 deletion.
11 changes: 10 additions & 1 deletion R/genotype_statistics.R
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,11 @@ read_input_sequences = function(filename, segment, chain_type) {
}

if ('CDR3_START' %in% col_names) {
s$CDR3_START = s$CDR3_START - s$FWR1_START + 1 # make them indeces into the V sequence
s$CDR3_START = s$CDR3_START - s$FWR1_START # make them indeces into the V sequence
if (dimnames(sort(table(substring(s$SEQUENCE_IMGT, s$CDR3_START, s$CDR3_START+2)),decreasing=TRUE))[[1]][1] == 'TGT') {
# we've actually found the junction not the CDR3!
s$CDR3_START = s$CDR3_START + 3
}
s$JUNCTION_START = s$CDR3_START - 3
names(s)[names(s) == 'SEQUENCE_IMGT'] = 'SEQUENCE'
}
Expand Down Expand Up @@ -572,6 +576,11 @@ calc_haplo_details = function(segment, input_sequences) {
'X'
}
})

if (nrow(sa) == 0) {
return(list())
}

sa$a_gene = factor(sa$a_gene, sort_alleles(unique(sa$a_gene)))

su = select(sa, A_CALL, a_gene, a_allele)
Expand Down
7 changes: 7 additions & 0 deletions R/sequence_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ apply_gaps = function(seq, tem) {
# It will not handle indels, but it will try to spot them and set the aligned sequence to NA
# junction_start is the location of the first nucleotide of the cysteine preceding the CRD3
# this is location 310 in the IMGT numbering scheme
#
# NOTE - as at v0.15.1, IgDiscover does not always find the junction accurately because
# it is using regular expression matching rather than IgBLAST junction analysis. This can
# drive the alignment awry. IgDiscover-inferred alleles that have no unmutated reads
# according to ogrdbstats are a symptom of this. Workaround is to re-annotate using IgBLAST
# and run ogrdbstats on the IgBLAST output rather than running it on IgDiscover output.

imgt_gap = function(sequence, cdr3_sequence, junction_start, ref_gene) {
if(is.na(ref_gene) || is.na(cdr3_sequence) || is.na(sequence) || is.na(junction_start)) {
return(NA)
Expand Down
32 changes: 32 additions & 0 deletions testdata/run_tests.R
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ genotype_statistics_test = function(testdir, full=F) {
# VH - IgDiscover 1.51
# Recent versions of IgDiscover use IgBLAST output in filtered.tsv

# VH - IgDiscover 1.51
# Recent versions of IgDiscover use IgBLAST output in filtered.tsv
report('VK_igdiscover_151')
setwd('VK_igdiscover_151')
ref_filename = 'V_ref_gapped.fasta'
Expand All @@ -203,5 +205,35 @@ genotype_statistics_test = function(testdir, full=F) {
generate_ogrdb_report(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, FALSE)
setwd('..')


report('VL_igdiscover_151')
setwd('VL_igdiscover_151')
ref_filename = 'V_gapped.fasta'
species = 'Homosapiens'
#inferred_filename = 'V_head.fasta'
#filename = 'filtered_1_01.tsv'
inferred_filename = 'V.fasta'
filename = 'filtered.tsv'
chain = 'IGLV'
hap_gene = ''
segment = 'V'
chain_type = 'L'
generate_ogrdb_report(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, FALSE)
setwd('..')

report('VL_mouse_igblast')
setwd('VL_mouse_igblast')
ref_filename = 'V_gapped.fasta'
species = 'Mouse'
filename = 'annotated_head.tsv'
#filename = 'annotated_full.tsv'
inferred_filename = '-'
chain = 'IGLV'
hap_gene = NA
segment = 'V'
chain_type = 'L'
generate_ogrdb_report(ref_filename, inferred_filename, species, filename, chain, hap_gene, segment, chain_type, TRUE)
setwd('..')

}
}

0 comments on commit ec774fa

Please sign in to comment.