diff --git a/ChangeLog.txt b/ChangeLog.txt index 15728d3d..42f6b11a 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -1,3 +1,5 @@ +v0.29.4 +- Add a python script to identify de novo calls - MANTA-285 add option to keep all temp files to support workflow debug - MANTA-276 mv configure to top-level, mv guides to docs dir, add methods docs - MANTA-284 improve windows shell support by shortening very long cmdlines diff --git a/docs/userGuide/README.md b/docs/userGuide/README.md index 515bed4f..743e8db3 100644 --- a/docs/userGuide/README.md +++ b/docs/userGuide/README.md @@ -606,6 +606,23 @@ minEdgeObservations = 2 minCandidateSpanningCount = 2 ``` +### De novo calling + +Manta can be used for de novo calling, following a two-step procedure: + +1) Manta can take multiple input bams for the normal +sample, each bam file being treated as a separate sample as part of a +joint diploid sample analysis, and then output a multi-sample vcf, where the sample order follows that of the input bams. + +2) A post-processing script, provided as $MANTA_INSTALL_FOLDER/libexec/denovo_scoring.py, can be applied to the multi-sample vcf and detects SVs that have inheritance conflicts among a trio sample set. + + The script usage is + denovo_scoring.py + It will ignore any samples in the vcf that are not specified at the commandline. + + Under the same folder of the input vcf, the script outputs a new vcf file and a text file of stats for the de novo calls. Currently, all SVs with inheritance conflicts are labled with "DQ=60" inside the INFO, while all SVs without any conflict are labled with "DQ=0". + + [1]: http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 -[2]: http://sequencing.github.io/pyflow/ +[2]: http://sequencing.github.io/pyflow/ \ No newline at end of file diff --git a/src/python/libexec/denovo_scoring.py b/src/python/libexec/denovo_scoring.py new file mode 100755 index 00000000..f12c0611 --- /dev/null +++ b/src/python/libexec/denovo_scoring.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# +# +# Manta - Structural Variant and Indel Caller +# Copyright (c) 2013-2015 Illumina, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# + + +import sys +from os import path +from os.path import exists, abspath, dirname, basename, splitext, join + + +def check_genotype(probandGT, fatherGT, motherGT): + isConsistent = False + + fatherGTItems = fatherGT.split('/') + motherGTItems = motherGT.split('/') + for it1 in fatherGTItems: + for it2 in motherGTItems: + temp = [it1, it2] + temp.sort() + GT = temp[0]+'/'+temp[1] + if GT == probandGT: + isConsistent = True + break + + return isConsistent + + + +def process_vcf(vcfFile, probandID, + fatherID, motherID): + + vcfFile = abspath(vcfFile) + dataDir = dirname(vcfFile) + filePrefix = splitext(basename(vcfFile))[0] + outFile = join(dataDir, filePrefix+".de_novo.vcf") + statsFile = join(dataDir, filePrefix+".de_novo.stats.txt") + + fpOut = open(outFile, 'wb') + fpStats = open(statsFile, 'wb') + + countPassed = 0 + countFiltered = 0 + consistencyDict = {} + + # parser + isInfoAdded = False + isIxFound = False + colNameLine = "" + probandIx = -1 + fatherIx = -1 + motherIx = -1 + + fpVcf = open(vcfFile, 'rb') + for line in fpVcf: + if line[0] == '#': + if not(isInfoAdded) and (line[:8] == "##FORMAT"): + fpOut.write("##INFO=\n") + isInfoAdded = True + fpOut.write(line) + colNameLine = line + continue + elif not(isIxFound): + # parse format line to get the columns of proband & parents + tokens = colNameLine.split() + for ix in xrange(len(tokens)): + if tokens[ix] == probandID: + probandIx = ix + elif tokens[ix] == fatherID: + fatherIx = ix + elif tokens[ix] == motherID: + motherIx = ix + + wrongID = "" + if probandIx == -1: + wrongID = probandID + if fatherIx == -1: + wrongID += (',%s' % fatherID) + if motherIx == -1: + wrongID += (',%s' % motherID) + + if wrongID: + errMsg = ('The sample ID %s does not exist in the vcf.' + % wrongID) + sys.stderr.write(errMsg + '\nProgram exits.') + sys.exit(1) + + tokens = line.split() + info = tokens[7] + format = tokens[8] + + items = format.split(':') + GTix = -1 + for ix in xrange(len(items)): + if items[ix] == "GT": + GTix = ix + + items = tokens[probandIx].split(':') + probandGT = items[GTix] + + items = tokens[fatherIx].split(':') + fatherGT = items[GTix] + + items = tokens[motherIx].split(':') + motherGT = items[GTix] + + isConsistent = check_genotype(probandGT, fatherGT, motherGT) + if not(isConsistent): + info += ";PG=60" + + # stats + filter = tokens[6] + if filter.upper() == "PASS": + countPassed += 1 + else: + countFiltered += 1 + + GTstring = probandGT + '-' + fatherGT + '-' + motherGT + if not(GTstring in consistencyDict): + consistencyDict[GTstring] = 0 + consistencyDict[GTstring] += 1 + else: + info += ";PG=0" + + newLine = "" + for i in xrange(7): + newLine += tokens[i] + "\t" + newLine += info + for i in xrange(8, len(tokens)): + newLine += "\t" + tokens[i] + fpOut.write(newLine+"\n") + + fpVcf.close() + fpOut.close() + + fpStats.write("# of passed SVs: %s\n" % (countPassed)) + fpStats.write("# of filtered SVs: %s\n" % (countFiltered)) + fpStats.write("probandGT-fatherGT-motherGT\tcounts\n") + genotypes = consistencyDict.keys() + genotypes.sort() + for gt in genotypes: + fpStats.write("%s\t%s\n" % (gt, consistencyDict[gt])) + fpStats.close() + + +if __name__=='__main__': + + usage = "denovo_scoring.py \n" + if len(sys.argv) <= 4: + sys.stderr.write(usage) + sys.exit(1) + + vcfFile = sys.argv[1] + probandID = sys.argv[2] + fatherID = sys.argv[3] + motherID = sys.argv[4] + + if not(exists(vcfFile)): + errMsg = ('The file %s does not exist.' + % vcfFile) + sys.stderr.write(errMsg + '\nProgram exits.') + sys.exit(1) + + process_vcf(vcfFile, probandID, + fatherID, motherID)