From cfed81128559e2028509ce464bcbe1e0e6eedddf Mon Sep 17 00:00:00 2001 From: alumi Date: Fri, 20 Dec 2024 23:33:38 +0900 Subject: [PATCH] feat: Add a function to read mate alignments --- src/cljam/io/bam/reader.clj | 78 +++++++++++++++++++++++++- src/cljam/io/bam_index.clj | 12 ++++ src/cljam/io/protocols.clj | 2 + src/cljam/io/sam.clj | 44 ++++++++++++++- src/cljam/io/util/bin.clj | 46 +++++++++++++++- test/cljam/io/bam/reader_test.clj | 92 +++++++++++++++++++++++++++++++ test/cljam/io/sam_test.clj | 11 ++++ test/cljam/io/util/bin_test.clj | 13 +++++ 8 files changed, 293 insertions(+), 5 deletions(-) create mode 100644 test/cljam/io/bam/reader_test.clj diff --git a/src/cljam/io/bam/reader.clj b/src/cljam/io/bam/reader.clj index be337582..956b279b 100644 --- a/src/cljam/io/bam/reader.clj +++ b/src/cljam/io/bam/reader.clj @@ -3,15 +3,18 @@ (:require [cljam.io.protocols :as protocols] [cljam.io.sam.util.refs :as refs] [cljam.io.sam.util.header :as header] - [cljam.io.bam-index.core :as bai] + [cljam.io.bam-index :as bai] [cljam.io.bam.decoder :as decoder] - [cljam.io.util.lsb.data-io :as lsb]) + [cljam.io.util.lsb.data-io :as lsb] + [cljam.io.sam.util.flag :as flag] + [cljam.io.util.byte-buffer :as bb]) (:import [java.io Closeable FileNotFoundException] [cljam.io.bam.decoder BAMRawBlock] [bgzf4j BGZFInputStream])) (declare read-blocks-sequentially* - read-blocks-randomly*) + read-blocks-randomly* + read-mate-alignments*) ;; BAMReader ;; --------- @@ -47,6 +50,10 @@ (if (nil? chr) (read-blocks-sequentially* this 64 decoder) (read-blocks-randomly* this chr start end 64 decoder)))) + (read-mate-alignments [this alignments] + (protocols/read-mate-alignments this {} alignments)) + (read-mate-alignments [this options alignments] + (read-mate-alignments* this options alignments)) (read-blocks [this] (protocols/read-blocks this {} {})) (read-blocks [this region] @@ -163,3 +170,68 @@ :len l-ref})))))] {:header header :refs refs})) + +(defn- mate?-fn + "Returns a predicate that checks if an alignment `block` is a mate of an + alignment in the given `alignments`." + [alignments] + (let [query (transduce + (map (fn [{:keys [pnext qname flag]}] + [[(flag/r1? flag) pnext (count qname)] qname])) + (completing (fn [r [ks v]] (update-in r ks (fnil conj #{}) v))) + {} + alignments)] + (fn mate? + [^BAMRawBlock block] + (let [b (bb/make-lsb-byte-buffer (.data block)) + flag (bit-and 0xFFFF (.getShort b 14))] + (when (flag/primary? flag) + (when-let [q' (query (not (flag/r1? flag)))] ; check R1/R2 + (when-let [q'' (q' (inc (.getInt b 4)))] ; check POS + (let [l-read-name (dec (bit-and 0xFF (.get b 8)))] + (when-let [q''' (q'' l-read-name)] ; check length of QNAME + (.position b 32) ; offset of qname + (q''' (String. ^bytes (bb/read-bytes b l-read-name)))))))))))) + +(defn- read-mate-alignments-1 + "Reads mate alignments in a specific chr." + [^BAMReader rdr {:keys [chunk-size] :or {chunk-size 64}} [chr xs]] + (let [refs (.refs rdr) + rid (refs/ref-id refs chr)] + (if (= chr "*") + ;; unmapped + (do (.seek ^BGZFInputStream (.reader rdr) + (ffirst (bai/get-unplaced-spans @(.index-delay rdr)))) + (eduction (filter (mate?-fn xs)) (read-to-finish rdr chunk-size))) + ;; mapped + (->> xs + (map (comp (juxt identity identity) :pnext)) + set + (bai/get-multi-spans @(.index-delay rdr) rid) + (eduction + (comp + (mapcat + (fn [[begin finish]] + (read-to-finish rdr begin finish chunk-size))) + (filter (mate?-fn xs)))))))) + +(defn- ref-order [refs] + (into {} + (map-indexed (fn [i m] [(:name m) i])) + (concat refs [{:name "*"}]))) + +(defn- read-mate-alignments* + "Implementation for `cljam.io.sam/read-mate-alignments`. + + The following options are availabe: + + - :chunk-size A size of chunks used for the internal block sequence" + ([^BAMReader rdr alignments] + (read-mate-alignments* rdr {} alignments)) + ([^BAMReader rdr options alignments] + (->> alignments + (group-by (fn [{:keys [rname rnext]}] (case rnext "=" rname rnext))) + (sort-by (comp (ref-order (.refs rdr)) key)) + (eduction + (comp (mapcat (partial read-mate-alignments-1 rdr options)) + (map (partial decoder/decode-alignment (.refs rdr)))))))) diff --git a/src/cljam/io/bam_index.clj b/src/cljam/io/bam_index.clj index 443eaa6b..67d0f802 100644 --- a/src/cljam/io/bam_index.clj +++ b/src/cljam/io/bam_index.clj @@ -19,6 +19,18 @@ [bai ref-idx beg end] (util-bin/get-spans bai ref-idx beg end)) +(defn get-multi-spans + "Returns regions of a BAM file that may contain an alignment for the given + `ranges` which is a sequence of pairs of integers, begin and end." + [bai ^long ref-idx ranges] + (util-bin/get-multi-spans bai ref-idx ranges)) + +(defn get-unplaced-spans + "Returns a sequence of [start end) pairs of virtual file offsets that may + contain alignments that don't have RNAME." + [bai] + (bai-core/get-unplaced-spans bai)) + (defn bam-index "Returns a cljam.bam-index.core.BAMIndex." [f] diff --git a/src/cljam/io/protocols.clj b/src/cljam/io/protocols.clj index eef87248..cbf8d02a 100644 --- a/src/cljam/io/protocols.clj +++ b/src/cljam/io/protocols.clj @@ -31,6 +31,8 @@ "Returns references of the SAM/BAM file.") (read-alignments [this] [this region] "Reads alignments of the SAM/BAM file, returning the alignments as an eduction.") + (read-mate-alignments [this alignments] [this option alignments] + "Reads mate alignments of given `alignments`.") (read-blocks [this] [this region] [this region option] "Reads alignment blocks of the SAM/BAM file, returning the blocks as an eduction.")) diff --git a/src/cljam/io/sam.clj b/src/cljam/io/sam.clj index 5517daaf..fa0a614c 100644 --- a/src/cljam/io/sam.clj +++ b/src/cljam/io/sam.clj @@ -7,7 +7,8 @@ [cljam.io.sam.writer :as sam-writer] [cljam.io.bam.core :as bam-core] [cljam.io.protocols :as protocols] - [cljam.io.util :as io-util]) + [cljam.io.util :as io-util] + [cljam.io.sam.util.flag :as flag]) (:import java.io.Closeable cljam.io.sam.reader.SAMReader cljam.io.sam.writer.SAMWriter @@ -69,6 +70,47 @@ ([rdr] (protocols/read-alignments rdr)) ([rdr region] (protocols/read-alignments rdr region))) +(defn read-mate-alignments + "Returns a sequence of the mate alignments corresponding to the given + `alignments` from the `rdr`. A mate alignment here refers to a primary + alignment that has the same QNAME as the given alignment and is the opposite + R1/R2 pair. The `rdr` must be indexed." + ([rdr alignments] + (protocols/read-mate-alignments rdr alignments)) + ([rdr options alignments] + (protocols/read-mate-alignments rdr options alignments))) + +(defn make-pairs + "Pairs mate alignments from the given `alignments` and `rdr`. + + This is a convenient wrapper for `read-mate-alignments`. This function + processes the alignments provided in the `alignments` sequence, pairing mate + alignments based on their QNAMEs. If a mate alignment is missing from + `alignments`, it attempts to read the mate alignment from the reader `rdr`. + Non-primary alignments are ignored. The order of the paired alignments in the + output is not guaranteed. + + The following options are available: + + - :chunk-size A size of chunks used for the internal block sequence" + ([rdr alignments] + (make-pairs rdr {} alignments)) + ([rdr options alignments] + (letfn [(has-mate? [{:keys [flag]}] + (and (flag/multiple? flag) + (flag/primary? flag)))] + (let [{solos true, + pairs false} (->> alignments + (group-by :qname) + (keep + (fn [[_ vs]] (not-empty (filterv has-mate? vs)))) + (group-by #(= 1 (count %)))) + solos (mapv first solos) + mates (->> solos + (read-mate-alignments rdr options) + (into {} (map (juxt :qname identity))))] + (concat pairs (map (juxt identity (comp mates :qname)) solos)))))) + (defn read-blocks "Reads alignment blocks of the SAM/BAM file, returning the blocks as an eduction." ([rdr] (protocols/read-blocks rdr)) diff --git a/src/cljam/io/util/bin.clj b/src/cljam/io/util/bin.clj index 81114653..110a4c17 100644 --- a/src/cljam/io/util/bin.clj +++ b/src/cljam/io/util/bin.clj @@ -1,5 +1,6 @@ (ns cljam.io.util.bin - (:require [cljam.io.util.chunk :as util-chunk])) + (:require [cljam.io.util.chunk :as util-chunk] + [cljam.util.region :as reg])) (defn max-pos "Returns a maximum position of a binning index. The value is identical to the @@ -101,3 +102,46 @@ min-offset (get-min-offset index-data ref-idx beg)] (->> (util-chunk/optimize-chunks chunks min-offset) (map vals)))) + +(defn get-multi-spans-merge + "Calculates span information for multiple regions. + + Merge query regions before computing bins." + [index-data ^long ref-idx regions] + (when (seq regions) + (let [min-shift (get-min-shift index-data) + depth (get-depth index-data)] + (->> regions + sort + (map (fn [[b e]] {:start b :end e})) + (reg/merge-regions 1000) + (mapcat + (fn [{beg :start end :end}] + (util-chunk/optimize-chunks + (get-chunks index-data ref-idx + (reg->bins beg end min-shift depth)) + (get-min-offset index-data ref-idx beg)))) + (#(util-chunk/optimize-chunks % 0)) + (map vals))))) + +(defn get-multi-spans + "Calculates span information for multiple regions." + [index-data ^long ref-idx regions] + (when (seq regions) + (let [min-shift (get-min-shift index-data) + depth (get-depth index-data)] + (->> regions + (transduce + (map + (fn [[^long beg ^long end]] + [(reg->bins beg end min-shift depth) + (get-min-offset index-data ref-idx beg)])) + (fn + ([[bins min-offset]] + [(get-chunks index-data ref-idx bins) min-offset]) + ([[bins min-offset] [chunks' min-offset']] + [(into bins chunks') + (min (long min-offset) (long min-offset'))])) + [#{} Long/MAX_VALUE]) + (apply util-chunk/optimize-chunks) + (map vals))))) diff --git a/test/cljam/io/bam/reader_test.clj b/test/cljam/io/bam/reader_test.clj new file mode 100644 index 00000000..e53eebe3 --- /dev/null +++ b/test/cljam/io/bam/reader_test.clj @@ -0,0 +1,92 @@ +(ns cljam.io.bam.reader-test + (:require [clojure.java.io :as io] + [clojure.test :refer [deftest is testing]] + [cljam.util :as util] + [cljam.util.chromosome :as chr] + [cljam.io.protocols :as prot] + [cljam.io.bam.core :as bam.core] + [cljam.io.sam.util.flag :as flag])) + +(defn- conj-flag [flag & xs] + (flag/encode (apply conj (flag/decode flag) xs))) + +(defn- unmap [xs] + (let [[unmapped mapped] (if (< (double (rand)) 0.5) + [0 1] + [1 0])] + (-> xs + (update unmapped assoc + :rname "*" :pos 0 :end 0 + :mapq 0 :cigar "" :tlen 0) + (update-in [unmapped :flag] conj-flag :unmapped) + (update-in [unmapped :rnext] #(if (= "=" %) + (:rname (first xs)) + %)) + (update mapped assoc + :rnext "*" :pnext 0 :tlen 0) + (update-in [mapped :flag] conj-flag :next-unmapped)))) + +(defn- generate-paired-bam [n] + (let [refs (->> + (range 24) + (map (fn [i] {:SN (str "chr" (inc (long i))), + :LN (+ 100000000 (long (rand-int 50000000)))})) + (sort-by (comp chr/chromosome-order-key :SN)))] + (->> + (range n) + (map + (fn [i] + (let [{:keys [SN LN] :as r} (rand-nth refs) + pos (int (rand-int (- (long LN) 300))) + chimeric? (< (double (rand)) 0.3) + unmapped? (< (double (rand)) 0.2) + rnext (if chimeric? + (rand-nth (filterv (complement #{r}) refs)) + r) + pnext (int (if chimeric? + (rand-int (- (long (:LN rnext)) 300)) + (+ pos (int 70) (int (rand-int 200))))) + qname (str "read" (inc (long i)))] + (cond-> [(prot/->SAMAlignment + qname (flag/encode #{:multiple :first}) + SN pos (int (+ pos 150)) 60 "151M" + (if chimeric? (:SN rnext) "=") pnext 0 "*" "*" []) + (prot/->SAMAlignment + qname (flag/encode #{:multiple :last}) + (:SN rnext) pnext (int (+ pnext 150)) 60 "151M" + (if chimeric? SN "=") pos 0 "*" "*" [])] + unmapped? unmap)))) + (apply concat) + (sort-by (juxt (comp chr/chromosome-order-key :rname) :pos)) + (vector {:HD {:VN "1.4", :SO "coordinate"}, :SQ refs})))) + +(defn- generate-test-case [alignments] + (let [in (->> alignments + shuffle + (take 10) + (group-by :qname) + vals + (map first)) ; take either R1 or R2 + mate? (into #{} (map (juxt :qname (comp not flag/r1? :flag)) in))] + [in (filter (comp mate? (juxt :qname (comp flag/r1? :flag))) alignments)])) + +(deftest read-mate-alignments-test + (util/with-temp-dir [d "read-pairs-test"] + (let [temp-bam (io/file d "tmp.bam") + [header alignments] (generate-paired-bam 10000)] + + ;; write a BAM with a BAI + (with-open [w (bam.core/writer temp-bam true)] + (doto w + (prot/write-header header) + (prot/write-refs header) + (prot/write-alignments alignments header))) + + ;; read and compare + (with-open [r (bam.core/reader temp-bam)] + (doseq [[i [input expected]] (->> #(generate-test-case alignments) + (repeatedly 10) + (map vector (range)))] + (testing (inc (long i)) + (is (= expected + (prot/read-mate-alignments r input))))))))) diff --git a/test/cljam/io/sam_test.clj b/test/cljam/io/sam_test.clj index cca8ccfd..29f8419e 100644 --- a/test/cljam/io/sam_test.clj +++ b/test/cljam/io/sam_test.clj @@ -468,3 +468,14 @@ temp-bam-file (cio/file temp-bam-file) (cio/as-url (cio/file temp-bam-file))))) + +(deftest make-pairs-test + (testing "BAM" + (with-open [r (sam/reader test-sorted-bam-file)] + (let [[r001 r002 r003 r004 r003' r001'] (vec (sam/read-alignments r))] + (is (= [[r001 r001']] + (sam/make-pairs r [r001]))) + (is (= [[r001 r001']] + (sam/make-pairs r [r001 r001']))) + (is (= [[r001 r001']] ;; r003 is not flagged as paired + (sam/make-pairs r [r001 r002 r003 r003' r004]))))))) diff --git a/test/cljam/io/util/bin_test.clj b/test/cljam/io/util/bin_test.clj index dff2ff47..9022c341 100644 --- a/test/cljam/io/util/bin_test.clj +++ b/test/cljam/io/util/bin_test.clj @@ -197,3 +197,16 @@ (number? (first %)) (number? (second %))) (util-bin/get-spans csi* 0 1 100000))))) + +(deftest get-multi-spans + (testing "tabix" + (let [tabix (tabix/read-index test-tabix-file)] + (is (= [[0 50872]] + (util-bin/get-multi-spans tabix 0 [[1 100]]))) + (is (= [[0 50872]] + (util-bin/get-multi-spans + tabix 0 [[1 10] [10 20] [30 40] [40 50] [50 100]]))))) + (testing "csi" + (let [csi (csi/read-index test-csi-file)] + (is (= [[3904 3973]] + (util-bin/get-multi-spans csi 0 [[1 100000]]))))))