From 1168f6b27f0ed77751e16ea4aaeae52519b6b03f Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Mon, 21 Oct 2024 11:17:56 +0900 Subject: [PATCH] Support encoding non-detached mate records --- src/cljam/io/cram/encode/mate_records.clj | 20 ++++ src/cljam/io/cram/encode/record.clj | 88 +++++++++++---- .../io/cram/encode/mate_records_test.clj | 27 +++++ test/cljam/io/cram/encode/record_test.clj | 106 ++++++++++++------ 4 files changed, 183 insertions(+), 58 deletions(-) create mode 100644 src/cljam/io/cram/encode/mate_records.clj create mode 100644 test/cljam/io/cram/encode/mate_records_test.clj diff --git a/src/cljam/io/cram/encode/mate_records.clj b/src/cljam/io/cram/encode/mate_records.clj new file mode 100644 index 00000000..c1d55c6c --- /dev/null +++ b/src/cljam/io/cram/encode/mate_records.clj @@ -0,0 +1,20 @@ +(ns cljam.io.cram.encode.mate-records + (:require [cljam.io.sam.util.flag :as util.flag]) + (:import [java.util HashMap])) + +(defprotocol IMateResolver + (resolve-mate! [this i record])) + +(defn make-mate-resolver + "Creates a new mate resolver." + [] + (let [record-indices (HashMap.)] + (reify IMateResolver + (resolve-mate! [_ i {:keys [flag qname]}] + (when (= (bit-and (long flag) + (util.flag/encoded #{:multiple :secondary :supplementary})) + (util.flag/encoded #{:multiple})) + (if-let [mate-index (.get record-indices qname)] + (do (.remove record-indices mate-index) + mate-index) + (.put record-indices qname i))))))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index 032b92d3..e347b36b 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -1,5 +1,6 @@ (ns cljam.io.cram.encode.record (:require [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.cram.encode.mate-records :as mate] [cljam.io.cram.encode.subst-matrix :as subst-mat] [cljam.io.cram.encode.tag-dict :as tag-dict] [cljam.io.cram.seq-resolver.protocol :as resolver] @@ -8,6 +9,11 @@ [cljam.io.sam.util.option :as sam.option]) (:import [java.util Arrays List])) +(def ^:private ^:const CF_PRESERVED_QUAL 0x01) +(def ^:private ^:const CF_DETACHED 0x02) +(def ^:private ^:const CF_MATE_DOWNSTREAM 0x04) +(def ^:private ^:const CF_NO_SEQ 0x08) + (defn- ref-index [rname->idx rname] (if (= rname "*") -1 @@ -35,20 +41,23 @@ (fn [record] (RN (.getBytes ^String (:qname record))))) -(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [MF NS NP TS]}] +(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}] (fn [{:keys [^long flag rnext] :as record}] - (let [mate-flag (cond-> 0 - (pos? (bit-and flag (sam.flag/encoded #{:next-reversed}))) - (bit-or 0x01) - - (pos? (bit-and flag (sam.flag/encoded #{:next-unmapped}))) - (bit-or 0x02))] - (MF mate-flag) - (NS (if (= rnext "=") - (::ref-index record) - (ref-index rname->idx rnext))) - (NP (:pnext record)) - (TS (:tlen record))))) + (if (zero? (bit-and (long (::flag record)) CF_DETACHED)) + (when-let [nf (::next-fragment record)] + (NF nf)) + (let [mate-flag (cond-> 0 + (pos? (bit-and flag (sam.flag/encoded #{:next-reversed}))) + (bit-or 0x01) + + (pos? (bit-and flag (sam.flag/encoded #{:next-unmapped}))) + (bit-or 0x02))] + (MF mate-flag) + (NS (if (= rnext "=") + (::ref-index record) + (ref-index rname->idx rnext))) + (NP (:pnext record)) + (TS (:tlen record)))))) (defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}] (let [tag-encoder (fn [{:keys [tag] :as item}] @@ -198,26 +207,61 @@ \P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n})))) [(persistent! fs) (dec rpos)]))))) +(defn- mate-consistent? [record mate] + (and (or (and (= (:rnext mate) "=") + (= (:rname record) (:rname mate))) + (= (:rname record) (:rnext mate))) + (= (long (:pos record)) (long (:pnext mate))) + (= (bit-and (long (:flag record)) + (sam.flag/encoded #{:unmapped :reversed})) + ;; takes advantages of the fact that: + ;; - :unmapped == :next-unmapped >> 1 + ;; - :reversed == :next-reversed >> 1 + (-> (long (:flag mate)) + (bit-and (sam.flag/encoded #{:next-unmapped :next-reversed})) + (unsigned-bit-shift-right 1))))) + +(defn- resolve-mate! [mate-resolver ^List records ^long i record] + (when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)] + (let [upstream-mate (.get records mate-index)] + (when (and (mate-consistent? record upstream-mate) + (mate-consistent? upstream-mate record) + (let [{^long tlen1 :tlen, ^long s1 :pos, ^long e1 ::end} record + {^long tlen2 :tlen, ^long s2 :pos, ^long e2 ::end} upstream-mate] + (or (and (zero? tlen1) (zero? tlen2) + (zero? s1) (zero? s2)) + (if (<= s1 s2) + (= tlen1 (- tlen2) (inc (- e2 s1))) + (= (- tlen1) tlen2 (inc (- e1 s2))))))) + (.set records mate-index + (assoc upstream-mate + ::flag (-> (long (::flag upstream-mate)) + (bit-and (bit-not CF_DETACHED)) + (bit-or CF_MATE_DOWNSTREAM)) + ::next-fragment (dec (- i mate-index)))) + mate-index)))) + (defn preprocess-slice-records "Preprocesses slice records to calculate some record fields prior to record encoding that are necessary for the CRAM writer to generate some header components." [{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records] - (let [stats-builder (stats/make-alignment-stats-builder)] + (let [mate-resolver (mate/make-mate-resolver) + stats-builder (stats/make-alignment-stats-builder)] (dotimes [i (.size records)] (let [record (.get records i) - ;; these flag bits of CF are hard-coded at the moment: - ;; - 0x01: quality scores stored as array (true) - ;; - 0x02: detached (true) - ;; - 0x04: has mate downstream (false) - cf (cond-> 0x03 - (= (:seq record) "*") (bit-or 0x08)) ri (ref-index rname->idx (:rname record)) tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) [fs end] (calculate-read-features&end seq-resolver subst-mat-builder record) + cf (cond-> (bit-or CF_PRESERVED_QUAL CF_DETACHED) + (= (:seq record) "*") (bit-or CF_NO_SEQ)) record' (assoc record ::flag cf ::ref-index ri ::end end - ::features fs ::tags-index tags-id)] + ::features fs ::tags-index tags-id) + mate-index (resolve-mate! mate-resolver records i record')] (stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1) - (.set records i record'))) + (.set records i + (cond-> record' + mate-index + (update ::flag bit-and (bit-not CF_DETACHED)))))) (stats/build stats-builder))) diff --git a/test/cljam/io/cram/encode/mate_records_test.clj b/test/cljam/io/cram/encode/mate_records_test.clj new file mode 100644 index 00000000..0b3dfd98 --- /dev/null +++ b/test/cljam/io/cram/encode/mate_records_test.clj @@ -0,0 +1,27 @@ +(ns cljam.io.cram.encode.mate-records-test + (:require [cljam.io.cram.encode.mate-records :as mate] + [cljam.io.sam.util.flag :as flag] + [clojure.test :refer [deftest is]])) + +(deftest make-mate-resolver-test + (let [mate-resolver (mate/make-mate-resolver)] + (is (nil? (mate/resolve-mate! mate-resolver 1 + {:qname "q001" + :flag (flag/encoded #{:properly-aligned :multiple + :first :next-reversed})}))) + (is (nil? (mate/resolve-mate! mate-resolver 2 + {:qname "q002" + :flag (flag/encoded #{:multiple :first :next-unmapped})}))) + (is (nil? (mate/resolve-mate! mate-resolver 3 + {:qname "q003" + :flag (flag/encoded #{:multiple :first :supplementary})}))) + (is (= 2 (mate/resolve-mate! mate-resolver 5 + {:qname "q002" + :flag (flag/encoded #{:multiple :last :unmapped})}))) + (is (= 1 (mate/resolve-mate! mate-resolver 4 + {:qname "q001" + :flag (flag/encoded #{:properly-aligned :multiple + :last :reversed})}))) + (is (nil? (mate/resolve-mate! mate-resolver 6 + {:qname "q003" + :flag (flag/encoded #{:multiple :first :reversed})}))))) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index 9b7678ff..4610104d 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -94,57 +94,95 @@ c->a (subst-mat/->MutableInt 0) subst-mat-init {\C {\A c->a}} records (ArrayList. - [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH" + [{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M", + :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH", :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "2C2"}} {:NM {:type "c", :value 1}}]} - {:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC" + {:qname "q002", :flag 99, :rname "ref", :pos 5, :cigar "2S3M", + :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "3"}} {:NM {:type "c", :value 0}}]} - {:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF" + {:qname "q003", :flag 177, :rname "ref", :pos 10, :cigar "5M", + :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "5"}} {:NM {:type "c", :value 0}}]} - {:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF" + {:qname "q002", :flag 147, :rname "ref", :pos 15, :cigar "1M1I1M1D2M", + :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "3^T2"}} {:NM {:type "c", :value 2}}]} - {:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE" + {:qname "q004", :flag 65, :rname "ref", :pos 20, :cigar "5M", + :rnext "=", :pnext 20, :tlen 0, :seq "CTGTG", :qual "DBBDD" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}]} + {:qname "q004", :flag 129, :rname "ref", :pos 20, :cigar "5M", + :rnext "=", :pnext 20, :tlen 0, :seq "CTGTG", :qual "DBBDD" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}]} + {:qname "q005", :flag 77, :rname "*", :pos 0, :cigar "*", + :rnext "*", :pnext 0, :tlen 0, :seq "ATGCA", :qual "AEEEE" :options []} - {:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*" + {:qname "q005", :flag 141, :rname "*", :pos 0, :cigar "*", + :rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*" :options []}]) container-ctx (preprocess-slice-records cram-header subst-mat-init records)] - (is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH" + (is (= [{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M", + :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "2C2"}} {:NM {:type "c", :value 1}}] - ::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0 + ::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0, ::record/features [{:code :subst, :pos 3 :subst c->a}]} - {:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC" + {:qname "q002", :flag 99, :rname "ref", :pos 5, :cigar "2S3M", + :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "3"}} {:NM {:type "c", :value 0}}] - ::record/flag 0x03, ::record/ref-index 0, ::record/end 7, ::record/tags-index 0 + ::record/flag 0x05, ::record/ref-index 0, ::record/end 7, + ::record/tags-index 0, ::record/next-fragment 1, ::record/features [{:code :softclip, :pos 1, :bases [(int \C) (int \C)]}]} - {:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF" + {:qname "q003", :flag 177, :rname "ref", :pos 10, :cigar "5M", + :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "5"}} {:NM {:type "c", :value 0}}] ::record/flag 0x03, ::record/ref-index 0, ::record/end 14, ::record/tags-index 0 ::record/features []} - {:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF" + {:qname "q002", :flag 147, :rname "ref", :pos 15, :cigar "1M1I1M1D2M", + :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "3^T2"}} {:NM {:type "c", :value 2}}] - ::record/flag 0x03, ::record/ref-index 0, ::record/end 19, ::record/tags-index 0 + ::record/flag 0x01, ::record/ref-index 0, ::record/end 19, ::record/tags-index 0 ::record/features [{:code :insertion, :pos 2, :bases [(int \A)]} {:code :deletion, :pos 4, :len 1}]} - {:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE", :options [] - ::record/flag 0x03, ::record/ref-index -1, ::record/end 0, ::record/tags-index 1 + {:qname "q004", :flag 65, :rname "ref", :pos 20, :cigar "5M", + :rnext "=", :pnext 20, :tlen 0, :seq "CTGTG", :qual "DBBDD" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}] + ::record/flag 0x03, ::record/ref-index 0, ::record/end 24, + ::record/tags-index 0, ::record/features []} + {:qname "q004", :flag 129, :rname "ref", :pos 20, :cigar "5M", + :rnext "=", :pnext 20, :tlen 0, :seq "CTGTG", :qual "DBBDD" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}] + ::record/flag 0x03, ::record/ref-index 0, ::record/end 24, + ::record/tags-index 0, ::record/features []} + {:qname "q005", :flag 77, :rname "*", :pos 0, :cigar "*", + :rnext "*", :pnext 0, :tlen 0, :seq "ATGCA", :qual "AEEEE", :options [] + ::record/flag 0x05, ::record/ref-index -1, ::record/end 0, + ::record/tags-index 1, ::record/next-fragment 0, ::record/features []} - {:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*", :options [] - ::record/flag 0x0b, ::record/ref-index -1, ::record/end 10, ::record/tags-index 1 + {:qname "q005", :flag 141, :rname "*", :pos 0, :cigar "*", + :rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*", :options [] + ::record/flag 0x09, ::record/ref-index -1, ::record/end 0, ::record/tags-index 1 ::record/features []}] (walk/prewalk #(if (.isArray (class %)) (vec %) %) (vec records)))) @@ -193,12 +231,12 @@ :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "5"}} {:NM {:type "c", :value 0}}]} - {:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15, + {:qname "q002", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15, :cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" :options [{:RG {:type "Z", :value "rg002"}} {:MD {:type "Z", :value "3^T2"}} {:NM {:type "c", :value 2}}]} - {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, + {:qname "q004", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" :options []}]) slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records) @@ -216,7 +254,7 @@ (is (= 1 (count (get ds-res :CF)))) (is (= 3 (get-in ds-res [:CF 0 :content-id]))) - (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) + (is (= [3 5 3 1 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) (is (= 4 (get-in ds-res [:RI 0 :content-id]))) @@ -237,30 +275,30 @@ (is (= 1 (count (get ds-res :RN)))) (is (= 8 (get-in ds-res [:RN 0 :content-id]))) - (is (= "q001\tq002\tq003\tq004\tq005\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) + (is (= "q001\tq002\tq003\tq002\tq004\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) (is (= 1 (count (get ds-res :MF)))) (is (= 9 (get-in ds-res [:MF 0 :content-id]))) - (is (= [1 1 1 0 2] (seq (get-in ds-res [:MF 0 :data])))) + (is (= [1 1 2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) (is (= 10 (get-in ds-res [:NS 0 :content-id]))) - (is (= [0 0 1 0 0xff 0xff 0xff 0xff 0x0f] + (is (= [0 1 0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) (is (= 11 (get-in ds-res [:NP 0 :content-id]))) - (is (= [0x80 0x97 0x0f 0x64 0x05 0x00] + (is (= [0x80 0x97 0x64 0x00] (map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) (is (= 12 (get-in ds-res [:TS 0 :content-id]))) - (is (= [0x80 0x96 0x0f 0x00 0xff 0xff 0xff 0xff 0x01 0x00] + (is (= [0x80 0x96 0x00 0x00] (map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) (is (= 13 (get-in ds-res [:NF 0 :content-id]))) - (is (zero? (count (get-in ds-res [:NF 0 :data])))) + (is (= [1] (seq (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) (is (= 14 (get-in ds-res [:TL 0 :content-id]))) @@ -388,7 +426,7 @@ (is (= 1 (count (get ds-res :CF)))) (is (= 3 (get-in ds-res [:CF 0 :content-id]))) - (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) + (is (= [5 1 5 1 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) (is (= 4 (get-in ds-res [:RI 0 :content-id]))) @@ -422,28 +460,24 @@ (is (= 1 (count (get ds-res :MF)))) (is (= 9 (get-in ds-res [:MF 0 :content-id]))) - (is (= [2 2 2 2 2] (seq (get-in ds-res [:MF 0 :data])))) + (is (= [2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) (is (= 10 (get-in ds-res [:NS 0 :content-id]))) - (is (= [0xff 0xff 0xff 0xff 0x0f - 0xff 0xff 0xff 0xff 0x0f - 0xff 0xff 0xff 0xff 0x0f - 0xff 0xff 0xff 0xff 0x0f - 0xff 0xff 0xff 0xff 0x0f] + (is (= [0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) (is (= 11 (get-in ds-res [:NP 0 :content-id]))) - (is (= [0 0 0 0 0] (seq (get-in ds-res [:NP 0 :data])))) + (is (= [0] (seq (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) (is (= 12 (get-in ds-res [:TS 0 :content-id]))) - (is (= [0 0 0 0 0] (seq (get-in ds-res [:TS 0 :data])))) + (is (= [0] (seq (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) (is (= 13 (get-in ds-res [:NF 0 :content-id]))) - (is (zero? (count (get-in ds-res [:NF 0 :data])))) + (is (= [0 0] (seq (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) (is (= 14 (get-in ds-res [:TL 0 :content-id])))