Skip to content

Commit

Permalink
Support encoding non-detached mate records
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Nov 5, 2024
1 parent fb6193b commit 5a96d6b
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 58 deletions.
20 changes: 20 additions & 0 deletions src/cljam/io/cram/encode/mate_records.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
(ns cljam.io.cram.encode.mate-records
(:require [cljam.io.sam.util.flag :as util.flag])
(:import [java.util HashMap]))

(defprotocol IMateResolver
(resolve-mate! [this i record]))

(defn make-mate-resolver
"Creates a new mate resolver."
[]
(let [record-indices (HashMap.)]
(reify IMateResolver
(resolve-mate! [_ i {:keys [flag qname]}]
(when (= (bit-and (long flag)
(util.flag/encoded #{:multiple :secondary :supplementary}))
(util.flag/encoded #{:multiple}))
(if-let [mate-index (.get record-indices qname)]
(do (.remove record-indices mate-index)
mate-index)
(.put record-indices qname i)))))))
88 changes: 66 additions & 22 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.record
(:require [cljam.io.cram.encode.alignment-stats :as stats]
[cljam.io.cram.encode.mate-records :as mate]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
Expand All @@ -8,6 +9,11 @@
[cljam.io.sam.util.option :as sam.option])
(:import [java.util Arrays List]))

(def ^:private ^:const CF_PRESERVED_QUAL 0x01)
(def ^:private ^:const CF_DETACHED 0x02)
(def ^:private ^:const CF_MATE_DOWNSTREAM 0x04)
(def ^:private ^:const CF_NO_SEQ 0x08)

(defn- ref-index [rname->idx rname]
(if (= rname "*")
-1
Expand Down Expand Up @@ -35,20 +41,23 @@
(fn [record]
(RN (.getBytes ^String (:qname record)))))

(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [MF NS NP TS]}]
(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}]
(fn [{:keys [^long flag rnext] :as record}]
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record)))))
(if (zero? (bit-and (long (::flag record)) CF_DETACHED))
(when-let [nf (::next-fragment record)]
(NF nf))
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record))))))

(defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}]
(let [tag-encoder (fn [{:keys [tag] :as item}]
Expand Down Expand Up @@ -198,26 +207,61 @@
\P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n}))))
[(persistent! fs) (dec rpos)])))))

(defn- mate-consistent? [record mate]
(and (or (and (= (:rnext mate) "=")
(= (:rname record) (:rname mate)))
(= (:rname record) (:rnext mate)))
(= (long (:pos record)) (long (:pnext mate)))
(= (bit-and (long (:flag record))
(sam.flag/encoded #{:unmapped :reversed}))
;; takes advantages of the fact that:
;; - :unmapped == :next-unmapped >> 1
;; - :reversed == :next-reversed >> 1
(-> (long (:flag mate))
(bit-and (sam.flag/encoded #{:next-unmapped :next-reversed}))
(unsigned-bit-shift-right 1)))))

(defn- resolve-mate! [mate-resolver ^List records ^long i record]
(when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)]
(let [upstream-mate (.get records mate-index)]
(when (and (mate-consistent? record upstream-mate)
(mate-consistent? upstream-mate record)
(let [{^long tlen1 :tlen, ^long s1 :pos, ^long e1 ::end} record
{^long tlen2 :tlen, ^long s2 :pos, ^long e2 ::end} upstream-mate]
(or (and (zero? tlen1) (zero? tlen2)
(zero? s1) (zero? s2))
(if (<= s1 s2)
(= tlen1 (- tlen2) (inc (- e2 s1)))
(= (- tlen1) tlen2 (inc (- e1 s2)))))))
(.set records mate-index
(assoc upstream-mate
::flag (-> (long (::flag upstream-mate))
(bit-and (bit-not CF_DETACHED))
(bit-or CF_MATE_DOWNSTREAM))
::next-fragment (dec (- i mate-index))))
mate-index))))

(defn preprocess-slice-records
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records]
(let [stats-builder (stats/make-alignment-stats-builder)]
(let [mate-resolver (mate/make-mate-resolver)
stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat-builder record)
cf (cond-> (bit-or CF_PRESERVED_QUAL CF_DETACHED)
(= (:seq record) "*") (bit-or CF_NO_SEQ))
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
::features fs ::tags-index tags-id)
mate-index (resolve-mate! mate-resolver records i record')]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i record')))
(.set records i
(cond-> record'
mate-index
(update ::flag bit-and (bit-not CF_DETACHED))))))
(stats/build stats-builder)))
27 changes: 27 additions & 0 deletions test/cljam/io/cram/encode/mate_records_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
(ns cljam.io.cram.encode.mate-records-test
(:require [cljam.io.cram.encode.mate-records :as mate]
[cljam.io.sam.util.flag :as flag]
[clojure.test :refer [deftest is]]))

(deftest make-mate-resolver-test
(let [mate-resolver (mate/make-mate-resolver)]
(is (nil? (mate/resolve-mate! mate-resolver 1
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:first :next-reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 2
{:qname "q002"
:flag (flag/encoded #{:multiple :first :next-unmapped})})))
(is (nil? (mate/resolve-mate! mate-resolver 3
{:qname "q003"
:flag (flag/encoded #{:multiple :first :supplementary})})))
(is (= 2 (mate/resolve-mate! mate-resolver 5
{:qname "q002"
:flag (flag/encoded #{:multiple :last :unmapped})})))
(is (= 1 (mate/resolve-mate! mate-resolver 4
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:last :reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 6
{:qname "q003"
:flag (flag/encoded #{:multiple :first :reversed})})))))
83 changes: 47 additions & 36 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.record-test
(:require [cljam.io.cram.encode.context :as context]
[cljam.io.cram.encode.mate-records :as mate]

Check warning on line 3 in test/cljam/io/cram/encode/record_test.clj

View workflow job for this annotation

GitHub Actions / lint

[clj-kondo] namespace cljam.io.cram.encode.mate-records is required but never used
[cljam.io.cram.encode.record :as record]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
Expand Down Expand Up @@ -94,57 +95,71 @@
c->a (subst-mat/->MutableInt 0)
subst-mat-init {\C {\A c->a}}
records (ArrayList.
[{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M",
:rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH",
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]}
{:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC"
{:qname "q002", :flag 99, :rname "ref", :pos 5, :cigar "2S3M",
:rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"
{:qname "q003", :flag 177, :rname "ref", :pos 10, :cigar "5M",
:rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"
{:qname "q002", :flag 147, :rname "ref", :pos 15, :cigar "1M1I1M1D2M",
:rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"
{:qname "q004", :flag 77, :rname "*", :pos 0, :cigar "*",
:rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options []}
{:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*"
{:qname "q004", :flag 141, :rname "*", :pos 0, :cigar "*",
:rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*"
:options []}])
container-ctx (preprocess-slice-records cram-header subst-mat-init records)]
(is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
(is (= [{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M",
:rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0
::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0,
::record/features [{:code :subst, :pos 3 :subst c->a}]}
{:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC"
{:qname "q002", :flag 99, :rname "ref", :pos 5, :cigar "2S3M",
:rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 7, ::record/tags-index 0
::record/flag 0x05, ::record/ref-index 0, ::record/end 7,
::record/tags-index 0, ::record/next-fragment 1,
::record/features [{:code :softclip, :pos 1, :bases [(int \C) (int \C)]}]}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"
{:qname "q003", :flag 177, :rname "ref", :pos 10, :cigar "5M",
:rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 14, ::record/tags-index 0
::record/features []}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"
{:qname "q002", :flag 147, :rname "ref", :pos 15, :cigar "1M1I1M1D2M",
:rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 19, ::record/tags-index 0
::record/flag 0x01, ::record/ref-index 0, ::record/end 19, ::record/tags-index 0
::record/features [{:code :insertion, :pos 2, :bases [(int \A)]}
{:code :deletion, :pos 4, :len 1}]}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE", :options []
::record/flag 0x03, ::record/ref-index -1, ::record/end 0, ::record/tags-index 1
{:qname "q004", :flag 77, :rname "*", :pos 0, :cigar "*",
:rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE", :options []
::record/flag 0x05, ::record/ref-index -1, ::record/end 0,
::record/tags-index 1, ::record/next-fragment 0,
::record/features []}
{:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*", :options []
::record/flag 0x0b, ::record/ref-index -1, ::record/end 10, ::record/tags-index 1
{:qname "q004", :flag 141, :rname "*", :pos 0, :cigar "*",
:rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*", :options []
::record/flag 0x09, ::record/ref-index -1, ::record/end 0, ::record/tags-index 1
::record/features []}]
(walk/prewalk #(if (.isArray (class %)) (vec %) %)
(vec records))))
Expand Down Expand Up @@ -193,12 +208,12 @@
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
{:qname "q002", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
{:qname "q004", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options []}])
slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records)
Expand All @@ -216,7 +231,7 @@

(is (= 1 (count (get ds-res :CF))))
(is (= 3 (get-in ds-res [:CF 0 :content-id])))
(is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data]))))
(is (= [3 5 3 1 3] (seq (get-in ds-res [:CF 0 :data]))))

(is (= 1 (count (get ds-res :RI))))
(is (= 4 (get-in ds-res [:RI 0 :content-id])))
Expand All @@ -237,30 +252,30 @@

(is (= 1 (count (get ds-res :RN))))
(is (= 8 (get-in ds-res [:RN 0 :content-id])))
(is (= "q001\tq002\tq003\tq004\tq005\t" (String. ^bytes (get-in ds-res [:RN 0 :data]))))
(is (= "q001\tq002\tq003\tq002\tq004\t" (String. ^bytes (get-in ds-res [:RN 0 :data]))))

(is (= 1 (count (get ds-res :MF))))
(is (= 9 (get-in ds-res [:MF 0 :content-id])))
(is (= [1 1 1 0 2] (seq (get-in ds-res [:MF 0 :data]))))
(is (= [1 1 2] (seq (get-in ds-res [:MF 0 :data]))))

(is (= 1 (count (get ds-res :NS))))
(is (= 10 (get-in ds-res [:NS 0 :content-id])))
(is (= [0 0 1 0 0xff 0xff 0xff 0xff 0x0f]
(is (= [0 1 0xff 0xff 0xff 0xff 0x0f]
(map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data]))))

(is (= 1 (count (get ds-res :NP))))
(is (= 11 (get-in ds-res [:NP 0 :content-id])))
(is (= [0x80 0x97 0x0f 0x64 0x05 0x00]
(is (= [0x80 0x97 0x64 0x00]
(map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data]))))

(is (= 1 (count (get ds-res :TS))))
(is (= 12 (get-in ds-res [:TS 0 :content-id])))
(is (= [0x80 0x96 0x0f 0x00 0xff 0xff 0xff 0xff 0x01 0x00]
(is (= [0x80 0x96 0x00 0x00]
(map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data]))))

(is (= 1 (count (get ds-res :NF))))
(is (= 13 (get-in ds-res [:NF 0 :content-id])))
(is (zero? (count (get-in ds-res [:NF 0 :data]))))
(is (= [1] (seq (get-in ds-res [:NF 0 :data]))))

(is (= 1 (count (get ds-res :TL))))
(is (= 14 (get-in ds-res [:TL 0 :content-id])))
Expand Down Expand Up @@ -388,7 +403,7 @@

(is (= 1 (count (get ds-res :CF))))
(is (= 3 (get-in ds-res [:CF 0 :content-id])))
(is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data]))))
(is (= [5 1 5 1 3] (seq (get-in ds-res [:CF 0 :data]))))

(is (= 1 (count (get ds-res :RI))))
(is (= 4 (get-in ds-res [:RI 0 :content-id])))
Expand Down Expand Up @@ -422,28 +437,24 @@

(is (= 1 (count (get ds-res :MF))))
(is (= 9 (get-in ds-res [:MF 0 :content-id])))
(is (= [2 2 2 2 2] (seq (get-in ds-res [:MF 0 :data]))))
(is (= [2] (seq (get-in ds-res [:MF 0 :data]))))

(is (= 1 (count (get ds-res :NS))))
(is (= 10 (get-in ds-res [:NS 0 :content-id])))
(is (= [0xff 0xff 0xff 0xff 0x0f
0xff 0xff 0xff 0xff 0x0f
0xff 0xff 0xff 0xff 0x0f
0xff 0xff 0xff 0xff 0x0f
0xff 0xff 0xff 0xff 0x0f]
(is (= [0xff 0xff 0xff 0xff 0x0f]
(map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data]))))

(is (= 1 (count (get ds-res :NP))))
(is (= 11 (get-in ds-res [:NP 0 :content-id])))
(is (= [0 0 0 0 0] (seq (get-in ds-res [:NP 0 :data]))))
(is (= [0] (seq (get-in ds-res [:NP 0 :data]))))

(is (= 1 (count (get ds-res :TS))))
(is (= 12 (get-in ds-res [:TS 0 :content-id])))
(is (= [0 0 0 0 0] (seq (get-in ds-res [:TS 0 :data]))))
(is (= [0] (seq (get-in ds-res [:TS 0 :data]))))

(is (= 1 (count (get ds-res :NF))))
(is (= 13 (get-in ds-res [:NF 0 :content-id])))
(is (zero? (count (get-in ds-res [:NF 0 :data]))))
(is (= [0 0] (seq (get-in ds-res [:NF 0 :data]))))

(is (= 1 (count (get ds-res :TL))))
(is (= 14 (get-in ds-res [:TL 0 :content-id])))
Expand Down

0 comments on commit 5a96d6b

Please sign in to comment.