Skip to content

Commit

Permalink
Merge pull request #326 from chrovis/feature/non-detached-mate-records
Browse files Browse the repository at this point in the history
Support for encoding non-detached mate records
  • Loading branch information
ToshimitsuArai authored Nov 20, 2024
2 parents b1748fd + 1168f6b commit a8966d9
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 58 deletions.
20 changes: 20 additions & 0 deletions src/cljam/io/cram/encode/mate_records.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
(ns cljam.io.cram.encode.mate-records
(:require [cljam.io.sam.util.flag :as util.flag])
(:import [java.util HashMap]))

(defprotocol IMateResolver
(resolve-mate! [this i record]))

(defn make-mate-resolver
"Creates a new mate resolver."
[]
(let [record-indices (HashMap.)]
(reify IMateResolver
(resolve-mate! [_ i {:keys [flag qname]}]
(when (= (bit-and (long flag)
(util.flag/encoded #{:multiple :secondary :supplementary}))
(util.flag/encoded #{:multiple}))
(if-let [mate-index (.get record-indices qname)]
(do (.remove record-indices mate-index)
mate-index)
(.put record-indices qname i)))))))
88 changes: 66 additions & 22 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.record
(:require [cljam.io.cram.encode.alignment-stats :as stats]
[cljam.io.cram.encode.mate-records :as mate]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
Expand All @@ -8,6 +9,11 @@
[cljam.io.sam.util.option :as sam.option])
(:import [java.util Arrays List]))

(def ^:private ^:const CF_PRESERVED_QUAL 0x01)
(def ^:private ^:const CF_DETACHED 0x02)
(def ^:private ^:const CF_MATE_DOWNSTREAM 0x04)
(def ^:private ^:const CF_NO_SEQ 0x08)

(defn- ref-index [rname->idx rname]
(if (= rname "*")
-1
Expand Down Expand Up @@ -35,20 +41,23 @@
(fn [record]
(RN (.getBytes ^String (:qname record)))))

(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [MF NS NP TS]}]
(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}]
(fn [{:keys [^long flag rnext] :as record}]
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record)))))
(if (zero? (bit-and (long (::flag record)) CF_DETACHED))
(when-let [nf (::next-fragment record)]
(NF nf))
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record))))))

(defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}]
(let [tag-encoder (fn [{:keys [tag] :as item}]
Expand Down Expand Up @@ -198,26 +207,61 @@
\P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n}))))
[(persistent! fs) (dec rpos)])))))

(defn- mate-consistent? [record mate]
(and (or (and (= (:rnext mate) "=")
(= (:rname record) (:rname mate)))
(= (:rname record) (:rnext mate)))
(= (long (:pos record)) (long (:pnext mate)))
(= (bit-and (long (:flag record))
(sam.flag/encoded #{:unmapped :reversed}))
;; takes advantages of the fact that:
;; - :unmapped == :next-unmapped >> 1
;; - :reversed == :next-reversed >> 1
(-> (long (:flag mate))
(bit-and (sam.flag/encoded #{:next-unmapped :next-reversed}))
(unsigned-bit-shift-right 1)))))

(defn- resolve-mate! [mate-resolver ^List records ^long i record]
(when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)]
(let [upstream-mate (.get records mate-index)]
(when (and (mate-consistent? record upstream-mate)
(mate-consistent? upstream-mate record)
(let [{^long tlen1 :tlen, ^long s1 :pos, ^long e1 ::end} record
{^long tlen2 :tlen, ^long s2 :pos, ^long e2 ::end} upstream-mate]
(or (and (zero? tlen1) (zero? tlen2)
(zero? s1) (zero? s2))
(if (<= s1 s2)
(= tlen1 (- tlen2) (inc (- e2 s1)))
(= (- tlen1) tlen2 (inc (- e1 s2)))))))
(.set records mate-index
(assoc upstream-mate
::flag (-> (long (::flag upstream-mate))
(bit-and (bit-not CF_DETACHED))
(bit-or CF_MATE_DOWNSTREAM))
::next-fragment (dec (- i mate-index))))
mate-index))))

(defn preprocess-slice-records
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records]
(let [stats-builder (stats/make-alignment-stats-builder)]
(let [mate-resolver (mate/make-mate-resolver)
stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat-builder record)
cf (cond-> (bit-or CF_PRESERVED_QUAL CF_DETACHED)
(= (:seq record) "*") (bit-or CF_NO_SEQ))
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
::features fs ::tags-index tags-id)
mate-index (resolve-mate! mate-resolver records i record')]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i record')))
(.set records i
(cond-> record'
mate-index
(update ::flag bit-and (bit-not CF_DETACHED))))))
(stats/build stats-builder)))
27 changes: 27 additions & 0 deletions test/cljam/io/cram/encode/mate_records_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
(ns cljam.io.cram.encode.mate-records-test
(:require [cljam.io.cram.encode.mate-records :as mate]
[cljam.io.sam.util.flag :as flag]
[clojure.test :refer [deftest is]]))

(deftest make-mate-resolver-test
(let [mate-resolver (mate/make-mate-resolver)]
(is (nil? (mate/resolve-mate! mate-resolver 1
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:first :next-reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 2
{:qname "q002"
:flag (flag/encoded #{:multiple :first :next-unmapped})})))
(is (nil? (mate/resolve-mate! mate-resolver 3
{:qname "q003"
:flag (flag/encoded #{:multiple :first :supplementary})})))
(is (= 2 (mate/resolve-mate! mate-resolver 5
{:qname "q002"
:flag (flag/encoded #{:multiple :last :unmapped})})))
(is (= 1 (mate/resolve-mate! mate-resolver 4
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:last :reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 6
{:qname "q003"
:flag (flag/encoded #{:multiple :first :reversed})})))))
Loading

0 comments on commit a8966d9

Please sign in to comment.