Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for encoding non-detached mate records #326

Merged
merged 1 commit into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions src/cljam/io/cram/encode/mate_records.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
(ns cljam.io.cram.encode.mate-records
(:require [cljam.io.sam.util.flag :as util.flag])
(:import [java.util HashMap]))

(defprotocol IMateResolver
(resolve-mate! [this i record]))

(defn make-mate-resolver
"Creates a new mate resolver."
[]
(let [record-indices (HashMap.)]
(reify IMateResolver
(resolve-mate! [_ i {:keys [flag qname]}]
(when (= (bit-and (long flag)
(util.flag/encoded #{:multiple :secondary :supplementary}))
(util.flag/encoded #{:multiple}))
(if-let [mate-index (.get record-indices qname)]
(do (.remove record-indices mate-index)
mate-index)
(.put record-indices qname i)))))))
88 changes: 66 additions & 22 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.encode.record
(:require [cljam.io.cram.encode.alignment-stats :as stats]
[cljam.io.cram.encode.mate-records :as mate]
[cljam.io.cram.encode.subst-matrix :as subst-mat]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
Expand All @@ -8,6 +9,11 @@
[cljam.io.sam.util.option :as sam.option])
(:import [java.util Arrays List]))

(def ^:private ^:const CF_PRESERVED_QUAL 0x01)
(def ^:private ^:const CF_DETACHED 0x02)
(def ^:private ^:const CF_MATE_DOWNSTREAM 0x04)
(def ^:private ^:const CF_NO_SEQ 0x08)

(defn- ref-index [rname->idx rname]
(if (= rname "*")
-1
Expand Down Expand Up @@ -35,20 +41,23 @@
(fn [record]
(RN (.getBytes ^String (:qname record)))))

(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [MF NS NP TS]}]
(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}]
(fn [{:keys [^long flag rnext] :as record}]
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record)))))
(if (zero? (bit-and (long (::flag record)) CF_DETACHED))
(when-let [nf (::next-fragment record)]
(NF nf))
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record))))))

(defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}]
(let [tag-encoder (fn [{:keys [tag] :as item}]
Expand Down Expand Up @@ -198,26 +207,61 @@
\P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n}))))
[(persistent! fs) (dec rpos)])))))

(defn- mate-consistent? [record mate]
(and (or (and (= (:rnext mate) "=")
(= (:rname record) (:rname mate)))
(= (:rname record) (:rnext mate)))
(= (long (:pos record)) (long (:pnext mate)))
(= (bit-and (long (:flag record))
(sam.flag/encoded #{:unmapped :reversed}))
;; takes advantages of the fact that:
;; - :unmapped == :next-unmapped >> 1
;; - :reversed == :next-reversed >> 1
(-> (long (:flag mate))
(bit-and (sam.flag/encoded #{:next-unmapped :next-reversed}))
(unsigned-bit-shift-right 1)))))

(defn- resolve-mate! [mate-resolver ^List records ^long i record]
(when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)]
(let [upstream-mate (.get records mate-index)]
(when (and (mate-consistent? record upstream-mate)
(mate-consistent? upstream-mate record)
(let [{^long tlen1 :tlen, ^long s1 :pos, ^long e1 ::end} record
{^long tlen2 :tlen, ^long s2 :pos, ^long e2 ::end} upstream-mate]
(or (and (zero? tlen1) (zero? tlen2)
(zero? s1) (zero? s2))
(if (<= s1 s2)
(= tlen1 (- tlen2) (inc (- e2 s1)))
(= (- tlen1) tlen2 (inc (- e1 s2)))))))
(.set records mate-index
(assoc upstream-mate
::flag (-> (long (::flag upstream-mate))
(bit-and (bit-not CF_DETACHED))
(bit-or CF_MATE_DOWNSTREAM))
::next-fragment (dec (- i mate-index))))
mate-index))))

(defn preprocess-slice-records
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records]
(let [stats-builder (stats/make-alignment-stats-builder)]
(let [mate-resolver (mate/make-mate-resolver)
stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat-builder record)
cf (cond-> (bit-or CF_PRESERVED_QUAL CF_DETACHED)
(= (:seq record) "*") (bit-or CF_NO_SEQ))
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
::features fs ::tags-index tags-id)
mate-index (resolve-mate! mate-resolver records i record')]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i record')))
(.set records i
(cond-> record'
mate-index
(update ::flag bit-and (bit-not CF_DETACHED))))))
(stats/build stats-builder)))
27 changes: 27 additions & 0 deletions test/cljam/io/cram/encode/mate_records_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
(ns cljam.io.cram.encode.mate-records-test
(:require [cljam.io.cram.encode.mate-records :as mate]
[cljam.io.sam.util.flag :as flag]
[clojure.test :refer [deftest is]]))

(deftest make-mate-resolver-test
(let [mate-resolver (mate/make-mate-resolver)]
(is (nil? (mate/resolve-mate! mate-resolver 1
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:first :next-reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 2
{:qname "q002"
:flag (flag/encoded #{:multiple :first :next-unmapped})})))
(is (nil? (mate/resolve-mate! mate-resolver 3
{:qname "q003"
:flag (flag/encoded #{:multiple :first :supplementary})})))
(is (= 2 (mate/resolve-mate! mate-resolver 5
{:qname "q002"
:flag (flag/encoded #{:multiple :last :unmapped})})))
(is (= 1 (mate/resolve-mate! mate-resolver 4
{:qname "q001"
:flag (flag/encoded #{:properly-aligned :multiple
:last :reversed})})))
(is (nil? (mate/resolve-mate! mate-resolver 6
{:qname "q003"
:flag (flag/encoded #{:multiple :first :reversed})})))))
Loading