Skip to content

Commit

Permalink
Omit read names if :omit-read-names is specified
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Nov 21, 2024
1 parent a8966d9 commit bc9566c
Show file tree
Hide file tree
Showing 7 changed files with 203 additions and 49 deletions.
10 changes: 9 additions & 1 deletion src/cljam/io/cram.clj
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,15 @@
sequences into a CRAM file, the CRAM writer, by default, checks if
the header specifies `SO:coordinate` and raises an error if it does not.
If this option is set to true, the CRAM writer will skip this header
check and proceed regardless of the header's sort order declaration."
check and proceed regardless of the header's sort order declaration.
- omit-read-names?: If set to true, the CRAM writer omits read names (QNAME)
from records while preserving consistency across mate records.
Read name omission does not apply to single-end reads, detached mate
records or mate records that involve secondary or supplementary alignments.
Whether or not secondary/supplementary alignments are involved in a set
of mate records is determined by the tags TC (for secondary alignments)
and SA (for supplementary alignments). Make sure that these tags are
appropriately attached to the records when such alignments are present."
(^CRAMWriter [f] (writer f {}))
(^CRAMWriter [f option] (cram/writer f option)))

Expand Down
3 changes: 3 additions & 0 deletions src/cljam/io/cram/encode/context.clj
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
preservation-map (cond-> {:RN true, :AP false, :RR true}
(:omit-read-names? options)
(assoc :RN false)

(= (sam.header/sort-order cram-header)
sam.header/order-coordinate)
(assoc :AP true))
Expand Down
76 changes: 50 additions & 26 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,35 @@
(AP' (:pos record))
(RG (if rg (get rg-id->idx rg) -1))))))

(defn- build-read-name-encoder [{:keys [RN]}]
(fn [record]
(RN (.getBytes ^String (:qname record)))))
(defn- build-read-name-encoder [{:keys [options]} {:keys [RN]}]
(if (:omit-read-names? options)
(constantly nil)
(fn [record]
(RN (.getBytes ^String (:qname record))))))

(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}]
(fn [{:keys [^long flag rnext] :as record}]
(if (zero? (bit-and (long (::flag record)) CF_DETACHED))
(when-let [nf (::next-fragment record)]
(NF nf))
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)
(defn- build-mate-read-encoder [{:keys [rname->idx options]} {:keys [RN NF MF NS NP TS]}]
(let [RN' (if (:omit-read-names? options)
;; read names must be preserved for detached mate records even when
;; :omit-read-names? is true
#(RN (.getBytes ^String %))
(constantly nil))]
(fn [{:keys [^long flag rnext] :as record}]
(if (zero? (bit-and (long (::flag record)) CF_DETACHED))
(when-let [nf (::next-fragment record)]
(NF nf))
(let [mate-flag (cond-> 0
(pos? (bit-and flag (sam.flag/encoded #{:next-reversed})))
(bit-or 0x01)

(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record))))))
(pos? (bit-and flag (sam.flag/encoded #{:next-unmapped})))
(bit-or 0x02))]
(MF mate-flag)
(RN' (:qname record))
(NS (if (= rnext "=")
(::ref-index record)
(ref-index rname->idx rnext)))
(NP (:pnext record))
(TS (:tlen record)))))))

(defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}]
(let [tag-encoder (fn [{:keys [tag] :as item}]
Expand Down Expand Up @@ -132,7 +140,7 @@

(defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}]
(let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders)
name-encoder (build-read-name-encoder ds-encoders)
name-encoder (build-read-name-encoder slice-ctx ds-encoders)
mate-encoder (build-mate-read-encoder slice-ctx ds-encoders)
tags-encoder (build-auxiliary-tags-encoder slice-ctx ds-encoders)
mapped-encoder (build-mapped-read-encoder ds-encoders)
Expand Down Expand Up @@ -221,7 +229,17 @@
(bit-and (sam.flag/encoded #{:next-unmapped :next-reversed}))
(unsigned-bit-shift-right 1)))))

(defn- resolve-mate! [mate-resolver ^List records ^long i record]
(defn- has-only-primary-mates? [record mate]
(and (if-let [tc (sam.option/value-for-tag :TC record)]
(= tc 2)
true)
(if-let [tc (sam.option/value-for-tag :TC mate)]
(= tc 2)
true)
(nil? (sam.option/value-for-tag :SA record))
(nil? (sam.option/value-for-tag :SA mate))))

(defn- resolve-mate! [mate-resolver omit-read-names? ^List records i record]
(when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)]
(let [upstream-mate (.get records mate-index)]
(when (and (mate-consistent? record upstream-mate)
Expand All @@ -232,22 +250,28 @@
(zero? s1) (zero? s2))
(if (<= s1 s2)
(= tlen1 (- tlen2) (inc (- e2 s1)))
(= (- tlen1) tlen2 (inc (- e1 s2)))))))
(= (- tlen1) tlen2 (inc (- e1 s2))))))
(or (not omit-read-names?)
;; to omit the read name from these mate records, they
;; must not have any secondary/supplementary alignments
(has-only-primary-mates? record upstream-mate)))
(.set records mate-index
(assoc upstream-mate
::flag (-> (long (::flag upstream-mate))
(bit-and (bit-not CF_DETACHED))
(bit-or CF_MATE_DOWNSTREAM))
::next-fragment (dec (- i mate-index))))
::next-fragment (dec (- (long i) mate-index))))
mate-index))))

(defn preprocess-slice-records
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records]
[{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder options]}
^List records]
(let [mate-resolver (mate/make-mate-resolver)
stats-builder (stats/make-alignment-stats-builder)]
stats-builder (stats/make-alignment-stats-builder)
omit-read-names? (:omit-read-names? options)]
(dotimes [i (.size records)]
(let [record (.get records i)
ri (ref-index rname->idx (:rname record))
Expand All @@ -258,7 +282,7 @@
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)
mate-index (resolve-mate! mate-resolver records i record')]
mate-index (resolve-mate! mate-resolver omit-read-names? records i record')]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i
(cond-> record'
Expand Down
Binary file added test-resources/cram/paired.sorted.cram
Binary file not shown.
86 changes: 78 additions & 8 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@
(is (= 1 @c->t))
(is (= 1 @g->c))))

(defn- preprocess-slice-records [cram-header subst-mat-init records]
(let [opts {:ds-compressor-overrides (constantly :raw)
:tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))}
container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts)
(defn- preprocess-slice-records [cram-header subst-mat-init opts records]
(let [opts' (merge opts
{:ds-compressor-overrides (constantly :raw)
:tag-compressor-overrides (constantly
(constantly
(constantly {:external :raw})))})
container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts')
(assoc :subst-mat-builder (test-subst-mat-builder subst-mat-init)))
stats (record/preprocess-slice-records container-ctx records)]
(context/finalize-container-context container-ctx [stats])))
Expand Down Expand Up @@ -130,7 +133,7 @@
{:qname "q005", :flag 141, :rname "*", :pos 0, :cigar "*",
:rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*"
:options []}])
container-ctx (preprocess-slice-records cram-header subst-mat-init records)]
container-ctx (preprocess-slice-records cram-header subst-mat-init {} records)]
(is (= [{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M",
:rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
Expand Down Expand Up @@ -239,7 +242,7 @@
{:qname "q004", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options []}])
slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records)
slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init {} records)
(context/make-slice-context 0))
_ (record/encode-slice-records slice-ctx records)
ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))
Expand Down Expand Up @@ -411,7 +414,7 @@
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options []}])
slice-ctx (-> (preprocess-slice-records cram-header {} records)
slice-ctx (-> (preprocess-slice-records cram-header {} {} records)
(context/make-slice-context 0))
_ (record/encode-slice-records slice-ctx records)
ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))
Expand Down Expand Up @@ -555,5 +558,72 @@
(map #(+ (long %) 33))
byte-array
String.)))
(is (= {} tag-res))))
(testing "omit read names"
(let [cram-header {:SQ
[{:SN "ref"}
{:SN "ref2"}]}
opts {:omit-read-names? true}
records (ArrayList.
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 60,
:cigar "5M", :rnext "=", :pnext 11, :tlen 15, :seq "AGAAT", :qual "AAAAA"
:options []}
{:qname "q002", :flag 97, :rname "ref", :pos 6, :end 10, :mapq 15,
:cigar "5M", :rnext "=", :pnext 16, :tlen 15, :seq "GTTAG", :qual "AAAAA"
:options [{:TC {:type "i", :value 3}}]}
{:qname "q003", :flag 97, :rname "ref", :pos 11, :end 15, :mapq 30,
:cigar "4M1S", :rnext "=", :pnext 21, :tlen 15, :seq "ATAAA", :qual "AAAAA"
:options [{:SA {:type "Z", :value "ref2,12345,+,4S1M,10,0"}}]}
{:qname "q004", :flag 73, :rname "ref", :pos 16, :end 20, :mapq 60,
:cigar "5M", :rnext "=", :pnext 19, :tlen 15, :seq "ATAGC", :qual "AAAAA"
:options []}
{:qname "q001", :flag 147, :rname "ref", :pos 11, :end 15, :mapq 60,
:cigar "5M", :rnext "=", :pnext 1, :tlen -15, :seq "ATAAG", :qual "AAAAA"
:options []}
{:qname "q002", :flag 145, :rname "ref", :pos 16, :end 20, :mapq 15,
:cigar "5M", :rnext "=", :pnext 6, :tlen -15, :seq "ATAGC", :qual "AAAAA"
:options [{:TC {:type "i", :value 3}}]}
{:qname "q003", :flag 145, :rname "ref", :pos 21, :end 25, :mapq 30,
:cigar "5M", :rnext "=", :pnext 11, :tlen -15, :seq "TGTGC", :qual "AAAAA"
:options []}
{:qname "q005", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATGCA", :qual "AAAAA"
:options []}
{:qname "q005", :flag 141, :rname "*", :pos 0, :end 4, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGCAT", :qual "AAAAA"
:options []}])
slice-ctx (-> (preprocess-slice-records cram-header {} opts records)
(context/make-slice-context 0))
_ (record/encode-slice-records slice-ctx records)
ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))]
(is (= 1 (count (get ds-res :CF))))
(is (= 3 (get-in ds-res [:CF 0 :content-id])))
(is (= [5 3 3 3 1 3 3 5 1] (seq (get-in ds-res [:CF 0 :data]))))

(is (= 1 (count (get ds-res :RN))))
(is (= 8 (get-in ds-res [:RN 0 :content-id])))
(is (= "q002\tq003\tq004\tq002\tq003\t" (String. ^bytes (get-in ds-res [:RN 0 :data]))))

(is (= 1 (count (get ds-res :MF))))
(is (= 9 (get-in ds-res [:MF 0 :content-id])))
(is (= [1 1 2 0 0] (seq (get-in ds-res [:MF 0 :data]))))

(is (= 1 (count (get ds-res :NS))))
(is (= 10 (get-in ds-res [:NS 0 :content-id])))
(is (= [0 0 0 0 0]
(map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data]))))

(is (= 1 (count (get ds-res :NP))))
(is (= 11 (get-in ds-res [:NP 0 :content-id])))
(is (= [16 21 19 6 11]
(map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data]))))

(is (= 1 (count (get ds-res :TS))))
(is (= 12 (get-in ds-res [:TS 0 :content-id])))
(is (= [15 15 15 0xff 0xff 0xff 0xff 0x01 0xff 0xff 0xff 0xff 0x01]
(map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data]))))

(is (= 1 (count (get ds-res :NF))))
(is (= 13 (get-in ds-res [:NF 0 :content-id])))
(is (= [3 0] (seq (get-in ds-res [:NF 0 :data])))))))

(is (= {} tag-res)))))
72 changes: 58 additions & 14 deletions test/cljam/io/cram_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
[cljam.io.cram.decode.structure :as struct]
[cljam.io.cram.reader :as reader]
[cljam.io.sam :as sam]
[cljam.test-common :as common :refer [clean-cache! deftest-remote
prepare-cache!
prepare-cavia!
with-before-after]]
[cljam.io.sam.util.flag :as flag]
[cljam.test-common :as common :refer [deftest-remote deftest-slow]]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.test :refer [are deftest is testing]])
(:import [cljam.io.cram.reader CRAMReader]
[java.nio.channels FileChannel]))
Expand Down Expand Up @@ -44,7 +43,7 @@
{:chr "ref2", :start 35, :end 35} 2)))

(deftest-remote reader-with-multiple-containers-test
(with-before-after {:before (prepare-cavia!)}
(common/with-before-after {:before (common/prepare-cavia!)}
(testing "read all the alignments"
(with-open [bam-rdr (sam/reader common/medium-bam-file)
cram-rdr (cram/reader common/medium-cram-file
Expand Down Expand Up @@ -77,8 +76,8 @@
{:chr "chr19", :start 54000000, :end 55000000} 12)))))

(deftest writer-test
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(testing "unsorted"
(with-open [r (cram/reader common/test-cram-file
{:reference common/test-fa-file})
Expand Down Expand Up @@ -111,9 +110,9 @@
(cram/read-alignments r')))))))

(deftest-remote writer-with-multiple-containers-test
(with-before-after {:before (do (prepare-cavia!)
(prepare-cache!))
:after (clean-cache!)}
(common/with-before-after {:before (do (common/prepare-cavia!)
(common/prepare-cache!))
:after (common/clean-cache!)}
(with-open [r (cram/reader common/medium-cram-file
{:reference common/hg19-twobit-file})
w (cram/writer temp-cram-file
Expand Down Expand Up @@ -141,8 +140,8 @@
(step))))

(deftest writer-with-reference-embedding-test
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(testing "Reference embedding will be enabled if `:embed-reference?` is set to true"
(with-open [r (cram/reader common/test-sorted-cram-file
{:reference common/test-fa-file})
Expand Down Expand Up @@ -184,8 +183,8 @@
(is (common/not-throw? (cram/write-header w (cram/read-header r))))))))

(deftest writer-index-options-test
(with-before-after {:before (prepare-cache!)
:after (clean-cache!)}
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(testing "A CRAM index file won't be created by default"
(with-open [r (cram/reader common/test-sorted-cram-file
{:reference common/test-fa-file})
Expand Down Expand Up @@ -221,3 +220,48 @@
(cram/write-header w (cram/read-header r))
(cram/write-alignments w (cram/read-alignments r) (cram/read-header r)))
(is (.exists (io/file (str temp-cram-file-3 ".crai")))))))

(deftest-slow writer-with-read-name-omission-test
(common/with-before-after {:before (do (common/prepare-cavia!)
(common/prepare-cache!))
:after (common/clean-cache!)}
(with-open [r (cram/reader common/paired-sorted-cram-file
{:reference common/hg38-twobit-file})
w (cram/writer temp-cram-file
{:reference common/hg38-twobit-file
:omit-read-names? true
:min-single-ref-slice-size 1})]
(cram/write-header w (cram/read-header r))
(cram/write-alignments w (cram/read-alignments r) (cram/read-header r)))
(with-open [r (cram/reader common/paired-sorted-cram-file
{:reference common/hg38-twobit-file})
r' (cram/reader temp-cram-file
{:reference common/hg38-twobit-file})]
(is (= (cram/read-header r)
(cram/read-header r')))
(let [alns (vec (cram/read-alignments r))
alns' (vec (cram/read-alignments r'))
mates (->> alns
(map-indexed #(assoc %2 :idx %1))
(group-by :qname)
(into {} (map (fn [[qname xs]]
[qname (into #{} (map :idx) xs)]))))
mates' (->> alns'
(map-indexed #(assoc %2 :idx %1))
(group-by :qname)
(into {} (map (fn [[qname xs]]
[qname (into #{} (map :idx) xs)]))))
qnames (set (keys mates))
qnames' (set (keys mates'))]
(is (= (map #(dissoc % :qname) alns)
(map #(dissoc % :qname) alns')))
(is (not= qnames qnames'))
(is (every? #(= (get mates %) (get mates' %))
(set/intersection qnames qnames')))
(is (every? (fn [qname]
(let [indices (get mates' qname)]
(and (= (count indices) 2)
(every? #(flag/primary? (:flag (nth alns' %)))
indices))))
(set/difference qnames' qnames)))
(is (= (set (vals mates)) (set (vals mates'))))))))
Loading

0 comments on commit bc9566c

Please sign in to comment.