diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj index 267fb2f6..47ab52dd 100644 --- a/src/cljam/io/cram.clj +++ b/src/cljam/io/cram.clj @@ -81,7 +81,15 @@ sequences into a CRAM file, the CRAM writer, by default, checks if the header specifies `SO:coordinate` and raises an error if it does not. If this option is set to true, the CRAM writer will skip this header - check and proceed regardless of the header's sort order declaration." + check and proceed regardless of the header's sort order declaration. + - omit-read-names?: If set to true, the CRAM writer omits read names (QNAME) + from records while preserving consistency across mate records. + Read name omission does not apply to single-end reads, detached mate + records or mate records that involve secondary or supplementary alignments. + Whether or not secondary/supplementary alignments are involved in a set + of mate records is determined by the tags TC (for secondary alignments) + and SA (for supplementary alignments). Make sure that these tags are + appropriately attached to the records when such alignments are present." (^CRAMWriter [f] (writer f {})) (^CRAMWriter [f option] (cram/writer f option))) diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index edb3b28e..7e392055 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -11,6 +11,9 @@ (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) preservation-map (cond-> {:RN true, :AP false, :RR true} + (:omit-read-names? options) + (assoc :RN false) + (= (sam.header/sort-order cram-header) sam.header/order-coordinate) (assoc :AP true)) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index e347b36b..bfffba07 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -37,27 +37,35 @@ (AP' (:pos record)) (RG (if rg (get rg-id->idx rg) -1)))))) -(defn- build-read-name-encoder [{:keys [RN]}] - (fn [record] - (RN (.getBytes ^String (:qname record))))) +(defn- build-read-name-encoder [{:keys [options]} {:keys [RN]}] + (if (:omit-read-names? options) + (constantly nil) + (fn [record] + (RN (.getBytes ^String (:qname record)))))) -(defn- build-mate-read-encoder [{:keys [rname->idx]} {:keys [NF MF NS NP TS]}] - (fn [{:keys [^long flag rnext] :as record}] - (if (zero? (bit-and (long (::flag record)) CF_DETACHED)) - (when-let [nf (::next-fragment record)] - (NF nf)) - (let [mate-flag (cond-> 0 - (pos? (bit-and flag (sam.flag/encoded #{:next-reversed}))) - (bit-or 0x01) +(defn- build-mate-read-encoder [{:keys [rname->idx options]} {:keys [RN NF MF NS NP TS]}] + (let [RN' (if (:omit-read-names? options) + ;; read names must be preserved for detached mate records even when + ;; :omit-read-names? is true + #(RN (.getBytes ^String %)) + (constantly nil))] + (fn [{:keys [^long flag rnext] :as record}] + (if (zero? (bit-and (long (::flag record)) CF_DETACHED)) + (when-let [nf (::next-fragment record)] + (NF nf)) + (let [mate-flag (cond-> 0 + (pos? (bit-and flag (sam.flag/encoded #{:next-reversed}))) + (bit-or 0x01) - (pos? (bit-and flag (sam.flag/encoded #{:next-unmapped}))) - (bit-or 0x02))] - (MF mate-flag) - (NS (if (= rnext "=") - (::ref-index record) - (ref-index rname->idx rnext))) - (NP (:pnext record)) - (TS (:tlen record)))))) + (pos? (bit-and flag (sam.flag/encoded #{:next-unmapped}))) + (bit-or 0x02))] + (MF mate-flag) + (RN' (:qname record)) + (NS (if (= rnext "=") + (::ref-index record) + (ref-index rname->idx rnext))) + (NP (:pnext record)) + (TS (:tlen record))))))) (defn- build-auxiliary-tags-encoder [{:keys [tag-dict tag-encoders]} {:keys [TL]}] (let [tag-encoder (fn [{:keys [tag] :as item}] @@ -132,7 +140,7 @@ (defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}] (let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders) - name-encoder (build-read-name-encoder ds-encoders) + name-encoder (build-read-name-encoder slice-ctx ds-encoders) mate-encoder (build-mate-read-encoder slice-ctx ds-encoders) tags-encoder (build-auxiliary-tags-encoder slice-ctx ds-encoders) mapped-encoder (build-mapped-read-encoder ds-encoders) @@ -221,7 +229,17 @@ (bit-and (sam.flag/encoded #{:next-unmapped :next-reversed})) (unsigned-bit-shift-right 1))))) -(defn- resolve-mate! [mate-resolver ^List records ^long i record] +(defn- has-only-primary-mates? [record mate] + (and (if-let [tc (sam.option/value-for-tag :TC record)] + (= tc 2) + true) + (if-let [tc (sam.option/value-for-tag :TC mate)] + (= tc 2) + true) + (nil? (sam.option/value-for-tag :SA record)) + (nil? (sam.option/value-for-tag :SA mate)))) + +(defn- resolve-mate! [mate-resolver omit-read-names? ^List records i record] (when-let [^long mate-index (mate/resolve-mate! mate-resolver i record)] (let [upstream-mate (.get records mate-index)] (when (and (mate-consistent? record upstream-mate) @@ -232,22 +250,28 @@ (zero? s1) (zero? s2)) (if (<= s1 s2) (= tlen1 (- tlen2) (inc (- e2 s1))) - (= (- tlen1) tlen2 (inc (- e1 s2))))))) + (= (- tlen1) tlen2 (inc (- e1 s2)))))) + (or (not omit-read-names?) + ;; to omit the read name from these mate records, they + ;; must not have any secondary/supplementary alignments + (has-only-primary-mates? record upstream-mate))) (.set records mate-index (assoc upstream-mate ::flag (-> (long (::flag upstream-mate)) (bit-and (bit-not CF_DETACHED)) (bit-or CF_MATE_DOWNSTREAM)) - ::next-fragment (dec (- i mate-index)))) + ::next-fragment (dec (- (long i) mate-index)))) mate-index)))) (defn preprocess-slice-records "Preprocesses slice records to calculate some record fields prior to record encoding that are necessary for the CRAM writer to generate some header components." - [{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records] + [{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder options]} + ^List records] (let [mate-resolver (mate/make-mate-resolver) - stats-builder (stats/make-alignment-stats-builder)] + stats-builder (stats/make-alignment-stats-builder) + omit-read-names? (:omit-read-names? options)] (dotimes [i (.size records)] (let [record (.get records i) ri (ref-index rname->idx (:rname record)) @@ -258,7 +282,7 @@ record' (assoc record ::flag cf ::ref-index ri ::end end ::features fs ::tags-index tags-id) - mate-index (resolve-mate! mate-resolver records i record')] + mate-index (resolve-mate! mate-resolver omit-read-names? records i record')] (stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1) (.set records i (cond-> record' diff --git a/test-resources/cram/paired.sorted.cram b/test-resources/cram/paired.sorted.cram new file mode 100644 index 00000000..029c2671 Binary files /dev/null and b/test-resources/cram/paired.sorted.cram differ diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index 4610104d..06b2b2bc 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -81,10 +81,13 @@ (is (= 1 @c->t)) (is (= 1 @g->c)))) -(defn- preprocess-slice-records [cram-header subst-mat-init records] - (let [opts {:ds-compressor-overrides (constantly :raw) - :tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))} - container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts) +(defn- preprocess-slice-records [cram-header subst-mat-init opts records] + (let [opts' (merge opts + {:ds-compressor-overrides (constantly :raw) + :tag-compressor-overrides (constantly + (constantly + (constantly {:external :raw})))}) + container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts') (assoc :subst-mat-builder (test-subst-mat-builder subst-mat-init))) stats (record/preprocess-slice-records container-ctx records)] (context/finalize-container-context container-ctx [stats]))) @@ -130,7 +133,7 @@ {:qname "q005", :flag 141, :rname "*", :pos 0, :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "*", :qual "*" :options []}]) - container-ctx (preprocess-slice-records cram-header subst-mat-init records)] + container-ctx (preprocess-slice-records cram-header subst-mat-init {} records)] (is (= [{:qname "q001", :flag 99, :rname "ref", :pos 1, :cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" :options [{:RG {:type "Z", :value "rg001"}} @@ -239,7 +242,7 @@ {:qname "q004", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" :options []}]) - slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records) + slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init {} records) (context/make-slice-context 0)) _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) @@ -411,7 +414,7 @@ {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" :options []}]) - slice-ctx (-> (preprocess-slice-records cram-header {} records) + slice-ctx (-> (preprocess-slice-records cram-header {} {} records) (context/make-slice-context 0)) _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) @@ -555,5 +558,72 @@ (map #(+ (long %) 33)) byte-array String.))) + (is (= {} tag-res)))) + (testing "omit read names" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}]} + opts {:omit-read-names? true} + records (ArrayList. + [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 60, + :cigar "5M", :rnext "=", :pnext 11, :tlen 15, :seq "AGAAT", :qual "AAAAA" + :options []} + {:qname "q002", :flag 97, :rname "ref", :pos 6, :end 10, :mapq 15, + :cigar "5M", :rnext "=", :pnext 16, :tlen 15, :seq "GTTAG", :qual "AAAAA" + :options [{:TC {:type "i", :value 3}}]} + {:qname "q003", :flag 97, :rname "ref", :pos 11, :end 15, :mapq 30, + :cigar "4M1S", :rnext "=", :pnext 21, :tlen 15, :seq "ATAAA", :qual "AAAAA" + :options [{:SA {:type "Z", :value "ref2,12345,+,4S1M,10,0"}}]} + {:qname "q004", :flag 73, :rname "ref", :pos 16, :end 20, :mapq 60, + :cigar "5M", :rnext "=", :pnext 19, :tlen 15, :seq "ATAGC", :qual "AAAAA" + :options []} + {:qname "q001", :flag 147, :rname "ref", :pos 11, :end 15, :mapq 60, + :cigar "5M", :rnext "=", :pnext 1, :tlen -15, :seq "ATAAG", :qual "AAAAA" + :options []} + {:qname "q002", :flag 145, :rname "ref", :pos 16, :end 20, :mapq 15, + :cigar "5M", :rnext "=", :pnext 6, :tlen -15, :seq "ATAGC", :qual "AAAAA" + :options [{:TC {:type "i", :value 3}}]} + {:qname "q003", :flag 145, :rname "ref", :pos 21, :end 25, :mapq 30, + :cigar "5M", :rnext "=", :pnext 11, :tlen -15, :seq "TGTGC", :qual "AAAAA" + :options []} + {:qname "q005", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATGCA", :qual "AAAAA" + :options []} + {:qname "q005", :flag 141, :rname "*", :pos 0, :end 4, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGCAT", :qual "AAAAA" + :options []}]) + slice-ctx (-> (preprocess-slice-records cram-header {} opts records) + (context/make-slice-context 0)) + _ (record/encode-slice-records slice-ctx records) + ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx))] + (is (= 1 (count (get ds-res :CF)))) + (is (= 3 (get-in ds-res [:CF 0 :content-id]))) + (is (= [5 3 3 3 1 3 3 5 1] (seq (get-in ds-res [:CF 0 :data])))) + + (is (= 1 (count (get ds-res :RN)))) + (is (= 8 (get-in ds-res [:RN 0 :content-id]))) + (is (= "q002\tq003\tq004\tq002\tq003\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) + + (is (= 1 (count (get ds-res :MF)))) + (is (= 9 (get-in ds-res [:MF 0 :content-id]))) + (is (= [1 1 2 0 0] (seq (get-in ds-res [:MF 0 :data])))) + + (is (= 1 (count (get ds-res :NS)))) + (is (= 10 (get-in ds-res [:NS 0 :content-id]))) + (is (= [0 0 0 0 0] + (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) + + (is (= 1 (count (get ds-res :NP)))) + (is (= 11 (get-in ds-res [:NP 0 :content-id]))) + (is (= [16 21 19 6 11] + (map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data])))) + + (is (= 1 (count (get ds-res :TS)))) + (is (= 12 (get-in ds-res [:TS 0 :content-id]))) + (is (= [15 15 15 0xff 0xff 0xff 0xff 0x01 0xff 0xff 0xff 0xff 0x01] + (map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data])))) + + (is (= 1 (count (get ds-res :NF)))) + (is (= 13 (get-in ds-res [:NF 0 :content-id]))) + (is (= [3 0] (seq (get-in ds-res [:NF 0 :data]))))))) - (is (= {} tag-res))))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index b04b9f7c..e6c1a8e7 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -3,11 +3,10 @@ [cljam.io.cram.decode.structure :as struct] [cljam.io.cram.reader :as reader] [cljam.io.sam :as sam] - [cljam.test-common :as common :refer [clean-cache! deftest-remote - prepare-cache! - prepare-cavia! - with-before-after]] + [cljam.io.sam.util.flag :as flag] + [cljam.test-common :as common :refer [deftest-remote deftest-slow]] [clojure.java.io :as io] + [clojure.set :as set] [clojure.test :refer [are deftest is testing]]) (:import [cljam.io.cram.reader CRAMReader] [java.nio.channels FileChannel])) @@ -44,7 +43,7 @@ {:chr "ref2", :start 35, :end 35} 2))) (deftest-remote reader-with-multiple-containers-test - (with-before-after {:before (prepare-cavia!)} + (common/with-before-after {:before (common/prepare-cavia!)} (testing "read all the alignments" (with-open [bam-rdr (sam/reader common/medium-bam-file) cram-rdr (cram/reader common/medium-cram-file @@ -77,8 +76,8 @@ {:chr "chr19", :start 54000000, :end 55000000} 12))))) (deftest writer-test - (with-before-after {:before (prepare-cache!) - :after (clean-cache!)} + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} (testing "unsorted" (with-open [r (cram/reader common/test-cram-file {:reference common/test-fa-file}) @@ -111,9 +110,9 @@ (cram/read-alignments r'))))))) (deftest-remote writer-with-multiple-containers-test - (with-before-after {:before (do (prepare-cavia!) - (prepare-cache!)) - :after (clean-cache!)} + (common/with-before-after {:before (do (common/prepare-cavia!) + (common/prepare-cache!)) + :after (common/clean-cache!)} (with-open [r (cram/reader common/medium-cram-file {:reference common/hg19-twobit-file}) w (cram/writer temp-cram-file @@ -141,8 +140,8 @@ (step)))) (deftest writer-with-reference-embedding-test - (with-before-after {:before (prepare-cache!) - :after (clean-cache!)} + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} (testing "Reference embedding will be enabled if `:embed-reference?` is set to true" (with-open [r (cram/reader common/test-sorted-cram-file {:reference common/test-fa-file}) @@ -184,8 +183,8 @@ (is (common/not-throw? (cram/write-header w (cram/read-header r)))))))) (deftest writer-index-options-test - (with-before-after {:before (prepare-cache!) - :after (clean-cache!)} + (common/with-before-after {:before (common/prepare-cache!) + :after (common/clean-cache!)} (testing "A CRAM index file won't be created by default" (with-open [r (cram/reader common/test-sorted-cram-file {:reference common/test-fa-file}) @@ -221,3 +220,48 @@ (cram/write-header w (cram/read-header r)) (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) (is (.exists (io/file (str temp-cram-file-3 ".crai"))))))) + +(deftest-slow writer-with-read-name-omission-test + (common/with-before-after {:before (do (common/prepare-cavia!) + (common/prepare-cache!)) + :after (common/clean-cache!)} + (with-open [r (cram/reader common/paired-sorted-cram-file + {:reference common/hg38-twobit-file}) + w (cram/writer temp-cram-file + {:reference common/hg38-twobit-file + :omit-read-names? true + :min-single-ref-slice-size 1})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/paired-sorted-cram-file + {:reference common/hg38-twobit-file}) + r' (cram/reader temp-cram-file + {:reference common/hg38-twobit-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (let [alns (vec (cram/read-alignments r)) + alns' (vec (cram/read-alignments r')) + mates (->> alns + (map-indexed #(assoc %2 :idx %1)) + (group-by :qname) + (into {} (map (fn [[qname xs]] + [qname (into #{} (map :idx) xs)])))) + mates' (->> alns' + (map-indexed #(assoc %2 :idx %1)) + (group-by :qname) + (into {} (map (fn [[qname xs]] + [qname (into #{} (map :idx) xs)])))) + qnames (set (keys mates)) + qnames' (set (keys mates'))] + (is (= (map #(dissoc % :qname) alns) + (map #(dissoc % :qname) alns'))) + (is (not= qnames qnames')) + (is (every? #(= (get mates %) (get mates' %)) + (set/intersection qnames qnames'))) + (is (every? (fn [qname] + (let [indices (get mates' qname)] + (and (= (count indices) 2) + (every? #(flag/primary? (:flag (nth alns' %))) + indices)))) + (set/difference qnames' qnames))) + (is (= (set (vals mates)) (set (vals mates')))))))) diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index 91524567..9a00ba61 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -35,6 +35,9 @@ {:resources [{:id "hg19.2bit" :url "https://test.chrov.is/data/refs/hg19.2bit" :sha1 "95e5806aee9ecc092d30a482aaa008ef66cbc468"} + {:id "hg38.2bit" + :url "https://test.chrov.is/data/refs/hg38.2bit" + :sha1 "6fb20ba4de0b49247b78e08c2394d0c4f8594148"} {:id "large.bam" :url "https://test.chrov.is/data/GSM721144_H3K36me3.nodup.bam" :sha1 "ad282c3779120057abc274ad8fad1910a4ad867b"} @@ -168,6 +171,7 @@ (def test-cram-file "test-resources/cram/test.cram") (def test-sorted-cram-file "test-resources/cram/test.sorted.cram") (def test-sorted-with-unknown-so-cram-file "test-resources/cram/test.sorted_with_unknown_so.cram") +(def paired-sorted-cram-file "test-resources/cram/paired.sorted.cram") (def medium-cram-file "test-resources/cram/medium.cram") (def medium-with-standard-tags-cram-file "test-resources/cram/medium_with_standard_tags.cram") (def medium-without-index-cram-file "test-resources/cram/medium_without_index.cram") @@ -200,6 +204,7 @@ (def test-twobit-be-n-file "test-resources/twobit/be-test-n.2bit") (def medium-twobit-file "test-resources/twobit/medium.2bit") (def hg19-twobit-file (cavia/resource mycavia "hg19.2bit")) +(def hg38-twobit-file (cavia/resource mycavia "hg38.2bit")) ;; ### FASTQ files