diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index a5c4a8d2..4f926eeb 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -1,13 +1,18 @@ (ns cljam.io.cram.encode.context (:require [cljam.io.cram.data-series :as ds] - [cljam.io.cram.encode.tag-dict :as tag-dict])) + [cljam.io.cram.encode.tag-dict :as tag-dict] + [cljam.io.sam.util.header :as sam.header])) (defn make-container-context "Creates a new container context." - [cram-header preservation-map seq-resolver] + [cram-header seq-resolver] (let [rname->idx (into {} (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) + preservation-map (cond-> {:RN true, :AP false, :RR true} + (= (sam.header/sort-order cram-header) + sam.header/order-coordinate) + (assoc :AP true)) subst-mat {\A {\T 0, \G 1, \C 2, \N 3} \T {\A 0, \G 1, \C 2, \N 3} \G {\A 0, \T 1, \C 2, \N 3} @@ -25,24 +30,26 @@ "Finalizes the builders in the container context and returns a new container context containing those builders' results. This operation must be done before creating a slice context." - [container-ctx ds-compressor-overrides tag-compressor-overrides] + [container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides] (let [ds-encodings (-> ds/default-data-series-encodings (ds/apply-ds-compressor-overrides ds-compressor-overrides)) tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx)) tag-encodings (-> (tag-dict/build-tag-encodings tag-dict) (ds/apply-tag-compressor-overrides tag-compressor-overrides))] (assoc container-ctx + :alignment-stats alignment-stats :ds-encodings ds-encodings :tag-dict tag-dict :tag-encodings tag-encodings))) (defn make-slice-context - "Creates a slice context from the given container context. Note that the container - context must be finalized with `finalize-container-context`." - [{:keys [ds-encodings tag-encodings] :as container-ctx}] + "Creates a slice context for the ith slice from the given container context. + Note that the container context must be finalized with `finalize-container-context`." + [{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i] (let [ds-encoders (ds/build-data-series-encoders ds-encodings) tag-encoders (ds/build-tag-encoders tag-encodings)] (assoc container-ctx + :alignment-stats (nth alignment-stats i) :ds-encoders ds-encoders :tag-encoders tag-encoders))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index bcd10e4a..fc5d4fec 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -12,15 +12,22 @@ -1 (get rname->idx rname))) -(defn- build-positional-data-encoder [{:keys [cram-header]} {:keys [RI RL AP RG]}] +(defn- build-positional-data-encoder + [{:keys [cram-header preservation-map alignment-stats]} {:keys [RI RL AP RG]}] (let [rg-id->idx (into {} (map-indexed (fn [i {:keys [ID]}] [ID i])) - (:RG cram-header))] + (:RG cram-header)) + AP' (if (:AP preservation-map) + (let [pos (volatile! (:start alignment-stats))] + (fn [^long pos'] + (AP (- pos' (long @pos))) + (vreset! pos pos'))) + AP)] (fn [record] (let [rg (sam.option/value-for-tag :RG record)] (RI (::ref-index record)) (RL (count (:seq record))) - (AP (:pos record)) + (AP' (:pos record)) (RG (if rg (get rg-id->idx rg) -1)))))) (defn- build-read-name-encoder [{:keys [RN]}] @@ -113,8 +120,7 @@ (BA (aget bs i))) (encode-qual record QS)))) -(defn- build-cram-record-encoder - [{:keys [ds-encoders] :as slice-ctx} stats-builder] +(defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}] (let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders) name-encoder (build-read-name-encoder ds-encoders) mate-encoder (build-mate-read-encoder slice-ctx ds-encoders) @@ -125,9 +131,6 @@ (fn [record] (let [bf (bit-and (long (:flag record)) (bit-not (sam.flag/encoded #{:next-reversed :next-unmapped})))] - (stats/update! stats-builder - (::ref-index record) (:pos record) (::end record) - (count (:seq record)) 1) (BF bf) (CF (::flag record)) (pos-encoder record) @@ -142,10 +145,8 @@ "Encodes CRAM records in a slice all at once using the given slice context. Returns the alignment stats for this slice." [slice-ctx records] - (let [stats-builder (stats/make-alignment-stats-builder) - record-encoder (build-cram-record-encoder slice-ctx stats-builder)] - (run! record-encoder records) - (stats/build stats-builder))) + (let [record-encoder (build-cram-record-encoder slice-ctx)] + (run! record-encoder records))) (defn- add-mismatches [n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] @@ -203,18 +204,21 @@ encoding that are necessary for the CRAM writer to generate some header components." [{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records] - (dotimes [i (.size records)] - (let [record (.get records i) - ;; these flag bits of CF are hard-coded at the moment: - ;; - 0x01: quality scores stored as array (true) - ;; - 0x02: detached (true) - ;; - 0x04: has mate downstream (false) - cf (cond-> 0x03 - (= (:seq record) "*") (bit-or 0x08)) - ri (ref-index rname->idx (:rname record)) - tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) - [fs end] (calculate-read-features&end seq-resolver subst-mat record) - record' (assoc record - ::flag cf ::ref-index ri ::end end - ::features fs ::tags-index tags-id)] - (.set records i record')))) + (let [stats-builder (stats/make-alignment-stats-builder)] + (dotimes [i (.size records)] + (let [record (.get records i) + ;; these flag bits of CF are hard-coded at the moment: + ;; - 0x01: quality scores stored as array (true) + ;; - 0x02: detached (true) + ;; - 0x04: has mate downstream (false) + cf (cond-> 0x03 + (= (:seq record) "*") (bit-or 0x08)) + ri (ref-index rname->idx (:rname record)) + tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) + [fs end] (calculate-read-features&end seq-resolver subst-mat record) + record' (assoc record + ::flag cf ::ref-index ri ::end end + ::features fs ::tags-index tags-id)] + (stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1) + (.set records i record'))) + (stats/build stats-builder))) diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index 66b2dd92..1a8b7bd7 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -50,13 +50,13 @@ (struct/encode-cram-header-container (.-stream wtr) header)) (defn- preprocess-records - [cram-header preservation-map seq-resolver options ^objects container-records] - (let [container-ctx (context/make-container-context cram-header - preservation-map - seq-resolver) - {:keys [ds-compressor-overrides tag-compressor-overrides]} options] - (run! (partial record/preprocess-slice-records container-ctx) container-records) + [cram-header seq-resolver options ^objects container-records] + (let [container-ctx (context/make-container-context cram-header seq-resolver) + {:keys [ds-compressor-overrides tag-compressor-overrides]} options + stats (mapv (partial record/preprocess-slice-records container-ctx) + container-records)] (context/finalize-container-context container-ctx + stats ds-compressor-overrides tag-compressor-overrides))) @@ -91,9 +91,9 @@ :bases nbases :records nrecords}) -(defn- generate-slice [container-ctx counter slice-records] - (let [slice-ctx (context/make-slice-context container-ctx) - stats (record/encode-slice-records slice-ctx slice-records) +(defn- generate-slice [slice-ctx counter slice-records] + (record/encode-slice-records slice-ctx slice-records) + (let [stats (:alignment-stats slice-ctx) blocks (generate-blocks slice-ctx) ref-md5 (reference-md5 slice-ctx stats) header (assoc (stats->header-base stats) @@ -103,7 +103,6 @@ header-block (struct/generate-slice-header-block header blocks) block-data (mapv :data blocks)] {:header header - :stats stats :header-block header-block :data-blocks block-data :counter (+ (long counter) (long (:nrecords stats))) @@ -112,10 +111,11 @@ (apply + (alength header-block)))})) (defn- generate-slices [container-ctx counter container-records] - (loop [[slice-records & more] container-records, counter counter, acc []] + (loop [i 0, [slice-records & more] container-records, counter counter, acc []] (if slice-records - (let [slice (generate-slice container-ctx counter slice-records)] - (recur more (:counter slice) (conj acc slice))) + (let [slice-ctx (context/make-slice-context container-ctx i) + slice (generate-slice slice-ctx counter slice-records)] + (recur (inc i) more (:counter slice) (conj acc slice))) acc))) (defn- generate-compression-header-block @@ -123,8 +123,8 @@ (struct/generate-compression-header-block preservation-map subst-mat tag-dict ds-encodings tag-encodings)) -(defn- generate-container-header [^bytes compression-header-block slices] - (let [stats (stats/merge-stats (map :stats slices)) +(defn- generate-container-header [container-ctx ^bytes compression-header-block slices] + (let [stats (stats/merge-stats (:alignment-stats container-ctx)) container-len (->> slices (map (fn [{:keys [^bytes header-block data-blocks]}] (apply + (alength header-block) @@ -172,12 +172,13 @@ (crai/write-index-entries index-writer entries))) (defn- write-container [^CRAMWriter wtr cram-header counter container-records] - (let [preservation-map {:RN true, :AP false, :RR true} - container-ctx (preprocess-records cram-header preservation-map (.-seq-resolver wtr) + (let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr) (.-options wtr) container-records) slices (generate-slices container-ctx counter container-records) compression-header-block (generate-compression-header-block container-ctx) - container-header (generate-container-header compression-header-block slices) + container-header (generate-container-header container-ctx + compression-header-block + slices) ^DataOutputStream out (.-stream wtr) container-offset (.size out)] (struct/encode-container-header out (assoc container-header :counter counter)) diff --git a/test-resources/cram/medium.cram b/test-resources/cram/medium.cram index 6a0541c5..0de7918d 100644 Binary files a/test-resources/cram/medium.cram and b/test-resources/cram/medium.cram differ diff --git a/test-resources/cram/medium.cram.crai b/test-resources/cram/medium.cram.crai index 2b4b9472..39462835 100644 Binary files a/test-resources/cram/medium.cram.crai and b/test-resources/cram/medium.cram.crai differ diff --git a/test/cljam/algo/cram_indexer_test.clj b/test/cljam/algo/cram_indexer_test.clj index b155f5ed..2536d54b 100644 --- a/test/cljam/algo/cram_indexer_test.clj +++ b/test/cljam/algo/cram_indexer_test.clj @@ -12,11 +12,11 @@ (doall (#'crai/read-index-entries r)))) (deftest create-index-test - (let [f (io/file common/temp-dir "medium.cram.crai")] + (let [f (io/file common/temp-dir "medium_without_index.cram.crai")] (common/with-before-after {:before (common/prepare-cache!) :after (common/clean-cache!)} (is (thrown-with-msg? Exception #"Cannot create CRAM index file .*" - (indexer/create-index common/medium-cram-file f))) + (indexer/create-index common/medium-without-index-cram-file f))) (indexer/create-index common/medium-cram-file f :skip-sort-order-check? true) (is (= (read-index-entries common/medium-crai-file) diff --git a/test/cljam/io/crai_test.clj b/test/cljam/io/crai_test.clj index 197e68b3..17dd3af0 100644 --- a/test/cljam/io/crai_test.clj +++ b/test/cljam/io/crai_test.clj @@ -17,121 +17,121 @@ [{:chr "chr1" :start 546609 :end (+ 546609 205262429) - :container-offset 324 + :container-offset 327 :slice-offset 563 - :size 22007} + :size 21475} {:chr "chr1" :start 206547069 :end (+ 206547069 42644506) - :container-offset 324 - :slice-offset 22570 - :size 7349}] + :container-offset 327 + :slice-offset 22038 + :size 7234}] "chr1" 550000 600000 [{:chr "chr1" :start 546609 :end (+ 546609 205262429) - :container-offset 324 + :container-offset 327 :slice-offset 563 - :size 22007}] + :size 21475}] "chr1" 210000000 240000000 [{:chr "chr1" :start 206547069 :end (+ 206547069 42644506) - :container-offset 324 - :slice-offset 22570 - :size 7349}] + :container-offset 327 + :slice-offset 22038 + :size 7234}] "chr1" 200000000 210000000 [{:chr "chr1" :start 546609 :end (+ 546609 205262429) - :container-offset 324 + :container-offset 327 :slice-offset 563 - :size 22007} + :size 21475} {:chr "chr1" :start 206547069 :end (+ 206547069 42644506) - :container-offset 324 - :slice-offset 22570 - :size 7349}] + :container-offset 327 + :slice-offset 22038 + :size 7234}] "*" 0 0 [{:chr "*" :start 0 :end 0 - :container-offset 354657 - :slice-offset 563 - :size 23119} + :container-offset 368626 + :slice-offset 541 + :size 15676} {:chr "*" :start 0 :end 0 - :container-offset 378365 + :container-offset 384869 :slice-offset 171 - :size 23494} + :size 23422} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 23665 - :size 23213} + :container-offset 384869 + :slice-offset 23593 + :size 23258} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 46878 - :size 23051} + :container-offset 384869 + :slice-offset 46851 + :size 23200} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 69929 - :size 23563} + :container-offset 384869 + :slice-offset 70051 + :size 23643} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 93492 - :size 24231} + :container-offset 384869 + :slice-offset 93694 + :size 24213} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 117723 - :size 24078} + :container-offset 384869 + :slice-offset 117907 + :size 24066} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 141801 - :size 23871} + :container-offset 384869 + :slice-offset 141973 + :size 23886} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 165672 - :size 24365} + :container-offset 384869 + :slice-offset 165859 + :size 24339} {:chr "*" :start 0 :end 0 - :container-offset 378365 - :slice-offset 190037 - :size 12326}]))) + :container-offset 384869 + :slice-offset 190198 + :size 11639}]))) (deftest write-index-entries-test (let [entries [{:ref-seq-id 0, :start 546609, :span 205262429, - :container-offset 324, :slice-offset 563, :size 22007} + :container-offset 327, :slice-offset 563, :size 21475} {:ref-seq-id 0, :start 206547069, :span 42644506, - :container-offset 324, :slice-offset 22570, :size 7349} + :container-offset 327, :slice-offset 22038, :size 7234} {:ref-seq-id 1, :start 67302, :span 231638920, - :container-offset 30272, :slice-offset 563, :size 21618} + :container-offset 29628, :slice-offset 563, :size 21098} {:ref-seq-id -1, :start 0, :span 0, - :container-offset 354657, :slice-offset 563, :size 23119} + :container-offset 368626, :slice-offset 541, :size 15676} {:ref-seq-id -1, :start 0, :span 0, - :container-offset 378365, :slice-offset 171, :size 23494} + :container-offset 384869, :slice-offset 171, :size 23422} {:ref-seq-id -1, :start 0, :span 0, - :container-offset 378365, :slice-offset 23665, :size 23213}] + :container-offset 384869, :slice-offset 23593, :size 23258}] f (io/file common/temp-dir "test.cram.crai")] (common/with-before-after {:before (common/prepare-cache!) :after (common/clean-cache!)} diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index b00f0503..9e7d4a59 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -71,9 +71,9 @@ 4])) (defn- preprocess-slice-records [cram-header records] - (let [container-ctx (context/make-container-context cram-header {} test-seq-resolver)] - (record/preprocess-slice-records container-ctx records) - (context/finalize-container-context container-ctx + (let [container-ctx (context/make-container-context cram-header test-seq-resolver) + stats (record/preprocess-slice-records container-ctx records)] + (context/finalize-container-context container-ctx [stats] (constantly :raw) (constantly (constantly (constantly {:external :raw})))))) @@ -153,7 +153,8 @@ (deftest encode-slice-records-test (testing "mapped reads" - (let [cram-header {:SQ + (let [cram-header {:HD {:SO "coordinate"} + :SQ [{:SN "ref"} {:SN "ref2"}] :RG @@ -183,12 +184,12 @@ {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" :options []}]) - slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records)) - stats (record/encode-slice-records slice-ctx records) + slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0) + _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))] (is (= {:ri 0, :start 1, :end 24, :nbases 25, :nrecords 5} - (into {} stats))) + (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) (is (= 1 (get-in ds-res [:BF 0 :content-id]))) @@ -209,7 +210,7 @@ (is (= 1 (count (get ds-res :AP)))) (is (= 5 (get-in ds-res [:AP 0 :content-id]))) - (is (= [1 5 10 15 20] (seq (get-in ds-res [:AP 0 :data])))) + (is (= [0 4 5 5 5] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) (is (= 6 (get-in ds-res [:RG 0 :content-id]))) @@ -354,12 +355,12 @@ {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" :options []}]) - slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records)) - stats (record/encode-slice-records slice-ctx records) + slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0) + _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))] (is (= {:ri -1, :start 0, :end 0, :nbases 25, :nrecords 5} - (into {} stats))) + (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) (is (= 1 (get-in ds-res [:BF 0 :content-id]))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index 7e8d6269..68086d16 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -11,6 +11,7 @@ (def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) (def ^:private temp-cram-file-2 (io/file common/temp-dir "test2.cram")) (def ^:private temp-cram-file-3 (io/file common/temp-dir "test3.cram")) +(def ^:private temp-sorted-cram-file (io/file common/temp-dir "test.sorted.cram")) (defn- fixup-bam-aln [aln] (-> (into {} aln) @@ -74,20 +75,36 @@ (deftest writer-test (with-before-after {:before (prepare-cache!) :after (clean-cache!)} - (with-open [r (cram/reader common/test-cram-file - {:reference common/test-fa-file}) - w (cram/writer temp-cram-file - {:reference common/test-fa-file})] - (cram/write-header w (cram/read-header r)) - (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) - (with-open [r (cram/reader common/test-cram-file - {:reference common/test-fa-file}) - r' (cram/reader temp-cram-file - {:reference common/test-fa-file})] - (is (= (cram/read-header r) - (cram/read-header r'))) - (is (= (cram/read-alignments r) - (cram/read-alignments r')))))) + (testing "unsorted" + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-cram-file + {:reference common/test-fa-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r'))))) + (testing "sorted by coordinate" + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-sorted-cram-file + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-sorted-cram-file + {:reference common/test-fa-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r'))))))) (deftest-remote writer-with-multiple-containers-test (with-before-after {:before (do (prepare-cavia!)