Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for delta encoding of AP data series for sorted CRAM files #323

Merged
merged 2 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions src/cljam/io/cram/encode/context.clj
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
(ns cljam.io.cram.encode.context
(:require [cljam.io.cram.data-series :as ds]
[cljam.io.cram.encode.tag-dict :as tag-dict]))
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.sam.util.header :as sam.header]))

(defn make-container-context
"Creates a new container context."
[cram-header preservation-map seq-resolver]
[cram-header seq-resolver]
(let [rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
preservation-map (cond-> {:RN true, :AP false, :RR true}
(= (sam.header/sort-order cram-header)
sam.header/order-coordinate)
(assoc :AP true))
subst-mat {\A {\T 0, \G 1, \C 2, \N 3}
\T {\A 0, \G 1, \C 2, \N 3}
\G {\A 0, \T 1, \C 2, \N 3}
Expand All @@ -25,24 +30,26 @@
"Finalizes the builders in the container context and returns a new container
context containing those builders' results. This operation must be done before
creating a slice context."
[container-ctx ds-compressor-overrides tag-compressor-overrides]
[container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides]
(let [ds-encodings (-> ds/default-data-series-encodings
(ds/apply-ds-compressor-overrides ds-compressor-overrides))
tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx))
tag-encodings (-> (tag-dict/build-tag-encodings tag-dict)
(ds/apply-tag-compressor-overrides tag-compressor-overrides))]
(assoc container-ctx
:alignment-stats alignment-stats
:ds-encodings ds-encodings
:tag-dict tag-dict
:tag-encodings tag-encodings)))

(defn make-slice-context
"Creates a slice context from the given container context. Note that the container
context must be finalized with `finalize-container-context`."
[{:keys [ds-encodings tag-encodings] :as container-ctx}]
"Creates a slice context for the ith slice from the given container context.
Note that the container context must be finalized with `finalize-container-context`."
[{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i]
(let [ds-encoders (ds/build-data-series-encoders ds-encodings)
tag-encoders (ds/build-tag-encoders tag-encodings)]
(assoc container-ctx
:alignment-stats (nth alignment-stats i)
:ds-encoders ds-encoders
:tag-encoders tag-encoders)))

Expand Down
58 changes: 31 additions & 27 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@
-1
(get rname->idx rname)))

(defn- build-positional-data-encoder [{:keys [cram-header]} {:keys [RI RL AP RG]}]
(defn- build-positional-data-encoder
[{:keys [cram-header preservation-map alignment-stats]} {:keys [RI RL AP RG]}]
(let [rg-id->idx (into {}
(map-indexed (fn [i {:keys [ID]}] [ID i]))
(:RG cram-header))]
(:RG cram-header))
AP' (if (:AP preservation-map)
(let [pos (volatile! (:start alignment-stats))]
(fn [^long pos']
(AP (- pos' (long @pos)))
(vreset! pos pos')))
AP)]
(fn [record]
(let [rg (sam.option/value-for-tag :RG record)]
(RI (::ref-index record))
(RL (count (:seq record)))
(AP (:pos record))
(AP' (:pos record))
(RG (if rg (get rg-id->idx rg) -1))))))

(defn- build-read-name-encoder [{:keys [RN]}]
Expand Down Expand Up @@ -113,8 +120,7 @@
(BA (aget bs i)))
(encode-qual record QS))))

(defn- build-cram-record-encoder
[{:keys [ds-encoders] :as slice-ctx} stats-builder]
(defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}]
(let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders)
name-encoder (build-read-name-encoder ds-encoders)
mate-encoder (build-mate-read-encoder slice-ctx ds-encoders)
Expand All @@ -125,9 +131,6 @@
(fn [record]
(let [bf (bit-and (long (:flag record))
(bit-not (sam.flag/encoded #{:next-reversed :next-unmapped})))]
(stats/update! stats-builder
(::ref-index record) (:pos record) (::end record)
(count (:seq record)) 1)
(BF bf)
(CF (::flag record))
(pos-encoder record)
Expand All @@ -142,10 +145,8 @@
"Encodes CRAM records in a slice all at once using the given slice context.
Returns the alignment stats for this slice."
[slice-ctx records]
(let [stats-builder (stats/make-alignment-stats-builder)
record-encoder (build-cram-record-encoder slice-ctx stats-builder)]
(run! record-encoder records)
(stats/build stats-builder)))
(let [record-encoder (build-cram-record-encoder slice-ctx)]
(run! record-encoder records)))

(defn- add-mismatches
[n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs]
Expand Down Expand Up @@ -203,18 +204,21 @@
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
(.set records i record'))))
(let [stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i record')))
(stats/build stats-builder)))
37 changes: 19 additions & 18 deletions src/cljam/io/cram/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@
(struct/encode-cram-header-container (.-stream wtr) header))

(defn- preprocess-records
[cram-header preservation-map seq-resolver options ^objects container-records]
(let [container-ctx (context/make-container-context cram-header
preservation-map
seq-resolver)
{:keys [ds-compressor-overrides tag-compressor-overrides]} options]
(run! (partial record/preprocess-slice-records container-ctx) container-records)
[cram-header seq-resolver options ^objects container-records]
(let [container-ctx (context/make-container-context cram-header seq-resolver)
{:keys [ds-compressor-overrides tag-compressor-overrides]} options
stats (mapv (partial record/preprocess-slice-records container-ctx)
container-records)]
(context/finalize-container-context container-ctx
stats
ds-compressor-overrides
tag-compressor-overrides)))

Expand Down Expand Up @@ -91,9 +91,9 @@
:bases nbases
:records nrecords})

(defn- generate-slice [container-ctx counter slice-records]
(let [slice-ctx (context/make-slice-context container-ctx)
stats (record/encode-slice-records slice-ctx slice-records)
(defn- generate-slice [slice-ctx counter slice-records]
(record/encode-slice-records slice-ctx slice-records)
(let [stats (:alignment-stats slice-ctx)
blocks (generate-blocks slice-ctx)
ref-md5 (reference-md5 slice-ctx stats)
header (assoc (stats->header-base stats)
Expand All @@ -103,7 +103,6 @@
header-block (struct/generate-slice-header-block header blocks)
block-data (mapv :data blocks)]
{:header header
:stats stats
:header-block header-block
:data-blocks block-data
:counter (+ (long counter) (long (:nrecords stats)))
Expand All @@ -112,19 +111,20 @@
(apply + (alength header-block)))}))

(defn- generate-slices [container-ctx counter container-records]
(loop [[slice-records & more] container-records, counter counter, acc []]
(loop [i 0, [slice-records & more] container-records, counter counter, acc []]
(if slice-records
(let [slice (generate-slice container-ctx counter slice-records)]
(recur more (:counter slice) (conj acc slice)))
(let [slice-ctx (context/make-slice-context container-ctx i)
slice (generate-slice slice-ctx counter slice-records)]
(recur (inc i) more (:counter slice) (conj acc slice)))
acc)))

(defn- generate-compression-header-block
^bytes [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]}]
(struct/generate-compression-header-block preservation-map subst-mat tag-dict
ds-encodings tag-encodings))

(defn- generate-container-header [^bytes compression-header-block slices]
(let [stats (stats/merge-stats (map :stats slices))
(defn- generate-container-header [container-ctx ^bytes compression-header-block slices]
(let [stats (stats/merge-stats (:alignment-stats container-ctx))
container-len (->> slices
(map (fn [{:keys [^bytes header-block data-blocks]}]
(apply + (alength header-block)
Expand Down Expand Up @@ -172,12 +172,13 @@
(crai/write-index-entries index-writer entries)))

(defn- write-container [^CRAMWriter wtr cram-header counter container-records]
(let [preservation-map {:RN true, :AP false, :RR true}
container-ctx (preprocess-records cram-header preservation-map (.-seq-resolver wtr)
(let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr)
(.-options wtr) container-records)
slices (generate-slices container-ctx counter container-records)
compression-header-block (generate-compression-header-block container-ctx)
container-header (generate-container-header compression-header-block slices)
container-header (generate-container-header container-ctx
compression-header-block
slices)
^DataOutputStream out (.-stream wtr)
container-offset (.size out)]
(struct/encode-container-header out (assoc container-header :counter counter))
Expand Down
Binary file modified test-resources/cram/medium.cram
Binary file not shown.
Binary file modified test-resources/cram/medium.cram.crai
Binary file not shown.
4 changes: 2 additions & 2 deletions test/cljam/algo/cram_indexer_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
(doall (#'crai/read-index-entries r))))

(deftest create-index-test
(let [f (io/file common/temp-dir "medium.cram.crai")]
(let [f (io/file common/temp-dir "medium_without_index.cram.crai")]
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(is (thrown-with-msg? Exception #"Cannot create CRAM index file .*"
(indexer/create-index common/medium-cram-file f)))
(indexer/create-index common/medium-without-index-cram-file f)))
(indexer/create-index common/medium-cram-file f
:skip-sort-order-check? true)
(is (= (read-index-entries common/medium-crai-file)
Expand Down
Loading