From 918aa86e0d313c4db4b48f79b7cfba9c19c5a483 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 4 Sep 2024 10:56:56 +0900 Subject: [PATCH] wip: Embed ref seqs if :embed-reference? is set to true --- src/cljam/io/cram/data_series.clj | 65 ++++++----- src/cljam/io/cram/encode/context.clj | 13 ++- src/cljam/io/cram/encode/structure.clj | 7 +- src/cljam/io/cram/writer.clj | 97 +++++++++------ test/cljam/io/cram/encode/record_test.clj | 136 +++++++++++----------- test/cljam/io/cram_test.clj | 42 ++++++- 6 files changed, 219 insertions(+), 141 deletions(-) diff --git a/src/cljam/io/cram/data_series.clj b/src/cljam/io/cram/data_series.clj index 3f427d0a..58dad9c0 100644 --- a/src/cljam/io/cram/data_series.clj +++ b/src/cljam/io/cram/data_series.clj @@ -147,42 +147,43 @@ (def ^{:doc "Default encodings for all the data series"} default-data-series-encodings - {:BF {:content-id 1, :codec :external, :compressor :gzip} - :CF {:content-id 2, :codec :external, :compressor :gzip} - :RI {:content-id 3, :codec :external, :compressor :gzip} - :RL {:content-id 4, :codec :external, :compressor :gzip} - :AP {:content-id 5, :codec :external, :compressor :gzip} - :RG {:content-id 6, :codec :external, :compressor :gzip} - :RN {:content-id 7, :codec :byte-array-stop, :stop-byte (int \tab), :compressor :gzip} - :MF {:content-id 8, :codec :external, :compressor :gzip} - :NS {:content-id 9, :codec :external, :compressor :gzip} - :NP {:content-id 10, :codec :external, :compressor :gzip} - :TS {:content-id 11, :codec :external, :compressor :gzip} - :NF {:content-id 12, :codec :external, :compressor :gzip} - :TL {:content-id 13, :codec :external, :compressor :gzip} - :FN {:content-id 14, :codec :external, :compressor :gzip} - :FC {:content-id 15, :codec :external, :compressor :gzip} - :FP {:content-id 16, :codec :external, :compressor :gzip} - :DL {:content-id 17, :codec :external, :compressor :gzip} + {:embedded-ref {::content-id 1, :codec :external, :compressor :gzip} + :BF {:content-id 2, :codec :external, :compressor :gzip} + :CF {:content-id 3, :codec :external, :compressor :gzip} + :RI {:content-id 4, :codec :external, :compressor :gzip} + :RL {:content-id 5, :codec :external, :compressor :gzip} + :AP {:content-id 6, :codec :external, :compressor :gzip} + :RG {:content-id 7, :codec :external, :compressor :gzip} + :RN {:content-id 8, :codec :byte-array-stop, :stop-byte (int \tab), :compressor :gzip} + :MF {:content-id 9, :codec :external, :compressor :gzip} + :NS {:content-id 10, :codec :external, :compressor :gzip} + :NP {:content-id 11, :codec :external, :compressor :gzip} + :TS {:content-id 12, :codec :external, :compressor :gzip} + :NF {:content-id 13, :codec :external, :compressor :gzip} + :TL {:content-id 14, :codec :external, :compressor :gzip} + :FN {:content-id 15, :codec :external, :compressor :gzip} + :FC {:content-id 16, :codec :external, :compressor :gzip} + :FP {:content-id 17, :codec :external, :compressor :gzip} + :DL {:content-id 18, :codec :external, :compressor :gzip} :BB {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 18, :compressor :gzip} - :val-encoding {:codec :external, :content-id 19, :compressor :gzip}} + :len-encoding {:codec :external, :content-id 19, :compressor :gzip} + :val-encoding {:codec :external, :content-id 20, :compressor :gzip}} :QQ {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 20, :compressor :gzip} - :val-encoding {:codec :external, :content-id 21, :compressor :gzip}} - :BS {:content-id 22, :codec :external, :compressor :gzip} + :len-encoding {:codec :external, :content-id 21, :compressor :gzip} + :val-encoding {:codec :external, :content-id 22, :compressor :gzip}} + :BS {:content-id 23, :codec :external, :compressor :gzip} :IN {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 23, :compressor :gzip} - :val-encoding {:codec :external, :content-id 24, :compressor :gzip}} - :RS {:content-id 25, :codec :external, :compressor :gzip} - :PD {:content-id 26, :codec :external, :compressor :gzip} - :HC {:content-id 27, :codec :external, :compressor :gzip} + :len-encoding {:codec :external, :content-id 24, :compressor :gzip} + :val-encoding {:codec :external, :content-id 25, :compressor :gzip}} + :RS {:content-id 26, :codec :external, :compressor :gzip} + :PD {:content-id 27, :codec :external, :compressor :gzip} + :HC {:content-id 28, :codec :external, :compressor :gzip} :SC {:codec :byte-array-len - :len-encoding {:codec :external, :content-id 28, :compressor :gzip} - :val-encoding {:codec :external, :content-id 29, :compressor :gzip}} - :MQ {:content-id 30, :codec :external, :compressor :gzip} - :BA {:content-id 31, :codec :external, :compressor :gzip} - :QS {:content-id 32, :codec :external, :compressor :gzip}}) + :len-encoding {:codec :external, :content-id 29, :compressor :gzip} + :val-encoding {:codec :external, :content-id 30, :compressor :gzip}} + :MQ {:content-id 31, :codec :external, :compressor :gzip} + :BA {:content-id 32, :codec :external, :compressor :gzip} + :QS {:content-id 33, :codec :external, :compressor :gzip}}) (defn- build-codec-encoder [{:keys [codec content-id compressor] :as params} data-type content-id->state] diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index 4f926eeb..fb48aa03 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -5,7 +5,7 @@ (defn make-container-context "Creates a new container context." - [cram-header seq-resolver] + [cram-header seq-resolver options] (let [rname->idx (into {} (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) @@ -24,14 +24,17 @@ :preservation-map preservation-map :subst-mat subst-mat :seq-resolver seq-resolver - :tag-dict-builder tag-dict-builder})) + :tag-dict-builder tag-dict-builder + :options options})) (defn finalize-container-context "Finalizes the builders in the container context and returns a new container context containing those builders' results. This operation must be done before creating a slice context." - [container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides] - (let [ds-encodings (-> ds/default-data-series-encodings + [container-ctx alignment-stats] + (let [{:keys [ds-compressor-overrides + tag-compressor-overrides]} (:options container-ctx) + ds-encodings (-> ds/default-data-series-encodings (ds/apply-ds-compressor-overrides ds-compressor-overrides)) tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx)) tag-encodings (-> (tag-dict/build-tag-encodings tag-dict) @@ -46,7 +49,7 @@ "Creates a slice context for the ith slice from the given container context. Note that the container context must be finalized with `finalize-container-context`." [{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i] - (let [ds-encoders (ds/build-data-series-encoders ds-encodings) + (let [ds-encoders (ds/build-data-series-encoders (dissoc ds-encodings :embedded-ref)) tag-encoders (ds/build-tag-encoders tag-encodings)] (assoc container-ctx :alignment-stats (nth alignment-stats i) diff --git a/src/cljam/io/cram/encode/structure.clj b/src/cljam/io/cram/encode/structure.clj index fbb84ef9..0c10ed6c 100644 --- a/src/cljam/io/cram/encode/structure.clj +++ b/src/cljam/io/cram/encode/structure.clj @@ -210,10 +210,13 @@ (defn encode-compression-header-block "Encodes a compression header block to the given OutputStream." [out preservation-map subst-mat tag-dict ds-encodings tag-encodings] - (let [bs (with-out-byte-array + ;; ensure that ds-encodings does not contain an encoding for :embedded-ref + ;; pseudo data series + (let [ds-encodings' (dissoc ds-encodings :embedded-ref) + bs (with-out-byte-array (fn [out'] (encode-preservation-map out' preservation-map subst-mat tag-dict) - (encode-data-series-encodings out' ds-encodings) + (encode-data-series-encodings out' ds-encodings') (encode-tag-encoding-map out' tag-encodings)))] (encode-block out :raw 1 0 (alength bs) bs))) diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index 1a8b7bd7..8b6f24cb 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -1,6 +1,7 @@ (ns cljam.io.cram.writer (:require [cljam.io.crai :as crai] [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.cram.encode.compressor :as compressor] [cljam.io.cram.encode.context :as context] [cljam.io.cram.encode.partitioning :as partition] [cljam.io.cram.encode.record :as record] @@ -51,36 +52,51 @@ (defn- preprocess-records [cram-header seq-resolver options ^objects container-records] - (let [container-ctx (context/make-container-context cram-header seq-resolver) - {:keys [ds-compressor-overrides tag-compressor-overrides]} options + (let [container-ctx (context/make-container-context cram-header seq-resolver options) stats (mapv (partial record/preprocess-slice-records container-ctx) container-records)] - (context/finalize-container-context container-ctx - stats - ds-compressor-overrides - tag-compressor-overrides))) - -(defn- generate-blocks [slice-ctx] - (->> (context/encoding-results slice-ctx) - (keep (fn [{:keys [^long raw-size] :as block}] - (when (pos? raw-size) - (update block :data - (fn [^bytes data] - (struct/generate-block (:compressor block) 4 - (:content-id block) raw-size - data)))))) - ;; sort + dedupe by :content-id - (into (sorted-map) (map (juxt :content-id identity))) - vals - (cons {:content-id 0 - :data (struct/generate-block :raw 5 0 0 (byte-array 0))}))) - -(defn- reference-md5 - [{:keys [seq-resolver cram-header]} {:keys [^long ri ^long start ^long end]}] + (context/finalize-container-context container-ctx stats))) + +(def ^:private ^:const embedded-ref-content-id 1) + +(defn- generate-embedded-ref-block [slice-ctx ^bytes embedded-ref] + (let [raw-size (alength embedded-ref) + compr (compressor/compressor (get-in slice-ctx [:ds-encodings :embedded-ref :compressor])) + _ (with-open [^OutputStream os (compressor/compressor-output-stream compr)] + (.write os embedded-ref)) + {:keys [compressor data]} (compressor/->compressed-result compr)] + {:content-id embedded-ref-content-id + :data (struct/generate-block compressor 4 embedded-ref-content-id raw-size data)})) + +(defn- generate-blocks [slice-ctx ^bytes embedded-ref] + (letfn [(inject-embedded-ref-block [blocks] + (cond->> blocks + embedded-ref + (cons (generate-embedded-ref-block slice-ctx embedded-ref))))] + (->> (context/encoding-results slice-ctx) + (keep (fn [{:keys [^long raw-size] :as block}] + (when (pos? raw-size) + (update block :data + (fn [^bytes data] + (struct/generate-block (:compressor block) 4 + (:content-id block) raw-size + data)))))) + ;; sort + dedupe by :content-id + (into (sorted-map) (map (juxt :content-id identity))) + vals + inject-embedded-ref-block + (cons {:content-id 0 + :data (struct/generate-block :raw 5 0 0 (byte-array 0))})))) + +(defn- resolve-reference-seq + ^bytes [{:keys [seq-resolver cram-header]} {:keys [^long ri ^long start ^long end]}] + (let [chr (:SN (nth (:SQ cram-header) ri))] + (resolver/resolve-sequence seq-resolver chr start end))) + +(defn- reference-md5 [slice-ctx ^bytes embedded-ref {:keys [^long ri] :as stats}] (if (neg? ri) (byte-array 16) - (let [chr (:SN (nth (:SQ cram-header) ri)) - ref-bases (resolver/resolve-sequence seq-resolver chr start end) + (let [ref-bases (or embedded-ref (resolve-reference-seq slice-ctx stats)) md5 (MessageDigest/getInstance "md5")] (.digest md5 ref-bases)))) @@ -94,11 +110,14 @@ (defn- generate-slice [slice-ctx counter slice-records] (record/encode-slice-records slice-ctx slice-records) (let [stats (:alignment-stats slice-ctx) - blocks (generate-blocks slice-ctx) - ref-md5 (reference-md5 slice-ctx stats) + embedded-ref (when (and (:embed-reference? (:options slice-ctx)) + (>= (long (:ri stats)) 0)) + (resolve-reference-seq slice-ctx stats)) + blocks (generate-blocks slice-ctx embedded-ref) + ref-md5 (reference-md5 slice-ctx embedded-ref stats) header (assoc (stats->header-base stats) :counter counter - :embedded-reference -1 + :embedded-reference (if embedded-ref embedded-ref-content-id -1) :reference-md5 ref-md5) header-block (struct/generate-slice-header-block header blocks) block-data (mapv :data blocks)] @@ -119,9 +138,20 @@ acc))) (defn- generate-compression-header-block - ^bytes [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]}] - (struct/generate-compression-header-block preservation-map subst-mat tag-dict - ds-encodings tag-encodings)) + ^bytes + [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]} slices] + (let [preservation-map' (cond-> preservation-map + ;; A reference file is not required when either of + ;; the following holds for every slice in the container: + ;; - the slice has an embedded reference block + ;; - the slice only contains unmapped records + (every? (fn [{:keys [header]}] + (or (>= (long (:embedded-reference header)) 0) + (= (long (:ref-seq-id header)) -1))) + slices) + (assoc :RR false))] + (struct/generate-compression-header-block preservation-map' subst-mat tag-dict + ds-encodings tag-encodings))) (defn- generate-container-header [container-ctx ^bytes compression-header-block slices] (let [stats (stats/merge-stats (:alignment-stats container-ctx)) @@ -175,7 +205,8 @@ (let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr) (.-options wtr) container-records) slices (generate-slices container-ctx counter container-records) - compression-header-block (generate-compression-header-block container-ctx) + compression-header-block (generate-compression-header-block container-ctx + slices) container-header (generate-container-header container-ctx compression-header-block slices) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index 9e7d4a59..f00ed6e1 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -71,11 +71,11 @@ 4])) (defn- preprocess-slice-records [cram-header records] - (let [container-ctx (context/make-container-context cram-header test-seq-resolver) + (let [opts {:ds-compressor-overrides (constantly :raw) + :tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))} + container-ctx (context/make-container-context cram-header test-seq-resolver opts) stats (record/preprocess-slice-records container-ctx records)] - (context/finalize-container-context container-ctx [stats] - (constantly :raw) - (constantly (constantly (constantly {:external :raw})))))) + (context/finalize-container-context container-ctx [stats]))) (deftest preprocess-slice-records-test (let [cram-header {:SQ [{:SN "ref"}]} @@ -192,129 +192,129 @@ (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) - (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= 2 (get-in ds-res [:BF 0 :content-id]))) (is (= [0x43 0x43 0x80 0x91 0x80 0x93 0x41] (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) (is (= 1 (count (get ds-res :CF)))) - (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= 3 (get-in ds-res [:CF 0 :content-id]))) (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) - (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= 4 (get-in ds-res [:RI 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:RI 0 :data])))) (is (= 1 (count (get ds-res :RL)))) - (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= 5 (get-in ds-res [:RL 0 :content-id]))) (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) (is (= 1 (count (get ds-res :AP)))) - (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= 6 (get-in ds-res [:AP 0 :content-id]))) (is (= [0 4 5 5 5] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) - (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= 7 (get-in ds-res [:RG 0 :content-id]))) (is (= [0 0 1 1 0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) (is (= 1 (count (get ds-res :RN)))) - (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= 8 (get-in ds-res [:RN 0 :content-id]))) (is (= "q001\tq002\tq003\tq004\tq005\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) (is (= 1 (count (get ds-res :MF)))) - (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= 9 (get-in ds-res [:MF 0 :content-id]))) (is (= [1 1 1 0 2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) - (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= 10 (get-in ds-res [:NS 0 :content-id]))) (is (= [0 0 1 0 0xff 0xff 0xff 0xff 0x0f] (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) - (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= 11 (get-in ds-res [:NP 0 :content-id]))) (is (= [0x80 0x97 0x0f 0x64 0x05 0x00] (map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) - (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= 12 (get-in ds-res [:TS 0 :content-id]))) (is (= [0x80 0x96 0x0f 0x00 0xff 0xff 0xff 0xff 0x01 0x00] (map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) - (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (= 13 (get-in ds-res [:NF 0 :content-id]))) (is (zero? (count (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) - (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= 14 (get-in ds-res [:TL 0 :content-id]))) (is (= [0 0 0 0 1] (seq (get-in ds-res [:TL 0 :data])))) (is (= 1 (count (get ds-res :FN)))) - (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (= 15 (get-in ds-res [:FN 0 :content-id]))) (is (= [1 1 0 2 0] (seq (get-in ds-res [:FN 0 :data])))) (is (= 1 (count (get ds-res :FC)))) - (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (= 16 (get-in ds-res [:FC 0 :content-id]))) (is (= [(int \X) (int \S) (int \I) (int \D)] (seq (get-in ds-res [:FC 0 :data])))) (is (= 1 (count (get ds-res :FP)))) - (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (= 17 (get-in ds-res [:FP 0 :content-id]))) (is (= [3 1 2 2] (seq (get-in ds-res [:FP 0 :data])))) (is (= 1 (count (get ds-res :DL)))) - (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (= 18 (get-in ds-res [:DL 0 :content-id]))) (is (= [1] (seq (get-in ds-res [:DL 0 :data])))) (is (= 2 (count (get ds-res :BB)))) - (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (= 19 (get-in ds-res [:BB 0 :content-id]))) (is (zero? (count (get-in ds-res [:BB 0 :data])))) - (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (= 20 (get-in ds-res [:BB 1 :content-id]))) (is (zero? (count (get-in ds-res [:BB 1 :data])))) (is (= 2 (count (get ds-res :QQ)))) - (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (= 21 (get-in ds-res [:QQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 0 :data])))) - (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (= 22 (get-in ds-res [:QQ 1 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 1 :data])))) (is (= 1 (count (get ds-res :BS)))) - (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (= 23 (get-in ds-res [:BS 0 :content-id]))) (is (= [0] (seq (get-in ds-res [:BS 0 :data])))) (is (= 2 (count (get ds-res :IN)))) - (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (= 24 (get-in ds-res [:IN 0 :content-id]))) (is (= [1] (seq (get-in ds-res [:IN 0 :data])))) - (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (= 25 (get-in ds-res [:IN 1 :content-id]))) (is (= "A" (String. ^bytes (get-in ds-res [:IN 1 :data])))) (is (= 1 (count (get ds-res :RS)))) - (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (= 26 (get-in ds-res [:RS 0 :content-id]))) (is (zero? (count (get-in ds-res [:RS 0 :data])))) (is (= 1 (count (get ds-res :PD)))) - (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (= 27 (get-in ds-res [:PD 0 :content-id]))) (is (zero? (count (get-in ds-res [:PD 0 :data])))) (is (= 1 (count (get ds-res :HC)))) - (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (= 28 (get-in ds-res [:HC 0 :content-id]))) (is (zero? (count (get-in ds-res [:HC 0 :data])))) (is (= 2 (count (get ds-res :SC)))) - (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (= 29 (get-in ds-res [:SC 0 :content-id]))) (is (= [2] (seq (get-in ds-res [:SC 0 :data])))) - (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (= 30 (get-in ds-res [:SC 1 :content-id]))) (is (= "CC" (String. ^bytes (get-in ds-res [:SC 1 :data])))) (is (= 1 (count (get ds-res :MQ)))) - (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (= 31 (get-in ds-res [:MQ 0 :content-id]))) (is (= [0 15 60 15 0] (seq (get-in ds-res [:MQ 0 :data])))) (is (= 1 (count (get ds-res :BA)))) - (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (= 32 (get-in ds-res [:BA 0 :content-id]))) (is (zero? (count (get-in ds-res [:BA 0 :data])))) (is (= 1 (count (get ds-res :QS)))) - (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= 33 (get-in ds-res [:QS 0 :content-id]))) (is (= "HFHHH##AACCCCFFEBBFFAEEEE" (->> (get-in ds-res [:QS 0 :data]) (map #(+ (long %) 33)) @@ -363,16 +363,16 @@ (into {} (:alignment-stats slice-ctx)))) (is (= 1 (count (get ds-res :BF)))) - (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= 2 (get-in ds-res [:BF 0 :content-id]))) (is (= [0x45 0x80 0x85 0x45 0x80 0x85 0x45] (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) (is (= 1 (count (get ds-res :CF)))) - (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= 3 (get-in ds-res [:CF 0 :content-id]))) (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) (is (= 1 (count (get ds-res :RI)))) - (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= 4 (get-in ds-res [:RI 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -381,15 +381,15 @@ (map #(bit-and % 0xff) (get-in ds-res [:RI 0 :data])))) (is (= 1 (count (get ds-res :RL)))) - (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= 5 (get-in ds-res [:RL 0 :content-id]))) (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) (is (= 1 (count (get ds-res :AP)))) - (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= 6 (get-in ds-res [:AP 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) - (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= 7 (get-in ds-res [:RG 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -398,15 +398,15 @@ (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) (is (= 1 (count (get ds-res :RN)))) - (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= 8 (get-in ds-res [:RN 0 :content-id]))) (is (= "q001\tq001\tq002\tq002\tq003\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) (is (= 1 (count (get ds-res :MF)))) - (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= 9 (get-in ds-res [:MF 0 :content-id]))) (is (= [2 2 2 2 2] (seq (get-in ds-res [:MF 0 :data])))) (is (= 1 (count (get ds-res :NS)))) - (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= 10 (get-in ds-res [:NS 0 :content-id]))) (is (= [0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f 0xff 0xff 0xff 0xff 0x0f @@ -415,88 +415,88 @@ (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) (is (= 1 (count (get ds-res :NP)))) - (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= 11 (get-in ds-res [:NP 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:NP 0 :data])))) (is (= 1 (count (get ds-res :TS)))) - (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= 12 (get-in ds-res [:TS 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:TS 0 :data])))) (is (= 1 (count (get ds-res :NF)))) - (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (= 13 (get-in ds-res [:NF 0 :content-id]))) (is (zero? (count (get-in ds-res [:NF 0 :data])))) (is (= 1 (count (get ds-res :TL)))) - (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= 14 (get-in ds-res [:TL 0 :content-id]))) (is (= [0 0 0 0 0] (seq (get-in ds-res [:TL 0 :data])))) (is (= 1 (count (get ds-res :FN)))) - (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (= 15 (get-in ds-res [:FN 0 :content-id]))) (is (zero? (count (get-in ds-res [:FN 0 :data])))) (is (= 1 (count (get ds-res :FC)))) - (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (= 16 (get-in ds-res [:FC 0 :content-id]))) (is (zero? (count (get-in ds-res [:FC 0 :data])))) (is (= 1 (count (get ds-res :FP)))) - (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (= 17 (get-in ds-res [:FP 0 :content-id]))) (is (zero? (count (get-in ds-res [:FP 0 :data])))) (is (= 1 (count (get ds-res :DL)))) - (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (= 18 (get-in ds-res [:DL 0 :content-id]))) (is (zero? (count (get-in ds-res [:DL 0 :data])))) (is (= 2 (count (get ds-res :BB)))) - (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (= 19 (get-in ds-res [:BB 0 :content-id]))) (is (zero? (count (get-in ds-res [:BB 0 :data])))) - (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (= 20 (get-in ds-res [:BB 1 :content-id]))) (is (zero? (count (get-in ds-res [:BB 1 :data])))) (is (= 2 (count (get ds-res :QQ)))) - (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (= 21 (get-in ds-res [:QQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 0 :data])))) - (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (= 22 (get-in ds-res [:QQ 1 :content-id]))) (is (zero? (count (get-in ds-res [:QQ 1 :data])))) (is (= 1 (count (get ds-res :BS)))) - (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (= 23 (get-in ds-res [:BS 0 :content-id]))) (is (zero? (count (get-in ds-res [:BS 0 :data])))) (is (= 2 (count (get ds-res :IN)))) - (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (= 24 (get-in ds-res [:IN 0 :content-id]))) (is (zero? (count (get-in ds-res [:IN 0 :data])))) - (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (= 25 (get-in ds-res [:IN 1 :content-id]))) (is (zero? (count (get-in ds-res [:IN 1 :data])))) (is (= 1 (count (get ds-res :RS)))) - (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (= 26 (get-in ds-res [:RS 0 :content-id]))) (is (zero? (count (get-in ds-res [:RS 0 :data])))) (is (= 1 (count (get ds-res :PD)))) - (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (= 27 (get-in ds-res [:PD 0 :content-id]))) (is (zero? (count (get-in ds-res [:PD 0 :data])))) (is (= 1 (count (get ds-res :HC)))) - (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (= 28 (get-in ds-res [:HC 0 :content-id]))) (is (zero? (count (get-in ds-res [:HC 0 :data])))) (is (= 2 (count (get ds-res :SC)))) - (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (= 29 (get-in ds-res [:SC 0 :content-id]))) (is (zero? (count (get-in ds-res [:SC 0 :data])))) - (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (= 30 (get-in ds-res [:SC 1 :content-id]))) (is (zero? (count (get-in ds-res [:SC 1 :data])))) (is (= 1 (count (get ds-res :MQ)))) - (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (= 31 (get-in ds-res [:MQ 0 :content-id]))) (is (zero? (count (get-in ds-res [:MQ 0 :data])))) (is (= 1 (count (get ds-res :BA)))) - (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (= 32 (get-in ds-res [:BA 0 :content-id]))) (is (= "AATCCATTGTTGGTATCTTGGCACA" (String. ^bytes (get-in ds-res [:BA 0 :data])))) (is (= 1 (count (get ds-res :QS)))) - (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= 33 (get-in ds-res [:QS 0 :content-id]))) (is (= "CCFFFBDFADADDHFDDDFDBCCFD" (->> (get-in ds-res [:QS 0 :data]) (map #(+ (long %) 33)) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index 68086d16..85872c60 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -1,12 +1,16 @@ (ns cljam.io.cram-test (:require [cljam.io.cram :as cram] + [cljam.io.cram.decode.structure :as struct] + [cljam.io.cram.reader :as reader] [cljam.io.sam :as sam] [cljam.test-common :as common :refer [clean-cache! deftest-remote prepare-cache! prepare-cavia! with-before-after]] [clojure.java.io :as io] - [clojure.test :refer [are deftest is testing]])) + [clojure.test :refer [are deftest is testing]]) + (:import [cljam.io.cram.reader CRAMReader] + [java.nio.channels FileChannel])) (def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) (def ^:private temp-cram-file-2 (io/file common/temp-dir "test2.cram")) @@ -125,6 +129,42 @@ (is (= (cram/read-alignments r) (cram/read-alignments r')))))) +(defn- all-compression-headers [^CRAMReader r] + (let [^FileChannel ch (.-channel r)] + (letfn [(read1 [container-header bb] + (when-not (struct/eof-container? container-header) + (struct/decode-compression-header-block bb))) + (step [] + (when (< (.position ch) (.size ch)) + (when-let [header (#'reader/read-container-with r read1)] + (cons header (lazy-seq (step))))))] + (step)))) + +(deftest writer-with-reference-embedding-test + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file + :embed-reference? true + ;; to avoid generating multi-ref slices + :min-single-ref-slice-size 0 + :ds-compressor-overrides {:embedded-ref :raw}})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-cram-file)] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r')))) + (with-open [r (cram/reader temp-cram-file)] + (let [headers (all-compression-headers r)] + (is (seq headers)) + (is (every? #(false? (get-in % [:preservation-map :RR])) headers)))))) + (deftest writer-index-options-test (with-before-after {:before (prepare-cache!) :after (clean-cache!)}