From c8a00e30ce3e1fb5bb998608942a06cfd8562345 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:21:05 +0900 Subject: [PATCH 01/10] Implement data series encoders --- src/cljam/io/cram/data_series.clj | 285 ++++++++++++++++++ src/cljam/io/cram/decode/data_series.clj | 143 --------- src/cljam/io/cram/reader.clj | 2 +- .../io/cram/{decode => }/data_series_test.clj | 16 +- 4 files changed, 295 insertions(+), 151 deletions(-) create mode 100644 src/cljam/io/cram/data_series.clj delete mode 100644 src/cljam/io/cram/decode/data_series.clj rename test/cljam/io/cram/{decode => }/data_series_test.clj (98%) diff --git a/src/cljam/io/cram/data_series.clj b/src/cljam/io/cram/data_series.clj new file mode 100644 index 00000000..b634b52a --- /dev/null +++ b/src/cljam/io/cram/data_series.clj @@ -0,0 +1,285 @@ +(ns cljam.io.cram.data-series + (:require [cljam.io.cram.bit-stream :as bs] + [cljam.io.cram.itf8 :as itf8] + [cljam.io.util.byte-buffer :as bb] + [cljam.io.util.lsb.io-stream :as lsb] + [clojure.string :as str]) + (:import [java.io ByteArrayOutputStream OutputStream] + [java.nio Buffer ByteBuffer])) + +(defn- data-series-type [ds] + (case ds + (:BF :CF :RI :RL :AP :RG :MF :NS :NP :TS :NF :TL :FN :FP :DL :RS :PD :HC :MQ) + :int + + (:FC :BS :BA :QS) + :byte + + (:RN :BB :QQ :IN :SC) + :bytes)) + +(defn- build-codec-decoder + [{:keys [codec] :as params} data-type bs-decoder content-id->block-data] + (case codec + :external + (let [^ByteBuffer block (get content-id->block-data (:content-id params))] + (case data-type + :byte #(.get block) + :int #(itf8/decode-itf8 block))) + + :huffman + (let [{:keys [alphabet bit-len]} params] + (assert (and (= (count alphabet) 1) + (zero? (long (first bit-len)))) + "Huffman coding for more than one word is not supported yet.") + (constantly (first alphabet))) + + :byte-array-len + (let [{:keys [len-encoding val-encoding]} params + len-decoder (build-codec-decoder len-encoding :int bs-decoder content-id->block-data) + val-decoder (build-codec-decoder val-encoding :byte bs-decoder content-id->block-data)] + (fn [] + (let [len (len-decoder) + bb (bb/allocate-lsb-byte-buffer len)] + (dotimes [_ len] + (.put bb (byte (val-decoder)))) + (.array bb)))) + + :byte-array-stop + (let [{:keys [stop-byte content-id]} params + ^ByteBuffer block (get content-id->block-data content-id)] + (fn [] + (.mark ^Buffer block) + (let [start (.position block) + end (long + (loop [] + (if (= (.get block) (byte stop-byte)) + (.position block) + (recur)))) + len (dec (- end start)) + _ (.reset ^Buffer block) + ret (bb/read-bytes block len)] + (.get block) + ret))) + + :beta + (let [{:keys [offset length]} params] + (fn [] + (+ (long (bs/read-bits bs-decoder (long length))) + (long offset)))))) + +(defn build-data-series-decoders + "Builds decoders for data series based on the encodings specified in the given + compression header and block data. + + `ds-encodings` is a map { } and the return value is + a map { }, where: + - : a keyword representing the data series name + - : a map representing the encoding of the data series + - : a function with no arguments that returns a value decoded from + the data series upon each call" + [{ds-encodings :data-series} bs-decoder blocks] + (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] + (reduce-kv (fn [decoders ds params] + (let [dt (data-series-type ds) + decoder (build-codec-decoder params dt bs-decoder content-id->block-data)] + (assoc decoders ds decoder))) + {} ds-encodings))) + +(defn- tag-value-coercer [tag-type] + (case tag-type + \A #(char (.get ^ByteBuffer %)) + \c #(.get ^ByteBuffer %) + \C bb/read-ubyte + \s #(.getShort ^ByteBuffer %) + \S bb/read-ushort + \i #(.getInt ^ByteBuffer %) + \I bb/read-uint + \f #(.getFloat ^ByteBuffer %) + \Z bb/read-null-terminated-string + \H (fn [^ByteBuffer bb] + (let [s (.getBytes ^String (bb/read-null-terminated-string bb)) + n (quot (alength s) 2) + arr (byte-array n)] + (dotimes [i n] + (let [b (bit-or (bit-shift-left (Character/digit (aget s (* 2 i)) 16) 4) + (Character/digit (aget s (inc (* 2 i))) 16))] + (aset arr i (byte b)))) + arr)) + \B (fn [^ByteBuffer bb] + (let [tag-type' (char (.get bb)) + len (.getInt bb) + coercer (tag-value-coercer tag-type') + vs (repeatedly len (partial coercer bb))] + (str/join \, (cons tag-type' vs)))))) + +(defn- build-tag-decoder [tag-encoding tag-type bs-decoder content-id->block-data] + (let [decoder (build-codec-decoder tag-encoding :bytes bs-decoder content-id->block-data) + coercer (tag-value-coercer tag-type)] + (fn [] + (let [bb (bb/make-lsb-byte-buffer (decoder))] + (coercer bb))))) + +(defn build-tag-decoders + "Builds decoders for tags based on the encodings specified in the given + compression header and block data. + + `tags` is a map { { }} and the return + value is a map { { }}, where: + - : a keyword representing the tag name + - : a character representing a type of the tag + - : a map representing the encoding of the tag and type + - : a function with no arguments that returns a value decoded from + the data series for the tag upon each call" + [{:keys [tags]} bs-decoder blocks] + (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] + (reduce-kv + (fn [decoders tag m] + (reduce-kv + (fn [decoders tag-type encoding] + (let [decoder (build-tag-decoder encoding tag-type bs-decoder content-id->block-data) + tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))] + (assoc-in decoders [tag tag-type] + (fn [] {:type tag-type' :value (decoder)})))) + decoders m)) + {} tags))) + +(defn- build-codec-encoder + [{:keys [codec content-id] :as params} data-type content-id->state] + (letfn [(out-for-encoder [] + (or (get-in @content-id->state [content-id :out]) + (let [out (ByteArrayOutputStream.)] + (vswap! content-id->state assoc-in [content-id :out] out) + out))) + (data-for-encoder [] + (let [state (get @content-id->state content-id)] + (or (get state :data) + (let [^ByteArrayOutputStream out (:out state) + data (.toByteArray out)] + (vswap! content-id->state assoc-in [content-id :data] data) + data))))] + (case codec + :external + (let [^OutputStream out (out-for-encoder)] + (case data-type + :byte (fn + ([] [{:content-id content-id, :data (data-for-encoder)}]) + ([v] (.write out (int v)))) + :int (fn + ([] [{:content-id content-id, :data (data-for-encoder)}]) + ([v] (itf8/encode-itf8 out v))))) + + :huffman + (let [{:keys [alphabet bit-len]} params] + (assert (and (= (count alphabet) 1) + (zero? (long (first bit-len)))) + "Huffman coding for more than one word is not supported yet.") + (fn + ([] []) + ([_]))) + + :byte-array-len + (let [{:keys [len-encoding val-encoding]} params + len-encoder (build-codec-encoder len-encoding :int content-id->state) + val-encoder (build-codec-encoder val-encoding :byte content-id->state)] + (fn + ([] (into (len-encoder) (val-encoder))) + ([^bytes bs] + (let [len (alength bs)] + (len-encoder len) + (dotimes [i len] + (val-encoder (aget bs i))))))) + + :byte-array-stop + (let [{:keys [stop-byte]} params + ^OutputStream out (out-for-encoder)] + (fn + ([] [{:content-id content-id, :data (data-for-encoder)}]) + ([^bytes bs] + (.write out bs) + (.write out (int stop-byte)))))))) + +(defn build-data-series-encoders + "Builds encoders for data series based on the given encodings. + + `ds-encodings` is a map { } and the return value is + a map { }, where: + - : a keyword representing the data series name + - : a map representing the encoding of the data series + - : a function that takes one argument to encode" + [ds-encoding] + (let [content-id->state (volatile! {})] + (reduce-kv (fn [encoders ds params] + (let [dt (data-series-type ds) + encoder (build-codec-encoder params dt content-id->state)] + (assoc encoders ds encoder))) + {} ds-encoding))) + +(def ^:private digit->char + (let [bs (.getBytes "0123456789ABCDEF")] + (fn [^long i] + (aget bs i)))) + +(defn- tag-value-converter [tag-type] + (case tag-type + \A (fn [^OutputStream out c] (.write out (byte (int c)))) + \c (fn [^OutputStream out b] (.write out (byte b))) + \C lsb/write-ubyte + \s lsb/write-short + \S lsb/write-ushort + \i lsb/write-int + \I lsb/write-uint + \f lsb/write-float + \Z (fn [^OutputStream out ^String s] + (.write out (.getBytes s)) + (.write out (byte 0))) + \H (fn [^OutputStream out ^"[B" bs] + (let [n (alength bs)] + (dotimes [i n] + (let [b (aget bs i)] + (.write out (byte (digit->char (bit-and (bit-shift-right b 4) 0x0f)))) + (.write out (byte (digit->char (bit-and b 0x0f)))))) + (.write out (byte 0)))) + \B (fn [^OutputStream out s] + (let [[t & vs] (str/split s #",") + t' (first t) + n (count vs) + conv (tag-value-converter t') + vs' (mapv (if (= t' \f) + #(Float/parseFloat %) + #(Long/parseLong %)) + vs)] + (.write out (byte (int t'))) + (lsb/write-int out n) + (dotimes [i n] + (conv out (nth vs' i))))))) + +(defn- build-tag-encoder [tag-encoding tag-type content-id->state] + (let [encoder (build-codec-encoder tag-encoding :bytes content-id->state) + converter (tag-value-converter tag-type)] + (fn + ([] (encoder)) + ([v] + (let [out (ByteArrayOutputStream.)] + (converter out v) + (encoder (.toByteArray out))))))) + +(defn build-tag-encoders + "Builds encoders for tags based on the given encodings. + + `tags` is a map { { }} and the return + value is a map { { }}, where: + - : a keyword representing the tag name + - : a character representing a type of the tag + - : a map representing the encoding of the tag and type + - : a function that takes one argument to encode" + [tag-encodings] + (let [content-id->state (volatile! {})] + (reduce-kv + (fn [encoders tag m] + (reduce-kv + (fn [encoders tag-type encoding] + (assoc-in encoders [tag tag-type] + (build-tag-encoder encoding tag-type content-id->state))) + encoders m)) + {} tag-encodings))) diff --git a/src/cljam/io/cram/decode/data_series.clj b/src/cljam/io/cram/decode/data_series.clj deleted file mode 100644 index 1390ab7b..00000000 --- a/src/cljam/io/cram/decode/data_series.clj +++ /dev/null @@ -1,143 +0,0 @@ -(ns cljam.io.cram.decode.data-series - (:require [cljam.io.cram.itf8 :as itf8] - [cljam.io.cram.bit-stream :as bs] - [cljam.io.util.byte-buffer :as bb] - [clojure.string :as str]) - (:import [java.nio Buffer ByteBuffer])) - -(defn- data-series-type [ds] - (case ds - (:BF :CF :RI :RL :AP :RG :MF :NS :NP :TS :NF :TL :FN :FP :DL :RS :PD :HC :MQ) - :int - - (:FC :BS :BA :QS) - :byte - - (:RN :BB :QQ :IN :SC) - :bytes)) - -(defn- build-codec-decoder - [{:keys [codec] :as params} data-type bs-decoder content-id->block-data] - (case codec - :external - (let [^ByteBuffer block (get content-id->block-data (:content-id params))] - (case data-type - :byte #(.get block) - :int #(itf8/decode-itf8 block))) - - :huffman - (let [{:keys [alphabet bit-len]} params] - (assert (and (= (count alphabet) 1) - (zero? (long (first bit-len)))) - "Huffman coding for more than one word is not supported yet.") - (constantly (first alphabet))) - - :byte-array-len - (let [{:keys [len-encoding val-encoding]} params - len-decoder (build-codec-decoder len-encoding :int bs-decoder content-id->block-data) - val-decoder (build-codec-decoder val-encoding :byte bs-decoder content-id->block-data)] - (fn [] - (let [len (len-decoder) - bb (bb/allocate-lsb-byte-buffer len)] - (dotimes [_ len] - (.put bb (byte (val-decoder)))) - (.array bb)))) - - :byte-array-stop - (let [{:keys [stop-byte external-id]} params - ^ByteBuffer block (get content-id->block-data external-id)] - (fn [] - (.mark ^Buffer block) - (let [start (.position block) - end (long - (loop [] - (if (= (.get block) (byte stop-byte)) - (.position block) - (recur)))) - len (dec (- end start)) - _ (.reset ^Buffer block) - ret (bb/read-bytes block len)] - (.get block) - ret))) - - :beta - (let [{:keys [offset length]} params] - (fn [] - (+ (long (bs/read-bits bs-decoder (long length))) - (long offset)))))) - -(defn build-data-series-decoders - "Builds decoders for data series based on the encodings specified in the given - compression header and block data. - - `ds-encodings` is a map { } and the return value is - a map { }, where: - - : a keyword representing the data series name - - : a map representing the encoding of the data series - - : a function with no arguments that returns a value decoded from - the data series upon each call" - [{ds-encodings :data-series} bs-decoder blocks] - (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] - (reduce-kv (fn [decoders ds params] - (let [dt (data-series-type ds) - decoder (build-codec-decoder params dt bs-decoder content-id->block-data)] - (assoc decoders ds decoder))) - {} ds-encodings))) - -(defn- tag-value-coercer [tag-type] - (case tag-type - \A #(char (.get ^ByteBuffer %)) - \c #(.get ^ByteBuffer %) - \C bb/read-ubyte - \s #(.getShort ^ByteBuffer %) - \S bb/read-ushort - \i #(.getInt ^ByteBuffer %) - \I bb/read-uint - \f #(.getFloat ^ByteBuffer %) - \Z bb/read-null-terminated-string - \H (fn [^ByteBuffer bb] - (let [s (.getBytes ^String (bb/read-null-terminated-string bb)) - n (quot (alength s) 2) - arr (byte-array n)] - (dotimes [i n] - (let [b (bit-or (bit-shift-left (Character/digit (aget s (* 2 i)) 16) 4) - (Character/digit (aget s (inc (* 2 i))) 16))] - (aset arr i (byte b)))) - arr)) - \B (fn [^ByteBuffer bb] - (let [tag-type' (char (.get bb)) - len (.getInt bb) - coercer (tag-value-coercer tag-type') - vs (repeatedly len (partial coercer bb))] - (str/join \, (cons tag-type' vs)))))) - -(defn- build-tag-decoder [tag-encoding tag-type bs-decoder content-id->block-data] - (let [decoder (build-codec-decoder tag-encoding :bytes bs-decoder content-id->block-data) - coercer (tag-value-coercer tag-type)] - (fn [] - (let [bb (bb/make-lsb-byte-buffer (decoder))] - (coercer bb))))) - -(defn build-tag-decoders - "Builds decoders for tags based on the encodings specified in the given - compression header and block data. - - `tags` is a map { { }} and the return - value is a map { { }}, where: - - : a keyword representing the tag name - - : a character representing a type of the tag - - : a map representing the encoding of the tag and type - - : a function with no arguments that returns a value decoded from - the data series for the tag upon each call" - [{:keys [tags]} bs-decoder blocks] - (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] - (reduce-kv - (fn [decoders tag m] - (reduce-kv - (fn [decoders tag-type encoding] - (let [decoder (build-tag-decoder encoding tag-type bs-decoder content-id->block-data) - tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))] - (assoc-in decoders [tag tag-type] - (fn [] {:type tag-type' :value (decoder)})))) - decoders m)) - {} tags))) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index e40c6136..a0c40eef 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -1,6 +1,6 @@ (ns cljam.io.cram.reader (:require [cljam.io.cram.bit-stream :as bs] - [cljam.io.cram.decode.data-series :as ds] + [cljam.io.cram.data-series :as ds] [cljam.io.cram.decode.record :as record] [cljam.io.cram.decode.structure :as struct] [cljam.io.cram.seq-resolver.protocol :as resolver] diff --git a/test/cljam/io/cram/decode/data_series_test.clj b/test/cljam/io/cram/data_series_test.clj similarity index 98% rename from test/cljam/io/cram/decode/data_series_test.clj rename to test/cljam/io/cram/data_series_test.clj index 26d0737f..29212568 100644 --- a/test/cljam/io/cram/decode/data_series_test.clj +++ b/test/cljam/io/cram/data_series_test.clj @@ -1,10 +1,13 @@ -(ns cljam.io.cram.decode.data-series-test +(ns cljam.io.cram.data-series-test (:require [cljam.io.bam.decoder :as decoder] - [cljam.io.cram.decode.data-series :as ds] [cljam.io.cram.bit-stream :as bs] + [cljam.io.cram.data-series :as ds] [cljam.io.sam.util.cigar :as cigar] [cljam.io.util.byte-buffer :as bb] - [clojure.test :refer [deftest is testing]])) + [cljam.io.util.lsb.io-stream :as lsb] + [clojure.string :as str] + [clojure.test :refer [deftest is testing]]) + (:import [java.io ByteArrayOutputStream])) (deftest build-data-series-decoders-test (let [encodings {:BA {:codec :external, :content-id 1} @@ -14,7 +17,7 @@ :BB {:codec :byte-array-len :len-encoding {:codec :external, :content-id 3} :val-encoding {:codec :external, :content-id 4}} - :RN {:codec :byte-array-stop, :stop-byte 0, :external-id 5}} + :RN {:codec :byte-array-stop, :stop-byte 0, :content-id 5}} blocks [{:content-id 0 :data (bb/make-lsb-byte-buffer (byte-array [2r01001110]))} {:content-id 1 @@ -212,7 +215,7 @@ (testing "strings" (let [encodings {:MC {\Z {:codec :byte-array-stop :stop-byte 9 - :external-id 5063514}} + :content-id 5063514}} :hx {\H {:codec :byte-array-len :len-encoding {:codec :huffman :alphabet [9] @@ -247,7 +250,6 @@ {:type "H" :value [0xde 0xad 0xbe 0xef]}] (map (fn [m] (update m :value (partial map #(bit-and (long %) 0xff)))) [(hx) (hx) (hx) (hx)])))))) - (testing "array values" (testing "byte arrays" (let [encodings {:sb {\B {:codec :byte-array-len @@ -397,7 +399,7 @@ "18S76M1D57M"] encodings {:CG {\B {:codec :byte-array-stop :stop-byte -1 - :external-id 4409154}}} + :content-id 4409154}}} blocks [{:content-id 4409154 :data (let [encoded (map cigar/encode-cigar vs) bb (bb/allocate-lsb-byte-buffer (+ (->> (apply concat encoded) From 0be45819df912b7edb4d31aa0ff84467507fa478 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:29:24 +0900 Subject: [PATCH 02/10] Implement CRAM record encoder --- src/cljam/io/cram/encode/alignment_stats.clj | 53 +++++ src/cljam/io/cram/encode/record.clj | 214 +++++++++++++++++++ src/cljam/io/cram/seq_resolver.clj | 8 +- src/cljam/io/cram/seq_resolver/protocol.clj | 4 +- test/cljam/io/cram/seq_resolver_test.clj | 16 +- 5 files changed, 289 insertions(+), 6 deletions(-) create mode 100644 src/cljam/io/cram/encode/alignment_stats.clj create mode 100644 src/cljam/io/cram/encode/record.clj diff --git a/src/cljam/io/cram/encode/alignment_stats.clj b/src/cljam/io/cram/encode/alignment_stats.clj new file mode 100644 index 00000000..dbf3d139 --- /dev/null +++ b/src/cljam/io/cram/encode/alignment_stats.clj @@ -0,0 +1,53 @@ +(ns cljam.io.cram.encode.alignment-stats) + +(defprotocol IAlignmentStatsBuilder + (update! [_ ri pos end nbases nrecords]) + (build [_])) + +(defrecord AlignmentStats [^long ri ^long start ^long end ^long nbases ^long nrecords]) + +(deftype AlignmentStatsBuilder + [^:unsynchronized-mutable first? + ^:unsynchronized-mutable ^long ref-index + ^:unsynchronized-mutable ^long start + ^:unsynchronized-mutable ^long end + ^:unsynchronized-mutable ^long nbases + ^:unsynchronized-mutable ^long nrecords] + IAlignmentStatsBuilder + (build [_] + (let [ri (if first? -1 ref-index)] + (->AlignmentStats ri + (if (neg? ri) 0 start) + (if (neg? ri) 0 end) + nbases nrecords))) + (update! [this ri pos end nbases nrecords] + (let [ri (long ri) + pos (long pos) + end (long end)] + (cond first? + (do (set! ref-index ri) + (set! first? false)) + + (not= ri ref-index) + (set! ref-index -2)) + (when (and (>= ref-index 0) (pos? pos)) + (when (or (zero? (.-start this)) (< pos (.-start this))) + (set! (.-start this) pos)) + (when (< (.-end this) end) + (set! (.-end this) end))) + (set! (.-nbases this) (+ (.-nbases this) (long nbases))) + (set! (.-nrecords this) (+ (.-nrecords this) (long nrecords)))))) + +(defn make-alignment-stats-builder + "Creates a new alignment stats builder." + [] + (->AlignmentStatsBuilder true 0 0 0 0 0)) + +(defn merge-stats + "Merges multiple alignment stats into one." + [ss] + (let [builder (make-alignment-stats-builder)] + (run! (fn [{:keys [ri start end nbases nrecords]}] + (update! builder ri start end nbases nrecords)) + ss) + (build builder))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj new file mode 100644 index 00000000..6b80a3c7 --- /dev/null +++ b/src/cljam/io/cram/encode/record.clj @@ -0,0 +1,214 @@ +(ns cljam.io.cram.encode.record + (:require [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.sam.util.cigar :as sam.cigar] + [cljam.io.sam.util.flag :as sam.flag] + [cljam.io.sam.util.option :as sam.option] + [cljam.io.cram.seq-resolver.protocol :as resolver]) + (:import [java.util Arrays])) + +(defn- ref-index [rname->idx rname] + (if (= rname "*") + -1 + (get rname->idx rname))) + +(defn- build-positional-data-encoder [cram-header {:keys [RI RL AP RG]}] + (let [rg-id->idx (into {} + (map-indexed (fn [i {:keys [ID]}] [ID i])) + (:RG cram-header))] + (fn [record] + (let [rg (sam.option/value-for-tag :RG record)] + (RI (::ref-index record)) + (RL (count (:seq record))) + (AP (:pos record)) + (RG (if rg (get rg-id->idx rg) -1)))))) + +(defn- build-read-name-encoder [{:keys [RN]}] + (fn [record] + (RN (.getBytes ^String (:qname record))))) + +(defn- build-mate-read-encoder [rname->idx {:keys [MF NS NP TS]}] + (fn [{:keys [^long flag rnext] :as record}] + (let [mate-flag (cond-> 0 + (pos? (bit-and flag (sam.flag/encoded #{:next-reversed}))) + (bit-or 0x01) + + (pos? (bit-and flag (sam.flag/encoded #{:next-unmapped}))) + (bit-or 0x02))] + (MF mate-flag) + (NS (if (= rnext "=") + (::ref-index record) + (ref-index rname->idx rnext))) + (NP (:pnext record)) + (TS (:tlen record))))) + +(defn- build-auxiliary-tags-encoder [tag-dict {:keys [TL]} tag-encoders] + (let [tag-encoder (fn [{:keys [tag] :as item}] + (let [tag&type [tag (:type item)] + encoder (get-in tag-encoders tag&type)] + (fn [opts] + (encoder (get opts tag&type))))) + encoders (mapv (fn [entry] + (let [encoders (mapv tag-encoder entry)] + (fn [opts] + (run! #(% opts) encoders)))) + tag-dict)] + (fn [record] + (let [tl (::tags-index record) + encoder (nth encoders tl) + opts (into {} + (map (fn [opt] + (let [[tag m] (first opt)] + [[tag (first (:type m))] (:value m)]))) + (:options record))] + (TL tl) + (encoder opts))))) + +(defn- build-read-features-encoder [{:keys [FN FP FC BA QS BS IN DL SC HC RS PD]}] + (fn [record] + (let [fs (::features record)] + (FN (count fs)) + (reduce (fn [^long prev-pos {:keys [^long pos] :as f}] + (FP (- pos prev-pos)) + (case (:code f) + :read-base (do (FC (int \B)) + (BA (:base f)) + (QS (:qual f))) + :subst (do (FC (int \X)) + (BS (:subst f))) + :insertion (do (FC (int \I)) + (IN (:bases f))) + :deletion (do (FC (int \D)) + (DL (:len f))) + :softclip (do (FC (int \S)) + (SC (:bases f))) + :hardclip (do (FC (int \H)) + (HC (:len f))) + :ref-skip (do (FC (int \N)) + (RS (:len f))) + :padding (do (FC (int \P)) + (PD (:len f)))) + pos) + 0 fs)))) + +(defn- encode-qual [{:keys [qual] :as record} qs-encoder] + (if (= qual "*") + (dotimes [_ (count (:seq record))] + (qs-encoder -1)) + (let [bs (.getBytes ^String qual)] + (dotimes [i (alength bs)] + (qs-encoder (bit-and (long (- (aget bs i) 33)) 0xff)))))) + +(defn- build-mapped-read-encoder [{:keys [MQ QS] :as encoders}] + (let [features-encoder (build-read-features-encoder encoders)] + (fn [record] + (features-encoder record) + (MQ (:mapq record)) + (encode-qual record QS)))) + +(defn- build-unmapped-read-encoder [{:keys [BA QS]}] + (fn [record] + (let [bs (.getBytes ^String (:seq record))] + (dotimes [i (alength bs)] + (BA (aget bs i))) + (encode-qual record QS)))) + +(defn- add-mismatches + [n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] + (loop [i (long n), rpos (long rpos), spos (long spos), fs fs] + (if (zero? i) + fs + (let [ref-base (aget ref-bases (dec rpos)) + read-base (aget read-bases spos)] + (if (= ref-base read-base) + (recur (dec i) (inc rpos) (inc spos) fs) + (let [pos (inc spos) + f (if (or (neg? (.indexOf "ATGCN" ref-base)) + (neg? (.indexOf "ATGCN" read-base))) + {:code :read-base :pos pos + :base read-base + :qual (if qs (- (aget qs spos) 33) -1)} + {:code :subst :pos pos + :subst (-> subst-mat + (get (char ref-base)) + (get (char read-base)))})] + (recur (dec i) (inc rpos) (inc spos) (conj! fs f)))))))) + +(defn- calculate-read-features&end + [seq-resolver subst-mat {:keys [rname ^long pos qual cigar] :as record}] + (if (or (zero? pos) (= (:seq record) "*")) + [[] pos] + (let [ref-bases ^bytes (resolver/resolve-sequence seq-resolver rname) + read-bases (.getBytes ^String (:seq record)) + qs (when-not (= qual "*") + (.getBytes ^String qual))] + (loop [[[^long n op] & more] (sam.cigar/parse cigar) + rpos pos + spos 0 + fs (transient [])] + (if op + (let [pos (inc spos)] + (case op + (\M \X \=) (recur more (+ rpos n) (+ spos n) + (add-mismatches n subst-mat ref-bases rpos + read-bases qs spos fs)) + \I (let [spos' (+ spos n) + bs (Arrays/copyOfRange read-bases spos spos')] + (recur more rpos spos' (conj! fs {:code :insertion :pos pos :bases bs}))) + \D (recur more (+ rpos n) spos (conj! fs {:code :deletion :pos pos :len n})) + \N (recur more (+ rpos n) spos (conj! fs {:code :ref-skip :pos pos :len n})) + \S (let [spos' (+ spos n) + bs (Arrays/copyOfRange read-bases spos spos')] + (recur more rpos spos' (conj! fs {:code :softclip :pos pos :bases bs}))) + \H (recur more rpos spos (conj! fs {:code :hardclip :pos pos :len n})) + \P (recur more rpos spos (conj! fs {:code :padding :pos pos :len n})))) + [(persistent! fs) (dec rpos)]))))) + +(defn- build-cram-record-encoder + [seq-resolver cram-header tag-dict subst-mat {:keys [BF CF] :as ds-encoders} + tag-encoders stats-builder] + (let [rname->idx (into {} + (map-indexed (fn [i {:keys [SN]}] [SN i])) + (:SQ cram-header)) + pos-encoder (build-positional-data-encoder cram-header ds-encoders) + name-encoder (build-read-name-encoder ds-encoders) + mate-encoder (build-mate-read-encoder rname->idx ds-encoders) + tags-encoder (build-auxiliary-tags-encoder tag-dict ds-encoders tag-encoders) + mapped-encoder (build-mapped-read-encoder ds-encoders) + unmapped-encoder (build-unmapped-read-encoder ds-encoders)] + (fn [record] + (let [bf (bit-and (long (:flag record)) + (bit-not (sam.flag/encoded #{:next-reversed :next-unmapped}))) + ;; these flag bits of CF are hard-coded at the moment: + ;; - 0x01: quality scores stored as array (true) + ;; - 0x02: detached (true) + ;; - 0x04: has mate downstream (false) + cf (cond-> 0x03 + (= (:seq record) "*") (bit-or 0x08)) + ri (ref-index rname->idx (:rname record)) + [fs end] (calculate-read-features&end seq-resolver subst-mat record) + record' (assoc record ::flag cf ::ref-index ri ::features fs)] + (stats/update! stats-builder ri (:pos record') end (count (:seq record')) 1) + (BF bf) + (CF cf) + (pos-encoder record') + (name-encoder record') + (mate-encoder record') + (tags-encoder record') + (if (sam.flag/unmapped? (:flag record')) + (unmapped-encoder record') + (mapped-encoder record')))))) + +(defn encode-slice-records + "Encodes CRAM records in a slice all at once using the given data series encoders + and tag encoders. Returns the alignment stats for this slice." + [seq-resolver cram-header tag-dict subst-mat ds-encoders tag-encoders records] + (let [stats-builder (stats/make-alignment-stats-builder) + record-encoder (build-cram-record-encoder seq-resolver + cram-header + tag-dict + subst-mat + ds-encoders + tag-encoders + stats-builder)] + (run! record-encoder records) + (stats/build stats-builder))) diff --git a/src/cljam/io/cram/seq_resolver.clj b/src/cljam/io/cram/seq_resolver.clj index 0fa95ead..faa9f949 100644 --- a/src/cljam/io/cram/seq_resolver.clj +++ b/src/cljam/io/cram/seq_resolver.clj @@ -1,13 +1,17 @@ (ns cljam.io.cram.seq-resolver (:require [cljam.io.cram.seq-resolver.protocol :as proto] - [cljam.io.sequence :as cseq]) - (:import [java.io Closeable])) + [cljam.io.sequence :as cseq] + [clojure.core.cache.wrapped :as cache]) + (:import [java.io Closeable] + [java.util Arrays])) (deftype SeqResolver [seq-reader] java.io.Closeable (close [_] (.close ^Closeable seq-reader)) proto/ISeqResolver + (resolve-sequence [this chr] + (proto/resolve-sequence this chr nil nil)) (resolve-sequence [_ chr start end] (when-let [s (cseq/read-sequence seq-reader {:chr chr :start start :end end})] (.getBytes ^String s)))) diff --git a/src/cljam/io/cram/seq_resolver/protocol.clj b/src/cljam/io/cram/seq_resolver/protocol.clj index f9bf4829..05f697ea 100644 --- a/src/cljam/io/cram/seq_resolver/protocol.clj +++ b/src/cljam/io/cram/seq_resolver/protocol.clj @@ -1,10 +1,12 @@ (ns cljam.io.cram.seq-resolver.protocol) (defprotocol ISeqResolver - (resolve-sequence [this chr start end])) + (resolve-sequence [this chr] [this chr start end])) (extend-protocol ISeqResolver nil + (resolve-sequence [this chr] + (resolve-sequence this chr nil nil)) (resolve-sequence [_ chr start end] (throw (ex-info "reference was not specified, but tried to resolve sequence" diff --git a/test/cljam/io/cram/seq_resolver_test.clj b/test/cljam/io/cram/seq_resolver_test.clj index 984d108d..f3006890 100644 --- a/test/cljam/io/cram/seq_resolver_test.clj +++ b/test/cljam/io/cram/seq_resolver_test.clj @@ -1,13 +1,22 @@ (ns cljam.io.cram.seq-resolver-test (:require [cljam.io.cram.seq-resolver :as resolver] [cljam.io.cram.seq-resolver.protocol :as proto] + [cljam.io.sequence :as cseq] [cljam.test-common :as common] [clojure.test :refer [deftest is testing]])) (deftest resolve-sequence-test - (testing "seq resolver resolves sequence for specified region" - (let [resolver (resolver/seq-resolver common/test-fa-file) - resolver' (resolver/clone-seq-resolver resolver)] + (with-open [resolver (resolver/seq-resolver common/test-fa-file) + resolver' (resolver/clone-seq-resolver resolver)] + (testing "seq resolver resolves sequence for chr" + (is (= (with-open [fasta-rdr (cseq/reader common/test-fa-file)] + (cseq/read-sequence fasta-rdr {:chr "ref"})) + (String. (proto/resolve-sequence resolver "ref")) + (String. (proto/resolve-sequence resolver' "ref")))) + (is (= nil + (proto/resolve-sequence resolver "unknown") + (proto/resolve-sequence resolver' "unknown")))) + (testing "seq resolver resolves sequence for specified region" (is (= "AGCAT" (String. (proto/resolve-sequence resolver "ref" 1 5)) (String. (proto/resolve-sequence resolver' "ref" 1 5)))) @@ -15,4 +24,5 @@ (proto/resolve-sequence resolver "unknown" 1 5) (proto/resolve-sequence resolver' "unknown" 1 5))))) (testing "resolver can be omitted, but in that case, trying to resolve seq will end up with error" + (is (thrown? Exception (proto/resolve-sequence nil "ref"))) (is (thrown? Exception (proto/resolve-sequence nil "ref" 1 5))))) From f5f513bcd1fd1f0bcae202e68b6fbfc7763904c7 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:31:31 +0900 Subject: [PATCH 03/10] Implement encoders for various headers and blocks --- src/cljam/io/cram/encode/structure.clj | 280 +++++++++++++++++++++++++ 1 file changed, 280 insertions(+) create mode 100644 src/cljam/io/cram/encode/structure.clj diff --git a/src/cljam/io/cram/encode/structure.clj b/src/cljam/io/cram/encode/structure.clj new file mode 100644 index 00000000..99c2ddd3 --- /dev/null +++ b/src/cljam/io/cram/encode/structure.clj @@ -0,0 +1,280 @@ +(ns cljam.io.cram.encode.structure + (:require [cljam.io.cram.itf8 :as itf8] + [cljam.io.sam.util.header :as sam.header] + [cljam.io.util.byte-buffer :as bb] + [cljam.io.util.lsb.io-stream :as lsb]) + (:import [java.io ByteArrayOutputStream OutputStream] + [java.util.zip CRC32 CheckedOutputStream] + [org.apache.commons.compress.compressors CompressorStreamFactory])) + +(defn- encode-itf8-array [out vs] + (itf8/encode-itf8 out (count vs)) + (run! #(itf8/encode-itf8 out %) vs)) + +(defn- with-out-byte-array ^bytes [f] + (let [out (ByteArrayOutputStream.)] + (f out) + (.toByteArray out))) + +(defn- with-size-prefixed-out [out f] + (let [bs (with-out-byte-array f)] + (itf8/encode-itf8 out (alength bs)) + (lsb/write-bytes out bs))) + +(defn- encode-encoding [^OutputStream out encoding] + (case (:codec encoding) + :null + (do (itf8/encode-itf8 out 0) + (itf8/encode-itf8 out 0)) + + :external + (do (itf8/encode-itf8 out 1) + (with-size-prefixed-out out + (fn [out'] + (itf8/encode-itf8 out' (:content-id encoding))))) + + :huffman + (let [{:keys [alphabet bit-len]} encoding] + (itf8/encode-itf8 out 3) + (with-size-prefixed-out out + (fn [out'] + (encode-itf8-array out' alphabet) + (encode-itf8-array out' bit-len)))) + + :byte-array-len + (let [{:keys [len-encoding val-encoding]} encoding] + (itf8/encode-itf8 out 4) + (with-size-prefixed-out out + (fn [out'] + (encode-encoding out' len-encoding) + (encode-encoding out' val-encoding)))) + + :byte-array-stop + (let [{:keys [stop-byte content-id]} encoding] + (itf8/encode-itf8 out 5) + (with-size-prefixed-out out + (fn [^OutputStream out'] + (.write out' (byte stop-byte)) + (itf8/encode-itf8 out' content-id)))) + + (throw (ex-info (str "codec " (:codec encoding) " not supported") + {:encoding encoding})))) + +(defn encode-file-definition + "Encodes the CRAM file definition to the given OutputStream." + [^OutputStream out version file-id] + (lsb/write-bytes out (.getBytes "CRAM")) + (.write out (byte (:major version))) + (.write out (byte (:minor version))) + (let [bb (bb/allocate-lsb-byte-buffer 20)] + (bb/write-string bb file-id) + (.write out (.array bb)))) + +(defn- with-crc-suffixed ^bytes [out f] + (let [crc (CRC32.) + out' (CheckedOutputStream. out crc) + bb (bb/allocate-lsb-byte-buffer 4)] + (f out') + (.putInt bb (int (.getValue crc))) + (lsb/write-bytes out (.array bb)))) + +(defn encode-container-header + "Encodes a container header to the given OutputStream." + [out container-header] + (with-crc-suffixed out + (fn [out'] + (lsb/write-int out' (:length container-header)) + (itf8/encode-itf8 out' (:ref-seq-id container-header)) + (itf8/encode-itf8 out' (:start container-header)) + (itf8/encode-itf8 out' (:span container-header)) + (itf8/encode-itf8 out' (:records container-header)) + (itf8/encode-ltf8 out' (:counter container-header)) + (itf8/encode-itf8 out' (:bases container-header)) + (itf8/encode-itf8 out' (:blocks container-header)) + (encode-itf8-array out' (:landmarks container-header))))) + +(def ^:private encode-block-data + (let [factory (CompressorStreamFactory.)] + (fn [^OutputStream out method data] + (case method + :raw (lsb/write-bytes out data) + (let [compressor (case method + :gzip CompressorStreamFactory/GZIP + :bzip CompressorStreamFactory/BZIP2 + :lzma CompressorStreamFactory/LZMA + (ex-info (str "compression method " method + " not supported") + {:method method})) + out' (.createCompressorOutputStream factory compressor out)] + (lsb/write-bytes out' data)))))) + +(defn encode-block + "Encodes a block to the given OutputStream." + [^OutputStream out method content-type content-id ^bytes block-data] + (let [method' (case method :raw 0 :gzip 1 :bzip 2 :lzma 3) + size (alength block-data)] + (with-crc-suffixed out + (fn [out'] + (lsb/write-ubyte out' method') + (lsb/write-ubyte out' content-type) + (itf8/encode-itf8 out' content-id) + (itf8/encode-itf8 out' size) + (itf8/encode-itf8 out' size) + (encode-block-data out' method block-data))))) + +(defn generate-block + "Encodes a block and returns the encoded result as a byte array." + ^bytes [method content-type content-id block-data] + (with-out-byte-array + (fn [out] + (encode-block out method content-type content-id block-data)))) + +(defn- encode-cram-header-block [out header] + (let [^String s (sam.header/stringify-header header) + bs (.getBytes s) + size (alength bs) + bb (bb/allocate-lsb-byte-buffer (+ 4 size))] + (.putInt bb size) + (.put bb bs) + (encode-block out :raw 0 0 (.array bb)))) + +(defn encode-cram-header-container + "Encodes a CRAM header container to the given OutputStream." + [out header] + (let [bs (with-out-byte-array #(encode-cram-header-block % header)) + container-header {:length (alength bs) + :ref-seq-id 0 + :start 0 + :span 0 + :records 0 + :counter 0 + :bases 0 + :blocks 1 + :landmarks [0]}] + (encode-container-header out container-header) + (lsb/write-bytes out bs))) + +(defn- encode-substitution-matrix [out m] + (let [all-bases [\A \C \G \T \N] + ret (byte-array 5)] + (dotimes [i 5] + (let [r (nth all-bases i) + codes (get m r)] + (aset ret i + (byte + (loop [j 0, k 0, acc 0] + (if (< j 5) + (if (= i j) + (recur (inc j) k acc) + (let [a (nth all-bases j) + code (long (get codes a))] + (recur (inc j) (inc k) + (bit-or acc (bit-shift-left code (* (- 3 k) 2)))))) + acc)))))) + (lsb/write-bytes out ret))) + +(defn- encode-tag-dictionary [out dict] + (with-size-prefixed-out out + (fn [out'] + (run! (fn [entry] + (run! (fn [{:keys [tag] :as item}] + (let [tag' (name tag)] + (lsb/write-ubyte out' (int (nth tag' 0))) + (lsb/write-ubyte out' (int (nth tag' 1))) + (lsb/write-ubyte out' (int (:type item))))) + entry) + (lsb/write-ubyte out' 0)) + dict)))) + +(defn- encode-preservation-map [out preservation-map subst-mat tag-dict] + (with-size-prefixed-out out + (fn [^OutputStream out'] + (itf8/encode-itf8 out' (+ (count preservation-map) 2)) + (run! (fn [[k v]] + (lsb/write-bytes out' (.getBytes (name k))) + (.write out' (byte (if v 1 0)))) + preservation-map) + (lsb/write-bytes out' (.getBytes "SM")) + (encode-substitution-matrix out' subst-mat) + (lsb/write-bytes out' (.getBytes "TD")) + (encode-tag-dictionary out' tag-dict)))) + +(defn- encode-data-series-encodings [out ds-encodings] + (with-size-prefixed-out out + (fn [out'] + (itf8/encode-itf8 out' (count ds-encodings)) + (run! (fn [[ds encoding]] + (lsb/write-string out' (name ds)) + (encode-encoding out' encoding)) + ds-encodings)))) + +(defn- encode-tag-encoding-map [out tag-encodings] + (with-size-prefixed-out out + (fn [out'] + (itf8/encode-itf8 out' (count tag-encodings)) + (run! (fn [[tag v]] + (run! (fn [[t encoding]] + (let [tag' (name tag) + k (bit-or (bit-shift-left (int (nth tag' 0)) 16) + (bit-shift-left (int (nth tag' 1)) 8) + (int t))] + (itf8/encode-itf8 out' k) + (encode-encoding out' encoding))) + v)) + tag-encodings)))) + +(defn encode-compression-header-block + "Encodes a compression header block to the given OutputStream." + [out preservation-map subst-mat tag-dict ds-encodings tag-encodings] + (let [bs (with-out-byte-array + (fn [out'] + (encode-preservation-map out' preservation-map subst-mat tag-dict) + (encode-data-series-encodings out' ds-encodings) + (encode-tag-encoding-map out' tag-encodings)))] + (encode-block out :raw 1 0 bs))) + +(defn generate-compression-header-block + "Encodes a compression header block and returns the encoded result as a byte array." + ^bytes [preservation-map subst-mat tag-dict ds-encodings tag-encodings] + (with-out-byte-array + (fn [out] + (encode-compression-header-block + out preservation-map subst-mat tag-dict ds-encodings tag-encodings)))) + +(defn encode-slice-header-block + "Encodes a slice header block to the given OutputStream." + [out slice-header blocks] + (let [bs (with-out-byte-array + (fn [out'] + (itf8/encode-itf8 out' (:ref-seq-id slice-header)) + (itf8/encode-itf8 out' (:start slice-header)) + (itf8/encode-itf8 out' (:span slice-header)) + (itf8/encode-itf8 out' (:records slice-header)) + (itf8/encode-ltf8 out' (:counter slice-header)) + (itf8/encode-itf8 out' (count blocks)) + (encode-itf8-array out' (map :content-id blocks)) + (itf8/encode-itf8 out' (:embedded-reference slice-header)) + (lsb/write-bytes out' (:reference-md5 slice-header))))] + (encode-block out :raw 2 0 bs))) + +(defn generate-slice-header-block + "Encodes a slice header block and returns the encoded result as a byte array." + ^bytes [slice-header blocks] + (with-out-byte-array + (fn [out] + (encode-slice-header-block out slice-header blocks)))) + +(defn encode-eof-container + "Encodes an EOF container to the given OutputStream." + [out] + (encode-container-header out {:length 15 + :ref-seq-id -1 + :start 4542278 + :span 0 + :records 0 + :counter 0 + :bases 0 + :blocks 1 + :landmarks []}) + (lsb/write-bytes out (byte-array [0 1 0 6 6])) + (lsb/write-bytes out (byte-array [1 0 1 0 1 0 0xee 0x63 0x01 0x4b]))) From efccfd40b056fb403aa995feea482b7661282c9e Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:34:04 +0900 Subject: [PATCH 04/10] Implement CRAM writer --- src/cljam/io/cram/data_series.clj | 39 ++++++ src/cljam/io/cram/writer.clj | 207 ++++++++++++++++++++++++++++++ 2 files changed, 246 insertions(+) create mode 100644 src/cljam/io/cram/writer.clj diff --git a/src/cljam/io/cram/data_series.clj b/src/cljam/io/cram/data_series.clj index b634b52a..f1aa4318 100644 --- a/src/cljam/io/cram/data_series.clj +++ b/src/cljam/io/cram/data_series.clj @@ -144,6 +144,45 @@ decoders m)) {} tags))) +(def ^{:doc "Default encodings for all the data series"} + default-data-series-encodings + {:BF {:content-id 1, :codec :external} + :CF {:content-id 2, :codec :external} + :RI {:content-id 3, :codec :external} + :RL {:content-id 4, :codec :external} + :AP {:content-id 5, :codec :external} + :RG {:content-id 6, :codec :external} + :RN {:content-id 7, :codec :byte-array-stop, :stop-byte (int \tab)} + :MF {:content-id 8, :codec :external} + :NS {:content-id 9, :codec :external} + :NP {:content-id 10, :codec :external} + :TS {:content-id 11, :codec :external} + :NF {:content-id 12, :codec :external} + :TL {:content-id 13, :codec :external} + :FN {:content-id 14, :codec :external} + :FC {:content-id 15, :codec :external} + :FP {:content-id 16, :codec :external} + :DL {:content-id 17, :codec :external} + :BB {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 18} + :val-encoding {:codec :external, :content-id 19}} + :QQ {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 20} + :val-encoding {:codec :external, :content-id 21}} + :BS {:content-id 22, :codec :external} + :IN {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 23} + :val-encoding {:codec :external, :content-id 24}} + :RS {:content-id 25, :codec :external} + :PD {:content-id 26, :codec :external} + :HC {:content-id 27, :codec :external} + :SC {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 28} + :val-encoding {:codec :external, :content-id 29}} + :MQ {:content-id 30, :codec :external} + :BA {:content-id 31, :codec :external} + :QS {:content-id 32, :codec :external}}) + (defn- build-codec-encoder [{:keys [codec content-id] :as params} data-type content-id->state] (letfn [(out-for-encoder [] diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj new file mode 100644 index 00000000..bcc96b56 --- /dev/null +++ b/src/cljam/io/cram/writer.clj @@ -0,0 +1,207 @@ +(ns cljam.io.cram.writer + (:require [cljam.io.cram.data-series :as ds] + [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.cram.encode.record :as record] + [cljam.io.cram.encode.structure :as struct] + [cljam.io.cram.seq-resolver.protocol :as resolver] + [cljam.io.protocols :as protocols]) + (:import [java.io Closeable OutputStream] + [java.security MessageDigest])) + +(declare write-header write-alignments) + +(deftype CRAMWriter [url stream seq-resolver] + Closeable + (close [_] + (struct/encode-eof-container stream) + (.close ^OutputStream stream) + (when seq-resolver + (.close ^Closeable seq-resolver))) + protocols/IWriter + (writer-url [_] url) + protocols/IAlignmentWriter + (write-header [this header] + (write-header this header)) + (write-refs [_ _]) + (write-alignments [this alignments header] + (write-alignments this alignments header) + nil) + #_(write-blocks [this blocks])) + +(defn write-file-definition + "Writes the CRAM file definition." + [^CRAMWriter wtr file-id] + (struct/encode-file-definition (.-stream wtr) {:major 3 :minor 1} file-id)) + +(defn write-header + "Writes the CRAM header." + [^CRAMWriter wtr header] + (struct/encode-cram-header-container (.-stream wtr) header)) + +(defn- reference-md5 [seq-resolver header {:keys [^long ri ^long start ^long end]}] + (if (neg? ri) + (byte-array 16) + (let [chr (:SN (nth (:SQ header) ri)) + ref-bases (resolver/resolve-sequence seq-resolver chr start end) + md5 (MessageDigest/getInstance "md5")] + (.digest md5 ref-bases)))) + +(defn- build-tag-dictionary [^objects container-records] + (let [tags->index (volatile! {})] + (dotimes [i (alength container-records)] + (let [^objects slice-records (aget container-records i)] + (dotimes [j (alength slice-records)] + (let [record (aget slice-records j) + tags (into [] + (keep (fn [opt] + (let [[tag m] (first opt)] + (when-not (= tag :RG) + [tag (first (:type m))])))) + (:options record)) + idx (or (get @tags->index tags) + (let [idx (count @tags->index)] + (vswap! tags->index assoc tags idx) + idx))] + (aset slice-records j + (assoc record ::record/tags-index idx)))))) + (->> (sort-by val @tags->index) + (mapv (fn [[tags _]] + (mapv (fn [[tag tag-type]] + {:tag tag :type tag-type}) + tags)))))) + +(defn- build-tag-encoding [item] + (letfn [(tag-id [item] + (let [tag' (name (:tag item))] + (bit-or (bit-shift-left (int (nth tag' 0)) 16) + (bit-shift-left (int (nth tag' 1)) 8) + (int (:type item))))) + (tag-encoding-for-fixed-size [item size] + {:codec :byte-array-len + :len-encoding {:codec :huffman, :alphabet [size], :bit-len [0]} + :val-encoding {:codec :external, :content-id (tag-id item)}})] + (case (:type item) + (\A \c \C) (tag-encoding-for-fixed-size item 1) + (\s \S) (tag-encoding-for-fixed-size item 2) + (\i \I \f) (tag-encoding-for-fixed-size item 4) + (let [content-id (tag-id item)] + {:codec :byte-array-len + :len-encoding {:codec :external, :content-id content-id} + :val-encoding {:codec :external, :content-id content-id}})))) + +(defn- build-tag-encodings [tag-dict] + (reduce + (fn [m entry] + (reduce + (fn [m {tag-type :type :as item}] + (update-in m [(:tag item) tag-type] #(or % (build-tag-encoding item)))) + m entry)) + {} tag-dict)) + +(defn- stats->header-base [{:keys [ri ^long start ^long end nbases nrecords]}] + {:ref-seq-id ri + :start start + :span (if (zero? start) 0 (inc (- end start))) + :bases nbases + :records nrecords}) + +(defn- generate-slice + [seq-resolver header counter tag-dict subst-mat tag-encodings slice-records] + (let [ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings) + tag-encoders (ds/build-tag-encoders tag-encodings) + stats (record/encode-slice-records seq-resolver header tag-dict subst-mat + ds-encoders tag-encoders slice-records) + ds-results (mapcat #(%) (vals ds-encoders)) + tag-results (for [[_tag v] tag-encoders + [_type encoder] v + res (encoder)] + res) + blocks (->> (concat ds-results tag-results) + (keep (fn [{:keys [content-id ^bytes data] :as block}] + (when (pos? (alength data)) + (update block :data + #(struct/generate-block :raw 4 content-id %))))) + ;; sort + dedupe by :content-id + (into (sorted-map) (map (juxt :content-id identity))) + vals + (cons {:content-id 0 + :data (struct/generate-block :raw 5 0 (byte-array 0))})) + ref-md5 (reference-md5 seq-resolver header stats) + header-block (struct/generate-slice-header-block + (assoc (stats->header-base stats) + :counter counter + :embedded-reference -1 + :reference-md5 ref-md5) + blocks) + block-data (mapv :data blocks)] + {:header-block header-block + :data-blocks block-data + :stats stats + :counter (+ (long counter) (long (:nrecords stats))) + :size (->> block-data + (map #(alength ^bytes %)) + (apply + (alength header-block)))})) + +(defn- generate-slices + [seq-resolver header counter tag-dict subst-mat tag-encodings container-records] + (loop [[slice-records & more] container-records, counter counter, acc []] + (if slice-records + (let [slice (generate-slice seq-resolver header counter tag-dict subst-mat + tag-encodings slice-records)] + (recur more (:counter slice) (conj acc slice))) + acc))) + +(defn- generate-container-header [^bytes compression-header-block slices] + (let [stats (stats/merge-stats (map :stats slices)) + container-len (->> slices + (map (fn [{:keys [^bytes header-block data-blocks]}] + (apply + (alength header-block) + (map #(alength ^bytes %) data-blocks)))) + (apply + (alength compression-header-block))) + landmarks (->> slices + (reductions #(+ (long %1) (long (:size %2))) + (alength compression-header-block)) + butlast)] + (assoc (stats->header-base stats) + :length container-len + :blocks (->> slices + (map (comp inc count :data-blocks)) + (apply + 1)) + :landmarks landmarks))) + +(defn- write-container [^CRAMWriter wtr header counter container-records] + (let [^OutputStream out (.-stream wtr) + ds-encodings ds/default-data-series-encodings + subst-mat {\A {\T 0, \G 1, \C 2, \N 3} + \T {\A 0, \G 1, \C 2, \N 3} + \G {\A 0, \T 1, \C 2, \N 3} + \C {\A 0, \T 1, \G 2, \N 3} + \N {\A 0, \T 1, \G 2, \C 3}} + tag-dict (build-tag-dictionary container-records) + tag-encodings (build-tag-encodings tag-dict) + slices (generate-slices (.-seq-resolver wtr) header counter tag-dict + subst-mat tag-encodings container-records) + compression-header-block (struct/generate-compression-header-block + {:RN true, :AP false, :RR true} + subst-mat tag-dict ds-encodings tag-encodings) + container-header (generate-container-header compression-header-block slices) + counter' (:counter (peek slices))] + (struct/encode-container-header out (assoc container-header :counter counter)) + (.write out compression-header-block) + (run! (fn [{:keys [^bytes header-block data-blocks]}] + (.write out header-block) + (run! #(.write out ^bytes %) data-blocks)) + slices) + counter')) + +(defn- partition-alignments [slices-per-container records-per-slice alns] + (->> alns + (partition-all records-per-slice) + (map object-array) + (partition-all slices-per-container) + (map object-array))) + +(defn write-alignments + "Writes all the given alignments, which is a sequence of alignment maps." + [wtr alns header] + (reduce (partial write-container wtr header) 0 (partition-alignments 1 10000 alns))) From 24a5d0ab2fae21fbd1f78b03f298561fb05fae4d Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:39:07 +0900 Subject: [PATCH 05/10] Add public APIs for CRAM writer --- src/cljam/io/cram.clj | 33 ++++++++++++++++++++++++++++-- src/cljam/io/cram/core.clj | 18 ++++++++++++++++ src/cljam/io/cram/seq_resolver.clj | 19 +++++++++++++++++ src/cljam/io/util.clj | 6 ++++++ 4 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj index 59a423d1..0c8596ff 100644 --- a/src/cljam/io/cram.clj +++ b/src/cljam/io/cram.clj @@ -1,10 +1,11 @@ (ns cljam.io.cram - "Alpha - subject to change. Provides functions for reading from a CRAM file." + "Alpha - subject to change. Provides functions for reading and writing a CRAM file." (:refer-clojure :exclude [indexed?]) (:require [cljam.io.cram.core :as cram] [cljam.io.protocols :as protocols] [cljam.io.util :as io-util]) - (:import [cljam.io.cram.reader CRAMReader])) + (:import [cljam.io.cram.reader CRAMReader] + [cljam.io.cram.writer CRAMWriter])) (defn reader "Creates a CRAM reader depending on the argument f: If f is a file or a string @@ -47,3 +48,31 @@ function immediately realizes a delayed index." [rdr] (protocols/indexed? rdr)) + +(defn writer + "Creates a new CRAM writer that writes to a CRAM file f. + + The function also takes an optional argument `option`, which is a map that + consists of: + - reference: A string representing the path to the reference file, or + a sequence reader that reads sequences from the reference file. + This may be omitted only when the CRAM file to be read does not + require a reference file." + (^CRAMWriter [f] (writer f {})) + (^CRAMWriter [f option] (cram/writer f option))) + +(defn write-header + "Writes header to the CRAM file." + [wtr header] + (protocols/write-header wtr header)) + +(defn write-refs + "Does nothing. This exists only for the sake of compatibility with other + alignment writers." + [wtr refs] + (protocols/write-refs wtr refs)) + +(defn write-alignments + "Writes alignments to the CRAM file." + [wtr alns header] + (protocols/write-alignments wtr alns header)) diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj index 6d3cc7d3..f89b770c 100644 --- a/src/cljam/io/cram/core.clj +++ b/src/cljam/io/cram/core.clj @@ -2,12 +2,14 @@ (:require [cljam.io.crai :as crai] [cljam.io.cram.reader :as reader] [cljam.io.cram.seq-resolver :as resolver] + [cljam.io.cram.writer :as writer] [cljam.io.sam.util.refs :as util.refs] [cljam.io.util.byte-buffer :as bb] [cljam.util :as util] [clojure.java.io :as cio] [clojure.string :as str]) (:import [cljam.io.cram.reader CRAMReader] + [cljam.io.cram.writer CRAMWriter] [java.io FileNotFoundException] [java.net URL] [java.nio.channels FileChannel] @@ -71,3 +73,19 @@ (reader/read-file-definition rdr') (reader/skip-container rdr') rdr')) + +(defn writer + "Creates a new CRAM writer that writes to a CRAM file f. + + Takes an option map as the second argument. An option map consists of: + - reference: a string representing the path to a reference file" + ^CRAMWriter [f {:keys [reference]}] + (let [file (cio/file f) + url (cio/as-url file) + url' (str url) + file-id (subs url' 0 (min 20 (count url'))) + out (cio/output-stream file) + seq-resolver (some-> reference resolver/seq-resolver resolver/cached-resolver) + wtr (writer/->CRAMWriter url out seq-resolver)] + (writer/write-file-definition wtr file-id) + wtr)) diff --git a/src/cljam/io/cram/seq_resolver.clj b/src/cljam/io/cram/seq_resolver.clj index faa9f949..5951ed50 100644 --- a/src/cljam/io/cram/seq_resolver.clj +++ b/src/cljam/io/cram/seq_resolver.clj @@ -21,6 +21,25 @@ [seq-file] (->SeqResolver (cseq/reader seq-file))) +(defn cached-resolver + "Creates a new cached sequence resolver based on the given sequence resolver. + + It will cache the resulting sequence for a whole contig sequence query. + For region queries, it will return a copy of the specified region of the cached + sequence if available." + [resolver] + (let [cache (cache/lu-cache-factory {} :threshold 3)] + (reify + Closeable + (close [_] + (.close ^Closeable resolver)) + proto/ISeqResolver + (resolve-sequence [_ chr] + (cache/lookup-or-miss cache chr (partial proto/resolve-sequence resolver))) + (resolve-sequence [this chr start end] + (some-> ^bytes (proto/resolve-sequence this chr) + (Arrays/copyOfRange (dec (long start)) (long end))))))) + (defn clone-seq-resolver "Creates a cloned sequence resolver based on the given resolver." [^SeqResolver resolver] diff --git a/src/cljam/io/util.clj b/src/cljam/io/util.clj index d0a5c322..59b54b99 100644 --- a/src/cljam/io/util.clj +++ b/src/cljam/io/util.clj @@ -6,6 +6,7 @@ cljam.io.bam.reader cljam.io.bam.writer cljam.io.cram.reader + cljam.io.cram.writer cljam.io.vcf.reader cljam.io.vcf.writer cljam.io.bcf.reader @@ -56,6 +57,11 @@ [rdr] (instance? cljam.io.cram.reader.CRAMReader rdr)) +(defn cram-writer? + "Checks if given object is an instance of CRAMWriter." + [wtr] + (instance? cljam.io.cram.writer.CRAMWriter wtr)) + (defn variant-reader? "Checks if given object implements protocol IVariantReader." [rdr] From ae93877d4a1788746226a7b4117737a13fe54a0b Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:39:56 +0900 Subject: [PATCH 06/10] Rename :external-id to :content-id for consistency with encoder --- src/cljam/io/cram/decode/structure.clj | 4 ++-- test/cljam/io/cram/decode/structure_test.clj | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cljam/io/cram/decode/structure.clj b/src/cljam/io/cram/decode/structure.clj index d4d00cd9..25f38d31 100644 --- a/src/cljam/io/cram/decode/structure.clj +++ b/src/cljam/io/cram/decode/structure.clj @@ -32,8 +32,8 @@ val-encoding (decode-encoding bb)] {:codec :byte-array-len, :len-encoding len-encoding, :val-encoding val-encoding}) 5 (let [stop-byte (.get bb) - external-id (itf8/decode-itf8 bb)] - {:codec :byte-array-stop, :stop-byte stop-byte, :external-id external-id}) + content-id (itf8/decode-itf8 bb)] + {:codec :byte-array-stop, :stop-byte stop-byte, :content-id content-id}) 6 (let [offset (itf8/decode-itf8 bb) length (itf8/decode-itf8 bb)] {:codec :beta, :offset offset, :length length}) diff --git a/test/cljam/io/cram/decode/structure_test.clj b/test/cljam/io/cram/decode/structure_test.clj index 490abc84..498f70d5 100644 --- a/test/cljam/io/cram/decode/structure_test.clj +++ b/test/cljam/io/cram/decode/structure_test.clj @@ -71,7 +71,7 @@ :RL {:codec :external, :content-id 4} :AP {:codec :external, :content-id 5} :RG {:codec :external, :content-id 6} - :RN {:codec :byte-array-stop, :stop-byte 9, :external-id 7} + :RN {:codec :byte-array-stop, :stop-byte 9, :content-id 7} :NF {:codec :external, :content-id 8} :MF {:codec :external, :content-id 9} :NS {:codec :external, :content-id 10} @@ -85,15 +85,15 @@ :BA {:codec :external, :content-id 22} :QS {:codec :external, :content-id 23} :BS {:codec :external, :content-id 24} - :IN {:codec :byte-array-stop, :stop-byte 9, :external-id 25} + :IN {:codec :byte-array-stop, :stop-byte 9, :content-id 25} :DL {:codec :external, :content-id 26} :RS {:codec :external, :content-id 27} - :SC {:codec :byte-array-stop, :stop-byte 9, :external-id 28} + :SC {:codec :byte-array-stop, :stop-byte 9, :content-id 28} :PD {:codec :external, :content-id 29} :HC {:codec :external, :content-id 30}} :tags {:MD - {\Z {:codec :byte-array-stop, :stop-byte 9, :external-id 5063770}} + {\Z {:codec :byte-array-stop, :stop-byte 9, :content-id 5063770}} :NM {\c {:codec :byte-array-len From 93a8ff48aa185638a47cf5ca0ed4b5359008432b Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:41:14 +0900 Subject: [PATCH 07/10] Add missing :bases case --- src/cljam/io/cram/decode/record.clj | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cljam/io/cram/decode/record.clj b/src/cljam/io/cram/decode/record.clj index b673e64d..b75e819a 100644 --- a/src/cljam/io/cram/decode/record.clj +++ b/src/cljam/io/cram/decode/record.clj @@ -196,6 +196,7 @@ (recur fs p (conj! acc [gap \M])) (if-let [[op ^long len] (case (:code f) (:read-base :subst) [\M 1] + :bases [\M (count (:bases f))] :insertion [\I (count (:bases f))] :softclip [\S (count (:bases f))] :hardclip [\H (:len f)] From a66f44ade0e3d15ee03a04e3a41ec8243c294823 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:50:37 +0900 Subject: [PATCH 08/10] Omit null characters at the end of CRAM file ID --- src/cljam/io/cram/decode/structure.clj | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cljam/io/cram/decode/structure.clj b/src/cljam/io/cram/decode/structure.clj index 25f38d31..078d0a2e 100644 --- a/src/cljam/io/cram/decode/structure.clj +++ b/src/cljam/io/cram/decode/structure.clj @@ -2,7 +2,8 @@ (:require [cljam.io.cram.codecs.rans4x8 :as rans] [cljam.io.cram.itf8 :as itf8] [cljam.io.sam.util.header :as sam.header] - [cljam.io.util.byte-buffer :as bb]) + [cljam.io.util.byte-buffer :as bb] + [clojure.string :as str]) (:import [java.io ByteArrayInputStream IOException] [java.nio Buffer ByteBuffer ByteOrder] [java.util Arrays] @@ -51,7 +52,8 @@ (throw (IOException. "Invalid CRAM file"))) (let [major (bb/read-ubyte bb) minor (bb/read-ubyte bb) - file-id (String. ^bytes (bb/read-bytes bb 20))] + file-id (-> (String. ^bytes (bb/read-bytes bb 20)) + (str/replace #"\000+$" ""))] {:version {:major major :minor minor}, :id file-id})) (defn decode-container-header From ffc77b969b9cddbb75288fbe6e34e49eb809359b Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Wed, 3 Jul 2024 08:51:37 +0900 Subject: [PATCH 09/10] Add tests for CRAM writer --- test/cljam/io/cram/data_series_test.clj | 331 +++++++++++++ .../io/cram/encode/alignment_stats_test.clj | 51 ++ test/cljam/io/cram/encode/record_test.clj | 434 ++++++++++++++++++ test/cljam/io/cram/encode/structure_test.clj | 158 +++++++ test/cljam/io/cram/seq_resolver_test.clj | 16 + test/cljam/io/cram_test.clj | 43 +- test/cljam/io/util_test.clj | 27 ++ 7 files changed, 1059 insertions(+), 1 deletion(-) create mode 100644 test/cljam/io/cram/encode/alignment_stats_test.clj create mode 100644 test/cljam/io/cram/encode/record_test.clj create mode 100644 test/cljam/io/cram/encode/structure_test.clj diff --git a/test/cljam/io/cram/data_series_test.clj b/test/cljam/io/cram/data_series_test.clj index 29212568..2354ba43 100644 --- a/test/cljam/io/cram/data_series_test.clj +++ b/test/cljam/io/cram/data_series_test.clj @@ -444,3 +444,334 @@ {:type "B" :value "f,0.0,-0.5"} {:type "B" :value "f,-0.75,-1.0"}] [(fl) (fl) (fl) (fl)])))))) + +(deftest build-data-series-encoders-test + (let [encodings {:BF {:codec :external, :content-id 1} + :BA {:codec :external, :content-id 2} + :BB {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 3} + :val-encoding {:codec :external, :content-id 4}} + :RN {:codec :byte-array-stop, :stop-byte (int \tab), :content-id 5}} + {:keys [BF BA BB RN]} + (ds/build-data-series-encoders encodings)] + (doseq [[bf ba bb rn] (->> [[0xa1 0x63 0xa3 0x63] + (map int "ATGC") + (map #(.getBytes ^String %) + ["CAT" "CGAAC" "AACT" "ACT"]) + (map #(.getBytes ^String %) + ["qname001" "qname002" "qname003" "qname004"])] + (apply map vector))] + (BF bf) + (BA ba) + (BB bb) + (RN rn)) + (let [[{:keys [content-id data]}] (BF)] + (is (= 1 content-id)) + (is (= [0x80 0xa1 0x63 0x80 0xa3 0x63] (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (BA)] + (is (= 2 content-id)) + (is (= "ATGC" (String. data)))) + (let [[{content-id1 :content-id, data1 :data} + {content-id2 :content-id, data2 :data}] (BB)] + (is (= 3 content-id1)) + (is (= [3 5 4 3] (map #(bit-and % 0xff) data1))) + (is (= 4 content-id2)) + (is (= "CATCGAACAACTACT" (String. data2)))) + (let [[{:keys [content-id data]}] (RN)] + (is (= 5 content-id)) + (is (= "qname001\tqname002\tqname003\tqname004\t" (String. data)))))) + +(deftest build-tag-encoders-test + (testing "single values" + (testing "bytes" + (let [encodings {:sb {\c {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [1] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7561827}}} + :ub {\C {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [1] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7692867}}}} + encoders (ds/build-tag-encoders encodings) + sb (get-in encoders [:sb \c]) + ub (get-in encoders [:ub \C])] + (doseq [[v1 v2] (->> [[0xde 0xed 0xbe 0xef] + [0xca 0xfe 0xba 0xbe]] + (apply map vector))] + (sb (unchecked-byte v1)) + (ub v2)) + (let [[{:keys [content-id data]}] (sb)] + (is (= 7561827 content-id)) + (is (= [0xde 0xed 0xbe 0xef] (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (ub)] + (is (= 7692867 content-id)) + (is (= [0xca 0xfe 0xba 0xbe] (map #(bit-and % 0xff) data)))))) + (testing "shorts" + (let [encodings {:ss {\s {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [2] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7566195}}} + :us {\S {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [2] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7697235}}}} + encoders (ds/build-tag-encoders encodings) + ss (get-in encoders [:ss \s]) + us (get-in encoders [:us \S])] + (doseq [v [0x0123 0x4567 0x89ab 0xcdef]] + (ss (unchecked-short v)) + (us v)) + (let [[{:keys [content-id data]}] (ss)] + (is (= 7566195 content-id)) + (is (= [0x23 0x01 0x67 0x45 0xab 0x89 0xef 0xcd] (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (us)] + (is (= 7697235 content-id)) + (is (= [0x23 0x01 0x67 0x45 0xab 0x89 0xef 0xcd] (map #(bit-and % 0xff) data)))))) + (testing "ints" + (let [encodings {:si {\i {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7563625}}} + :ui {\I {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7694665}}}} + encoders (ds/build-tag-encoders encodings) + si (get-in encoders [:si \i]) + ui (get-in encoders [:ui \I])] + (doseq [v [0 0x01234567 0x89abcdef 0xffffffff]] + (si (unchecked-int v)) + (ui v)) + (let [[{:keys [content-id data]}] (si)] + (is (= 7563625 content-id)) + (is (= [0x00 0x00 0x00 0x00 0x67 0x45 0x23 0x01 + 0xef 0xcd 0xab 0x89 0xff 0xff 0xff 0xff] + (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (ui)] + (is (= 7694665 content-id)) + (is (= [0x00 0x00 0x00 0x00 0x67 0x45 0x23 0x01 + 0xef 0xcd 0xab 0x89 0xff 0xff 0xff 0xff] + (map #(bit-and % 0xff) data)))))) + (testing "floats" + (let [encodings {:fl {\f {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 6712422}}}} + encoders (ds/build-tag-encoders encodings) + fl (get-in encoders [:fl \f])] + (doseq [v [1.0 0.75 -0.5 0.0]] + (fl v)) + (let [[{:keys [content-id data]}] (fl) + bb (doto (bb/allocate-lsb-byte-buffer 16) + (.putFloat 1.0) + (.putFloat 0.75) + (.putFloat -0.5) + (.putFloat 0.0))] + (is (= 6712422 content-id)) + (is (= (seq (.array bb)) (seq data)))))) + (testing "strings" + (let [encodings {:MC {\Z {:codec :byte-array-stop + :stop-byte 9 + :content-id 5063514}} + :hx {\H {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 6846536}}}} + encoders (ds/build-tag-encoders encodings) + MC (get-in encoders [:MC \Z]) + hx (get-in encoders [:hx \H])] + (doseq [[v1 v2] (->> [["151M" "20S131M" "16S74M1D58M2S" "151M"] + [[0x01 0x23 0x45 0x67] + [0x89 0xab 0xcd 0xef] + [0xca 0xfe 0xba 0xbe] + [0xde 0xad 0xbe 0xef]]] + (apply map vector))] + (MC v1) + (hx (byte-array v2))) + (let [[{:keys [content-id data]}] (MC)] + (is (= 5063514 content-id)) + (is (= "151M\000\t20S131M\000\t16S74M1D58M2S\000\t151M\000\t" + (String. data)))) + (let [[{:keys [content-id data]}] (hx)] + (is (= 6846536 content-id)) + (is (= "01234567\00089ABCDEF\000CAFEBABE\000DEADBEEF\000" + (String. data))))))) + (testing "array values" + (testing "byte arrays" + (let [encodings {:sb {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7561794}}} + :ub {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7692866}}}} + encoders (ds/build-tag-encoders encodings) + sb (get-in encoders [:sb \B]) + ub (get-in encoders [:ub \B])] + (doseq [[v1 v2] (->> [["c,0,1,2,3" + "c,124,125,126,127" + "c,-128,-127,-126,-125" + "c,-4,-3,-2,-1"] + ["C,0,1,2,3" + "C,124,125,126,127" + "C,128,129,130,131" + "C,252,253,254,255"]] + (apply map vector))] + (sb v1) + (ub v2)) + (let [[{:keys [content-id data]}] (sb)] + (is (= 7561794 content-id)) + (is (= [0x63 0x04 0x00 0x00 0x00 0x00 0x01 0x02 0x03 + 0x63 0x04 0x00 0x00 0x00 0x7c 0x7d 0x7e 0x7f + 0x63 0x04 0x00 0x00 0x00 0x80 0x81 0x82 0x83 + 0x63 0x04 0x00 0x00 0x00 0xfc 0xfd 0xfe 0xff] + (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (ub)] + (is (= 7692866 content-id)) + (is (= [0x43 0x04 0x00 0x00 0x00 0x00 0x01 0x02 0x03 + 0x43 0x04 0x00 0x00 0x00 0x7c 0x7d 0x7e 0x7f + 0x43 0x04 0x00 0x00 0x00 0x80 0x81 0x82 0x83 + 0x43 0x04 0x00 0x00 0x00 0xfc 0xfd 0xfe 0xff] + (map #(bit-and % 0xff) data)))))) + (testing "short arrays" + (let [encodings {:ss {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7566146}}} + :us {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7697218}}}} + encoders (ds/build-tag-encoders encodings) + ss (get-in encoders [:ss \B]) + us (get-in encoders [:us \B])] + (doseq [[v1 v2] [[0x0123 0x4567] + [0x7ffe 0x7fff] + [0x8000 0x8001] + [0xfffe 0xffff]]] + (ss (str "s," (unchecked-short v1) "," (unchecked-short v2))) + (us (str "S," v1 "," v2))) + (let [[{:keys [content-id data]}] (ss)] + (is (= 7566146 content-id)) + (is (= [0x73 0x02 0x00 0x00 0x00 0x23 0x01 0x67 0x45 + 0x73 0x02 0x00 0x00 0x00 0xfe 0x7f 0xff 0x7f + 0x73 0x02 0x00 0x00 0x00 0x00 0x80 0x01 0x80 + 0x73 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0xff] + (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (us)] + (is (= 7697218 content-id)) + (is (= [0x53 0x02 0x00 0x00 0x00 0x23 0x01 0x67 0x45 + 0x53 0x02 0x00 0x00 0x00 0xfe 0x7f 0xff 0x7f + 0x53 0x02 0x00 0x00 0x00 0x00 0x80 0x01 0x80 + 0x53 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0xff] + (map #(bit-and % 0xff) data)))))) + (testing "int arrays" + (let [encodings {:si {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [13] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7563586}}} + :ui {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [13] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7694658}}}} + encoders (ds/build-tag-encoders encodings) + si (get-in encoders [:si \B]) + ui (get-in encoders [:ui \B])] + (doseq [[v1 v2] [[0x01234567 0x76543210] + [0x7ffffffe 0x7fffffff] + [0x80000000 0x80000001] + [0xfffffffe 0xffffffff]]] + (si (str "i," (unchecked-int v1) "," (unchecked-int v2))) + (ui (str "I," v1 "," v2))) + (let [[{:keys [content-id data]}] (si)] + (is (= 7563586 content-id)) + (is (= [0x69 0x02 0x00 0x00 0x00 0x67 0x45 0x23 0x01 0x10 0x32 0x54 0x76 + 0x69 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0x7f 0xff 0xff 0xff 0x7f + 0x69 0x02 0x00 0x00 0x00 0x00 0x00 0x00 0x80 0x01 0x00 0x00 0x80 + 0x69 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0xff 0xff 0xff 0xff 0xff] + (map #(bit-and % 0xff) data)))) + (let [[{:keys [content-id data]}] (ui)] + (is (= 7694658 content-id)) + (is (= [0x49 0x02 0x00 0x00 0x00 0x67 0x45 0x23 0x01 0x10 0x32 0x54 0x76 + 0x49 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0x7f 0xff 0xff 0xff 0x7f + 0x49 0x02 0x00 0x00 0x00 0x00 0x00 0x00 0x80 0x01 0x00 0x00 0x80 + 0x49 0x02 0x00 0x00 0x00 0xfe 0xff 0xff 0xff 0xff 0xff 0xff 0xff] + (map #(bit-and % 0xff) data))))) + (let [vs ["42M1D7M1D74M1D28M" + "44M2D45M1D58M4S" + "65M1D9M1D75M2S" + "18S76M1D57M"] + encodings {:CG {\B {:codec :byte-array-stop + :stop-byte -1 + :content-id 4409154}}} + encoders (ds/build-tag-encoders encodings) + CG (get-in encoders [:CG \B])] + (doseq [v vs] + (CG (str/join \, (cons \I (cigar/encode-cigar v))))) + (let [[{:keys [content-id data]}] (CG)] + (is (= 4409154 content-id)) + (is (= (mapcat (fn [v] + (let [encoded (cigar/encode-cigar v) + bb (bb/allocate-lsb-byte-buffer (+ 6 (* 4 (count encoded))))] + (.put bb (byte (int \I))) + (.putInt bb (count encoded)) + (run! #(.putInt bb %) encoded) + (.put bb (byte 0xff)) + (.array bb))) + vs) + (seq data)))))) + (testing "float arrays" + (let [encodings {:fl {\B {:codec :byte-array-len + :len-encoding {:codec :external + :content-id 6712386} + :val-encoding {:codec :external + :content-id 6712386}}}} + encoders (ds/build-tag-encoders encodings) + fl (get-in encoders [:fl \B])] + (doseq [v ["f,0.0,0.25" "f,0.5" "f,0.0,-0.25" "f,-0.5,-0.75,-1.0"]] + (fl v)) + (let [[{content-id1 :content-id data1 :data} + {content-id2 :content-id data2 :data}] (fl)] + (is (= 6712386 content-id1 content-id2)) + (is (identical? data1 data2)) + (is (= (mapcat (fn [vs] + (let [out (ByteArrayOutputStream.)] + (.write out (+ 5 (* 4 (count vs)))) + (.write out (int \f)) + (lsb/write-int out (count vs)) + (run! #(lsb/write-float out %) vs) + (.toByteArray out))) + [[0.0 0.25] + [0.5] + [0.0 -0.25] + [-0.5 -0.75 -1.0]]) + (seq data1)))))))) diff --git a/test/cljam/io/cram/encode/alignment_stats_test.clj b/test/cljam/io/cram/encode/alignment_stats_test.clj new file mode 100644 index 00000000..00551aa9 --- /dev/null +++ b/test/cljam/io/cram/encode/alignment_stats_test.clj @@ -0,0 +1,51 @@ +(ns cljam.io.cram.encode.alignment-stats-test + (:require [cljam.io.cram.encode.alignment-stats :as stats] + [clojure.test :refer [deftest is testing]])) + +(deftest alignment-stats-builder-test + (testing "all the reads are mapped to the same reference" + (let [builder (stats/make-alignment-stats-builder)] + (stats/update! builder 0 1 10 10 1) + (stats/update! builder 0 5 14 10 1) + (stats/update! builder 0 21 24 5 1) + (is (= {:ri 0, :start 1, :end 24, :nbases 25, :nrecords 3} + (into {} (stats/build builder)))))) + (testing "some of the reads are mapped to a different reference" + (let [builder (stats/make-alignment-stats-builder)] + (stats/update! builder 0 1 10 10 1) + (stats/update! builder 1 1 7 7 1) + (stats/update! builder 0 11 20 10 1) + (is (= {:ri -2, :start 0, :end 0, :nbases 27 :nrecords 3} + (into {} (stats/build builder)))))) + (testing "all the reads are unmapped" + (let [builder (stats/make-alignment-stats-builder)] + (stats/update! builder -1 0 0 10 1) + (stats/update! builder -1 0 0 7 1) + (stats/update! builder -1 11 20 10 1) + (is (= {:ri -1, :start 0, :end 0, :nbases 27 :nrecords 3} + (into {} (stats/build builder))))))) + +(deftest merge-stats-test + (testing "all the alignments stats are for the same reference" + (is (= {:ri 1, :start 1, :end 150, :nbases 25000, :nrecords 50} + (into {} + (stats/merge-stats [{:ri 1, :start 1, :end 100, :nbases 10000, :nrecords 20} + {:ri 1, :start 50, :end 150, :nbases 15000, :nrecords 30}]))))) + (testing "some alignments stats are for a different reference" + (is (= {:ri -2, :start 0, :end 0, :nbases 25000, :nrecords 50} + (into {} + (stats/merge-stats [{:ri 0, :start 1, :end 100, :nbases 10000, :nrecords 20} + {:ri 1, :start 50, :end 150, :nbases 15000, :nrecords 30}])))) + (is (= {:ri -2, :start 0, :end 0, :nbases 25000, :nrecords 50} + (into {} + (stats/merge-stats [{:ri -2, :start 0, :end 0, :nbases 10000, :nrecords 20} + {:ri 1, :start 50, :end 150, :nbases 15000, :nrecords 30}])))) + (is (= {:ri -2, :start 0, :end 0, :nbases 25000, :nrecords 50} + (into {} + (stats/merge-stats [{:ri -2, :start 0, :end 0, :nbases 10000, :nrecords 20} + {:ri -1, :start 0, :end 0, :nbases 15000, :nrecords 30}]))))) + (testing "all the alignments stats are for unmapped slices" + (is (= {:ri -1, :start 0, :end 0, :nbases 25000, :nrecords 50} + (into {} + (stats/merge-stats [{:ri -1, :start 0, :end 0, :nbases 10000, :nrecords 20} + {:ri -1, :start 0, :end 0, :nbases 15000, :nrecords 30}])))))) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj new file mode 100644 index 00000000..87664a7d --- /dev/null +++ b/test/cljam/io/cram/encode/record_test.clj @@ -0,0 +1,434 @@ +(ns cljam.io.cram.encode.record-test + (:require [cljam.io.cram.data-series :as ds] + [cljam.io.cram.encode.record :as record] + [cljam.io.cram.seq-resolver.protocol :as resolver] + [cljam.io.sequence :as cseq] + [cljam.test-common :as common] + [clojure.test :refer [are deftest is testing]] + [clojure.walk :as walk])) + +(def ^:private test-seq-resolver + (let [seqs (with-open [r (cseq/reader common/test-fa-file)] + (into {} (map (juxt :name :sequence)) (cseq/read-all-sequences r)))] + (reify resolver/ISeqResolver + (resolve-sequence [_ chr] + (.getBytes ^bytes (get seqs chr))) + (resolve-sequence [_ chr start end] + (let [s (get seqs chr)] + (.getBytes (subs s (dec (long start)) end))))))) + +(def subst-mat + {\A {\T 0, \G 1, \C 2, \N 3} + \T {\A 0, \G 1, \C 2, \N 3} + \G {\A 0, \T 1, \C 2, \N 3} + \C {\A 0, \T 1, \G 2, \N 3} + \N {\A 0, \T 1, \G 2, \C 3}}) + +(deftest calculate-read-features&end + (are [?record ?expected] + (= ?expected + (walk/prewalk + #(if (instance? (Class/forName "[B") %) (vec %) %) + (#'record/calculate-read-features&end test-seq-resolver subst-mat ?record))) + {:rname "ref", :pos 2, :seq "GCATG", :qual "ABCDE", :cigar "5M"} + [[] + 6] + + {:rname "ref", :pos 2, :seq "GCCTG", :qual "ABCDE", :cigar "5M"} + [[{:code :subst, :pos 3, :subst 2}] + 6] + + {:rname "ref", :pos 2, :seq "GCRTG", :qual "ABCDE", :cigar "5M"} + [[{:code :read-base, :pos 3, :base (int \R), :qual (- (int \C) 33)}] + 6] + + {:rname "ref", :pos 2, :seq "GCCAA", :qual "ABCDE", :cigar "2M2I1M"} + [[{:code :insertion, :pos 3, :bases (mapv int "CA")}] + 4] + + {:rname "ref", :pos 2, :seq "GTCAA", :qual "ABCDE", :cigar "2M2I1M"} + [[{:code :subst, :pos 2, :subst 1} + {:code :insertion, :pos 3, :bases (mapv int "CA")}] + 4] + + {:rname "ref", :pos 2, :seq "GCGTT", :qual "ABCDE", :cigar "2M2D3M"} + [[{:code :deletion, :pos 3, :len 2}] + 8] + + {:rname "ref", :pos 2, :seq "GCCTT", :qual "ABCDE", :cigar "2M2D3M"} + [[{:code :deletion, :pos 3, :len 2} + {:code :subst, :pos 3, :subst 2}] + 8] + + {:rname "ref", :pos 2, :seq "GCATT", :qual "ABCDE", :cigar "3M2S"} + [[{:code :softclip, :pos 4, :bases (mapv int "TT")}] + 4] + + {:rname "ref", :pos 2, :seq "TTGCA", :qual "ABCDE", :cigar "2S3M"} + [[{:code :softclip, :pos 1, :bases (mapv int "TT")}] + 4])) + +(deftest encode-slice-records-test + (testing "mapped reads" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}] + :RG + [{:ID "rg001"} + {:ID "rg002"}]} + tag-dict [[] + [{:tag :MD, :type \Z} + {:tag :NM, :type \c}]] + ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings) + tag-encoders (ds/build-tag-encoders + {:MD {\Z {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 5063770} + :val-encoding {:codec :external, :content-id 5063770}}} + :NM {\c {:codec :byte-array-len + :len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]} + :val-encoding {:codec :external, :content-id 5131619}}}}) + records [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0, + :cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "2C2"}} + {:NM {:type "c", :value 1}}] + ::record/tags-index 1} + {:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15, + :cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "3"}} + {:NM {:type "c", :value 0}}] + ::record/tags-index 1} + {:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60, + :cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}] + ::record/tags-index 1} + {:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15, + :cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "3^T2"}} + {:NM {:type "c", :value 2}}] + ::record/tags-index 1} + {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, + :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" + :options [], ::record/tags-index 0}] + stats (record/encode-slice-records test-seq-resolver cram-header tag-dict subst-mat + ds-encoders tag-encoders records) + ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders) + tag-res (walk/prewalk #(if (fn? %) (%) %) tag-encoders)] + (is (= {:ri 0, :start 1, :end 24, :nbases 25, :nrecords 5} + (into {} stats))) + + (is (= 1 (count (get ds-res :BF)))) + (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= [0x43 0x43 0x80 0x91 0x80 0x93 0x41] + (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) + + (is (= 1 (count (get ds-res :CF)))) + (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) + + (is (= 1 (count (get ds-res :RI)))) + (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= [0 0 0 0 0] (seq (get-in ds-res [:RI 0 :data])))) + + (is (= 1 (count (get ds-res :RL)))) + (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) + + (is (= 1 (count (get ds-res :AP)))) + (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= [1 5 10 15 20] (seq (get-in ds-res [:AP 0 :data])))) + + (is (= 1 (count (get ds-res :RG)))) + (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= [0 0 1 1 0xff 0xff 0xff 0xff 0x0f] + (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) + + (is (= 1 (count (get ds-res :RN)))) + (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= "q001\tq002\tq003\tq004\tq005\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) + + (is (= 1 (count (get ds-res :MF)))) + (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= [1 1 1 0 2] (seq (get-in ds-res [:MF 0 :data])))) + + (is (= 1 (count (get ds-res :NS)))) + (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= [0 0 1 0 0xff 0xff 0xff 0xff 0x0f] + (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) + + (is (= 1 (count (get ds-res :NP)))) + (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= [0x80 0x97 0x0f 0x64 0x05 0x00] + (map #(bit-and % 0xff) (get-in ds-res [:NP 0 :data])))) + + (is (= 1 (count (get ds-res :TS)))) + (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= [0x80 0x96 0x0f 0x00 0xff 0xff 0xff 0xff 0x01 0x00] + (map #(bit-and % 0xff) (get-in ds-res [:TS 0 :data])))) + + (is (= 1 (count (get ds-res :NF)))) + (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (zero? (count (get-in ds-res [:NF 0 :data])))) + + (is (= 1 (count (get ds-res :TL)))) + (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= [1 1 1 1 0] (seq (get-in ds-res [:TL 0 :data])))) + + (is (= 1 (count (get ds-res :FN)))) + (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (= [1 1 0 2 0] (seq (get-in ds-res [:FN 0 :data])))) + + (is (= 1 (count (get ds-res :FC)))) + (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (= [(int \X) (int \S) (int \I) (int \D)] + (seq (get-in ds-res [:FC 0 :data])))) + + (is (= 1 (count (get ds-res :FP)))) + (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (= [3 1 2 2] (seq (get-in ds-res [:FP 0 :data])))) + + (is (= 1 (count (get ds-res :DL)))) + (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (= [1] (seq (get-in ds-res [:DL 0 :data])))) + + (is (= 2 (count (get ds-res :BB)))) + (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (zero? (count (get-in ds-res [:BB 0 :data])))) + (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (zero? (count (get-in ds-res [:BB 1 :data])))) + + (is (= 2 (count (get ds-res :QQ)))) + (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (zero? (count (get-in ds-res [:QQ 0 :data])))) + (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (zero? (count (get-in ds-res [:QQ 1 :data])))) + + (is (= 1 (count (get ds-res :BS)))) + (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (= [0] (seq (get-in ds-res [:BS 0 :data])))) + + (is (= 2 (count (get ds-res :IN)))) + (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (= [1] (seq (get-in ds-res [:IN 0 :data])))) + (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (= "A" (String. ^bytes (get-in ds-res [:IN 1 :data])))) + + (is (= 1 (count (get ds-res :RS)))) + (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (zero? (count (get-in ds-res [:RS 0 :data])))) + + (is (= 1 (count (get ds-res :PD)))) + (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (zero? (count (get-in ds-res [:PD 0 :data])))) + + (is (= 1 (count (get ds-res :HC)))) + (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (zero? (count (get-in ds-res [:HC 0 :data])))) + + (is (= 2 (count (get ds-res :SC)))) + (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (= [2] (seq (get-in ds-res [:SC 0 :data])))) + (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (= "CC" (String. ^bytes (get-in ds-res [:SC 1 :data])))) + + (is (= 1 (count (get ds-res :MQ)))) + (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (= [0 15 60 15 0] (seq (get-in ds-res [:MQ 0 :data])))) + + (is (= 1 (count (get ds-res :BA)))) + (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (zero? (count (get-in ds-res [:BA 0 :data])))) + + (is (= 1 (count (get ds-res :QS)))) + (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= "HFHHH##AACCCCFFEBBFFAEEEE" + (->> (get-in ds-res [:QS 0 :data]) + (map #(+ (long %) 33)) + byte-array + String.))) + + (is (= 2 (count (get-in tag-res [:MD \Z])))) + (is (= 5063770 + (get-in tag-res [:MD \Z 0 :content-id]) + (get-in tag-res [:MD \Z 1 :content-id]))) + (is (= `[4 ~@(map int "2C2\0") + 2 ~@(map int "3\0") + 2 ~@(map int "5\0") + 5 ~@(map int "3^T2\0")] + (seq (get-in tag-res [:MD \Z 0 :data])) + (seq (get-in tag-res [:MD \Z 1 :data])))) + + (is (= 1 (count (get-in tag-res [:NM \c])))) + (is (= 5131619 (get-in tag-res [:NM \c 0 :content-id]))) + (is (= [1 0 0 2] (seq (get-in tag-res [:NM \c 0 :data])))))) + (testing "unmapped reads" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}]} + tag-dict [[]] + ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings) + records [{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF" + :options [], ::record/tags-index 0} + {:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD" + :options [], ::record/tags-index 0} + {:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF" + :options [], ::record/tags-index 0} + {:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD" + :options [], ::record/tags-index 0} + {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, + :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" + :options [], ::record/tags-index 0}] + stats (record/encode-slice-records test-seq-resolver cram-header tag-dict + subst-mat ds-encoders {} records) + ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)] + (is (= {:ri -1, :start 0, :end 0, :nbases 25, :nrecords 5} + (into {} stats))) + + (is (= 1 (count (get ds-res :BF)))) + (is (= 1 (get-in ds-res [:BF 0 :content-id]))) + (is (= [0x45 0x80 0x85 0x45 0x80 0x85 0x45] + (map #(bit-and % 0xff) (get-in ds-res [:BF 0 :data])))) + + (is (= 1 (count (get ds-res :CF)))) + (is (= 2 (get-in ds-res [:CF 0 :content-id]))) + (is (= [3 3 3 3 3] (seq (get-in ds-res [:CF 0 :data])))) + + (is (= 1 (count (get ds-res :RI)))) + (is (= 3 (get-in ds-res [:RI 0 :content-id]))) + (is (= [0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f] + (map #(bit-and % 0xff) (get-in ds-res [:RI 0 :data])))) + + (is (= 1 (count (get ds-res :RL)))) + (is (= 4 (get-in ds-res [:RL 0 :content-id]))) + (is (= [5 5 5 5 5] (seq (get-in ds-res [:RL 0 :data])))) + + (is (= 1 (count (get ds-res :AP)))) + (is (= 5 (get-in ds-res [:AP 0 :content-id]))) + (is (= [0 0 0 0 0] (seq (get-in ds-res [:AP 0 :data])))) + + (is (= 1 (count (get ds-res :RG)))) + (is (= 6 (get-in ds-res [:RG 0 :content-id]))) + (is (= [0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f] + (map #(bit-and % 0xff) (get-in ds-res [:RG 0 :data])))) + + (is (= 1 (count (get ds-res :RN)))) + (is (= 7 (get-in ds-res [:RN 0 :content-id]))) + (is (= "q001\tq001\tq002\tq002\tq003\t" (String. ^bytes (get-in ds-res [:RN 0 :data])))) + + (is (= 1 (count (get ds-res :MF)))) + (is (= 8 (get-in ds-res [:MF 0 :content-id]))) + (is (= [2 2 2 2 2] (seq (get-in ds-res [:MF 0 :data])))) + + (is (= 1 (count (get ds-res :NS)))) + (is (= 9 (get-in ds-res [:NS 0 :content-id]))) + (is (= [0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f + 0xff 0xff 0xff 0xff 0x0f] + (map #(bit-and % 0xff) (get-in ds-res [:NS 0 :data])))) + + (is (= 1 (count (get ds-res :NP)))) + (is (= 10 (get-in ds-res [:NP 0 :content-id]))) + (is (= [0 0 0 0 0] (seq (get-in ds-res [:NP 0 :data])))) + + (is (= 1 (count (get ds-res :TS)))) + (is (= 11 (get-in ds-res [:TS 0 :content-id]))) + (is (= [0 0 0 0 0] (seq (get-in ds-res [:TS 0 :data])))) + + (is (= 1 (count (get ds-res :NF)))) + (is (= 12 (get-in ds-res [:NF 0 :content-id]))) + (is (zero? (count (get-in ds-res [:NF 0 :data])))) + + (is (= 1 (count (get ds-res :TL)))) + (is (= 13 (get-in ds-res [:TL 0 :content-id]))) + (is (= [0 0 0 0 0] (seq (get-in ds-res [:TL 0 :data])))) + + (is (= 1 (count (get ds-res :FN)))) + (is (= 14 (get-in ds-res [:FN 0 :content-id]))) + (is (zero? (count (get-in ds-res [:FN 0 :data])))) + + (is (= 1 (count (get ds-res :FC)))) + (is (= 15 (get-in ds-res [:FC 0 :content-id]))) + (is (zero? (count (get-in ds-res [:FC 0 :data])))) + + (is (= 1 (count (get ds-res :FP)))) + (is (= 16 (get-in ds-res [:FP 0 :content-id]))) + (is (zero? (count (get-in ds-res [:FP 0 :data])))) + + (is (= 1 (count (get ds-res :DL)))) + (is (= 17 (get-in ds-res [:DL 0 :content-id]))) + (is (zero? (count (get-in ds-res [:DL 0 :data])))) + + (is (= 2 (count (get ds-res :BB)))) + (is (= 18 (get-in ds-res [:BB 0 :content-id]))) + (is (zero? (count (get-in ds-res [:BB 0 :data])))) + (is (= 19 (get-in ds-res [:BB 1 :content-id]))) + (is (zero? (count (get-in ds-res [:BB 1 :data])))) + + (is (= 2 (count (get ds-res :QQ)))) + (is (= 20 (get-in ds-res [:QQ 0 :content-id]))) + (is (zero? (count (get-in ds-res [:QQ 0 :data])))) + (is (= 21 (get-in ds-res [:QQ 1 :content-id]))) + (is (zero? (count (get-in ds-res [:QQ 1 :data])))) + + (is (= 1 (count (get ds-res :BS)))) + (is (= 22 (get-in ds-res [:BS 0 :content-id]))) + (is (zero? (count (get-in ds-res [:BS 0 :data])))) + + (is (= 2 (count (get ds-res :IN)))) + (is (= 23 (get-in ds-res [:IN 0 :content-id]))) + (is (zero? (count (get-in ds-res [:IN 0 :data])))) + (is (= 24 (get-in ds-res [:IN 1 :content-id]))) + (is (zero? (count (get-in ds-res [:IN 1 :data])))) + + (is (= 1 (count (get ds-res :RS)))) + (is (= 25 (get-in ds-res [:RS 0 :content-id]))) + (is (zero? (count (get-in ds-res [:RS 0 :data])))) + + (is (= 1 (count (get ds-res :PD)))) + (is (= 26 (get-in ds-res [:PD 0 :content-id]))) + (is (zero? (count (get-in ds-res [:PD 0 :data])))) + + (is (= 1 (count (get ds-res :HC)))) + (is (= 27 (get-in ds-res [:HC 0 :content-id]))) + (is (zero? (count (get-in ds-res [:HC 0 :data])))) + + (is (= 2 (count (get ds-res :SC)))) + (is (= 28 (get-in ds-res [:SC 0 :content-id]))) + (is (zero? (count (get-in ds-res [:SC 0 :data])))) + (is (= 29 (get-in ds-res [:SC 1 :content-id]))) + (is (zero? (count (get-in ds-res [:SC 1 :data])))) + + (is (= 1 (count (get ds-res :MQ)))) + (is (= 30 (get-in ds-res [:MQ 0 :content-id]))) + (is (zero? (count (get-in ds-res [:MQ 0 :data])))) + + (is (= 1 (count (get ds-res :BA)))) + (is (= 31 (get-in ds-res [:BA 0 :content-id]))) + (is (= "AATCCATTGTTGGTATCTTGGCACA" + (String. ^bytes (get-in ds-res [:BA 0 :data])))) + + (is (= 1 (count (get ds-res :QS)))) + (is (= 32 (get-in ds-res [:QS 0 :content-id]))) + (is (= "CCFFFBDFADADDHFDDDFDBCCFD" + (->> (get-in ds-res [:QS 0 :data]) + (map #(+ (long %) 33)) + byte-array + String.)))))) diff --git a/test/cljam/io/cram/encode/structure_test.clj b/test/cljam/io/cram/encode/structure_test.clj new file mode 100644 index 00000000..bc615254 --- /dev/null +++ b/test/cljam/io/cram/encode/structure_test.clj @@ -0,0 +1,158 @@ +(ns cljam.io.cram.encode.structure-test + (:require [cljam.io.cram.decode.structure :as decode.struct] + [cljam.io.cram.encode.structure :as struct] + [cljam.io.util.byte-buffer :as bb] + [clojure.set :as set] + [clojure.test :refer [deftest is]]) + (:import [java.io ByteArrayOutputStream] + [java.nio ByteBuffer] + [java.util Arrays] + [java.util.zip CRC32])) + +(defn- with-encoding-result [encode-fn & args] + (let [cont (last args) + out (ByteArrayOutputStream.) + _ (apply encode-fn out (butlast args)) + bs (.toByteArray out)] + (cont (bb/make-lsb-byte-buffer bs) bs))) + +(defn- calculate-crc ^bytes [^bytes bs ^long start ^long end] + (let [crc (CRC32.)] + (.update crc bs start end) + (-> (bb/allocate-lsb-byte-buffer 4) + (.putInt (.getValue crc)) + .array))) + +(deftest encode-file-definition-test + (let [version {:major 3, :minor 0} + file-id "test.cram"] + (with-encoding-result struct/encode-file-definition version file-id + (fn [bb _] + (is (= {:version version, :id file-id} + (decode.struct/decode-file-definition bb))))))) + +(deftest encode-container-header-test + (let [header {:length 362828 + :ref-seq-id 1 + :start 100 + :span 500 + :records 100 + :counter 0 + :bases 15000 + :blocks 40 + :landmarks [207]}] + (with-encoding-result struct/encode-container-header header + (fn [bb ^bytes encoded] + (let [decoded (decode.struct/decode-container-header bb) + crc (calculate-crc encoded 0 (- (alength encoded) 4))] + (is (= header (dissoc decoded :crc))) + (is (Arrays/equals crc ^bytes (:crc decoded)))))))) + +(deftest encode-block-test + (let [content (.getBytes "This is the block content.") + size (alength content) + {:keys [method content-type content-id] :as block} {:method :raw + :content-type 4 + :content-id 42}] + (with-encoding-result struct/encode-block method content-type content-id content + (fn [bb ^bytes encoded] + (let [decoded (decode.struct/decode-block bb) + crc (calculate-crc encoded 0 (- (alength encoded) 4))] + (is (= (assoc block :method 0 :size size :raw-size size) + (dissoc decoded :crc :data))) + (is (= (seq content) + (seq (bb/read-bytes (:data decoded) (:size decoded))))) + (is (Arrays/equals crc ^bytes (:crc decoded)))))))) + +(deftest encode-cram-header-container-test + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}] + :RG + [{:ID "rg001"} + {:ID "rg002"}]}] + (with-encoding-result struct/encode-cram-header-container cram-header + (fn [^ByteBuffer bb ^bytes encoded] + (let [container-header (decode.struct/decode-container-header bb) + container-header-size (.position bb) + crc (calculate-crc encoded 0 (- container-header-size 4)) + decoded (decode.struct/decode-cram-header-block bb)] + (is (= {:length (- (alength encoded) container-header-size) + :ref-seq-id 0, :start 0, :span 0, :records 0 + :counter 0, :bases 0, :blocks 1, :landmarks [0]} + (dissoc container-header :crc))) + (is (Arrays/equals crc ^bytes (:crc container-header))) + (is (= cram-header decoded))))))) + +(deftest encode-compression-header-block-test + (let [preservation-map {:RN true, :AP false, :RR true} + subst-mat {\A {\T 0, \G 1, \C 2, \N 3} + \T {\A 0, \G 1, \C 2, \N 3} + \G {\A 0, \T 1, \C 2, \N 3} + \C {\A 0, \T 1, \G 2, \N 3} + \N {\A 0, \T 1, \G 2, \C 3}} + tag-dict [[] + [{:tag :MD, :type \Z}] + [{:tag :NM, :type \c}] + [{:tag :MD, :type \Z} + {:tag :NM, :type \c}]] + ds-encodings {:BF {:content-id 1, :codec :external} + :RN {:content-id 2, :codec :byte-array-stop, :stop-byte (int \tab)} + :BB {:codec :byte-array-len + :len-encoding {:codec :huffman, :alphabet [151], :bit-len [0]} + :val-encoding {:codec :external, :content-id 3}} + :QQ {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 4} + :val-encoding {:codec :external, :content-id 5}}} + tag-encodings {:MD {\Z {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 5063770} + :val-encoding {:codec :external, :content-id 5063770}}} + :NM {\c {:codec :byte-array-len + :len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]} + :val-encoding {:codec :external, :content-id 5131619}}}}] + (with-encoding-result + struct/encode-compression-header-block + preservation-map subst-mat tag-dict ds-encodings tag-encodings + (fn [bb _] + (let [decoded (decode.struct/decode-compression-header-block bb)] + (is (= {:preservation-map (assoc preservation-map :TD tag-dict) + :data-series ds-encodings + :tags tag-encodings} + (update decoded :preservation-map dissoc :SM))) + (is (= (into {} (map (fn [[k v]] [k (set/map-invert v)])) + subst-mat) + (get-in decoded [:preservation-map :SM])))))))) + +(deftest encode-slice-header-block-test + (let [header {:ref-seq-id 1 + :start 100 + :span 500 + :records 1000 + :counter 12345 + :embedded-reference -1 + :reference-md5 (byte-array (range 16)) + :tags []} + blocks [{:content-id 1} + {:content-id 2} + {:content-id 3}]] + (with-encoding-result struct/encode-slice-header-block header blocks + (fn [bb _] + (let [decoded (decode.struct/decode-slice-header-block bb)] + (is (= (-> header + (assoc :blocks (count blocks) + :content-ids (mapv :content-id blocks)) + (dissoc :reference-md5)) + (dissoc decoded :reference-md5))) + (is (Arrays/equals ^bytes (:reference-md5 header) + ^bytes (:reference-md5 decoded)))))))) + +(deftest encode-eof-container-test + (with-encoding-result struct/encode-eof-container + (fn [bb _] + (let [container-header (decode.struct/decode-container-header bb) + compression-header (decode.struct/decode-compression-header-block bb)] + (is (decode.struct/eof-container? container-header)) + (is (= {:preservation-map {:RN true, :AP true, :RR true} + :data-series {} + :tags {}} + compression-header)))))) diff --git a/test/cljam/io/cram/seq_resolver_test.clj b/test/cljam/io/cram/seq_resolver_test.clj index f3006890..0a1f6d91 100644 --- a/test/cljam/io/cram/seq_resolver_test.clj +++ b/test/cljam/io/cram/seq_resolver_test.clj @@ -26,3 +26,19 @@ (testing "resolver can be omitted, but in that case, trying to resolve seq will end up with error" (is (thrown? Exception (proto/resolve-sequence nil "ref"))) (is (thrown? Exception (proto/resolve-sequence nil "ref" 1 5))))) + +(deftest cached-seq-resolver-test + (with-open [resolver (resolver/seq-resolver common/test-fa-file) + resolver' (resolver/cached-resolver resolver)] + (testing "cached seq resolver returns equivalent sequences as the original seq resolver does" + (is (= (String. (proto/resolve-sequence resolver "ref")) + (String. (proto/resolve-sequence resolver' "ref")))) + (is (= (String. (proto/resolve-sequence resolver "ref" 5 10)) + (String. (proto/resolve-sequence resolver' "ref" 5 10)))) + (is (= (proto/resolve-sequence resolver "unknown") + (proto/resolve-sequence resolver' "unknown"))) + (is (= (proto/resolve-sequence resolver "unknown" 5 10) + (proto/resolve-sequence resolver' "unknown" 5 10)))) + (testing "cached seq resolver returns the identical sequence for the cached chr" + (is (identical? (proto/resolve-sequence resolver' "ref") + (proto/resolve-sequence resolver' "ref")))))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index de1ff416..8ff2902e 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -1,11 +1,15 @@ (ns cljam.io.cram-test (:require [cljam.io.cram :as cram] [cljam.io.sam :as sam] - [cljam.test-common :as common :refer [deftest-remote + [cljam.test-common :as common :refer [clean-cache! deftest-remote + prepare-cache! prepare-cavia! with-before-after]] + [clojure.java.io :as io] [clojure.test :refer [are deftest is testing]])) +(def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) + (defn- fixup-bam-aln [aln] (-> (into {} aln) (update :cigar #(if (= % "") "*" %)) @@ -64,3 +68,40 @@ {:chr "chr14", :start 105661859} 10 ;; region crosses over container boundary {:chr "chr19", :start 54000000, :end 55000000} 12))))) + +(deftest writer-test + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-cram-file + {:reference common/test-fa-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r')))))) + +(deftest-remote writer-with-multiple-containers-test + (with-before-after {:before (do (prepare-cavia!) + (prepare-cache!)) + :after (clean-cache!)} + (with-open [r (cram/reader common/medium-cram-file + {:reference common/hg19-twobit-file}) + w (cram/writer temp-cram-file + {:reference common/hg19-twobit-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/medium-cram-file + {:reference common/hg19-twobit-file}) + r' (cram/reader temp-cram-file + {:reference common/hg19-twobit-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r')))))) diff --git a/test/cljam/io/util_test.clj b/test/cljam/io/util_test.clj index 250c6bf3..5a0c45ee 100644 --- a/test/cljam/io/util_test.clj +++ b/test/cljam/io/util_test.clj @@ -394,6 +394,7 @@ io-util/alignment-writer? true io-util/sam-writer? true io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -411,6 +412,25 @@ io-util/alignment-writer? true io-util/sam-writer? false io-util/bam-writer? true + io-util/cram-writer? false + io-util/sequence-writer? false + io-util/fasta-writer? false + io-util/twobit-writer? false + io-util/variant-writer? false + io-util/vcf-writer? false + io-util/bcf-writer? false + io-util/fastq-writer? false + io-util/bed-writer? false + io-util/wig-writer? false)))) + (testing "cram writer" + (with-before-after {:before (prepare-cache!) + :after (clean-cache!)} + (with-open [r (cram/writer (.getAbsolutePath (cio/file temp-dir "temp.cram")))] + (are [?pred ?expected] (= (?pred r) ?expected) + io-util/alignment-writer? true + io-util/sam-writer? false + io-util/bam-writer? false + io-util/cram-writer? true io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -428,6 +448,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? true io-util/fasta-writer? true io-util/twobit-writer? false @@ -445,6 +466,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? true io-util/fasta-writer? false io-util/twobit-writer? true @@ -462,6 +484,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -479,6 +502,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -496,6 +520,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -513,6 +538,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false @@ -530,6 +556,7 @@ io-util/alignment-writer? false io-util/sam-writer? false io-util/bam-writer? false + io-util/cram-writer? false io-util/sequence-writer? false io-util/fasta-writer? false io-util/twobit-writer? false From 800eecf2938f9d6bd14e1f0085249327b612929c Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Tue, 16 Jul 2024 14:03:21 +0900 Subject: [PATCH 10/10] Add descriptions for arity 0 overload of data series/tag encoders --- src/cljam/io/cram/data_series.clj | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cljam/io/cram/data_series.clj b/src/cljam/io/cram/data_series.clj index f1aa4318..43005f6a 100644 --- a/src/cljam/io/cram/data_series.clj +++ b/src/cljam/io/cram/data_series.clj @@ -245,7 +245,9 @@ a map { }, where: - : a keyword representing the data series name - : a map representing the encoding of the data series - - : a function that takes one argument to encode" + - : a function that has two arities + - arity 1: take a value to encode for the data series + - arity 0: finalize and return the encoding result" [ds-encoding] (let [content-id->state (volatile! {})] (reduce-kv (fn [encoders ds params] @@ -311,7 +313,9 @@ - : a keyword representing the tag name - : a character representing a type of the tag - : a map representing the encoding of the tag and type - - : a function that takes one argument to encode" + - : a function that has two arities + - arity 1: take a value to encode for the tag + - arity 0: finalize and return the encoding result" [tag-encodings] (let [content-id->state (volatile! {})] (reduce-kv