diff --git a/src/cljam/io/cram/bit_stream.clj b/src/cljam/io/cram/bit_stream.clj new file mode 100644 index 00000000..c74bb042 --- /dev/null +++ b/src/cljam/io/cram/bit_stream.clj @@ -0,0 +1,31 @@ +(ns cljam.io.cram.bit-stream + (:import [java.nio ByteBuffer])) + +(defprotocol IBitStreamDecoder + (read-bits [_ m])) + +(definline ^:private right-nbits-of [x nbits] + `(bit-and (long ~x) (dec (bit-shift-left 1 (long ~nbits))))) + +(deftype BitStreamDecoder + [^ByteBuffer bb + ^:unsynchronized-mutable ^long buffer + ^:unsynchronized-mutable ^long nbits] + IBitStreamDecoder + (read-bits [_ m] + (let [m (long m)] + (if (zero? m) + m + (loop [m m, buf buffer, n nbits, acc 0] + (if (<= m n) + (do (set! buffer buf) + (set! nbits (- n m)) + (bit-or acc (right-nbits-of (unsigned-bit-shift-right buf nbits) m))) + (let [m' (- m n) + acc' (bit-or acc (bit-shift-left (right-nbits-of buf n) m'))] + (recur m' (long (.get bb)) 8 acc')))))))) + +(defn make-bit-stream-decoder + "Creates a new bit stream decoder based on the given byte buffer." + [bb] + (->BitStreamDecoder bb 0 0)) diff --git a/src/cljam/io/cram/decode/data_series.clj b/src/cljam/io/cram/decode/data_series.clj index 38446c97..1390ab7b 100644 --- a/src/cljam/io/cram/decode/data_series.clj +++ b/src/cljam/io/cram/decode/data_series.clj @@ -1,5 +1,6 @@ (ns cljam.io.cram.decode.data-series (:require [cljam.io.cram.itf8 :as itf8] + [cljam.io.cram.bit-stream :as bs] [cljam.io.util.byte-buffer :as bb] [clojure.string :as str]) (:import [java.nio Buffer ByteBuffer])) @@ -16,7 +17,7 @@ :bytes)) (defn- build-codec-decoder - [{:keys [codec] :as params} data-type content-id->block-data] + [{:keys [codec] :as params} data-type bs-decoder content-id->block-data] (case codec :external (let [^ByteBuffer block (get content-id->block-data (:content-id params))] @@ -33,8 +34,8 @@ :byte-array-len (let [{:keys [len-encoding val-encoding]} params - len-decoder (build-codec-decoder len-encoding :int content-id->block-data) - val-decoder (build-codec-decoder val-encoding :byte content-id->block-data)] + len-decoder (build-codec-decoder len-encoding :int bs-decoder content-id->block-data) + val-decoder (build-codec-decoder val-encoding :byte bs-decoder content-id->block-data)] (fn [] (let [len (len-decoder) bb (bb/allocate-lsb-byte-buffer len)] @@ -57,7 +58,13 @@ _ (.reset ^Buffer block) ret (bb/read-bytes block len)] (.get block) - ret))))) + ret))) + + :beta + (let [{:keys [offset length]} params] + (fn [] + (+ (long (bs/read-bits bs-decoder (long length))) + (long offset)))))) (defn build-data-series-decoders "Builds decoders for data series based on the encodings specified in the given @@ -69,11 +76,11 @@ - : a map representing the encoding of the data series - : a function with no arguments that returns a value decoded from the data series upon each call" - [{ds-encodings :data-series} blocks] + [{ds-encodings :data-series} bs-decoder blocks] (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] (reduce-kv (fn [decoders ds params] (let [dt (data-series-type ds) - decoder (build-codec-decoder params dt content-id->block-data)] + decoder (build-codec-decoder params dt bs-decoder content-id->block-data)] (assoc decoders ds decoder))) {} ds-encodings))) @@ -104,8 +111,8 @@ vs (repeatedly len (partial coercer bb))] (str/join \, (cons tag-type' vs)))))) -(defn- build-tag-decoder [tag-encoding tag-type content-id->block-data] - (let [decoder (build-codec-decoder tag-encoding :bytes content-id->block-data) +(defn- build-tag-decoder [tag-encoding tag-type bs-decoder content-id->block-data] + (let [decoder (build-codec-decoder tag-encoding :bytes bs-decoder content-id->block-data) coercer (tag-value-coercer tag-type)] (fn [] (let [bb (bb/make-lsb-byte-buffer (decoder))] @@ -122,13 +129,13 @@ - : a map representing the encoding of the tag and type - : a function with no arguments that returns a value decoded from the data series for the tag upon each call" - [{:keys [tags]} blocks] + [{:keys [tags]} bs-decoder blocks] (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] (reduce-kv (fn [decoders tag m] (reduce-kv (fn [decoders tag-type encoding] - (let [decoder (build-tag-decoder encoding tag-type content-id->block-data) + (let [decoder (build-tag-decoder encoding tag-type bs-decoder content-id->block-data) tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))] (assoc-in decoders [tag tag-type] (fn [] {:type tag-type' :value (decoder)})))) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index e9d09e5d..49512cb4 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -1,5 +1,6 @@ (ns cljam.io.cram.reader - (:require [cljam.io.cram.decode.data-series :as ds] + (:require [cljam.io.cram.bit-stream :as bs] + [cljam.io.cram.decode.data-series :as ds] [cljam.io.cram.decode.record :as record] [cljam.io.cram.decode.structure :as struct] [cljam.io.protocols :as protocols] @@ -66,8 +67,11 @@ (let [slice-header (struct/decode-slice-header-block bb) blocks (into [] (map (fn [_] (struct/decode-block bb))) (range (:blocks slice-header))) - ds-decoders (ds/build-data-series-decoders compression-header blocks) - tag-decoders (ds/build-tag-decoders compression-header blocks)] + core-block (first (filter #(zero? (long (:content-id %))) blocks)) + bs-decoder (when core-block + (bs/make-bit-stream-decoder (:data core-block))) + ds-decoders (ds/build-data-series-decoders compression-header bs-decoder blocks) + tag-decoders (ds/build-tag-decoders compression-header bs-decoder blocks)] (record/decode-slice-records (.-seq-resolver rdr) @(.-header rdr) compression-header diff --git a/test/cljam/io/cram/bit_stream_test.clj b/test/cljam/io/cram/bit_stream_test.clj new file mode 100644 index 00000000..ea46b75c --- /dev/null +++ b/test/cljam/io/cram/bit_stream_test.clj @@ -0,0 +1,12 @@ +(ns cljam.io.cram.bit-stream-test + (:require [cljam.io.cram.bit-stream :as bs] + [cljam.io.util.byte-buffer :as bb] + [clojure.test :refer [deftest is]])) + +(deftest read-bits-test + (let [bb (bb/make-lsb-byte-buffer (byte-array [0x31 0x41 0x59 0x26])) + bs-decoder (bs/make-bit-stream-decoder bb)] + (is (= [3 1 4 1 5 9 2 6] (repeatedly 8 #(bs/read-bits bs-decoder 4))))) + (let [bb (bb/make-lsb-byte-buffer (byte-array [2r11001110 2r10101110])) + bs-decoder (bs/make-bit-stream-decoder bb)] + (is (= [6 3 5 2 7] (repeatedly 5 #(bs/read-bits bs-decoder 3)))))) diff --git a/test/cljam/io/cram/decode/data_series_test.clj b/test/cljam/io/cram/decode/data_series_test.clj index 12048883..26d0737f 100644 --- a/test/cljam/io/cram/decode/data_series_test.clj +++ b/test/cljam/io/cram/decode/data_series_test.clj @@ -1,6 +1,7 @@ (ns cljam.io.cram.decode.data-series-test (:require [cljam.io.bam.decoder :as decoder] [cljam.io.cram.decode.data-series :as ds] + [cljam.io.cram.bit-stream :as bs] [cljam.io.sam.util.cigar :as cigar] [cljam.io.util.byte-buffer :as bb] [clojure.test :refer [deftest is testing]])) @@ -8,12 +9,15 @@ (deftest build-data-series-decoders-test (let [encodings {:BA {:codec :external, :content-id 1} :BF {:codec :external, :content-id 2} - :RL {:codec :huffman, :alphabet [151], :bit-len [0]} + :MQ {:codec :huffman, :alphabet [60], :bit-len [0]} + :RL {:codec :beta, :offset 149, :length 2} :BB {:codec :byte-array-len :len-encoding {:codec :external, :content-id 3} :val-encoding {:codec :external, :content-id 4}} :RN {:codec :byte-array-stop, :stop-byte 0, :external-id 5}} - blocks [{:content-id 1 + blocks [{:content-id 0 + :data (bb/make-lsb-byte-buffer (byte-array [2r01001110]))} + {:content-id 1 :data (bb/make-lsb-byte-buffer (byte-array (.getBytes "ATGC")))} {:content-id 2 :data (->> (byte-array [0x80 0xa1 0x80 0x63 0x80 0xa3 0x80 0x63]) @@ -26,30 +30,45 @@ :data (->> "qname001\000qname002\000qname003\000qname004\000" .getBytes bb/make-lsb-byte-buffer)}] - {:keys [BA BF RL BB RN]} - (ds/build-data-series-decoders {:data-series encodings} blocks)] + bs-decoder (bs/make-bit-stream-decoder + (:data (first (filter #(zero? (:content-id %)) blocks)))) + {:keys [BA BF MQ RL BB RN]} + (ds/build-data-series-decoders {:data-series encodings} bs-decoder blocks)] (is (= (map int "ATGC") [(BA) (BA) (BA) (BA)])) (is (= [0xa1 0x63 0xa3 0x63] [(BF) (BF) (BF) (BF)])) - (is (= [151 151 151 151] [(RL) (RL) (RL) (RL)])) + (is (= [60 60 60 60] [(MQ) (MQ) (MQ) (MQ)])) + (is (= [150 149 152 151] [(RL) (RL) (RL) (RL)])) (is (= ["CAT" "CGAAC" "AACT" "ACT"] (map #(String. ^bytes %) [(BB) (BB) (BB) (BB)]))) (is (= ["qname001" "qname002" "qname003" "qname004"] (map #(String. ^bytes %) [(RN) (RN) (RN) (RN)])))) (testing "block content interleaves when associated with multiple data series" - (let [encodings {:BB {:codec :byte-array-len + (let [encodings {:RI {:codec :beta, :offset 0, :length 1} + :RL {:codec :beta, :offset 150, :length 1} + :BB {:codec :byte-array-len :len-encoding {:codec :external, :content-id 1} :val-encoding {:codec :external, :content-id 1}} :DL {:codec :external, :content-id 2} :HC {:codec :external, :content-id 2}} - blocks [{:content-id 1 + blocks [{:content-id 0 + :data (bb/make-lsb-byte-buffer (byte-array [2r11100100]))} + {:content-id 1 :data (->> (byte-array [3 (int \A) (int \B) (int \C) 2 (int \D) (int \E) 1 (int \F)]) bb/make-lsb-byte-buffer)} {:content-id 2 :data (bb/make-lsb-byte-buffer (byte-array [1 2 3 4 5 6]))}] - {:keys [BB DL HC]} - (ds/build-data-series-decoders {:data-series encodings} blocks)] + bs-decoder (bs/make-bit-stream-decoder + (:data (first (filter #(zero? (:content-id %)) blocks)))) + {:keys [RI RL BB DL HC]} + (ds/build-data-series-decoders {:data-series encodings} bs-decoder blocks)] + (is (= [[1 151] + [1 150] + [0 151]] + [[(RI) (RL)] + [(RI) (RL)] + [(RI) (RL)]])) (is (= ["ABC" "DE" "F"] (map #(String. ^bytes %) [(BB) (BB) (BB)]))) (is (= [[1 2] [3 4] [5 6]] @@ -76,7 +95,7 @@ :data (bb/make-lsb-byte-buffer (byte-array [0xde 0xed 0xbe 0xef]))} {:content-id 7692867 :data (bb/make-lsb-byte-buffer (byte-array [0xca 0xfe 0xba 0xbe]))}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) sb (get-in decoders [:sb \c]) ub (get-in decoders [:ub \C])] (is (= [{:type "i" :value (unchecked-byte 0xde)} @@ -116,7 +135,7 @@ (.putShort 0x89ab) (.putShort 0xcdef) .flip)}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) ss (get-in decoders [:ss \s]) us (get-in decoders [:us \S])] (is (= [{:type "i" :value 0x0123} @@ -156,7 +175,7 @@ (.putInt 0x89abcdef) (.putInt 0xffffffff) .flip)}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) si (get-in decoders [:si \i]) ui (get-in decoders [:ui \I])] (is (= [{:type "i" :value 0} @@ -183,7 +202,7 @@ (.putFloat -0.5) (.putFloat 0.0) .flip)}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) fl (get-in decoders [:fl \f])] (is (= [{:type "f" :value 1.0} {:type "f" :value 0.75} @@ -214,7 +233,7 @@ "deadbeef\000") .getBytes bb/make-lsb-byte-buffer)}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) MC (get-in decoders [:MC \Z]) hx (get-in decoders [:hx \H])] (is (= [{:type "Z" :value "151M"} @@ -263,7 +282,7 @@ (.putInt bb 4) (doseq [v vs] (.put bb (byte v)))) (.flip bb))}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) sb (get-in decoders [:sb \B]) ub (get-in decoders [:ub \B])] (is (= [{:type "B" :value "c,0,1,2,3"} @@ -309,7 +328,7 @@ (.putInt bb 2) (doseq [v vs] (.putShort bb v))) (.flip bb))}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) ss (get-in decoders [:ss \B]) us (get-in decoders [:us \B])] (is (= [{:type "B" :value (str "s," 0x0123 "," 0x4567)} @@ -357,7 +376,7 @@ (.putInt bb 2) (doseq [v vs] (.putInt bb v))) (.flip bb))}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) si (get-in decoders [:si \B]) ui (get-in decoders [:ui \B])] (is (= [{:type "B" :value (str "i," 0x01234567 "," 0x76543210)} @@ -394,7 +413,7 @@ (.put bb (byte -1))) (.flip bb) bb)}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) CG (get-in decoders [:CG \B])] (is (= (map #(array-map :type "B" :value %) vs) (map #(update % :value #'decoder/B-I-type-cigar-str->cigar-str) @@ -416,7 +435,7 @@ (.putInt bb 2) (doseq [v vs] (.putFloat bb v))) (.flip bb))}] - decoders (ds/build-tag-decoders {:tags encodings} blocks) + decoders (ds/build-tag-decoders {:tags encodings} nil blocks) fl (get-in decoders [:fl \B])] (is (= [{:type "B" :value "f,0.0,0.25"} {:type "B" :value "f,0.5,1.0"}