Skip to content

Commit

Permalink
Merge pull request #309 from chrovis/feature/cram-bit-encoding-support
Browse files Browse the repository at this point in the history
Support for CRAM beta coding
  • Loading branch information
r6eve authored May 14, 2024
2 parents e170871 + e00b3fa commit 90227f3
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 32 deletions.
31 changes: 31 additions & 0 deletions src/cljam/io/cram/bit_stream.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
(ns cljam.io.cram.bit-stream
(:import [java.nio ByteBuffer]))

(defprotocol IBitStreamDecoder
(read-bits [_ m]))

(definline ^:private right-nbits-of [x nbits]
`(bit-and (long ~x) (dec (bit-shift-left 1 (long ~nbits)))))

(deftype BitStreamDecoder
[^ByteBuffer bb
^:unsynchronized-mutable ^long buffer
^:unsynchronized-mutable ^long nbits]
IBitStreamDecoder
(read-bits [_ m]
(let [m (long m)]
(if (zero? m)
m
(loop [m m, buf buffer, n nbits, acc 0]
(if (<= m n)
(do (set! buffer buf)
(set! nbits (- n m))
(bit-or acc (right-nbits-of (unsigned-bit-shift-right buf nbits) m)))
(let [m' (- m n)
acc' (bit-or acc (bit-shift-left (right-nbits-of buf n) m'))]
(recur m' (long (.get bb)) 8 acc'))))))))

(defn make-bit-stream-decoder
"Creates a new bit stream decoder based on the given byte buffer."
[bb]
(->BitStreamDecoder bb 0 0))
27 changes: 17 additions & 10 deletions src/cljam/io/cram/decode/data_series.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.decode.data-series
(:require [cljam.io.cram.itf8 :as itf8]
[cljam.io.cram.bit-stream :as bs]
[cljam.io.util.byte-buffer :as bb]
[clojure.string :as str])
(:import [java.nio Buffer ByteBuffer]))
Expand All @@ -16,7 +17,7 @@
:bytes))

(defn- build-codec-decoder
[{:keys [codec] :as params} data-type content-id->block-data]
[{:keys [codec] :as params} data-type bs-decoder content-id->block-data]
(case codec
:external
(let [^ByteBuffer block (get content-id->block-data (:content-id params))]
Expand All @@ -33,8 +34,8 @@

:byte-array-len
(let [{:keys [len-encoding val-encoding]} params
len-decoder (build-codec-decoder len-encoding :int content-id->block-data)
val-decoder (build-codec-decoder val-encoding :byte content-id->block-data)]
len-decoder (build-codec-decoder len-encoding :int bs-decoder content-id->block-data)
val-decoder (build-codec-decoder val-encoding :byte bs-decoder content-id->block-data)]
(fn []
(let [len (len-decoder)
bb (bb/allocate-lsb-byte-buffer len)]
Expand All @@ -57,7 +58,13 @@
_ (.reset ^Buffer block)
ret (bb/read-bytes block len)]
(.get block)
ret)))))
ret)))

:beta
(let [{:keys [offset length]} params]
(fn []
(+ (long (bs/read-bits bs-decoder (long length)))
(long offset))))))

(defn build-data-series-decoders
"Builds decoders for data series based on the encodings specified in the given
Expand All @@ -69,11 +76,11 @@
- <encoding>: a map representing the encoding of the data series
- <decoder>: a function with no arguments that returns a value decoded from
the data series upon each call"
[{ds-encodings :data-series} blocks]
[{ds-encodings :data-series} bs-decoder blocks]
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)]
(reduce-kv (fn [decoders ds params]
(let [dt (data-series-type ds)
decoder (build-codec-decoder params dt content-id->block-data)]
decoder (build-codec-decoder params dt bs-decoder content-id->block-data)]
(assoc decoders ds decoder)))
{} ds-encodings)))

Expand Down Expand Up @@ -104,8 +111,8 @@
vs (repeatedly len (partial coercer bb))]
(str/join \, (cons tag-type' vs))))))

(defn- build-tag-decoder [tag-encoding tag-type content-id->block-data]
(let [decoder (build-codec-decoder tag-encoding :bytes content-id->block-data)
(defn- build-tag-decoder [tag-encoding tag-type bs-decoder content-id->block-data]
(let [decoder (build-codec-decoder tag-encoding :bytes bs-decoder content-id->block-data)
coercer (tag-value-coercer tag-type)]
(fn []
(let [bb (bb/make-lsb-byte-buffer (decoder))]
Expand All @@ -122,13 +129,13 @@
- <encoding>: a map representing the encoding of the tag and type
- <decoder>: a function with no arguments that returns a value decoded from
the data series for the tag upon each call"
[{:keys [tags]} blocks]
[{:keys [tags]} bs-decoder blocks]
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)]
(reduce-kv
(fn [decoders tag m]
(reduce-kv
(fn [decoders tag-type encoding]
(let [decoder (build-tag-decoder encoding tag-type content-id->block-data)
(let [decoder (build-tag-decoder encoding tag-type bs-decoder content-id->block-data)
tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))]
(assoc-in decoders [tag tag-type]
(fn [] {:type tag-type' :value (decoder)}))))
Expand Down
10 changes: 7 additions & 3 deletions src/cljam/io/cram/reader.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns cljam.io.cram.reader
(:require [cljam.io.cram.decode.data-series :as ds]
(:require [cljam.io.cram.bit-stream :as bs]
[cljam.io.cram.decode.data-series :as ds]
[cljam.io.cram.decode.record :as record]
[cljam.io.cram.decode.structure :as struct]
[cljam.io.protocols :as protocols]
Expand Down Expand Up @@ -66,8 +67,11 @@
(let [slice-header (struct/decode-slice-header-block bb)
blocks (into [] (map (fn [_] (struct/decode-block bb)))
(range (:blocks slice-header)))
ds-decoders (ds/build-data-series-decoders compression-header blocks)
tag-decoders (ds/build-tag-decoders compression-header blocks)]
core-block (first (filter #(zero? (long (:content-id %))) blocks))
bs-decoder (when core-block
(bs/make-bit-stream-decoder (:data core-block)))
ds-decoders (ds/build-data-series-decoders compression-header bs-decoder blocks)
tag-decoders (ds/build-tag-decoders compression-header bs-decoder blocks)]
(record/decode-slice-records (.-seq-resolver rdr)
@(.-header rdr)
compression-header
Expand Down
12 changes: 12 additions & 0 deletions test/cljam/io/cram/bit_stream_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
(ns cljam.io.cram.bit-stream-test
(:require [cljam.io.cram.bit-stream :as bs]
[cljam.io.util.byte-buffer :as bb]
[clojure.test :refer [deftest is]]))

(deftest read-bits-test
(let [bb (bb/make-lsb-byte-buffer (byte-array [0x31 0x41 0x59 0x26]))
bs-decoder (bs/make-bit-stream-decoder bb)]
(is (= [3 1 4 1 5 9 2 6] (repeatedly 8 #(bs/read-bits bs-decoder 4)))))
(let [bb (bb/make-lsb-byte-buffer (byte-array [2r11001110 2r10101110]))
bs-decoder (bs/make-bit-stream-decoder bb)]
(is (= [6 3 5 2 7] (repeatedly 5 #(bs/read-bits bs-decoder 3))))))
57 changes: 38 additions & 19 deletions test/cljam/io/cram/decode/data_series_test.clj
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
(ns cljam.io.cram.decode.data-series-test
(:require [cljam.io.bam.decoder :as decoder]
[cljam.io.cram.decode.data-series :as ds]
[cljam.io.cram.bit-stream :as bs]
[cljam.io.sam.util.cigar :as cigar]
[cljam.io.util.byte-buffer :as bb]
[clojure.test :refer [deftest is testing]]))

(deftest build-data-series-decoders-test
(let [encodings {:BA {:codec :external, :content-id 1}
:BF {:codec :external, :content-id 2}
:RL {:codec :huffman, :alphabet [151], :bit-len [0]}
:MQ {:codec :huffman, :alphabet [60], :bit-len [0]}
:RL {:codec :beta, :offset 149, :length 2}
:BB {:codec :byte-array-len
:len-encoding {:codec :external, :content-id 3}
:val-encoding {:codec :external, :content-id 4}}
:RN {:codec :byte-array-stop, :stop-byte 0, :external-id 5}}
blocks [{:content-id 1
blocks [{:content-id 0
:data (bb/make-lsb-byte-buffer (byte-array [2r01001110]))}
{:content-id 1
:data (bb/make-lsb-byte-buffer (byte-array (.getBytes "ATGC")))}
{:content-id 2
:data (->> (byte-array [0x80 0xa1 0x80 0x63 0x80 0xa3 0x80 0x63])
Expand All @@ -26,30 +30,45 @@
:data (->> "qname001\000qname002\000qname003\000qname004\000"
.getBytes
bb/make-lsb-byte-buffer)}]
{:keys [BA BF RL BB RN]}
(ds/build-data-series-decoders {:data-series encodings} blocks)]
bs-decoder (bs/make-bit-stream-decoder
(:data (first (filter #(zero? (:content-id %)) blocks))))
{:keys [BA BF MQ RL BB RN]}
(ds/build-data-series-decoders {:data-series encodings} bs-decoder blocks)]
(is (= (map int "ATGC") [(BA) (BA) (BA) (BA)]))
(is (= [0xa1 0x63 0xa3 0x63] [(BF) (BF) (BF) (BF)]))
(is (= [151 151 151 151] [(RL) (RL) (RL) (RL)]))
(is (= [60 60 60 60] [(MQ) (MQ) (MQ) (MQ)]))
(is (= [150 149 152 151] [(RL) (RL) (RL) (RL)]))
(is (= ["CAT" "CGAAC" "AACT" "ACT"]
(map #(String. ^bytes %) [(BB) (BB) (BB) (BB)])))
(is (= ["qname001" "qname002" "qname003" "qname004"]
(map #(String. ^bytes %) [(RN) (RN) (RN) (RN)]))))
(testing "block content interleaves when associated with multiple data series"
(let [encodings {:BB {:codec :byte-array-len
(let [encodings {:RI {:codec :beta, :offset 0, :length 1}
:RL {:codec :beta, :offset 150, :length 1}
:BB {:codec :byte-array-len
:len-encoding {:codec :external, :content-id 1}
:val-encoding {:codec :external, :content-id 1}}
:DL {:codec :external, :content-id 2}
:HC {:codec :external, :content-id 2}}
blocks [{:content-id 1
blocks [{:content-id 0
:data (bb/make-lsb-byte-buffer (byte-array [2r11100100]))}
{:content-id 1
:data (->> (byte-array [3 (int \A) (int \B) (int \C)
2 (int \D) (int \E)
1 (int \F)])
bb/make-lsb-byte-buffer)}
{:content-id 2
:data (bb/make-lsb-byte-buffer (byte-array [1 2 3 4 5 6]))}]
{:keys [BB DL HC]}
(ds/build-data-series-decoders {:data-series encodings} blocks)]
bs-decoder (bs/make-bit-stream-decoder
(:data (first (filter #(zero? (:content-id %)) blocks))))
{:keys [RI RL BB DL HC]}
(ds/build-data-series-decoders {:data-series encodings} bs-decoder blocks)]
(is (= [[1 151]
[1 150]
[0 151]]
[[(RI) (RL)]
[(RI) (RL)]
[(RI) (RL)]]))
(is (= ["ABC" "DE" "F"]
(map #(String. ^bytes %) [(BB) (BB) (BB)])))
(is (= [[1 2] [3 4] [5 6]]
Expand All @@ -76,7 +95,7 @@
:data (bb/make-lsb-byte-buffer (byte-array [0xde 0xed 0xbe 0xef]))}
{:content-id 7692867
:data (bb/make-lsb-byte-buffer (byte-array [0xca 0xfe 0xba 0xbe]))}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
sb (get-in decoders [:sb \c])
ub (get-in decoders [:ub \C])]
(is (= [{:type "i" :value (unchecked-byte 0xde)}
Expand Down Expand Up @@ -116,7 +135,7 @@
(.putShort 0x89ab)
(.putShort 0xcdef)
.flip)}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
ss (get-in decoders [:ss \s])
us (get-in decoders [:us \S])]
(is (= [{:type "i" :value 0x0123}
Expand Down Expand Up @@ -156,7 +175,7 @@
(.putInt 0x89abcdef)
(.putInt 0xffffffff)
.flip)}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
si (get-in decoders [:si \i])
ui (get-in decoders [:ui \I])]
(is (= [{:type "i" :value 0}
Expand All @@ -183,7 +202,7 @@
(.putFloat -0.5)
(.putFloat 0.0)
.flip)}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
fl (get-in decoders [:fl \f])]
(is (= [{:type "f" :value 1.0}
{:type "f" :value 0.75}
Expand Down Expand Up @@ -214,7 +233,7 @@
"deadbeef\000")
.getBytes
bb/make-lsb-byte-buffer)}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
MC (get-in decoders [:MC \Z])
hx (get-in decoders [:hx \H])]
(is (= [{:type "Z" :value "151M"}
Expand Down Expand Up @@ -263,7 +282,7 @@
(.putInt bb 4)
(doseq [v vs] (.put bb (byte v))))
(.flip bb))}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
sb (get-in decoders [:sb \B])
ub (get-in decoders [:ub \B])]
(is (= [{:type "B" :value "c,0,1,2,3"}
Expand Down Expand Up @@ -309,7 +328,7 @@
(.putInt bb 2)
(doseq [v vs] (.putShort bb v)))
(.flip bb))}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
ss (get-in decoders [:ss \B])
us (get-in decoders [:us \B])]
(is (= [{:type "B" :value (str "s," 0x0123 "," 0x4567)}
Expand Down Expand Up @@ -357,7 +376,7 @@
(.putInt bb 2)
(doseq [v vs] (.putInt bb v)))
(.flip bb))}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
si (get-in decoders [:si \B])
ui (get-in decoders [:ui \B])]
(is (= [{:type "B" :value (str "i," 0x01234567 "," 0x76543210)}
Expand Down Expand Up @@ -394,7 +413,7 @@
(.put bb (byte -1)))
(.flip bb)
bb)}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
CG (get-in decoders [:CG \B])]
(is (= (map #(array-map :type "B" :value %) vs)
(map #(update % :value #'decoder/B-I-type-cigar-str->cigar-str)
Expand All @@ -416,7 +435,7 @@
(.putInt bb 2)
(doseq [v vs] (.putFloat bb v)))
(.flip bb))}]
decoders (ds/build-tag-decoders {:tags encodings} blocks)
decoders (ds/build-tag-decoders {:tags encodings} nil blocks)
fl (get-in decoders [:fl \B])]
(is (= [{:type "B" :value "f,0.0,0.25"}
{:type "B" :value "f,0.5,1.0"}
Expand Down

0 comments on commit 90227f3

Please sign in to comment.