-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
285 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
(ns cljam.io.cram | ||
{:clj-kondo/ignore [:missing-docstring]} | ||
(:require [cljam.io.util.byte-buffer :as bb]) | ||
(:import [java.io ByteArrayInputStream IOException] | ||
[java.nio Buffer ByteBuffer ByteOrder] | ||
[java.util Arrays] | ||
[org.apache.commons.compress.compressors CompressorStreamFactory])) | ||
|
||
(def ^:const cram-magic "CRAM") | ||
|
||
(defn decode-itf8 ^long [bb] | ||
(let [b (long (bb/read-ubyte bb))] | ||
(cond (zero? (bit-and b 0x80)) | ||
b | ||
|
||
(zero? (bit-and b 0x40)) | ||
(bit-or (bit-shift-left (bit-and b 0x7f) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x20)) | ||
(bit-or (bit-shift-left (bit-and b 0x3f) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x10)) | ||
(bit-or (bit-shift-left (bit-and b 0x1f) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
:else | ||
(bit-or (bit-shift-left (bit-and b 0x0f) 28) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 20) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 12) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 4) | ||
(bit-and (long (bb/read-ubyte bb)) 0x0f))))) | ||
|
||
(defn decode-ltf8 ^long [bb] | ||
(let [b (long (bb/read-ubyte bb))] | ||
(cond (zero? (bit-and b 0x80)) | ||
b | ||
|
||
(zero? (bit-and b 0x40)) | ||
(bit-or (bit-shift-left (bit-and b 0x7f) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x20)) | ||
(bit-or (bit-shift-left (bit-and b 0x3f) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x10)) | ||
(bit-or (bit-shift-left (bit-and b 0x1f) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x08)) | ||
(bit-or (bit-shift-left (bit-and b 0x07) 32) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x04)) | ||
(bit-or (bit-shift-left (bit-and b 0x03) 40) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 32) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x02)) | ||
(bit-or (bit-shift-left (bit-and b 0x01) 48) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 40) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 32) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
(zero? (bit-and b 0x01)) | ||
(bit-or (bit-shift-left (long (bb/read-ubyte bb)) 48) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 40) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 32) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb))) | ||
|
||
:else | ||
(bit-or (bit-shift-left (long (bb/read-ubyte bb)) 56) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 48) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 40) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 32) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 24) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 16) | ||
(bit-shift-left (long (bb/read-ubyte bb)) 8) | ||
(long (bb/read-ubyte bb)))))) | ||
|
||
(defn decode-array [^ByteBuffer bb elem-fn] | ||
(let [start (.position ^Buffer bb) | ||
size (decode-itf8 bb)] | ||
(loop [pos start, acc (transient [])] | ||
(if (< (- pos start) size) | ||
(let [e (elem-fn bb)] | ||
(recur (.position ^Buffer bb) (conj! acc e))) | ||
(persistent! acc))))) | ||
|
||
(defn decode-encoding [^ByteBuffer bb] | ||
(let [codec-id (decode-itf8 bb) | ||
_n (decode-itf8 bb)] | ||
(case codec-id | ||
0 {:codec :null} | ||
1 (let [content-id (decode-itf8 bb)] | ||
{:codec :external | ||
:content-id content-id}) | ||
3 (let [alphabet (decode-array bb decode-itf8) | ||
bit-len (decode-array bb decode-itf8)] | ||
{:codec :huffman, :alphabet alphabet, :bit-len bit-len}) | ||
4 (let [len-encoding (decode-encoding bb) | ||
val-encoding (decode-encoding bb)] | ||
{:codec :byte-array-len, :len-encoding len-encoding, :val-encoding val-encoding}) | ||
5 (let [stop-byte (.get bb) | ||
external-id (decode-itf8 bb)] | ||
{:codec :byte-array-stop, :stop-byte stop-byte, :external-id external-id}) | ||
6 (let [offset (decode-itf8 bb) | ||
length (decode-itf8 bb)] | ||
{:codec :beta, :offset offset, :length length}) | ||
7 (let [offset (decode-itf8 bb) | ||
k (decode-itf8 bb)] | ||
{:codec :subexp, :offset offset, :k k}) | ||
9 (let [offset (decode-itf8 bb)] | ||
{:codec :gamma, :offset offset}) | ||
(throw (ex-info (str "codec " codec-id " not supported") {}))))) | ||
|
||
(defn decode-file-definition [bb] | ||
(when-not (Arrays/equals ^bytes (bb/read-bytes bb 4) (.getBytes cram-magic)) | ||
(throw (IOException. "Invalid CRAM file"))) | ||
(let [major (bb/read-ubyte bb) | ||
minor (bb/read-ubyte bb) | ||
file-id (bb/read-bytes bb 20)] | ||
{:version {:major major :minor minor}, :id file-id})) | ||
|
||
(defn decode-container-header [^ByteBuffer bb] | ||
(let [len (.getInt bb) | ||
ref-seq-id (decode-itf8 bb) | ||
start-pos (decode-itf8 bb) | ||
span (decode-itf8 bb) | ||
n-records (decode-itf8 bb) | ||
counter (decode-ltf8 bb) | ||
n-bases (decode-ltf8 bb) | ||
n-blocks (decode-itf8 bb) | ||
landmarks (decode-array bb decode-itf8) | ||
crc (bb/read-bytes bb 4)] | ||
{:length len | ||
:ref ref-seq-id | ||
:start start-pos | ||
:span span | ||
:records n-records | ||
:counter counter | ||
:bases n-bases | ||
:blocks n-blocks | ||
:landmarks landmarks | ||
:crc crc})) | ||
|
||
(def decode-block-data | ||
(let [factory (CompressorStreamFactory.)] | ||
(fn [^ByteBuffer bb method size raw-size] | ||
(if (zero? (long method)) | ||
(let [^Buffer bb' (.order (.slice bb) ByteOrder/LITTLE_ENDIAN)] | ||
(bb/skip bb raw-size) | ||
(.limit bb' raw-size)) | ||
(let [compressed (bb/read-bytes bb size) | ||
bais (ByteArrayInputStream. compressed) | ||
uncompressed (byte-array raw-size) | ||
bb' (bb/make-lsb-byte-buffer uncompressed)] | ||
;; TODO: create appropriate compressor input stream for method | ||
(with-open [is (.createCompressorInputStream factory bais)] | ||
(.read is uncompressed) | ||
bb')))))) | ||
|
||
(defn decode-block [bb] | ||
(let [method (bb/read-ubyte bb) | ||
content-type-id (bb/read-ubyte bb) | ||
content-id (decode-itf8 bb) | ||
size (decode-itf8 bb) | ||
raw-size (decode-itf8 bb) | ||
data (decode-block-data bb method size raw-size) | ||
crc (bb/read-bytes bb 4)] | ||
{:method method | ||
:content-type content-type-id | ||
:content-id content-id | ||
:size size | ||
:raw-size raw-size | ||
:data data | ||
:crc crc})) | ||
|
||
(defn decode-cram-header-block [bb] | ||
(let [{bb' :data} (decode-block bb) | ||
size (bb/read-uint bb')] | ||
(bb/read-bytes bb' size))) | ||
|
||
(defn decode-tag-dictionary [bb] | ||
(letfn [(decode-tags [bb] | ||
(loop [acc (transient [])] | ||
(let [c1 (long (bb/read-ubyte bb))] | ||
(if (zero? c1) | ||
(persistent! acc) | ||
(let [c2 (bb/read-ubyte bb) | ||
t (bb/read-ubyte bb) | ||
tag (keyword (str (char c1) (char c2)))] | ||
(recur (conj! acc {:tag tag, :type (char t)})))))))] | ||
(decode-array bb decode-tags))) | ||
|
||
(defn decode-preservation-map [^ByteBuffer bb] | ||
(let [_size (decode-itf8 bb) | ||
n (decode-itf8 bb)] | ||
(loop [n n, acc (transient {:RN true, :AP true, :RR true})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (keyword (String. ^bytes (bb/read-bytes bb 2))) | ||
v (case k | ||
(:RN :AP :RR) (pos? (.get bb)) | ||
:SM (bb/read-bytes bb 5) | ||
:TD (decode-tag-dictionary bb))] | ||
(recur (dec n) (assoc! acc k v))))))) | ||
|
||
(defn decode-data-series-encodings [bb] | ||
(let [_size (decode-itf8 bb) | ||
n (decode-itf8 bb)] | ||
(loop [n n, acc (transient {})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (keyword (String. ^bytes (bb/read-bytes bb 2))) | ||
v (decode-encoding bb)] | ||
(recur (dec n) (assoc! acc k v))))))) | ||
|
||
(defn decode-tag-encoding-map [bb] | ||
(let [_size (decode-itf8 bb) | ||
n (decode-itf8 bb)] | ||
(loop [n n, acc (transient {})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (decode-itf8 bb) | ||
c1 (char (bit-and (bit-shift-right k 16) 0xff)) | ||
c2 (char (bit-and (bit-shift-right k 8) 0xff)) | ||
t (char (bit-and k 0xff)) | ||
v (decode-encoding bb) | ||
tag (keyword (str c1 c2))] | ||
(recur (dec n) (assoc! acc tag {:type t :encoding v}))))))) | ||
|
||
(defn decode-compression-header-block [bb] | ||
(let [{bb' :data} (decode-block bb) | ||
preservation-map (decode-preservation-map bb') | ||
data-series-encodings (decode-data-series-encodings bb') | ||
tag-encoding-map (decode-tag-encoding-map bb')] | ||
{:preservation-map preservation-map | ||
:data-series data-series-encodings | ||
:tags tag-encoding-map})) | ||
|
||
(defn decode-slice-header-block [bb] | ||
(let [{bb :data} (decode-block bb) | ||
ref-seq-id (decode-itf8 bb) | ||
start (decode-itf8 bb) | ||
span (decode-itf8 bb) | ||
n-records (decode-itf8 bb) | ||
counter (decode-ltf8 bb) | ||
n-blocks (decode-itf8 bb) | ||
content-ids (into [] (map (fn [_] (decode-itf8 bb))) | ||
(range n-blocks)) | ||
embedded-reference (decode-itf8 bb) | ||
reference-md5 (bb/read-bytes bb 16) | ||
tags [(bb/read-ubyte bb) (bb/read-ubyte bb)]] | ||
{:ref-seq-id ref-seq-id | ||
:start start | ||
:span span | ||
:records n-records | ||
:counter counter | ||
:blocks n-blocks | ||
:content-ids content-ids | ||
:embedded-reference embedded-reference | ||
:reference-md5 reference-md5 | ||
:tags tags})) | ||
|