-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
284 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
(ns cljam.io.cram | ||
{:clj-kondo/ignore [:missing-docstring]} | ||
(:require [cljam.io.util.byte-buffer :as bb] | ||
[cljam.io.util.lsb.data-io :as lsb] | ||
[cljam.io.util.lsb.io-stream :as lsb.stream]) | ||
(:import [java.io | ||
ByteArrayInputStream | ||
DataInputStream | ||
IOException | ||
RandomAccessFile] | ||
[java.util Arrays] | ||
[org.apache.commons.compress.compressors CompressorStreamFactory])) | ||
|
||
(def ^:const cram-magic "CRAM") | ||
|
||
(defn decode-itf8 ^long [in] | ||
(let [b (long (lsb/read-ubyte in))] | ||
(cond (zero? (bit-and b 0x80)) | ||
b | ||
|
||
(zero? (bit-and b 0x40)) | ||
(bit-or (bit-shift-left (bit-and b 0x7f) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x20)) | ||
(bit-or (bit-shift-left (bit-and b 0x3f) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x10)) | ||
(bit-or (bit-shift-left (bit-and b 0x1f) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
:else | ||
(bit-or (bit-shift-left (bit-and b 0x0f) 28) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 20) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 12) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 4) | ||
(bit-and (long (lsb/read-ubyte in)) 0x0f))))) | ||
|
||
(defn decode-ltf8 ^long [in] | ||
(let [b (long (lsb/read-ubyte in))] | ||
(cond (zero? (bit-and b 0x80)) | ||
b | ||
|
||
(zero? (bit-and b 0x40)) | ||
(bit-or (bit-shift-left (bit-and b 0x7f) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x20)) | ||
(bit-or (bit-shift-left (bit-and b 0x3f) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x10)) | ||
(bit-or (bit-shift-left (bit-and b 0x1f) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x08)) | ||
(bit-or (bit-shift-left (bit-and b 0x07) 32) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x04)) | ||
(bit-or (bit-shift-left (bit-and b 0x03) 40) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 32) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x02)) | ||
(bit-or (bit-shift-left (bit-and b 0x01) 48) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 40) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 32) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
(zero? (bit-and b 0x01)) | ||
(bit-or (bit-shift-left (long (lsb/read-ubyte in)) 48) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 40) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 32) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in))) | ||
|
||
:else | ||
(bit-or (bit-shift-left (long (lsb/read-ubyte in)) 56) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 48) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 40) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 32) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 24) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 16) | ||
(bit-shift-left (long (lsb/read-ubyte in)) 8) | ||
(long (lsb/read-ubyte in)))))) | ||
|
||
(defn decode-array [^RandomAccessFile in elem-fn] | ||
(let [start (.getFilePointer in) | ||
size (decode-itf8 in)] | ||
(loop [pos start, acc (transient [])] | ||
(if (< (- pos start) size) | ||
(let [e (elem-fn in)] | ||
(recur (.getFilePointer in) (conj! acc e))) | ||
(persistent! acc))))) | ||
|
||
(defn decode-encoding [in] | ||
(let [codec-id (decode-itf8 in) | ||
_n (decode-itf8 in)] | ||
(case codec-id | ||
0 {:codec :null} | ||
1 (let [content-id (decode-itf8 in)] | ||
{:codec :external | ||
:content-id content-id}) | ||
3 (let [alphabet (decode-array in decode-itf8) | ||
bit-len (decode-array in decode-itf8)] | ||
{:codec :huffman, :alphabet alphabet, :bit-len bit-len}) | ||
4 (let [len-encoding (decode-encoding in) | ||
val-encoding (decode-encoding in)] | ||
{:codec :byte-array-len, :len-encoding len-encoding, :val-encoding val-encoding}) | ||
5 (let [stop-byte (lsb/read-byte in) | ||
external-id (decode-itf8 in)] | ||
{:codec :byte-array-stop, :stop-byte stop-byte, :external-id external-id}) | ||
6 (let [offset (decode-itf8 in) | ||
length (decode-itf8 in)] | ||
{:codec :beta, :offset offset, :length length}) | ||
7 (let [offset (decode-itf8 in) | ||
k (decode-itf8 in)] | ||
{:codec :subexp, :offset offset, :k k}) | ||
9 (let [offset (decode-itf8 in)] | ||
{:codec :gamma, :offset offset}) | ||
(throw (ex-info (str "codec " codec-id " not supported") {}))))) | ||
|
||
(defn decode-file-definition [in] | ||
(when-not (Arrays/equals ^bytes (lsb/read-bytes in 4) (.getBytes cram-magic)) | ||
(throw (IOException. "Invalid CRAM file"))) | ||
(let [major (lsb/read-ubyte in) | ||
minor (lsb/read-ubyte in) | ||
file-id (lsb/read-bytes in 20)] | ||
{:version {:major major :minor minor}, :id file-id})) | ||
|
||
(defn decode-container-header [in] | ||
(let [len (lsb/read-int in) | ||
ref-seq-id (decode-itf8 in) | ||
start-pos (decode-itf8 in) | ||
span (decode-itf8 in) | ||
n-records (decode-itf8 in) | ||
counter (decode-ltf8 in) | ||
n-bases (decode-ltf8 in) | ||
n-blocks (decode-itf8 in) | ||
landmarks (decode-array in decode-itf8) | ||
crc (lsb/read-bytes in 4)] | ||
{:length len | ||
:ref ref-seq-id | ||
:start start-pos | ||
:span span | ||
:records n-records | ||
:counter counter | ||
:bases n-bases | ||
:blocks n-blocks | ||
:landmarks landmarks | ||
:crc crc})) | ||
|
||
(def decode-block-data | ||
(let [factory (CompressorStreamFactory.)] | ||
(fn [in method size raw-size] | ||
(let [raw-data (lsb/read-bytes in size)] | ||
(if (zero? method) | ||
raw-data | ||
(let [bais (ByteArrayInputStream. raw-data)] | ||
;; TODO: create appropriate compressor input stream for method | ||
(with-open [is (DataInputStream. (.createCompressorInputStream factory bais))] | ||
(lsb.stream/read-bytes is raw-size)))))))) | ||
|
||
(defn decode-block [in] | ||
(let [method (lsb/read-byte in) | ||
content-type-id (lsb/read-byte in) | ||
content-id (decode-itf8 in) | ||
size (decode-itf8 in) | ||
raw-size (decode-itf8 in) | ||
data (decode-block-data in method size raw-size) | ||
crc (lsb/read-bytes in 4)] | ||
{:method method | ||
:content-type content-type-id | ||
:content-id content-id | ||
:size size | ||
:raw-size raw-size | ||
:data data | ||
:crc crc})) | ||
|
||
(defn decode-cram-header [in] | ||
(let [{:keys [data]} (decode-block in) | ||
bb (bb/make-lsb-byte-buffer data) | ||
size (bb/read-uint bb)] | ||
(bb/read-bytes bb size))) | ||
|
||
(defn decode-tag-dictionary [in] | ||
(letfn [(decode-tags [in] | ||
(loop [acc (transient [])] | ||
(let [c1 (lsb/read-ubyte in)] | ||
(if (zero? c1) | ||
(persistent! acc) | ||
(let [c2 (lsb/read-ubyte in) | ||
t (lsb/read-ubyte in) | ||
tag (keyword (str (char c1) (char c2)))] | ||
(recur (conj! acc {:tag tag, :type (char t)})))))))] | ||
(decode-array in decode-tags))) | ||
|
||
(defn decode-preservation-map [in] | ||
(let [_size (decode-itf8 in) | ||
n (decode-itf8 in)] | ||
(loop [n n, acc (transient {:RN true, :AP true, :RR true})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (keyword (String. (lsb/read-bytes in 2))) | ||
v (case k | ||
(:RN :AP :RR) (pos? (lsb/read-byte in)) | ||
:SM (lsb/read-bytes in 5) | ||
:TD (decode-tag-dictionary in))] | ||
(recur (dec n) (assoc! acc k v))))))) | ||
|
||
(defn decode-data-series-encodings [in] | ||
(let [_size (decode-itf8 in) | ||
n (decode-itf8 in)] | ||
(loop [n n, acc (transient {})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (keyword (String. (lsb/read-bytes in 2))) | ||
v (decode-encoding in)] | ||
(recur (dec n) (assoc! acc k v))))))) | ||
|
||
(defn decode-tag-encoding-map [in] | ||
(let [_size (decode-itf8 in) | ||
n (decode-itf8 in)] | ||
(loop [n n, acc (transient {})] | ||
(if (zero? n) | ||
(persistent! acc) | ||
(let [k (decode-itf8 in) | ||
c1 (char (bit-and (bit-shift-right k 16) 0xff)) | ||
c2 (char (bit-and (bit-shift-right k 8) 0xff)) | ||
t (char (bit-and k 0xff)) | ||
v (decode-encoding in) | ||
tag (keyword (str c1 c2))] | ||
(recur (dec n) (assoc! acc tag {:type t :encoding v}))))))) | ||
|
||
(defn decode-compression-header [in] | ||
(let [preservation-map (decode-preservation-map in) | ||
data-series-encodings (decode-data-series-encodings in) | ||
tag-encoding-map (decode-tag-encoding-map in)] | ||
{:preservation-map preservation-map | ||
:data-series data-series-encodings | ||
:tags tag-encoding-map})) | ||
|
||
(defn decode-slice-header [in] | ||
(let [ref-seq-id (decode-itf8 in) | ||
start (decode-itf8 in) | ||
span (decode-itf8 in) | ||
n-records (decode-itf8 in) | ||
counter (decode-ltf8 in) | ||
n-blocks (decode-itf8 in) | ||
content-ids (into [] (map (fn [_] (decode-itf8 in))) | ||
(range n-blocks)) | ||
embedded-reference (decode-itf8 in) | ||
reference-md5 (lsb/read-bytes in 16) | ||
tags [(lsb/read-ubyte in) (lsb/read-ubyte in)]] | ||
{:ref-seq-id ref-seq-id | ||
:start start | ||
:span span | ||
:records n-records | ||
:counter counter | ||
:blocks n-blocks | ||
:content-ids content-ids | ||
:embedded-reference embedded-reference | ||
:reference-md5 reference-md5 | ||
:tags tags})) | ||
|