diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj new file mode 100644 index 00000000..f99a2406 --- /dev/null +++ b/src/cljam/io/cram.clj @@ -0,0 +1,284 @@ +(ns cljam.io.cram + {:clj-kondo/ignore [:missing-docstring]} + (:require [cljam.io.util.byte-buffer :as bb] + [cljam.io.util.lsb.data-io :as lsb] + [cljam.io.util.lsb.io-stream :as lsb.stream]) + (:import [java.io + ByteArrayInputStream + DataInputStream + IOException + RandomAccessFile] + [java.util Arrays] + [org.apache.commons.compress.compressors CompressorStreamFactory])) + +(def ^:const cram-magic "CRAM") + +(defn decode-itf8 ^long [in] + (let [b (long (lsb/read-ubyte in))] + (cond (zero? (bit-and b 0x80)) + b + + (zero? (bit-and b 0x40)) + (bit-or (bit-shift-left (bit-and b 0x7f) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x20)) + (bit-or (bit-shift-left (bit-and b 0x3f) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x10)) + (bit-or (bit-shift-left (bit-and b 0x1f) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + :else + (bit-or (bit-shift-left (bit-and b 0x0f) 28) + (bit-shift-left (long (lsb/read-ubyte in)) 20) + (bit-shift-left (long (lsb/read-ubyte in)) 12) + (bit-shift-left (long (lsb/read-ubyte in)) 4) + (bit-and (long (lsb/read-ubyte in)) 0x0f))))) + +(defn decode-ltf8 ^long [in] + (let [b (long (lsb/read-ubyte in))] + (cond (zero? (bit-and b 0x80)) + b + + (zero? (bit-and b 0x40)) + (bit-or (bit-shift-left (bit-and b 0x7f) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x20)) + (bit-or (bit-shift-left (bit-and b 0x3f) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x10)) + (bit-or (bit-shift-left (bit-and b 0x1f) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x08)) + (bit-or (bit-shift-left (bit-and b 0x07) 32) + (bit-shift-left (long (lsb/read-ubyte in)) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x04)) + (bit-or (bit-shift-left (bit-and b 0x03) 40) + (bit-shift-left (long (lsb/read-ubyte in)) 32) + (bit-shift-left (long (lsb/read-ubyte in)) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x02)) + (bit-or (bit-shift-left (bit-and b 0x01) 48) + (bit-shift-left (long (lsb/read-ubyte in)) 40) + (bit-shift-left (long (lsb/read-ubyte in)) 32) + (bit-shift-left (long (lsb/read-ubyte in)) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + (zero? (bit-and b 0x01)) + (bit-or (bit-shift-left (long (lsb/read-ubyte in)) 48) + (bit-shift-left (long (lsb/read-ubyte in)) 40) + (bit-shift-left (long (lsb/read-ubyte in)) 32) + (bit-shift-left (long (lsb/read-ubyte in)) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in))) + + :else + (bit-or (bit-shift-left (long (lsb/read-ubyte in)) 56) + (bit-shift-left (long (lsb/read-ubyte in)) 48) + (bit-shift-left (long (lsb/read-ubyte in)) 40) + (bit-shift-left (long (lsb/read-ubyte in)) 32) + (bit-shift-left (long (lsb/read-ubyte in)) 24) + (bit-shift-left (long (lsb/read-ubyte in)) 16) + (bit-shift-left (long (lsb/read-ubyte in)) 8) + (long (lsb/read-ubyte in)))))) + +(defn decode-array [^RandomAccessFile in elem-fn] + (let [start (.getFilePointer in) + size (decode-itf8 in)] + (loop [pos start, acc (transient [])] + (if (< (- pos start) size) + (let [e (elem-fn in)] + (recur (.getFilePointer in) (conj! acc e))) + (persistent! acc))))) + +(defn decode-encoding [in] + (let [codec-id (decode-itf8 in) + _n (decode-itf8 in)] + (case codec-id + 0 {:codec :null} + 1 (let [content-id (decode-itf8 in)] + {:codec :external + :content-id content-id}) + 3 (let [alphabet (decode-array in decode-itf8) + bit-len (decode-array in decode-itf8)] + {:codec :huffman, :alphabet alphabet, :bit-len bit-len}) + 4 (let [len-encoding (decode-encoding in) + val-encoding (decode-encoding in)] + {:codec :byte-array-len, :len-encoding len-encoding, :val-encoding val-encoding}) + 5 (let [stop-byte (lsb/read-byte in) + external-id (decode-itf8 in)] + {:codec :byte-array-stop, :stop-byte stop-byte, :external-id external-id}) + 6 (let [offset (decode-itf8 in) + length (decode-itf8 in)] + {:codec :beta, :offset offset, :length length}) + 7 (let [offset (decode-itf8 in) + k (decode-itf8 in)] + {:codec :subexp, :offset offset, :k k}) + 9 (let [offset (decode-itf8 in)] + {:codec :gamma, :offset offset}) + (throw (ex-info (str "codec " codec-id " not supported") {}))))) + +(defn decode-file-definition [in] + (when-not (Arrays/equals ^bytes (lsb/read-bytes in 4) (.getBytes cram-magic)) + (throw (IOException. "Invalid CRAM file"))) + (let [major (lsb/read-ubyte in) + minor (lsb/read-ubyte in) + file-id (lsb/read-bytes in 20)] + {:version {:major major :minor minor}, :id file-id})) + +(defn decode-container-header [in] + (let [len (lsb/read-int in) + ref-seq-id (decode-itf8 in) + start-pos (decode-itf8 in) + span (decode-itf8 in) + n-records (decode-itf8 in) + counter (decode-ltf8 in) + n-bases (decode-ltf8 in) + n-blocks (decode-itf8 in) + landmarks (decode-array in decode-itf8) + crc (lsb/read-bytes in 4)] + {:length len + :ref ref-seq-id + :start start-pos + :span span + :records n-records + :counter counter + :bases n-bases + :blocks n-blocks + :landmarks landmarks + :crc crc})) + +(def decode-block-data + (let [factory (CompressorStreamFactory.)] + (fn [in method size raw-size] + (let [raw-data (lsb/read-bytes in size)] + (if (zero? method) + raw-data + (let [bais (ByteArrayInputStream. raw-data)] + ;; TODO: create appropriate compressor input stream for method + (with-open [is (DataInputStream. (.createCompressorInputStream factory bais))] + (lsb.stream/read-bytes is raw-size)))))))) + +(defn decode-block [in] + (let [method (lsb/read-byte in) + content-type-id (lsb/read-byte in) + content-id (decode-itf8 in) + size (decode-itf8 in) + raw-size (decode-itf8 in) + data (decode-block-data in method size raw-size) + crc (lsb/read-bytes in 4)] + {:method method + :content-type content-type-id + :content-id content-id + :size size + :raw-size raw-size + :data data + :crc crc})) + +(defn decode-cram-header [in] + (let [{:keys [data]} (decode-block in) + bb (bb/make-lsb-byte-buffer data) + size (bb/read-uint bb)] + (bb/read-bytes bb size))) + +(defn decode-tag-dictionary [in] + (letfn [(decode-tags [in] + (loop [acc (transient [])] + (let [c1 (lsb/read-ubyte in)] + (if (zero? c1) + (persistent! acc) + (let [c2 (lsb/read-ubyte in) + t (lsb/read-ubyte in) + tag (keyword (str (char c1) (char c2)))] + (recur (conj! acc {:tag tag, :type (char t)})))))))] + (decode-array in decode-tags))) + +(defn decode-preservation-map [in] + (let [_size (decode-itf8 in) + n (decode-itf8 in)] + (loop [n n, acc (transient {:RN true, :AP true, :RR true})] + (if (zero? n) + (persistent! acc) + (let [k (keyword (String. (lsb/read-bytes in 2))) + v (case k + (:RN :AP :RR) (pos? (lsb/read-byte in)) + :SM (lsb/read-bytes in 5) + :TD (decode-tag-dictionary in))] + (recur (dec n) (assoc! acc k v))))))) + +(defn decode-data-series-encodings [in] + (let [_size (decode-itf8 in) + n (decode-itf8 in)] + (loop [n n, acc (transient {})] + (if (zero? n) + (persistent! acc) + (let [k (keyword (String. (lsb/read-bytes in 2))) + v (decode-encoding in)] + (recur (dec n) (assoc! acc k v))))))) + +(defn decode-tag-encoding-map [in] + (let [_size (decode-itf8 in) + n (decode-itf8 in)] + (loop [n n, acc (transient {})] + (if (zero? n) + (persistent! acc) + (let [k (decode-itf8 in) + c1 (char (bit-and (bit-shift-right k 16) 0xff)) + c2 (char (bit-and (bit-shift-right k 8) 0xff)) + t (char (bit-and k 0xff)) + v (decode-encoding in) + tag (keyword (str c1 c2))] + (recur (dec n) (assoc! acc tag {:type t :encoding v}))))))) + +(defn decode-compression-header [in] + (let [preservation-map (decode-preservation-map in) + data-series-encodings (decode-data-series-encodings in) + tag-encoding-map (decode-tag-encoding-map in)] + {:preservation-map preservation-map + :data-series data-series-encodings + :tags tag-encoding-map})) + +(defn decode-slice-header [in] + (let [ref-seq-id (decode-itf8 in) + start (decode-itf8 in) + span (decode-itf8 in) + n-records (decode-itf8 in) + counter (decode-ltf8 in) + n-blocks (decode-itf8 in) + content-ids (into [] (map (fn [_] (decode-itf8 in))) + (range n-blocks)) + embedded-reference (decode-itf8 in) + reference-md5 (lsb/read-bytes in 16) + tags [(lsb/read-ubyte in) (lsb/read-ubyte in)]] + {:ref-seq-id ref-seq-id + :start start + :span span + :records n-records + :counter counter + :blocks n-blocks + :content-ids content-ids + :embedded-reference embedded-reference + :reference-md5 reference-md5 + :tags tags})) +