diff --git a/src/cljam/io/bam/decoder.clj b/src/cljam/io/bam/decoder.clj index 6f1c86da..9278b131 100644 --- a/src/cljam/io/bam/decoder.clj +++ b/src/cljam/io/bam/decoder.clj @@ -3,6 +3,7 @@ (:require [clojure.string :as cstr] [proton.core :as proton] [cljam.util :as util] + cljam.io.protocols [cljam.io.sam.util.quality :as qual] [cljam.io.sam.util.sequence :as sam-seq] [cljam.io.sam.util.refs :as refs] diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj new file mode 100644 index 00000000..304ecd18 --- /dev/null +++ b/src/cljam/io/cram.clj @@ -0,0 +1,40 @@ +(ns cljam.io.cram + "Alpha - subject to change. Provides functions for reading from a CRAM file." + (:require [cljam.io.cram.core :as cram] + [cljam.io.protocols :as protocols] + [cljam.io.util :as io-util]) + (:import [cljam.io.cram.reader CRAMReader])) + +(defn reader + "Creates a CRAM reader depending on the argument f: If f is a file or a string + that representing the path to a CRAM file, returns a new reader that reads + that CRAM file. If f is a CRAM reader, creates and returns a cloned CRAM reader + from it. + + The function also takes an optional argument `option`, which is a map that + consists of: + - reference: A string representing the path to the reference file, or + a sequence reader that reads sequences from the reference file. + This may be omitted only when the CRAM file to be read does not + require a reference file." + (^CRAMReader [f] (reader f {})) + (^CRAMReader [f option] + (if (io-util/cram-reader? f) + (cram/clone-reader f) + (cram/reader f option)))) + +(defn read-header + "Returns the header of the CRAM file." + [rdr] + (protocols/read-header rdr)) + +(defn read-refs + "Returns the references of the CRAM file." + [rdr] + (protocols/read-refs rdr)) + +(defn read-alignments + "Reads all the alignments from the CRAM file and returns them as a lazy sequence + of record maps." + [rdr] + (protocols/read-alignments rdr)) diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj new file mode 100644 index 00000000..7937391d --- /dev/null +++ b/src/cljam/io/cram/core.clj @@ -0,0 +1,46 @@ +(ns cljam.io.cram.core + (:require [cljam.io.cram.seq-resolver :as resolver] + [cljam.io.cram.reader :as reader.core] + [cljam.io.sam.util.refs :as util.refs] + [cljam.io.util.byte-buffer :as bb] + [cljam.util :as util] + [clojure.java.io :as cio]) + (:import [cljam.io.cram.reader CRAMReader] + [java.nio.channels FileChannel] + [java.nio.file OpenOption StandardOpenOption])) + +(defn reader + "Creates a new CRAM reader that reads a CRAM file f. + + Takes an option map as the second argument. An option map consists of: + - reference: a string representing the path to a reference file" + ^CRAMReader [f {:keys [reference]}] + (let [file (cio/file f) + url (util/as-url (.getAbsolutePath file)) + ch (FileChannel/open (.toPath file) + (into-array OpenOption [StandardOpenOption/READ])) + bb (bb/allocate-lsb-byte-buffer 256) + seq-resolver (some-> reference resolver/seq-resolver) + header (volatile! nil) + refs (delay (util.refs/make-refs @header)) + rdr (reader.core/->CRAMReader url ch bb header refs seq-resolver)] + (reader.core/read-file-definition rdr) + (vreset! header (reader.core/read-header rdr)) + rdr)) + +(defn clone-reader + "Creates a cloned CRAM reader based on the given CRAM reader." + ^CRAMReader [^CRAMReader rdr] + (let [url (.-url rdr) + file (cio/as-file url) + ch (FileChannel/open (.toPath file) + (into-array OpenOption [StandardOpenOption/READ])) + bb (bb/allocate-lsb-byte-buffer 256) + seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver) + rdr' (reader.core/->CRAMReader url ch bb + (delay @(.-header rdr)) + (delay @(.-refs rdr)) + seq-resolver)] + (reader.core/read-file-definition rdr') + (reader.core/skip-container rdr') + rdr')) diff --git a/src/cljam/io/cram/decode/data_series.clj b/src/cljam/io/cram/decode/data_series.clj new file mode 100644 index 00000000..38446c97 --- /dev/null +++ b/src/cljam/io/cram/decode/data_series.clj @@ -0,0 +1,136 @@ +(ns cljam.io.cram.decode.data-series + (:require [cljam.io.cram.itf8 :as itf8] + [cljam.io.util.byte-buffer :as bb] + [clojure.string :as str]) + (:import [java.nio Buffer ByteBuffer])) + +(defn- data-series-type [ds] + (case ds + (:BF :CF :RI :RL :AP :RG :MF :NS :NP :TS :NF :TL :FN :FP :DL :RS :PD :HC :MQ) + :int + + (:FC :BS :BA :QS) + :byte + + (:RN :BB :QQ :IN :SC) + :bytes)) + +(defn- build-codec-decoder + [{:keys [codec] :as params} data-type content-id->block-data] + (case codec + :external + (let [^ByteBuffer block (get content-id->block-data (:content-id params))] + (case data-type + :byte #(.get block) + :int #(itf8/decode-itf8 block))) + + :huffman + (let [{:keys [alphabet bit-len]} params] + (assert (and (= (count alphabet) 1) + (zero? (long (first bit-len)))) + "Huffman coding for more than one word is not supported yet.") + (constantly (first alphabet))) + + :byte-array-len + (let [{:keys [len-encoding val-encoding]} params + len-decoder (build-codec-decoder len-encoding :int content-id->block-data) + val-decoder (build-codec-decoder val-encoding :byte content-id->block-data)] + (fn [] + (let [len (len-decoder) + bb (bb/allocate-lsb-byte-buffer len)] + (dotimes [_ len] + (.put bb (byte (val-decoder)))) + (.array bb)))) + + :byte-array-stop + (let [{:keys [stop-byte external-id]} params + ^ByteBuffer block (get content-id->block-data external-id)] + (fn [] + (.mark ^Buffer block) + (let [start (.position block) + end (long + (loop [] + (if (= (.get block) (byte stop-byte)) + (.position block) + (recur)))) + len (dec (- end start)) + _ (.reset ^Buffer block) + ret (bb/read-bytes block len)] + (.get block) + ret))))) + +(defn build-data-series-decoders + "Builds decoders for data series based on the encodings specified in the given + compression header and block data. + + `ds-encodings` is a map { } and the return value is + a map { }, where: + - : a keyword representing the data series name + - : a map representing the encoding of the data series + - : a function with no arguments that returns a value decoded from + the data series upon each call" + [{ds-encodings :data-series} blocks] + (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] + (reduce-kv (fn [decoders ds params] + (let [dt (data-series-type ds) + decoder (build-codec-decoder params dt content-id->block-data)] + (assoc decoders ds decoder))) + {} ds-encodings))) + +(defn- tag-value-coercer [tag-type] + (case tag-type + \A #(char (.get ^ByteBuffer %)) + \c #(.get ^ByteBuffer %) + \C bb/read-ubyte + \s #(.getShort ^ByteBuffer %) + \S bb/read-ushort + \i #(.getInt ^ByteBuffer %) + \I bb/read-uint + \f #(.getFloat ^ByteBuffer %) + \Z bb/read-null-terminated-string + \H (fn [^ByteBuffer bb] + (let [s (.getBytes ^String (bb/read-null-terminated-string bb)) + n (quot (alength s) 2) + arr (byte-array n)] + (dotimes [i n] + (let [b (bit-or (bit-shift-left (Character/digit (aget s (* 2 i)) 16) 4) + (Character/digit (aget s (inc (* 2 i))) 16))] + (aset arr i (byte b)))) + arr)) + \B (fn [^ByteBuffer bb] + (let [tag-type' (char (.get bb)) + len (.getInt bb) + coercer (tag-value-coercer tag-type') + vs (repeatedly len (partial coercer bb))] + (str/join \, (cons tag-type' vs)))))) + +(defn- build-tag-decoder [tag-encoding tag-type content-id->block-data] + (let [decoder (build-codec-decoder tag-encoding :bytes content-id->block-data) + coercer (tag-value-coercer tag-type)] + (fn [] + (let [bb (bb/make-lsb-byte-buffer (decoder))] + (coercer bb))))) + +(defn build-tag-decoders + "Builds decoders for tags based on the encodings specified in the given + compression header and block data. + + `tags` is a map { { }} and the return + value is a map { { }}, where: + - : a keyword representing the tag name + - : a character representing a type of the tag + - : a map representing the encoding of the tag and type + - : a function with no arguments that returns a value decoded from + the data series for the tag upon each call" + [{:keys [tags]} blocks] + (let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] + (reduce-kv + (fn [decoders tag m] + (reduce-kv + (fn [decoders tag-type encoding] + (let [decoder (build-tag-decoder encoding tag-type content-id->block-data) + tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))] + (assoc-in decoders [tag tag-type] + (fn [] {:type tag-type' :value (decoder)})))) + decoders m)) + {} tags))) diff --git a/src/cljam/io/cram/decode/record.clj b/src/cljam/io/cram/decode/record.clj new file mode 100644 index 00000000..cf741ca2 --- /dev/null +++ b/src/cljam/io/cram/decode/record.clj @@ -0,0 +1,356 @@ +(ns cljam.io.cram.decode.record + (:require [cljam.io.cram.seq-resolver.protocol :as resolver] + [cljam.io.sam.util.cigar :as sam.cigar] + [cljam.io.sam.util.flag :as sam.flag] + [cljam.io.util.byte-buffer :as bb])) + +(defn- ref-name [cram-header ^long ref-id] + (or (when (<= 0 ref-id) + (get-in cram-header [:SQ ref-id :SN])) + "*")) + +(defn- read-group-id [cram-header ^long rg-index] + (when (<= 0 rg-index) + (get-in cram-header [:RG rg-index :ID]))) + +(defn- build-positional-data-decoder + [cram-header {:keys [preservation-map]} slice-header {:keys [RI RL AP RG]}] + (let [AP' (if (:AP preservation-map) + (let [pos (volatile! nil)] + (fn [] + (let [pos' (+ (long (or @pos (:start slice-header))) + (long (AP)))] + (vreset! pos pos') + pos'))) + AP) + RI' (if (= (:ref-seq-id slice-header) -2) + RI + (constantly (:ref-seq-id slice-header)))] + (fn [record] + (let [rg (read-group-id cram-header (RG))] + (assoc record + :rname (ref-name cram-header (RI')) + ::len (RL) + :pos (AP') + :options (cond-> [] + rg + (conj {:RG {:type "Z" :value rg}}))))))) + +(defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}] + (if (:RN preservation-map) + #(assoc % :qname (String. ^bytes (RN))) + (throw (ex-info "Omitted read names are not supported yet." {})))) + +(defn- build-mate-read-decoder [cram-header {:keys [MF NS NP TS NF]}] + (fn [{:keys [rname] :as record}] + (let [flag (long (::flag record))] + (if (pos? (bit-and flag 0x02)) + (let [mate-flag (long (MF)) + bam-flag (cond-> (long (:flag record)) + (pos? (bit-and mate-flag 0x01)) + (bit-or (sam.flag/encoded #{:next-reversed})) + + (pos? (bit-and mate-flag 0x02)) + (bit-or (sam.flag/encoded #{:next-unmapped}))) + rnext (ref-name cram-header (NS))] + (assoc record + :flag bam-flag + :rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext) + :pnext (NP) + :tlen (TS))) + (cond-> record + (pos? (bit-and flag 0x04)) + (assoc ::next-fragment (NF))))))) + +(defn- build-auxiliary-tags-decoder [{:keys [preservation-map]} {:keys [TL]} tag-decoders] + (let [tag-decoder (fn [{tag-type :type :keys [tag]}] + (let [decoder (get-in tag-decoders [tag tag-type])] + (fn [] + {tag (decoder)}))) + decoders (mapv (fn [tags] + (let [decoders (mapv tag-decoder tags)] + (fn [] + (into [] (map #(%)) decoders)))) + (:TD preservation-map))] + (fn [record] + (let [tl (TL) + decoder (nth decoders tl)] + (update record :options into (decoder)))))) + +(defn- record-end [record features] + (->> features + ^long (reduce + (fn [^long ret f] + (case (:code f) + (:insertion :softclip) + (- ret (alength ^bytes (:bases f))) + + (:deletion :ref-skip) + (+ ret (long (:len f))) + + :insert-base + (dec ret) + + ret)) + (::len record)) + dec + (+ (long (:pos record))))) + +(defn- record-seq + [seq-resolver {:keys [preservation-map]} {:keys [rname pos] :as record} features] + (let [region {:chr rname :start pos :end (::end record)} + ref-bases (.getBytes ^String (resolver/resolve-sequence seq-resolver region)) + len (long (::len record)) + bs (byte-array len (byte (int \N))) + subst (:SM preservation-map)] + (loop [[f & more :as fs] features, rpos 0, spos 1] + (if f + (let [fpos (long (:pos f)) + gap (- fpos spos)] + (if (pos? gap) + (do (System/arraycopy ref-bases rpos bs (dec spos) gap) + (recur fs (+ rpos gap) fpos)) + (case (:code f) + :subst + (let [b (char (aget ref-bases rpos)) + b' (get-in subst [b (:subst f)])] + (aset bs (dec fpos) (byte (int b'))) + (recur more (inc rpos) (inc fpos))) + + (:insertion :softclip) + (let [^bytes bs' (:bases f) + n (alength bs')] + (System/arraycopy bs' 0 bs (dec fpos) n) + (recur more rpos (+ fpos n))) + + (:deletion :ref-skip) + (recur more (+ rpos (long (:len f))) fpos) + + :insert-base + (do (aset bs (dec fpos) (byte (:base f))) + (recur more rpos (inc fpos))) + + (recur more rpos spos)))) + (let [gap (- len (dec spos))] + (when (pos? gap) + (System/arraycopy ref-bases rpos bs (dec spos) gap)) + (run! (fn [f] + (case (:code f) + :read-base (aset bs (dec (long (:pos f))) (byte (:base f))) + :bases (let [^bytes bs' (:bases f) + n (alength bs')] + (System/arraycopy bs' 0 bs (dec (long (:pos f))) n)) + nil)) + features) + (String. bs)))))) + +(defn- qual-score ^long [^long qual] + (+ qual 33)) + +(defn- decode-qual ^String [^long len qs-decoder] + (let [bb (bb/allocate-lsb-byte-buffer len)] + (loop [i len, miss 0] + (if (zero? i) + (if (= len miss) + "*" + (String. (.array bb))) + (let [q (qs-decoder)] + (.put bb (byte (qual-score q))) + (recur (dec i) (cond-> miss (= q -1) inc))))))) + +(defn- record-qual [record features {:keys [QS]}] + (let [flag (long (::flag record)) + len (long (::len record))] + (if (pos? (bit-and flag 0x01)) + (decode-qual len QS) + (let [qs (byte-array len) + missing? (reduce + (fn [missing? {:keys [^long pos] :as f}] + (case (:code f) + (:read-base :score) + (do (aset qs (dec pos) (byte (:qual f))) + false) + + :scores + (let [^bytes scores (:scores f)] + (System/arraycopy ^bytes scores 0 qs (dec pos) (alength scores)) + false) + missing?)) + true + features)] + (if missing? + "*" + (do (dotimes [i len] + (aset qs i (byte (qual-score (aget qs i))))) + (String. qs))))))) + +(defn- record-cigar [record features] + (let [read-len (long (::len record))] + (loop [[f & more :as fs] features + pos 0 + acc (transient [])] + (if f + (let [p (long (:pos f)) + gap (- p (inc pos))] + (if (pos? gap) + (recur fs p (conj! acc [gap \M])) + (if-let [[op ^long len] (case (:code f) + (:read-base :subst) [\M 1] + :insertion [\I (count (:bases f))] + :softclip [\S (count (:bases f))] + :hardclip [\H (:len f)] + :padding [\P (:len f)] + :deletion [\D (:len f)] + :ref-skip [\N (:len f)] + :insert-base [\I 1] + nil)] + (let [pos' (if (#{\M \I \S} op) + (+ p (dec len)) + (dec p))] + (recur more pos' (cond-> acc (pos? len) (conj! [len op])))) + (recur more p acc)))) + (let [acc' (cond-> acc + (< pos read-len) + (conj! [(- read-len pos) \M]))] + (some->> (not-empty (persistent! acc')) sam.cigar/simplify)))))) + +(defn- build-read-features-decoder [{:keys [FN FC FP BA QS BS IN SC HC PD DL RS BB QQ]}] + (fn [] + (loop [i (long (FN)) + prev-pos 0 + fs (transient [])] + (if (zero? i) + (persistent! fs) + (let [code (char (FC)) + pos (+ prev-pos (long (FP))) + data (case code + \B {:code :read-base :base (BA) :qual (QS)} + \X {:code :subst :subst (BS)} + \I {:code :insertion :bases (IN)} + \S {:code :softclip :bases (SC)} + \H {:code :hardclip :len (HC)} + \P {:code :padding :len (PD)} + \D {:code :deletion :len (DL)} + \N {:code :ref-skip :len (RS)} + \i {:code :insert-base :base (BA)} + \b {:code :bases :bases (BB)} + \q {:code :scores :scores (QQ)} + \Q {:code :score :score (QS)})] + (recur (dec i) pos (conj! fs (assoc data :pos pos)))))))) + +(defn- build-mapped-read-decoder [seq-resolver compression-header {:keys [MQ] :as decoders}] + (let [features-decoder (build-read-features-decoder decoders)] + (fn [record] + (let [fs (features-decoder) + end (record-end record fs) + record' (assoc record ::end end)] + (assoc record' + :mapq (MQ) + :seq (record-seq seq-resolver compression-header record' fs) + :qual (record-qual record' fs decoders) + :cigar (->> (record-cigar record' fs) + (map (fn [[n op]] (str n op))) + (apply str))))))) + +(defn- build-unmapped-read-decoder [{:keys [BA QS]}] + (fn [record] + (let [len (long (::len record)) + flag (long (::flag record)) + bb (bb/allocate-lsb-byte-buffer len) + _ (dotimes [_ len] + (.put bb (byte (BA)))) + record' (assoc record + :seq (String. (.array bb)) + :mapq 0 + :cigar "*")] + (if (zero? (bit-and flag 0x01)) + record' + (assoc record' :qual (decode-qual len QS)))))) + +(defn build-cram-record-decoder + "Builds a CRAM record decoder. + + Returns a function with no arguments that returns a map representing a decoded + CRAM record upon each call." + [seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders] + (let [{:keys [BF CF]} ds-decoders + pos-decoder (build-positional-data-decoder cram-header + compression-header + slice-header + ds-decoders) + name-decoder (build-read-name-decoder compression-header ds-decoders) + mate-decoder (build-mate-read-decoder cram-header ds-decoders) + tags-decoder (build-auxiliary-tags-decoder compression-header ds-decoders tag-decoders) + mapped-decoder (build-mapped-read-decoder seq-resolver compression-header ds-decoders) + unmapped-decoder (build-unmapped-read-decoder ds-decoders)] + (fn [] + (let [bf (BF) + cf (CF) + record' (->> {:flag bf ::flag cf} + (pos-decoder) + (name-decoder) + (mate-decoder) + (tags-decoder))] + (if (sam.flag/unmapped? bf) + (unmapped-decoder record') + (mapped-decoder record')))))) + +(defn- update-next-mate + [{:keys [^long flag] :as record} {^long mate-flag :flag :as mate}] + (assoc record + :flag (cond-> flag + (sam.flag/reversed? mate-flag) + (bit-or (sam.flag/encoded #{:next-reversed})) + + (sam.flag/unmapped? mate-flag) + (bit-or (sam.flag/encoded #{:next-unmapped}))) + :rnext (if (= (:rname record) (:rname mate)) + "=" + (:rname mate)) + :pnext (:pos mate))) + +(defn- update-mate-records + [{^long s1 :pos :as r1} {^long s2 :pos :as r2}] + (let [e1 (long (::end r1)) + e2 (long (::end r2)) + r1' (update-next-mate r1 r2) + r2' (update-next-mate r2 r1)] + (if (or (sam.flag/unmapped? (:flag r1)) + (sam.flag/unmapped? (:flag r2)) + (not= (:rname r1') (:rname r2'))) + [(assoc r1' :tlen 0) (assoc r2' :tlen 0)] + (if (<= s1 s2) + (let [tlen (inc (- e2 s1))] + [(assoc r1' :tlen tlen) + (assoc r2' :tlen (- tlen))]) + (let [tlen (inc (- e1 s2))] + [(assoc r1' :tlen (- tlen)) + (assoc r2' :tlen tlen)]))))) + +(defn resolve-mate-records + "Resolves mate records in the same slice by updating their flags, tlen, etc." + [^objects records] + (dotimes [i (alength records)] + (let [record (aget records i)] + (when-let [nf (::next-fragment record)] + (let [j (inc (+ i (long nf))) + mate (aget records j) + [record' mate'] (update-mate-records record mate)] + (aset records i record') + (aset records j mate')))))) + +(defn decode-slice-records + "Decodes CRAM records in a slice all at once and returns them as a sequence of maps." + [seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders] + (let [record-decoder (build-cram-record-decoder seq-resolver + cram-header + compression-header + slice-header + ds-decoders + tag-decoders) + n (:records slice-header) + records (object-array n)] + (dotimes [i n] + (aset records i (record-decoder))) + (resolve-mate-records records) + (map #(dissoc % ::flag ::len ::end ::next-fragment) records))) diff --git a/src/cljam/io/cram/decode/structure.clj b/src/cljam/io/cram/decode/structure.clj new file mode 100644 index 00000000..b9940cef --- /dev/null +++ b/src/cljam/io/cram/decode/structure.clj @@ -0,0 +1,250 @@ +(ns cljam.io.cram.decode.structure + (:require [cljam.io.cram.codecs.rans4x8 :as rans] + [cljam.io.cram.itf8 :as itf8] + [cljam.io.sam.util.header :as sam.header] + [cljam.io.util.byte-buffer :as bb]) + (:import [java.io ByteArrayInputStream IOException] + [java.nio Buffer ByteBuffer ByteOrder] + [java.util Arrays] + [org.apache.commons.compress.compressors CompressorStreamFactory])) + +(def ^:private ^:const cram-magic "CRAM") + +(defn- decode-itf8-array [bb] + (let [n (itf8/decode-itf8 bb)] + (loop [i n, acc (transient [])] + (if (zero? i) + (persistent! acc) + (recur (dec i) (conj! acc (itf8/decode-itf8 bb))))))) + +(defn- decode-encoding [^ByteBuffer bb] + (let [codec-id (itf8/decode-itf8 bb) + _n (itf8/decode-itf8 bb)] + (case codec-id + 0 {:codec :null} + 1 (let [content-id (itf8/decode-itf8 bb)] + {:codec :external + :content-id content-id}) + 3 (let [alphabet (decode-itf8-array bb) + bit-len (decode-itf8-array bb)] + {:codec :huffman, :alphabet alphabet, :bit-len bit-len}) + 4 (let [len-encoding (decode-encoding bb) + val-encoding (decode-encoding bb)] + {:codec :byte-array-len, :len-encoding len-encoding, :val-encoding val-encoding}) + 5 (let [stop-byte (.get bb) + external-id (itf8/decode-itf8 bb)] + {:codec :byte-array-stop, :stop-byte stop-byte, :external-id external-id}) + 6 (let [offset (itf8/decode-itf8 bb) + length (itf8/decode-itf8 bb)] + {:codec :beta, :offset offset, :length length}) + 7 (let [offset (itf8/decode-itf8 bb) + k (itf8/decode-itf8 bb)] + {:codec :subexp, :offset offset, :k k}) + 9 (let [offset (itf8/decode-itf8 bb)] + {:codec :gamma, :offset offset}) + (throw (ex-info (str "codec " codec-id " not supported") {}))))) + +(defn decode-file-definition + "Decodes the CRAM file definition from the given byte buffer." + [bb] + (when-not (Arrays/equals ^bytes (bb/read-bytes bb 4) (.getBytes cram-magic)) + (throw (IOException. "Invalid CRAM file"))) + (let [major (bb/read-ubyte bb) + minor (bb/read-ubyte bb) + file-id (String. ^bytes (bb/read-bytes bb 20))] + {:version {:major major :minor minor}, :id file-id})) + +(defn decode-container-header + "Decodes a container header from the given byte buffer." + [^ByteBuffer bb] + (let [len (.getInt bb) + ref-seq-id (itf8/decode-itf8 bb) + start-pos (itf8/decode-itf8 bb) + span (itf8/decode-itf8 bb) + n-records (itf8/decode-itf8 bb) + counter (itf8/decode-ltf8 bb) + n-bases (itf8/decode-ltf8 bb) + n-blocks (itf8/decode-itf8 bb) + landmarks (decode-itf8-array bb) + crc (bb/read-bytes bb 4)] + {:length len + :ref ref-seq-id + :start start-pos + :span span + :records n-records + :counter counter + :bases n-bases + :blocks n-blocks + :landmarks landmarks + :crc crc})) + +(defn eof-container? + "Returns true iff the given container header represents an EOF container." + [container-header] + (and (= (:length container-header) 15) + (= (:ref container-header) -1) + (= (:start container-header) 4542278) + (= (:span container-header) 0) + (= (:records container-header) 0) + (= (:counter container-header) 0) + (= (:bases container-header) 0) + (= (:blocks container-header) 1) + (= (:landmarks container-header) []))) + +(defn- split-buffer ^ByteBuffer [^ByteBuffer bb size] + (let [^Buffer bb' (.order (.slice bb) ByteOrder/LITTLE_ENDIAN)] + (bb/skip bb size) + (.limit bb' size))) + +(def ^:private decode-block-data + (let [factory (CompressorStreamFactory.)] + (fn [^ByteBuffer bb ^long method ^long size ^long raw-size] + (if (zero? size) + (bb/allocate-lsb-byte-buffer 0) + (case method + 0 (split-buffer bb size) + 4 (->> (split-buffer bb size) + rans/decode + bb/make-lsb-byte-buffer) + (let [compressed (bb/read-bytes bb size) + bais (ByteArrayInputStream. compressed) + uncompressed (byte-array raw-size) + bb' (bb/make-lsb-byte-buffer uncompressed) + compressor (case method + 1 CompressorStreamFactory/GZIP + 2 CompressorStreamFactory/BZIP2 + 3 CompressorStreamFactory/LZMA + (throw + (ex-info (str "compression method " method + " not supported") + {:method method})))] + (with-open [is (.createCompressorInputStream factory compressor bais)] + (.read is uncompressed) + bb'))))))) + +(defn decode-block + "Decodes a block from the given byte buffer." + [bb] + (let [method (bb/read-ubyte bb) + content-type-id (bb/read-ubyte bb) + content-id (itf8/decode-itf8 bb) + size (itf8/decode-itf8 bb) + raw-size (itf8/decode-itf8 bb) + data (decode-block-data bb method size raw-size) + crc (bb/read-bytes bb 4)] + {:method method + :content-type content-type-id + :content-id content-id + :size size + :raw-size raw-size + :data data + :crc crc})) + +(defn decode-cram-header-block + "Decodes a CRAM header block from the given byte buffer." + [bb] + (let [{bb' :data} (decode-block bb) + size (bb/read-uint bb')] + (sam.header/parse-header (String. ^bytes (bb/read-bytes bb' size))))) + +(defn- decode-substitution-matrix [bb] + (let [bs (bb/read-bytes bb 5) + all-bases [\A \C \G \T \N]] + (into {} + (map (fn [r ^long b] + [r (zipmap [(bit-and (bit-shift-right b 6) 0x3) + (bit-and (bit-shift-right b 4) 0x3) + (bit-and (bit-shift-right b 2) 0x3) + (bit-and b 0x3)] + (remove #{r} all-bases))]) + all-bases bs)))) + +(defn- decode-tag-dictionary [^ByteBuffer bb] + (let [n (itf8/decode-itf8 bb) + bb' (split-buffer bb n) + decode-tags (fn [bb] + (loop [acc (transient [])] + (let [c1 (long (bb/read-ubyte bb))] + (if (zero? c1) + (persistent! acc) + (let [c2 (bb/read-ubyte bb) + t (bb/read-ubyte bb) + tag (keyword (str (char c1) (char c2)))] + (recur (conj! acc {:tag tag, :type (char t)})))))))] + (loop [acc (transient [])] + (if (.hasRemaining bb') + (recur (conj! acc (decode-tags bb'))) + (persistent! acc))))) + +(defn- decode-preservation-map [^ByteBuffer bb] + (let [_size (itf8/decode-itf8 bb) + n (itf8/decode-itf8 bb)] + (loop [i n, acc (transient {:RN true, :AP true, :RR true})] + (if (zero? i) + (persistent! acc) + (let [k (keyword (String. ^bytes (bb/read-bytes bb 2))) + v (case k + (:RN :AP :RR) (pos? (.get bb)) + :SM (decode-substitution-matrix bb) + :TD (decode-tag-dictionary bb))] + (recur (dec i) (assoc! acc k v))))))) + +(defn- decode-data-series-encodings [bb] + (let [_size (itf8/decode-itf8 bb) + n (itf8/decode-itf8 bb)] + (loop [n n, acc (transient {})] + (if (zero? n) + (persistent! acc) + (let [k (keyword (String. ^bytes (bb/read-bytes bb 2))) + v (decode-encoding bb)] + (recur (dec n) (assoc! acc k v))))))) + +(defn- decode-tag-encoding-map [bb] + (let [_size (itf8/decode-itf8 bb) + n (itf8/decode-itf8 bb)] + (loop [i n, acc {}] + (if (zero? i) + acc + (let [k (itf8/decode-itf8 bb) + c1 (char (bit-and (bit-shift-right k 16) 0xff)) + c2 (char (bit-and (bit-shift-right k 8) 0xff)) + t (char (bit-and k 0xff)) + v (decode-encoding bb) + tag (keyword (str c1 c2))] + (recur (dec i) (assoc-in acc [tag t] v))))))) + +(defn decode-compression-header-block + "Decodes a compression header block from the given byte buffer." + [bb] + (let [{bb' :data} (decode-block bb) + preservation-map (decode-preservation-map bb') + data-series-encodings (decode-data-series-encodings bb') + tag-encoding-map (decode-tag-encoding-map bb')] + {:preservation-map preservation-map + :data-series data-series-encodings + :tags tag-encoding-map})) + +(defn decode-slice-header-block + "Decodes a slice header block from the given byte buffer." + [bb] + (let [{bb' :data} (decode-block bb) + ref-seq-id (itf8/decode-itf8 bb') + start (itf8/decode-itf8 bb') + span (itf8/decode-itf8 bb') + n-records (itf8/decode-itf8 bb') + counter (itf8/decode-ltf8 bb') + n-blocks (itf8/decode-itf8 bb') + content-ids (decode-itf8-array bb') + embedded-reference (itf8/decode-itf8 bb') + reference-md5 (bb/read-bytes bb' 16) + tags []] + {:ref-seq-id ref-seq-id + :start start + :span span + :records n-records + :counter counter + :blocks n-blocks + :content-ids content-ids + :embedded-reference embedded-reference + :reference-md5 reference-md5 + :tags tags})) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj new file mode 100644 index 00000000..32d2fba2 --- /dev/null +++ b/src/cljam/io/cram/reader.clj @@ -0,0 +1,121 @@ +(ns cljam.io.cram.reader + (:require [cljam.io.cram.decode.data-series :as ds] + [cljam.io.cram.decode.record :as record] + [cljam.io.cram.decode.structure :as struct] + [cljam.io.protocols :as protocols]) + (:import [java.io Closeable] + [java.nio Buffer ByteBuffer ByteOrder] + [java.nio.channels FileChannel FileChannel$MapMode])) + +(declare read-alignments) + +(deftype CRAMReader [url channel buffer header refs seq-resolver] + Closeable + (close [_] + (when seq-resolver + (.close ^Closeable seq-resolver)) + (.close ^FileChannel channel)) + protocols/IReader + (reader-url [_] url) + (read [this] + (protocols/read-alignments this)) + #_(read [_ option]) + (indexed? [_] false) + protocols/IAlignmentReader + (read-header [_] @header) + (read-refs [_] @refs) + (read-alignments [this] + (read-alignments this)) + #_(read-alignments [_ region]) + #_(read-blocks [_]) + #_(read-blocks [_ region]) + #_(read-blocks [_ region option]) + #_protocols/IRegionReader + #_(read-in-region [_ region]) + #_(read-in-region [_ region option])) + +(defn- read-to-buffer + ([rdr] (read-to-buffer rdr nil)) + ([^CRAMReader rdr limit] + (let [^FileChannel ch (.-channel rdr) + ^Buffer bb (.-buffer rdr)] + (.clear bb) + (.limit bb (or limit (.capacity bb))) + (while (and (.hasRemaining bb) + (< (.position ch) (.size ch))) + (.read ch ^ByteBuffer bb)) + (.flip bb)))) + +(defn read-file-definition + "Reads the CRAM file definition." + [^CRAMReader rdr] + (read-to-buffer rdr 26) + (struct/decode-file-definition (.-buffer rdr))) + +(defn- read-slice-records [^CRAMReader rdr bb compression-header] + (let [slice-header (struct/decode-slice-header-block bb) + blocks (into [] (map (fn [_] (struct/decode-block bb))) + (range (:blocks slice-header))) + ds-decoders (ds/build-data-series-decoders compression-header blocks) + tag-decoders (ds/build-tag-decoders compression-header blocks)] + (record/decode-slice-records (.-seq-resolver rdr) + @(.-header rdr) + compression-header + slice-header + ds-decoders + tag-decoders))) + +(defn- read-container-records [^CRAMReader rdr ^ByteBuffer bb container-header] + (let [container-header-end (.position bb) + compression-header (struct/decode-compression-header-block bb)] + (->> (:landmarks container-header) + (mapcat + (fn [^long landmark] + (.position ^Buffer bb (+ container-header-end landmark)) + (read-slice-records rdr bb compression-header)))))) + +(defn- with-next-container-header [^CRAMReader rdr f] + (let [^FileChannel ch (.-channel rdr) + pos (.position ch) + _ (read-to-buffer rdr) + ^ByteBuffer bb (.-buffer rdr) + container-header (struct/decode-container-header bb) + container-start (+ pos (.position bb)) + container-size (long (:length container-header)) + ret (f container-header container-start)] + (.position ch (+ container-start container-size)) + ret)) + +(defn- read-container-with [^CRAMReader rdr f] + (letfn [(f' [container-header container-start] + (let [container-size (long (:length container-header)) + ^FileChannel ch (.-channel rdr) + bb (-> ch + (.map FileChannel$MapMode/READ_ONLY container-start container-size) + (.order ByteOrder/LITTLE_ENDIAN))] + (f container-header bb)))] + (with-next-container-header rdr f'))) + +(defn skip-container + "Skips the next container." + [rdr] + (with-next-container-header rdr (constantly nil))) + +(defn read-header + "Reads the CRAM header from the CRAM header block." + [^CRAMReader rdr] + (read-container-with rdr (fn [_ bb] (struct/decode-cram-header-block bb)))) + +(defn read-alignments + "Reads all the alignments from the CRAM file and returns them as a lazy + sequence of alignment maps." + [^CRAMReader rdr] + (let [^FileChannel ch (.-channel rdr)] + (letfn [(read1 [container-header bb] + (when-not (struct/eof-container? container-header) + (read-container-records rdr bb container-header))) + (step [] + (when (< (.position ch) (.size ch)) + (when-let [alns (read-container-with rdr read1)] + (concat alns (lazy-seq (step))))))] + (step)))) diff --git a/src/cljam/io/cram/seq_resolver.clj b/src/cljam/io/cram/seq_resolver.clj new file mode 100644 index 00000000..3dee9f99 --- /dev/null +++ b/src/cljam/io/cram/seq_resolver.clj @@ -0,0 +1,22 @@ +(ns cljam.io.cram.seq-resolver + (:require [cljam.io.cram.seq-resolver.protocol :as proto] + [cljam.io.sequence :as cseq]) + (:import [java.io Closeable])) + +(deftype SeqResolver [seq-reader] + java.io.Closeable + (close [_] + (.close ^Closeable seq-reader)) + proto/ISeqResolver + (resolve-sequence [_ region] + (cseq/read-sequence seq-reader region))) + +(defn seq-resolver + "Creates e new sequence resolver from the given sequence file." + [seq-file] + (->SeqResolver (cseq/reader seq-file))) + +(defn clone-seq-resolver + "Creates a cloned sequence resolver based on the given resolver." + [^SeqResolver resolver] + (->SeqResolver (cseq/reader (.-seq-reader resolver)))) diff --git a/src/cljam/io/cram/seq_resolver/protocol.clj b/src/cljam/io/cram/seq_resolver/protocol.clj new file mode 100644 index 00000000..ddebb14d --- /dev/null +++ b/src/cljam/io/cram/seq_resolver/protocol.clj @@ -0,0 +1,11 @@ +(ns cljam.io.cram.seq-resolver.protocol) + +(defprotocol ISeqResolver + (resolve-sequence [this region])) + +(extend-protocol ISeqResolver + nil + (resolve-sequence [_ region] + (throw + (ex-info "reference was not specified, but tried to resolve sequence" + region)))) diff --git a/src/cljam/io/util.clj b/src/cljam/io/util.clj index c8241dba..fb06b0fa 100644 --- a/src/cljam/io/util.clj +++ b/src/cljam/io/util.clj @@ -5,6 +5,7 @@ cljam.io.sam.writer cljam.io.bam.reader cljam.io.bam.writer + cljam.io.cram.reader cljam.io.vcf.reader cljam.io.vcf.writer cljam.io.bcf.reader @@ -50,6 +51,11 @@ [wtr] (instance? cljam.io.bam.writer.BAMWriter wtr)) +(defn cram-reader? + "Checks if given object is an instance of CRAMReader." + [rdr] + (instance? cljam.io.cram.reader.CRAMReader rdr)) + (defn variant-reader? "Checks if given object implements protocol IVariantReader." [rdr] @@ -153,6 +159,7 @@ #"(?i)\.sam$" :sam #"(?i)\.bai$" :bai #"(?i)\.bam$" :bam + #"(?i)\.cram$" :cram #"(?i)\.f(ast)?q" :fastq #"(?i)\.fai$" :fai #"(?i)\.(fa|fasta|fas|fsa|seq|fna|faa|ffn|frn|mpfa)" :fasta @@ -183,6 +190,7 @@ (condp re-find s #"^BAM\01" :bam #"^BAI\01" :bai + #"^CRAM" :cram #"^BCF\02" :bcf #"^TBI\01" :tbi #"^##fileformat=VCF" :vcf diff --git a/test-resources/cram/medium.cram b/test-resources/cram/medium.cram new file mode 100644 index 00000000..f1d64f70 Binary files /dev/null and b/test-resources/cram/medium.cram differ diff --git a/test-resources/cram/medium_with_standard_tags.cram b/test-resources/cram/medium_with_standard_tags.cram new file mode 100644 index 00000000..d7c7525e Binary files /dev/null and b/test-resources/cram/medium_with_standard_tags.cram differ diff --git a/test-resources/cram/test.cram b/test-resources/cram/test.cram new file mode 100644 index 00000000..2e9185e1 Binary files /dev/null and b/test-resources/cram/test.cram differ diff --git a/test/cljam/io/cram/decode/data_series_test.clj b/test/cljam/io/cram/decode/data_series_test.clj new file mode 100644 index 00000000..12048883 --- /dev/null +++ b/test/cljam/io/cram/decode/data_series_test.clj @@ -0,0 +1,425 @@ +(ns cljam.io.cram.decode.data-series-test + (:require [cljam.io.bam.decoder :as decoder] + [cljam.io.cram.decode.data-series :as ds] + [cljam.io.sam.util.cigar :as cigar] + [cljam.io.util.byte-buffer :as bb] + [clojure.test :refer [deftest is testing]])) + +(deftest build-data-series-decoders-test + (let [encodings {:BA {:codec :external, :content-id 1} + :BF {:codec :external, :content-id 2} + :RL {:codec :huffman, :alphabet [151], :bit-len [0]} + :BB {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 3} + :val-encoding {:codec :external, :content-id 4}} + :RN {:codec :byte-array-stop, :stop-byte 0, :external-id 5}} + blocks [{:content-id 1 + :data (bb/make-lsb-byte-buffer (byte-array (.getBytes "ATGC")))} + {:content-id 2 + :data (->> (byte-array [0x80 0xa1 0x80 0x63 0x80 0xa3 0x80 0x63]) + bb/make-lsb-byte-buffer)} + {:content-id 3 + :data (bb/make-lsb-byte-buffer (byte-array [3 5 4 3]))} + {:content-id 4 + :data (bb/make-lsb-byte-buffer (.getBytes "CATCGAACAACTACT"))} + {:content-id 5 + :data (->> "qname001\000qname002\000qname003\000qname004\000" + .getBytes + bb/make-lsb-byte-buffer)}] + {:keys [BA BF RL BB RN]} + (ds/build-data-series-decoders {:data-series encodings} blocks)] + (is (= (map int "ATGC") [(BA) (BA) (BA) (BA)])) + (is (= [0xa1 0x63 0xa3 0x63] [(BF) (BF) (BF) (BF)])) + (is (= [151 151 151 151] [(RL) (RL) (RL) (RL)])) + (is (= ["CAT" "CGAAC" "AACT" "ACT"] + (map #(String. ^bytes %) [(BB) (BB) (BB) (BB)]))) + (is (= ["qname001" "qname002" "qname003" "qname004"] + (map #(String. ^bytes %) [(RN) (RN) (RN) (RN)])))) + (testing "block content interleaves when associated with multiple data series" + (let [encodings {:BB {:codec :byte-array-len + :len-encoding {:codec :external, :content-id 1} + :val-encoding {:codec :external, :content-id 1}} + :DL {:codec :external, :content-id 2} + :HC {:codec :external, :content-id 2}} + blocks [{:content-id 1 + :data (->> (byte-array [3 (int \A) (int \B) (int \C) + 2 (int \D) (int \E) + 1 (int \F)]) + bb/make-lsb-byte-buffer)} + {:content-id 2 + :data (bb/make-lsb-byte-buffer (byte-array [1 2 3 4 5 6]))}] + {:keys [BB DL HC]} + (ds/build-data-series-decoders {:data-series encodings} blocks)] + (is (= ["ABC" "DE" "F"] + (map #(String. ^bytes %) [(BB) (BB) (BB)]))) + (is (= [[1 2] [3 4] [5 6]] + [[(DL) (HC)] + [(DL) (HC)] + [(DL) (HC)]]))))) + +(deftest build-tag-decoders-test + (testing "single values" + (testing "bytes" + (let [encodings {:sb {\c {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [1] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7561827}}} + :ub {\C {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [1] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7692867}}}} + blocks [{:content-id 7561827 + :data (bb/make-lsb-byte-buffer (byte-array [0xde 0xed 0xbe 0xef]))} + {:content-id 7692867 + :data (bb/make-lsb-byte-buffer (byte-array [0xca 0xfe 0xba 0xbe]))}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + sb (get-in decoders [:sb \c]) + ub (get-in decoders [:ub \C])] + (is (= [{:type "i" :value (unchecked-byte 0xde)} + {:type "i" :value (unchecked-byte 0xed)} + {:type "i" :value (unchecked-byte 0xbe)} + {:type "i" :value (unchecked-byte 0xef)}] + [(sb) (sb) (sb) (sb)])) + (is (= [{:type "i" :value 0xca} + {:type "i" :value 0xfe} + {:type "i" :value 0xba} + {:type "i" :value 0xbe}] + [(ub) (ub) (ub) (ub)])))) + (testing "shorts" + (let [encodings {:ss {\s {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [2] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7566195}}} + :us {\S {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [2] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7697235}}}} + blocks [{:content-id 7566195 + :data (doto (bb/allocate-lsb-byte-buffer 8) + (.putShort 0x0123) + (.putShort 0x4567) + (.putShort 0x89ab) + (.putShort 0xcdef) + .flip)} + {:content-id 7697235 + :data (doto (bb/allocate-lsb-byte-buffer 8) + (.putShort 0x0123) + (.putShort 0x4567) + (.putShort 0x89ab) + (.putShort 0xcdef) + .flip)}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + ss (get-in decoders [:ss \s]) + us (get-in decoders [:us \S])] + (is (= [{:type "i" :value 0x0123} + {:type "i" :value 0x4567} + {:type "i" :value (unchecked-short 0x89ab)} + {:type "i" :value (unchecked-short 0xcdef)}] + [(ss) (ss) (ss) (ss)])) + (is (= [{:type "i" :value 0x0123} + {:type "i" :value 0x4567} + {:type "i" :value 0x89ab} + {:type "i" :value 0xcdef}] + [(us) (us) (us) (us)])))) + (testing "ints" + (let [encodings {:si {\i {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7563625}}} + :ui {\I {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7694665}}}} + blocks [{:content-id 7563625 + :data (doto (bb/allocate-lsb-byte-buffer 16) + (.putInt 0) + (.putInt 0x01234567) + (.putInt 0x89abcdef) + (.putInt 0xffffffff) + .flip)} + {:content-id 7694665 + :data (doto (bb/allocate-lsb-byte-buffer 16) + (.putInt 0) + (.putInt 0x01234567) + (.putInt 0x89abcdef) + (.putInt 0xffffffff) + .flip)}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + si (get-in decoders [:si \i]) + ui (get-in decoders [:ui \I])] + (is (= [{:type "i" :value 0} + {:type "i" :value 0x01234567} + {:type "i" :value (unchecked-int 0x89abcdef)} + {:type "i" :value -1}] + [(si) (si) (si) (si)])) + (is (= [{:type "i" :value 0} + {:type "i" :value 0x01234567} + {:type "i" :value 0x89abcdef} + {:type "i" :value 0xffffffff}] + [(ui) (ui) (ui) (ui)])))) + (testing "floats" + (let [encodings {:fl {\f {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [4] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 6712422}}}} + blocks [{:content-id 6712422 + :data (doto (bb/allocate-lsb-byte-buffer 16) + (.putFloat 1.0) + (.putFloat 0.75) + (.putFloat -0.5) + (.putFloat 0.0) + .flip)}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + fl (get-in decoders [:fl \f])] + (is (= [{:type "f" :value 1.0} + {:type "f" :value 0.75} + {:type "f" :value -0.5} + {:type "f" :value 0.0}] + [(fl) (fl) (fl) (fl)])))) + (testing "strings" + (let [encodings {:MC {\Z {:codec :byte-array-stop + :stop-byte 9 + :external-id 5063514}} + :hx {\H {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 6846536}}}} + blocks [{:content-id 5063514 + :data (->> (str "151M\000\011" + "20S131M\000\011" + "16S74M1D58M2S\000\011" + "151M\000\011") + .getBytes + bb/make-lsb-byte-buffer)} + {:content-id 6846536 + :data (->> (str "01234567\000" + "89abcdef\000" + "cafebabe\000" + "deadbeef\000") + .getBytes + bb/make-lsb-byte-buffer)}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + MC (get-in decoders [:MC \Z]) + hx (get-in decoders [:hx \H])] + (is (= [{:type "Z" :value "151M"} + {:type "Z" :value "20S131M"} + {:type "Z" :value "16S74M1D58M2S"} + {:type "Z" :value "151M"}] + [(MC) (MC) (MC) (MC)])) + (is (= [{:type "H" :value [0x01 0x23 0x45 0x67]} + {:type "H" :value [0x89 0xab 0xcd 0xef]} + {:type "H" :value [0xca 0xfe 0xba 0xbe]} + {:type "H" :value [0xde 0xad 0xbe 0xef]}] + (map (fn [m] (update m :value (partial map #(bit-and (long %) 0xff)))) + [(hx) (hx) (hx) (hx)])))))) + + (testing "array values" + (testing "byte arrays" + (let [encodings {:sb {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7561794}}} + :ub {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7692866}}}} + blocks [{:content-id 7561794 + :data (let [bb (bb/allocate-lsb-byte-buffer 36)] + (doseq [vs [[0x00 0x01 0x02 0x03] + [0x7c 0x7d 0x7e 0x7f] + [0x80 0x81 0x82 0x83] + [0xfc 0xfd 0xfe 0xff]]] + (.put bb (byte (int \c))) + (.putInt bb 4) + (doseq [v vs] (.put bb (byte v)))) + (.flip bb))} + {:content-id 7692866 + :data (let [bb (bb/allocate-lsb-byte-buffer 36)] + (doseq [vs [[0x00 0x01 0x02 0x03] + [0x7c 0x7d 0x7e 0x7f] + [0x80 0x81 0x82 0x83] + [0xfc 0xfd 0xfe 0xff]]] + (.put bb (byte (int \C))) + (.putInt bb 4) + (doseq [v vs] (.put bb (byte v)))) + (.flip bb))}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + sb (get-in decoders [:sb \B]) + ub (get-in decoders [:ub \B])] + (is (= [{:type "B" :value "c,0,1,2,3"} + {:type "B" :value "c,124,125,126,127"} + {:type "B" :value "c,-128,-127,-126,-125"} + {:type "B" :value "c,-4,-3,-2,-1"}] + [(sb) (sb) (sb) (sb)])) + (is (= [{:type "B" :value "C,0,1,2,3"} + {:type "B" :value "C,124,125,126,127"} + {:type "B" :value "C,128,129,130,131"} + {:type "B" :value "C,252,253,254,255"}] + [(ub) (ub) (ub) (ub)])))) + (testing "short arrays" + (let [encodings {:ss {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7566146}}} + :us {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [9] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7697218}}}} + blocks [{:content-id 7566146 + :data (let [bb (bb/allocate-lsb-byte-buffer 36)] + (doseq [vs [[0x0123 0x4567] + [0x7ffe 0x7fff] + [0x8000 0x8001] + [0xfffe 0xffff]]] + (.put bb (byte (int \s))) + (.putInt bb 2) + (doseq [v vs] (.putShort bb v))) + (.flip bb))} + {:content-id 7697218 + :data (let [bb (bb/allocate-lsb-byte-buffer 36)] + (doseq [vs [[0x0123 0x4567] + [0x7ffe 0x7fff] + [0x8000 0x8001] + [0xfffe 0xffff]]] + (.put bb (byte (int \S))) + (.putInt bb 2) + (doseq [v vs] (.putShort bb v))) + (.flip bb))}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + ss (get-in decoders [:ss \B]) + us (get-in decoders [:us \B])] + (is (= [{:type "B" :value (str "s," 0x0123 "," 0x4567)} + {:type "B" :value (str "s," 0x7ffe "," 0x7fff)} + {:type "B" :value (str "s," (unchecked-short 0x8000) + "," (unchecked-short 0x8001))} + {:type "B" :value (str "s," (unchecked-short 0xfffe) + "," (unchecked-short 0xffff))}] + [(ss) (ss) (ss) (ss)])) + (is (= [{:type "B" :value (str "S," 0x0123 "," 0x4567)} + {:type "B" :value (str "S," 0x7ffe "," 0x7fff)} + {:type "B" :value (str "S," 0x8000 "," 0x8001)} + {:type "B" :value (str "S," 0xfffe "," 0xffff)}] + [(us) (us) (us) (us)])))) + (testing "int arrays" + (let [encodings {:si {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [13] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7563586}}} + :ui {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [13] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 7694658}}}} + blocks [{:content-id 7563586 + :data (let [bb (bb/allocate-lsb-byte-buffer 52)] + (doseq [vs [[0x01234567 0x76543210] + [0x7ffffffe 0x7fffffff] + [0x80000000 0x80000001] + [0xfffffffe 0xffffffff]]] + (.put bb (byte (int \i))) + (.putInt bb 2) + (doseq [v vs] (.putInt bb v))) + (.flip bb))} + {:content-id 7694658 + :data (let [bb (bb/allocate-lsb-byte-buffer 52)] + (doseq [vs [[0x01234567 0x76543210] + [0x7ffffffe 0x7fffffff] + [0x80000000 0x80000001] + [0xfffffffe 0xffffffff]]] + (.put bb (byte (int \I))) + (.putInt bb 2) + (doseq [v vs] (.putInt bb v))) + (.flip bb))}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + si (get-in decoders [:si \B]) + ui (get-in decoders [:ui \B])] + (is (= [{:type "B" :value (str "i," 0x01234567 "," 0x76543210)} + {:type "B" :value (str "i," 0x7ffffffe "," 0x7fffffff)} + {:type "B" :value (str "i," (unchecked-int 0x80000000) + "," (unchecked-int 0x80000001))} + {:type "B" :value (str "i," (unchecked-int 0xfffffffe) + "," (unchecked-int 0xffffffff))}] + [(si) (si) (si) (si)])) + (is (= [{:type "B" :value (str "I," 0x01234567 "," 0x76543210)} + {:type "B" :value (str "I," 0x7ffffffe "," 0x7fffffff)} + {:type "B" :value (str "I," 0x80000000 "," 0x80000001)} + {:type "B" :value (str "I," 0xfffffffe "," 0xffffffff)}] + [(ui) (ui) (ui) (ui)]))) + (let [vs ["42M1D7M1D74M1D28M" + "44M2D45M1D58M4S" + "65M1D9M1D75M2S" + "18S76M1D57M"] + encodings {:CG {\B {:codec :byte-array-stop + :stop-byte -1 + :external-id 4409154}}} + blocks [{:content-id 4409154 + :data (let [encoded (map cigar/encode-cigar vs) + bb (bb/allocate-lsb-byte-buffer (+ (->> (apply concat encoded) + count + (* 4)) + (* (count vs) + (+ 1 4 1))))] + (doseq [e encoded] + (.put bb (byte (int \I))) + (.putInt bb (count e)) + (doseq [x e] + (.putInt bb (int x))) + (.put bb (byte -1))) + (.flip bb) + bb)}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + CG (get-in decoders [:CG \B])] + (is (= (map #(array-map :type "B" :value %) vs) + (map #(update % :value #'decoder/B-I-type-cigar-str->cigar-str) + [(CG) (CG) (CG) (CG)]))))) + (testing "float arrays" + (let [encodings {:fl {\B {:codec :byte-array-len + :len-encoding {:codec :huffman + :alphabet [13] + :bit-len [0]} + :val-encoding {:codec :external + :content-id 6712386}}}} + blocks [{:content-id 6712386 + :data (let [bb (bb/allocate-lsb-byte-buffer 52)] + (doseq [vs [[0.0 0.25] + [0.5 1.0] + [0.0 -0.5] + [-0.75 -1.0]]] + (.put bb (byte (int \f))) + (.putInt bb 2) + (doseq [v vs] (.putFloat bb v))) + (.flip bb))}] + decoders (ds/build-tag-decoders {:tags encodings} blocks) + fl (get-in decoders [:fl \B])] + (is (= [{:type "B" :value "f,0.0,0.25"} + {:type "B" :value "f,0.5,1.0"} + {:type "B" :value "f,0.0,-0.5"} + {:type "B" :value "f,-0.75,-1.0"}] + [(fl) (fl) (fl) (fl)])))))) diff --git a/test/cljam/io/cram/decode/record_test.clj b/test/cljam/io/cram/decode/record_test.clj new file mode 100644 index 00000000..1e30a9c5 --- /dev/null +++ b/test/cljam/io/cram/decode/record_test.clj @@ -0,0 +1,490 @@ +(ns cljam.io.cram.decode.record-test + (:require [cljam.io.cram.decode.record :as record] + [cljam.io.cram.seq-resolver.protocol :as resolver] + [cljam.io.sequence :as cseq] + [cljam.test-common :as common] + [clojure.test :refer [are deftest is testing]])) + +(defn- build-stub-decoders + ([data] (build-stub-decoders data (fn [_ v] v))) + ([data wrap-fn] + (into {} + (map (fn [[k xs]] + (let [xs' (volatile! xs) + f (fn [] + (when (seq @xs') + (let [v (first @xs')] + (vswap! xs' next) + (or (some->> v (wrap-fn k)) + ;; nil in test data is only used for readability + ;; and has no meaning, so just ignore it + (recur)))))] + [k f]))) + data))) + +(defn- build-stub-tag-decoders [data] + (into {} + (map (fn [[k v]] + [k (build-stub-decoders v #(array-map :type (str %1) :value %2))])) + data)) + +(deftest ref-name-test + (let [cram-header {:SQ + [{:SN "chr1"} + {:SN "chr2"} + {:SN "chr3"}]}] + (is (= "chr2" (#'record/ref-name cram-header 1))) + (is (= "*" (#'record/ref-name cram-header -1))))) + +(deftest read-group-id-test + (let [cram-header {:RG + [{:ID "rg001"} + {:ID "rg002"} + {:ID "rg003"}]}] + (is (= "rg002" (#'record/read-group-id cram-header 1))) + (is (nil? (#'record/read-group-id cram-header -1))))) + +(def ^:private test-seq-resolver + (let [seqs (with-open [r (cseq/reader common/test-fa-file)] + (into {} (map (juxt :name :sequence)) (cseq/read-all-sequences r)))] + (reify resolver/ISeqResolver + (resolve-sequence [_ {:keys [chr start end]}] + (let [s (get seqs chr)] + (subs s (dec (long start)) end)))))) + +(deftest record-end-test + (are [?record ?features ?expected] + (= ?expected (#'record/record-end ?record ?features)) + {:pos 2, ::record/len 5} + [] + 6 + + {:pos 2, ::record/len 5} + [{:code :subst, :pos 3, :subst 2}] + 6 + + {:pos 2, ::record/len 5} + [{:code :subst, :pos 2, :subst 3} + {:code :subst, :pos 3, :subst 2}] + 6 + + {:pos 2, ::record/len 5} + [{:code :insert-base, :pos 3, :base (int \A)}] + 5 + + {:pos 2, ::record/len 5} + [{:code :insert-base, :pos 3, :base (int \A)} + {:code :subst, :pos 4, :subst 1}] + 5 + + {:pos 2, ::record/len 5} + [{:code :insertion, :pos 3, :bases (.getBytes "CA")}] + 4 + + {:pos 2, ::record/len 5} + [{:code :subst, :pos 2, :subst 3} + {:code :insertion, :pos 3, :bases (.getBytes "CA")}] + 4 + + {:pos 2, ::record/len 5} + [{:code :deletion, :pos 3, :len 2}] + 8 + + {:pos 2, ::record/len 5} + [{:code :deletion, :pos 3, :len 2} + {:code :subst, :pos 3, :subst 0}] + 8 + + {:pos 2, ::record/len 5} + [{:code :softclip, :pos 4, :bases (.getBytes "TT")}] + 4 + + {:pos 2, ::record/len 5} + [{:code :softclip, :pos 1, :bases (.getBytes "TT")}] + 4)) + +(deftest record-seq-test + (let [compression-header {:preservation-map + {:SM + {\A {0 \C, 1 \G, 2 \T, 3 \N} + \C {0 \G, 1 \T, 2 \N, 3 \A} + \G {0 \T, 1 \N, 2 \A, 3 \C} + \T {0 \N, 1 \A, 2 \C, 3 \G} + \N {0 \A, 1 \C, 2 \G, 3 \T}}}}] + (are [?record ?features ?expected] + (= ?expected (#'record/record-seq test-seq-resolver compression-header ?record ?features)) + {:rname "ref", :pos 2, ::record/len 5, ::record/end 6} + [] + "GCATG" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 6} + [{:code :subst, :pos 3, :subst 2}] + "GCTTG" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 6} + [{:code :subst, :pos 2, :subst 3} + {:code :subst, :pos 3, :subst 2}] + "GATTG" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 5} + [{:code :insert-base, :pos 3, :base (int \A)}] + "GCAAT" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 5} + [{:code :insert-base, :pos 3, :base (int \A)} + {:code :subst, :pos 4, :subst 1}] + "GCAGT" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 4} + [{:code :insertion, :pos 3, :bases (.getBytes "CA")}] + "GCCAA" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 4} + [{:code :subst, :pos 2, :subst 3} + {:code :insertion, :pos 3, :bases (.getBytes "CA")}] + "GACAA" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 8} + [{:code :deletion, :pos 3, :len 2}] + "GCGTT" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 8} + [{:code :deletion, :pos 3, :len 2} + {:code :subst, :pos 3, :subst 0}] + "GCTTT" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 4} + [{:code :softclip, :pos 4, :bases (.getBytes "TT")}] + "GCATT" + + {:rname "ref", :pos 2, ::record/len 5, ::record/end 4} + [{:code :softclip, :pos 1, :bases (.getBytes "TT")}] + "TTGCA"))) + +(deftest record-qual-test + (are [?record ?features ?qual ?expected] + (= ?expected + (#'record/record-qual ?record ?features (build-stub-decoders {:QS ?qual}))) + {::record/flag 0x01, ::record/len 5} + [] + (->> ["ABCDE"] + (mapcat #(.getBytes ^String %)) + (map #(- (long %) 33))) + "ABCDE" + + {::record/flag 0x02, ::record/len 5} + (->> [{:code :read-base, :pos 1, :qual \A} + {:code :read-base, :pos 2, :qual \B} + {:code :read-base, :pos 3, :qual \C} + {:code :read-base, :pos 4, :qual \D} + {:code :read-base, :pos 5, :qual \E}] + (map (fn [f] (update f :qual #(- (int %) 33))))) + [] + "ABCDE" + + {::record/flag 0x04, ::record/len 5} + (->> [{:code :score, :pos 1, :qual \A} + {:code :score, :pos 2, :qual \B} + {:code :score, :pos 3, :qual \C} + {:code :score, :pos 4, :qual \D} + {:code :score, :pos 5, :qual \E}] + (map (fn [f] (update f :qual #(- (int %) 33))))) + [] + "ABCDE" + + {::record/flag 0x08, ::record/len 5} + (->> [{:code :scores, :pos 1, :scores "ABC"} + {:code :scores, :pos 4, :scores "DE"}] + (map (fn [f] + (update f :scores + (fn [qs] + (->> (.getBytes qs) + (map #(- (long %) 33)) + byte-array)))))) + [] + "ABCDE")) + +(deftest record-cigar-test + (are [?features ?expected] + (= ?expected + (->> (#'record/record-cigar {::record/len 5} ?features) + (map (fn [[n op]] (str n op))) + (apply str))) + [] + "5M" + + [{:code :subst, :pos 3, :subst 2}] + "5M" + + [{:code :subst, :pos 2, :subst 3} + {:code :subst, :pos 3, :subst 2}] + "5M" + + [{:code :insert-base, :pos 3, :base (int \A)}] + "2M1I2M" + + [{:code :insert-base, :pos 3, :base (int \A)} + {:code :subst, :pos 4, :subst 1}] + "2M1I2M" + + [{:code :insertion, :pos 3, :bases (.getBytes "CA")}] + "2M2I1M" + + [{:code :subst, :pos 2, :subst 3} + {:code :insertion, :pos 3, :bases (.getBytes "CA")}] + "2M2I1M" + + [{:code :deletion, :pos 3, :len 2}] + "2M2D3M" + + [{:code :deletion, :pos 3, :len 2} + {:code :subst, :pos 3, :subst 0}] + "2M2D3M" + + [{:code :softclip, :pos 4, :bases (.getBytes "TT")}] + "3M2S" + + [{:code :softclip, :pos 1, :bases (.getBytes "TT")}] + "2S3M")) + +(deftest build-read-features-decoder-test + (are [?decoders ?expected] + (let [fs-decoder (#'record/build-read-features-decoder + (build-stub-decoders ?decoders))] + (= ?expected (fs-decoder))) + {:FN [0]} + [] + + {:FN [2] + :FC [(int \B) (int \B)] + :FP [5 10] + :BA [(int \A) (int \T)] + :QS [33 0]} + [{:code :read-base, :pos 5, :base (int \A), :qual 33} + {:code :read-base, :pos 15, :base (int \T), :qual 0}] + + {:FN [2] + :FC [(int \X) (int \X)] + :FP [5 10] + :BS [0 3]} + [{:code :subst, :pos 5, :subst 0} + {:code :subst, :pos 15, :subst 3}] + + {:FN [2] + :FC [(int \I) (int \I)] + :FP [5 10] + :IN ["CA" "TT"]} + [{:code :insertion, :pos 5, :bases "CA"} + {:code :insertion, :pos 15, :bases "TT"}] + + {:FN [2] + :FC [(int \S) (int \S)] + :FP [1 10] + :SC ["CA" "TT"]} + [{:code :softclip, :pos 1, :bases "CA"} + {:code :softclip, :pos 11, :bases "TT"}] + + {:FN [2] + :FC [(int \H) (int \H)] + :FP [1 10] + :HC [2 5]} + [{:code :hardclip, :pos 1, :len 2} + {:code :hardclip, :pos 11, :len 5}] + + {:FN [2] + :FC [(int \P) (int \P)] + :FP [1 10] + :PD [2 5]} + [{:code :padding, :pos 1, :len 2} + {:code :padding, :pos 11, :len 5}] + + {:FN [2] + :FC [(int \D) (int \D)] + :FP [5 10] + :DL [2 5]} + [{:code :deletion, :pos 5, :len 2} + {:code :deletion, :pos 15, :len 5}] + + {:FN [2] + :FC [(int \N) (int \N)] + :FP [5 10] + :RS [2 5]} + [{:code :ref-skip, :pos 5, :len 2} + {:code :ref-skip, :pos 15, :len 5}] + + {:FN [2] + :FC [(int \i) (int \i)] + :FP [5 10] + :BA [(int \A) (int \T)]} + [{:code :insert-base, :pos 5, :base (int \A)} + {:code :insert-base, :pos 15, :base (int \T)}] + + {:FN [2] + :FC [(int \b) (int \b)] + :FP [5 10] + :BB ["CAT" "TAG"]} + [{:code :bases, :pos 5, :bases "CAT"} + {:code :bases, :pos 15, :bases "TAG"}] + + {:FN [2] + :FC [(int \q) (int \q)] + :FP [5 10] + :QQ ["??@" "###"]} + [{:code :scores, :pos 5, :scores "??@"} + {:code :scores, :pos 15, :scores "###"}] + + {:FN [2] + :FC [(int \Q) (int \Q)] + :FP [5 10] + :QS [33 0]} + [{:code :score, :pos 5, :score 33} + {:code :score, :pos 15, :score 0}])) + +(deftest decode-slice-records-test + (testing "mapped reads" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}] + :RG + [{:ID "rg001"} + {:ID "rg002"}]} + compression-header {:preservation-map + {:RN true + :AP true + :RR true + :SM {\A {0 \C, 1 \G, 2 \T, 3 \N} + \C {0 \A, 1 \G, 2 \T, 3 \N} + \G {0 \A, 1 \C, 2 \T, 3 \N} + \T {0 \A, 1 \C, 2 \G, 3 \N} + \N {0 \A, 1 \C, 2 \G, 3 \T}} + :TD [[] + [{:tag :MD, :type \Z} + {:tag :NM, :type \c}]]}} + slice-header {:ref-seq-id 0 + :start 1 + :records 5} + ds-decoders (build-stub-decoders + {:BF [67 67 145 147 65] + :CF [3 5 3 1 3] + :RI [] + :RL [5 5 5 5 5] + :AP [0 4 5 5 5] + :RG [0 0 1 1 -1] + :RN (->> ["q001" "q002" "q003" "q004" "q005"] + (map #(.getBytes ^String %))) + :MF [1 nil 1 nil 2] + :NF [nil 1 nil nil nil] + :NS [0 nil 1 nil -1] + :NP [151 nil 100 nil 0] + :TS [150 nil 0 nil 0] + :TL [1 1 1 1 0] + :FN [1 1 0 2 0] + :FC [(int \X) + (int \S) + nil + (int \I) (int \D) + nil] + :FP [3 + 1 + nil + 2 2 + nil] + :BS [0 nil nil nil nil] + :IN (->> [nil nil nil "A" nil] + (keep #(some-> ^String % (.getBytes)))) + :SC (->> [nil "CC" nil nil nil] + (keep #(some-> ^String % (.getBytes)))) + :DL [nil nil nil 1 nil] + :QS (->> ["HFHHH" "##AAC" "CCCFF" "EBBFF" "AEEEE"] + (mapcat #(.getBytes ^String %)) + (map #(- (long %) 33))) + :MQ [0 15 60 15 0]}) + tag-decoders (build-stub-tag-decoders + {:MD {\Z ["2C2" "3" "5" "3^T2" nil]} + :NM {\c [1 0 0 2 nil]}})] + (is (= [{:qname "q001", :flag 99, :rname "ref", :pos 1, :mapq 0, :cigar "5M" + :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "2C2"}} + {:NM {:type "c", :value 1}}]} + {:qname "q002", :flag 99, :rname "ref", :pos 5, :mapq 15, :cigar "2S3M" + :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC" + :options [{:RG {:type "Z", :value "rg001"}} + {:MD {:type "Z", :value "3"}} + {:NM {:type "c", :value 0}}]} + {:qname "q003", :flag 177, :rname "ref", :pos 10, :mapq 60, :cigar "5M" + :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "5"}} + {:NM {:type "c", :value 0}}]} + {:qname "q004", :flag 147, :rname "ref", :pos 15, :mapq 15, :cigar "1M1I1M1D2M" + :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF" + :options [{:RG {:type "Z", :value "rg002"}} + {:MD {:type "Z", :value "3^T2"}} + {:NM {:type "c", :value 2}}]} + {:qname "q005", :flag 73, :rname "ref", :pos 20, :mapq 0, :cigar "5M" + :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" + :options []}] + (record/decode-slice-records test-seq-resolver + cram-header + compression-header + slice-header + ds-decoders + tag-decoders))))) + (testing "unmapped reads" + (let [cram-header {:SQ + [{:SN "ref"} + {:SN "ref2"}]} + compression-header {:preservation-map + {:RN true + :AP true + :RR true + :SM {\A {0 \C, 1 \G, 2 \T, 3 \N} + \C {0 \A, 1 \G, 2 \T, 3 \N} + \G {0 \A, 1 \C, 2 \T, 3 \N} + \T {0 \A, 1 \C, 2 \G, 3 \N} + \N {0 \A, 1 \C, 2 \G, 3 \T}} + :TD [[]]}} + slice-header {:ref-seq-id -1 + :start 0 + :records 5} + ds-decoders (build-stub-decoders + {:BF [77 141 77 141 77] + :CF [3 3 3 3 3] + :RL [5 5 5 5 5] + :AP [0 0 0 0 0] + :RG [-1 -1 -1 -1 -1] + :RN (->> ["q001" "q001" "q002" "q002" "q003"] + (map #(.getBytes ^String %))) + :MF [2 2 2 2 2] + :NS [-1 -1 -1 -1 -1] + :NP [0 0 0 0 0] + :TS [0 0 0 0 0] + :TL [0 0 0 0 0] + :BA (->> ["AATCC" "ATTGT" "TGGTA" "TCTTG" "GCACA"] + (mapcat #(.getBytes ^String %))) + :QS (->> ["CCFFF" "BDFAD" "ADDHF" "DDDFD" "BCCFD"] + (mapcat #(.getBytes ^String %)) + (map #(- (long %) 33)))}) + tag-decoders (build-stub-tag-decoders {})] + (is (= [{:qname "q001", :flag 77, :rname "*", :pos 0, :mapq 0, :cigar "*" + :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF" + :options []} + {:qname "q001", :flag 141, :rname "*", :pos 0, :mapq 0, :cigar "*" + :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD" + :options []} + {:qname "q002", :flag 77, :rname "*", :pos 0, :mapq 0, :cigar "*" + :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF" + :options []} + {:qname "q002", :flag 141, :rname "*", :pos 0, :mapq 0, :cigar "*" + :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD" + :options []} + {:qname "q003", :flag 77, :rname "*", :pos 0, :mapq 0, :cigar "*" + :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" + :options []}] + (record/decode-slice-records test-seq-resolver + cram-header + compression-header + slice-header + ds-decoders + tag-decoders)))))) diff --git a/test/cljam/io/cram/decode/structure_test.clj b/test/cljam/io/cram/decode/structure_test.clj new file mode 100644 index 00000000..ab1269fa --- /dev/null +++ b/test/cljam/io/cram/decode/structure_test.clj @@ -0,0 +1,164 @@ +(ns cljam.io.cram.decode.structure-test + (:require [cljam.io.cram.decode.structure :as struct] + [cljam.io.sam :as sam] + [cljam.io.util.byte-buffer :as bb] + [cljam.test-common :as common] + [clojure.java.io :as cio] + [clojure.test :refer [deftest is]]) + (:import [java.io FileInputStream] + [java.nio ByteOrder] + [java.nio.channels FileChannel$MapMode])) + +(defn- bytes->vec [bs] + (mapv #(bit-and (long %) 0xff) bs)) + +(defn- decode-container-header [bb] + (-> (struct/decode-container-header bb) + (update :landmarks bytes->vec) + (update :crc bytes->vec))) + +(deftest cram-file-structure-decode-test + (with-open [fis (FileInputStream. (cio/file common/medium-with-standard-tags-cram-file)) + ch (.getChannel fis)] + (let [bb (-> ch + (.map FileChannel$MapMode/READ_ONLY 0 (.size ch)) + (.order ByteOrder/LITTLE_ENDIAN))] + (is (= {:version {:major 3 :minor 0} + :id "file:///tmp/cljam/me"} + (struct/decode-file-definition bb))) + (is (= {:length 278 + :ref -1 + :start 0 + :span 0 + :records 0 + :counter 0 + :bases 0 + :blocks 1 + :landmarks [] + :crc [240 201 140 91]} + (decode-container-header bb))) + (is (= (with-open [r (sam/reader common/medium-bam-file)] + (merge {:HD {:VN "1.6" :SO "unsorted"}} + (sam/read-header r))) + (struct/decode-cram-header-block bb))) + (is (= {:length 362828 + :ref -2 + :start 0 + :span 0 + :records 10000 + :counter 0 + :bases 760000 + :blocks 31 + :landmarks [207] + :crc [125 112 223 241]} + (decode-container-header bb))) + (is (= {:preservation-map + {:RN true + :AP false + :RR true + :SM + {\A {0 \G, 1 \C, 2 \T, 3 \N} + \C {0 \G, 1 \T, 2 \A, 3 \N} + \G {0 \A, 1 \C, 2 \T, 3 \N} + \T {0 \C, 1 \G, 2 \A, 3 \N} + \N {0 \A, 1 \C, 2 \G, 3 \T}} + :TD [[] + [{:tag :MD, :type \Z} {:tag :NM, :type \c}]]} + :data-series + {:BF {:codec :external, :content-id 1} + :CF {:codec :external, :content-id 2} + :RI {:codec :external, :content-id 3} + :RL {:codec :external, :content-id 4} + :AP {:codec :external, :content-id 5} + :RG {:codec :external, :content-id 6} + :RN {:codec :byte-array-stop, :stop-byte 9, :external-id 7} + :NF {:codec :external, :content-id 8} + :MF {:codec :external, :content-id 9} + :NS {:codec :external, :content-id 10} + :NP {:codec :external, :content-id 11} + :TS {:codec :external, :content-id 12} + :TL {:codec :external, :content-id 13} + :MQ {:codec :external, :content-id 16} + :FN {:codec :external, :content-id 17} + :FP {:codec :external, :content-id 18} + :FC {:codec :external, :content-id 19} + :BA {:codec :external, :content-id 22} + :QS {:codec :external, :content-id 23} + :BS {:codec :external, :content-id 24} + :IN {:codec :byte-array-stop, :stop-byte 9, :external-id 25} + :DL {:codec :external, :content-id 26} + :RS {:codec :external, :content-id 27} + :SC {:codec :byte-array-stop, :stop-byte 9, :external-id 28} + :PD {:codec :external, :content-id 29} + :HC {:codec :external, :content-id 30}} + :tags + {:MD + {\Z {:codec :byte-array-stop, :stop-byte 9, :external-id 5063770}} + :NM + {\c + {:codec :byte-array-len + :len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]} + :val-encoding {:codec :external, :content-id 5131619}}}}} + (struct/decode-compression-header-block bb))) + (is (= {:tags [] + :content-ids + [1 2 3 4 5 6 7 8 9 10 11 12 13 16 17 18 19 + 22 23 24 25 26 27 28 29 30 5063770 5131619] + :start 0 + :reference-md5 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + :records 10000 + :counter 0 + :blocks 29 + :embedded-reference -1 + :ref-seq-id -2 + :span 0} + (-> (struct/decode-slice-header-block bb) + (update :reference-md5 bytes->vec)))) + (is (= [{:method 0, :content-type 5, :content-id 0, :size 0, :raw-size 0, :crc [47 7 252 241]} + {:method 4, :content-type 4, :content-id 1, :size 1039, :raw-size 10000, :crc [202 95 54 132]} + {:method 4, :content-type 4, :content-id 2, :size 36, :raw-size 10000, :crc [87 22 235 242]} + {:method 4, :content-type 4, :content-id 3, :size 6684, :raw-size 18308, :crc [1 15 226 78]} + {:method 4, :content-type 4, :content-id 4, :size 36, :raw-size 10000, :crc [166 191 130 64]} + {:method 4, :content-type 4, :content-id 5, :size 31125, :raw-size 33411, :crc [179 8 47 242]} + {:method 4, :content-type 4, :content-id 6, :size 36, :raw-size 50000, :crc [249 83 125 26]} + {:method 1, :content-type 4, :content-id 7, :size 77500, :raw-size 411343, :crc [1 19 161 227]} + {:method 1, :content-type 4, :content-id 8, :size 20, :raw-size 0, :crc [100 132 79 154]} + {:method 1, :content-type 4, :content-id 9, :size 45, :raw-size 10000, :crc [90 129 139 84]} + {:method 4, :content-type 4, :content-id 10, :size 36, :raw-size 50000, :crc [243 129 185 33]} + {:method 1, :content-type 4, :content-id 11, :size 45, :raw-size 10000, :crc [61 217 74 145]} + {:method 4, :content-type 4, :content-id 12, :size 31, :raw-size 10000, :crc [46 244 148 239]} + {:method 1, :content-type 4, :content-id 13, :size 48, :raw-size 10000, :crc [239 168 203 254]} + {:method 1, :content-type 4, :content-id 16, :size 1977, :raw-size 7923, :crc [248 27 134 197]} + {:method 1, :content-type 4, :content-id 17, :size 2046, :raw-size 7923, :crc [61 142 4 22]} + {:method 1, :content-type 4, :content-id 18, :size 3458, :raw-size 4616, :crc [61 182 157 145]} + {:method 1, :content-type 4, :content-id 19, :size 271, :raw-size 4616, :crc [88 186 74 54]} + {:method 4, :content-type 4, :content-id 22, :size 37161, :raw-size 157901, :crc [226 220 219 56]} + {:method 4, :content-type 4, :content-id 23, :size 187340, :raw-size 760000, :crc [227 117 208 184]} + {:method 1, :content-type 4, :content-id 24, :size 1377, :raw-size 4493, :crc [28 234 48 60]} + {:method 1, :content-type 4, :content-id 25, :size 20, :raw-size 0, :crc [252 232 78 0]} + {:method 1, :content-type 4, :content-id 26, :size 47, :raw-size 73, :crc [87 78 224 195]} + {:method 1, :content-type 4, :content-id 27, :size 20, :raw-size 0, :crc [21 59 216 237]} + {:method 1, :content-type 4, :content-id 28, :size 23, :raw-size 3, :crc [186 219 248 202]} + {:method 1, :content-type 4, :content-id 29, :size 20, :raw-size 0, :crc [111 73 18 0]} + {:method 1, :content-type 4, :content-id 30, :size 20, :raw-size 0, :crc [82 112 247 118]} + {:method 4, :content-type 4, :content-id 5063770, :size 10053, :raw-size 43243, :crc [246 219 153 220]} + {:method 4, :content-type 4, :content-id 5131619, :size 1492, :raw-size 7923, :crc [161 9 102 22]}] + (mapv (fn [_] + (-> (struct/decode-block bb) + (update :crc bytes->vec) + (dissoc :data))) + (range 29)))) + (let [container-header (decode-container-header bb)] + (is (= {:length 98171 + :ref -1 + :start 0 + :span 0 + :records 2271 + :counter 10000 + :bases 172596 + :blocks 31 + :landmarks [171] + :crc [111 230 105 210]} + container-header)) + (bb/skip bb (:length container-header)) + (is (struct/eof-container? (decode-container-header bb))))))) diff --git a/test/cljam/io/cram/seq_resolver_test.clj b/test/cljam/io/cram/seq_resolver_test.clj new file mode 100644 index 00000000..25cffb13 --- /dev/null +++ b/test/cljam/io/cram/seq_resolver_test.clj @@ -0,0 +1,21 @@ +(ns cljam.io.cram.seq-resolver-test + (:require [cljam.io.cram.seq-resolver :as resolver] + [cljam.io.cram.seq-resolver.protocol :as proto] + [cljam.test-common :as common] + [clojure.test :refer [are deftest is testing]])) + +(deftest resolve-sequence-test + (testing "seq resolver resolves sequence for specified region" + (let [resolver (resolver/seq-resolver common/test-fa-file) + resolver' (resolver/clone-seq-resolver resolver)] + (are [?region ?expected] + (= ?expected + (proto/resolve-sequence resolver ?region) + (proto/resolve-sequence resolver' ?region)) + {:chr "ref" :start 1 :end 5} + "AGCAT" + + {:chr "unknown" :start 1 :end 5} + nil))) + (testing "resolver can be omitted, but in that case, trying to resolve seq will end up with error" + (is (thrown? Exception (proto/resolve-sequence nil {:chr "ref" :start 1 :end 5}))))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj new file mode 100644 index 00000000..22f3c374 --- /dev/null +++ b/test/cljam/io/cram_test.clj @@ -0,0 +1,40 @@ +(ns cljam.io.cram-test + (:require [cljam.io.cram :as cram] + [cljam.io.sam :as sam] + [cljam.test-common :as common :refer [deftest-remote + prepare-cavia! + with-before-after]] + [clojure.test :refer [deftest is]])) + +(defn- fixup-bam-aln [aln] + (-> aln + (dissoc :end) + (update :cigar #(if (= % "") "*" %)) + (update :options #(sort-by (comp name key first) %)))) + +(deftest reader-test + (with-open [bam-rdr (sam/reader common/test-bam-file) + cram-rdr (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + cram-rdr' (cram/reader cram-rdr)] + (is (= (sam/read-header bam-rdr) + (dissoc (cram/read-header cram-rdr) :HD) + (dissoc (cram/read-header cram-rdr') :HD))) + (is (= (sam/read-refs bam-rdr) + (cram/read-refs cram-rdr) + (cram/read-refs cram-rdr'))) + (is (= (map fixup-bam-aln (sam/read-alignments bam-rdr)) + (cram/read-alignments cram-rdr) + (cram/read-alignments cram-rdr'))))) + +(deftest-remote reader-with-multiple-containers-test + (with-before-after {:before (prepare-cavia!)} + (with-open [bam-rdr (sam/reader common/medium-bam-file) + cram-rdr (cram/reader common/medium-cram-file + {:reference common/hg19-twobit-file})] + (is (= (sam/read-header bam-rdr) + (dissoc (cram/read-header cram-rdr) :HD))) + (is (= (sam/read-refs bam-rdr) + (cram/read-refs cram-rdr))) + (is (= (map fixup-bam-aln (sam/read-alignments bam-rdr)) + (cram/read-alignments cram-rdr)))))) diff --git a/test/cljam/io/util_test.clj b/test/cljam/io/util_test.clj index 56eb2a57..66d6f2f2 100644 --- a/test/cljam/io/util_test.clj +++ b/test/cljam/io/util_test.clj @@ -75,6 +75,8 @@ "foo.bai" :bai "foo.sam" :sam "foo.SAM" :sam + "foo.cram" :cram + "foo.CRAM" :cram "foo.fa" :fasta "foo.fasta" :fasta "foo.fa.gz" :fasta @@ -125,8 +127,6 @@ (are [?path] (thrown? Exception (io-util/file-type (cio/file ?dir ?path))) "foo.bam.gz" "foo.SAM.gz" - "foo.cram" - "foo.CRAM" "foo.2bit.gz" "foo.bcf.gz") "" @@ -193,6 +193,7 @@ io-util/alignment-reader? true io-util/sam-reader? true io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -209,6 +210,7 @@ io-util/alignment-reader? true io-util/sam-reader? false io-util/bam-reader? true + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -230,6 +232,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? true io-util/vcf-reader? true io-util/bcf-reader? false @@ -246,6 +249,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? true io-util/vcf-reader? false io-util/bcf-reader? true @@ -262,6 +266,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -278,6 +283,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -294,6 +300,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -310,6 +317,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -326,6 +334,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false @@ -342,6 +351,7 @@ io-util/alignment-reader? false io-util/sam-reader? false io-util/bam-reader? false + io-util/cram-reader? false io-util/variant-reader? false io-util/vcf-reader? false io-util/bcf-reader? false diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index eca6a88d..aeb896d4 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -32,7 +32,10 @@ (expand-deftest (with-meta sym {:remote true}) args body)) (defprofile mycavia - {:resources [{:id "large.bam" + {:resources [{:id "hg19.2bit" + :url "https://test.chrov.is/data/refs/hg19.2bit" + :sha1 "95e5806aee9ecc092d30a482aaa008ef66cbc468"} + {:id "large.bam" :url "https://test.chrov.is/data/GSM721144_H3K36me3.nodup.bam" :sha1 "ad282c3779120057abc274ad8fad1910a4ad867b"} {:id "large.bai" @@ -160,6 +163,12 @@ (def test-bai-file "test-resources/bam/test.sorted.bam.bai") (def test-large-bai-file (cavia/resource mycavia "large.bai")) +;; ### CRAM file + +(def test-cram-file "test-resources/cram/test.cram") +(def medium-cram-file "test-resources/cram/medium.cram") +(def medium-with-standard-tags-cram-file "test-resources/cram/medium_with_standard_tags.cram") + ;; ### FASTA files (def test-fa-file "test-resources/fasta/test.fa") @@ -183,6 +192,7 @@ (def test-twobit-be-file "test-resources/twobit/be-test.2bit") (def test-twobit-be-n-file "test-resources/twobit/be-test-n.2bit") (def medium-twobit-file "test-resources/twobit/medium.2bit") +(def hg19-twobit-file (cavia/resource mycavia "hg19.2bit")) ;; ### FASTQ files