-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #302 from chrovis/feature/cram-support
Add CRAM reader (alpha)
- Loading branch information
Showing
20 changed files
with
2,154 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
(ns cljam.io.cram | ||
"Alpha - subject to change. Provides functions for reading from a CRAM file." | ||
(:require [cljam.io.cram.core :as cram] | ||
[cljam.io.protocols :as protocols] | ||
[cljam.io.util :as io-util]) | ||
(:import [cljam.io.cram.reader CRAMReader])) | ||
|
||
(defn reader | ||
"Creates a CRAM reader depending on the argument f: If f is a file or a string | ||
that representing the path to a CRAM file, returns a new reader that reads | ||
that CRAM file. If f is a CRAM reader, creates and returns a cloned CRAM reader | ||
from it. | ||
The function also takes an optional argument `option`, which is a map that | ||
consists of: | ||
- reference: A string representing the path to the reference file, or | ||
a sequence reader that reads sequences from the reference file. | ||
This may be omitted only when the CRAM file to be read does not | ||
require a reference file." | ||
(^CRAMReader [f] (reader f {})) | ||
(^CRAMReader [f option] | ||
(if (io-util/cram-reader? f) | ||
(cram/clone-reader f) | ||
(cram/reader f option)))) | ||
|
||
(defn read-header | ||
"Returns the header of the CRAM file." | ||
[rdr] | ||
(protocols/read-header rdr)) | ||
|
||
(defn read-refs | ||
"Returns the references of the CRAM file." | ||
[rdr] | ||
(protocols/read-refs rdr)) | ||
|
||
(defn read-alignments | ||
"Reads all the alignments from the CRAM file and returns them as a lazy sequence | ||
of record maps." | ||
[rdr] | ||
(protocols/read-alignments rdr)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
(ns cljam.io.cram.core | ||
(:require [cljam.io.cram.seq-resolver :as resolver] | ||
[cljam.io.cram.reader :as reader.core] | ||
[cljam.io.sam.util.refs :as util.refs] | ||
[cljam.io.util.byte-buffer :as bb] | ||
[cljam.util :as util] | ||
[clojure.java.io :as cio]) | ||
(:import [cljam.io.cram.reader CRAMReader] | ||
[java.nio.channels FileChannel] | ||
[java.nio.file OpenOption StandardOpenOption])) | ||
|
||
(defn reader | ||
"Creates a new CRAM reader that reads a CRAM file f. | ||
Takes an option map as the second argument. An option map consists of: | ||
- reference: a string representing the path to a reference file" | ||
^CRAMReader [f {:keys [reference]}] | ||
(let [file (cio/file f) | ||
url (util/as-url (.getAbsolutePath file)) | ||
ch (FileChannel/open (.toPath file) | ||
(into-array OpenOption [StandardOpenOption/READ])) | ||
bb (bb/allocate-lsb-byte-buffer 256) | ||
seq-resolver (some-> reference resolver/seq-resolver) | ||
header (volatile! nil) | ||
refs (delay (util.refs/make-refs @header)) | ||
rdr (reader.core/->CRAMReader url ch bb header refs seq-resolver)] | ||
(reader.core/read-file-definition rdr) | ||
(vreset! header (reader.core/read-header rdr)) | ||
rdr)) | ||
|
||
(defn clone-reader | ||
"Creates a cloned CRAM reader based on the given CRAM reader." | ||
^CRAMReader [^CRAMReader rdr] | ||
(let [url (.-url rdr) | ||
file (cio/as-file url) | ||
ch (FileChannel/open (.toPath file) | ||
(into-array OpenOption [StandardOpenOption/READ])) | ||
bb (bb/allocate-lsb-byte-buffer 256) | ||
seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver) | ||
rdr' (reader.core/->CRAMReader url ch bb | ||
(delay @(.-header rdr)) | ||
(delay @(.-refs rdr)) | ||
seq-resolver)] | ||
(reader.core/read-file-definition rdr') | ||
(reader.core/skip-container rdr') | ||
rdr')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
(ns cljam.io.cram.decode.data-series | ||
(:require [cljam.io.cram.itf8 :as itf8] | ||
[cljam.io.util.byte-buffer :as bb] | ||
[clojure.string :as str]) | ||
(:import [java.nio Buffer ByteBuffer])) | ||
|
||
(defn- data-series-type [ds] | ||
(case ds | ||
(:BF :CF :RI :RL :AP :RG :MF :NS :NP :TS :NF :TL :FN :FP :DL :RS :PD :HC :MQ) | ||
:int | ||
|
||
(:FC :BS :BA :QS) | ||
:byte | ||
|
||
(:RN :BB :QQ :IN :SC) | ||
:bytes)) | ||
|
||
(defn- build-codec-decoder | ||
[{:keys [codec] :as params} data-type content-id->block-data] | ||
(case codec | ||
:external | ||
(let [^ByteBuffer block (get content-id->block-data (:content-id params))] | ||
(case data-type | ||
:byte #(.get block) | ||
:int #(itf8/decode-itf8 block))) | ||
|
||
:huffman | ||
(let [{:keys [alphabet bit-len]} params] | ||
(assert (and (= (count alphabet) 1) | ||
(zero? (long (first bit-len)))) | ||
"Huffman coding for more than one word is not supported yet.") | ||
(constantly (first alphabet))) | ||
|
||
:byte-array-len | ||
(let [{:keys [len-encoding val-encoding]} params | ||
len-decoder (build-codec-decoder len-encoding :int content-id->block-data) | ||
val-decoder (build-codec-decoder val-encoding :byte content-id->block-data)] | ||
(fn [] | ||
(let [len (len-decoder) | ||
bb (bb/allocate-lsb-byte-buffer len)] | ||
(dotimes [_ len] | ||
(.put bb (byte (val-decoder)))) | ||
(.array bb)))) | ||
|
||
:byte-array-stop | ||
(let [{:keys [stop-byte external-id]} params | ||
^ByteBuffer block (get content-id->block-data external-id)] | ||
(fn [] | ||
(.mark ^Buffer block) | ||
(let [start (.position block) | ||
end (long | ||
(loop [] | ||
(if (= (.get block) (byte stop-byte)) | ||
(.position block) | ||
(recur)))) | ||
len (dec (- end start)) | ||
_ (.reset ^Buffer block) | ||
ret (bb/read-bytes block len)] | ||
(.get block) | ||
ret))))) | ||
|
||
(defn build-data-series-decoders | ||
"Builds decoders for data series based on the encodings specified in the given | ||
compression header and block data. | ||
`ds-encodings` is a map {<data series name> <encoding>} and the return value is | ||
a map {<data series name> <decoder>}, where: | ||
- <data series name>: a keyword representing the data series name | ||
- <encoding>: a map representing the encoding of the data series | ||
- <decoder>: a function with no arguments that returns a value decoded from | ||
the data series upon each call" | ||
[{ds-encodings :data-series} blocks] | ||
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] | ||
(reduce-kv (fn [decoders ds params] | ||
(let [dt (data-series-type ds) | ||
decoder (build-codec-decoder params dt content-id->block-data)] | ||
(assoc decoders ds decoder))) | ||
{} ds-encodings))) | ||
|
||
(defn- tag-value-coercer [tag-type] | ||
(case tag-type | ||
\A #(char (.get ^ByteBuffer %)) | ||
\c #(.get ^ByteBuffer %) | ||
\C bb/read-ubyte | ||
\s #(.getShort ^ByteBuffer %) | ||
\S bb/read-ushort | ||
\i #(.getInt ^ByteBuffer %) | ||
\I bb/read-uint | ||
\f #(.getFloat ^ByteBuffer %) | ||
\Z bb/read-null-terminated-string | ||
\H (fn [^ByteBuffer bb] | ||
(let [s (.getBytes ^String (bb/read-null-terminated-string bb)) | ||
n (quot (alength s) 2) | ||
arr (byte-array n)] | ||
(dotimes [i n] | ||
(let [b (bit-or (bit-shift-left (Character/digit (aget s (* 2 i)) 16) 4) | ||
(Character/digit (aget s (inc (* 2 i))) 16))] | ||
(aset arr i (byte b)))) | ||
arr)) | ||
\B (fn [^ByteBuffer bb] | ||
(let [tag-type' (char (.get bb)) | ||
len (.getInt bb) | ||
coercer (tag-value-coercer tag-type') | ||
vs (repeatedly len (partial coercer bb))] | ||
(str/join \, (cons tag-type' vs)))))) | ||
|
||
(defn- build-tag-decoder [tag-encoding tag-type content-id->block-data] | ||
(let [decoder (build-codec-decoder tag-encoding :bytes content-id->block-data) | ||
coercer (tag-value-coercer tag-type)] | ||
(fn [] | ||
(let [bb (bb/make-lsb-byte-buffer (decoder))] | ||
(coercer bb))))) | ||
|
||
(defn build-tag-decoders | ||
"Builds decoders for tags based on the encodings specified in the given | ||
compression header and block data. | ||
`tags` is a map {<tag name> {<type character> <encoding>}} and the return | ||
value is a map {<tag name> {<type character> <decoder>}}, where: | ||
- <tag name>: a keyword representing the tag name | ||
- <type character>: a character representing a type of the tag | ||
- <encoding>: a map representing the encoding of the tag and type | ||
- <decoder>: a function with no arguments that returns a value decoded from | ||
the data series for the tag upon each call" | ||
[{:keys [tags]} blocks] | ||
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)] | ||
(reduce-kv | ||
(fn [decoders tag m] | ||
(reduce-kv | ||
(fn [decoders tag-type encoding] | ||
(let [decoder (build-tag-decoder encoding tag-type content-id->block-data) | ||
tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))] | ||
(assoc-in decoders [tag tag-type] | ||
(fn [] {:type tag-type' :value (decoder)})))) | ||
decoders m)) | ||
{} tags))) |
Oops, something went wrong.