Skip to content

Commit

Permalink
Merge pull request #302 from chrovis/feature/cram-support
Browse files Browse the repository at this point in the history
Add CRAM reader (alpha)
  • Loading branch information
alumi authored Mar 28, 2024
2 parents ec9c199 + 5d3d289 commit 43df52d
Show file tree
Hide file tree
Showing 20 changed files with 2,154 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/cljam/io/bam/decoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
(:require [clojure.string :as cstr]
[proton.core :as proton]
[cljam.util :as util]
cljam.io.protocols
[cljam.io.sam.util.quality :as qual]
[cljam.io.sam.util.sequence :as sam-seq]
[cljam.io.sam.util.refs :as refs]
Expand Down
40 changes: 40 additions & 0 deletions src/cljam/io/cram.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
(ns cljam.io.cram
"Alpha - subject to change. Provides functions for reading from a CRAM file."
(:require [cljam.io.cram.core :as cram]
[cljam.io.protocols :as protocols]
[cljam.io.util :as io-util])
(:import [cljam.io.cram.reader CRAMReader]))

(defn reader
"Creates a CRAM reader depending on the argument f: If f is a file or a string
that representing the path to a CRAM file, returns a new reader that reads
that CRAM file. If f is a CRAM reader, creates and returns a cloned CRAM reader
from it.
The function also takes an optional argument `option`, which is a map that
consists of:
- reference: A string representing the path to the reference file, or
a sequence reader that reads sequences from the reference file.
This may be omitted only when the CRAM file to be read does not
require a reference file."
(^CRAMReader [f] (reader f {}))
(^CRAMReader [f option]
(if (io-util/cram-reader? f)
(cram/clone-reader f)
(cram/reader f option))))

(defn read-header
"Returns the header of the CRAM file."
[rdr]
(protocols/read-header rdr))

(defn read-refs
"Returns the references of the CRAM file."
[rdr]
(protocols/read-refs rdr))

(defn read-alignments
"Reads all the alignments from the CRAM file and returns them as a lazy sequence
of record maps."
[rdr]
(protocols/read-alignments rdr))
46 changes: 46 additions & 0 deletions src/cljam/io/cram/core.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
(ns cljam.io.cram.core
(:require [cljam.io.cram.seq-resolver :as resolver]
[cljam.io.cram.reader :as reader.core]
[cljam.io.sam.util.refs :as util.refs]
[cljam.io.util.byte-buffer :as bb]
[cljam.util :as util]
[clojure.java.io :as cio])
(:import [cljam.io.cram.reader CRAMReader]
[java.nio.channels FileChannel]
[java.nio.file OpenOption StandardOpenOption]))

(defn reader
"Creates a new CRAM reader that reads a CRAM file f.
Takes an option map as the second argument. An option map consists of:
- reference: a string representing the path to a reference file"
^CRAMReader [f {:keys [reference]}]
(let [file (cio/file f)
url (util/as-url (.getAbsolutePath file))
ch (FileChannel/open (.toPath file)
(into-array OpenOption [StandardOpenOption/READ]))
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> reference resolver/seq-resolver)
header (volatile! nil)
refs (delay (util.refs/make-refs @header))
rdr (reader.core/->CRAMReader url ch bb header refs seq-resolver)]
(reader.core/read-file-definition rdr)
(vreset! header (reader.core/read-header rdr))
rdr))

(defn clone-reader
"Creates a cloned CRAM reader based on the given CRAM reader."
^CRAMReader [^CRAMReader rdr]
(let [url (.-url rdr)
file (cio/as-file url)
ch (FileChannel/open (.toPath file)
(into-array OpenOption [StandardOpenOption/READ]))
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver)
rdr' (reader.core/->CRAMReader url ch bb
(delay @(.-header rdr))
(delay @(.-refs rdr))
seq-resolver)]
(reader.core/read-file-definition rdr')
(reader.core/skip-container rdr')
rdr'))
136 changes: 136 additions & 0 deletions src/cljam/io/cram/decode/data_series.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
(ns cljam.io.cram.decode.data-series
(:require [cljam.io.cram.itf8 :as itf8]
[cljam.io.util.byte-buffer :as bb]
[clojure.string :as str])
(:import [java.nio Buffer ByteBuffer]))

(defn- data-series-type [ds]
(case ds
(:BF :CF :RI :RL :AP :RG :MF :NS :NP :TS :NF :TL :FN :FP :DL :RS :PD :HC :MQ)
:int

(:FC :BS :BA :QS)
:byte

(:RN :BB :QQ :IN :SC)
:bytes))

(defn- build-codec-decoder
[{:keys [codec] :as params} data-type content-id->block-data]
(case codec
:external
(let [^ByteBuffer block (get content-id->block-data (:content-id params))]
(case data-type
:byte #(.get block)
:int #(itf8/decode-itf8 block)))

:huffman
(let [{:keys [alphabet bit-len]} params]
(assert (and (= (count alphabet) 1)
(zero? (long (first bit-len))))
"Huffman coding for more than one word is not supported yet.")
(constantly (first alphabet)))

:byte-array-len
(let [{:keys [len-encoding val-encoding]} params
len-decoder (build-codec-decoder len-encoding :int content-id->block-data)
val-decoder (build-codec-decoder val-encoding :byte content-id->block-data)]
(fn []
(let [len (len-decoder)
bb (bb/allocate-lsb-byte-buffer len)]
(dotimes [_ len]
(.put bb (byte (val-decoder))))
(.array bb))))

:byte-array-stop
(let [{:keys [stop-byte external-id]} params
^ByteBuffer block (get content-id->block-data external-id)]
(fn []
(.mark ^Buffer block)
(let [start (.position block)
end (long
(loop []
(if (= (.get block) (byte stop-byte))
(.position block)
(recur))))
len (dec (- end start))
_ (.reset ^Buffer block)
ret (bb/read-bytes block len)]
(.get block)
ret)))))

(defn build-data-series-decoders
"Builds decoders for data series based on the encodings specified in the given
compression header and block data.
`ds-encodings` is a map {<data series name> <encoding>} and the return value is
a map {<data series name> <decoder>}, where:
- <data series name>: a keyword representing the data series name
- <encoding>: a map representing the encoding of the data series
- <decoder>: a function with no arguments that returns a value decoded from
the data series upon each call"
[{ds-encodings :data-series} blocks]
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)]
(reduce-kv (fn [decoders ds params]
(let [dt (data-series-type ds)
decoder (build-codec-decoder params dt content-id->block-data)]
(assoc decoders ds decoder)))
{} ds-encodings)))

(defn- tag-value-coercer [tag-type]
(case tag-type
\A #(char (.get ^ByteBuffer %))
\c #(.get ^ByteBuffer %)
\C bb/read-ubyte
\s #(.getShort ^ByteBuffer %)
\S bb/read-ushort
\i #(.getInt ^ByteBuffer %)
\I bb/read-uint
\f #(.getFloat ^ByteBuffer %)
\Z bb/read-null-terminated-string
\H (fn [^ByteBuffer bb]
(let [s (.getBytes ^String (bb/read-null-terminated-string bb))
n (quot (alength s) 2)
arr (byte-array n)]
(dotimes [i n]
(let [b (bit-or (bit-shift-left (Character/digit (aget s (* 2 i)) 16) 4)
(Character/digit (aget s (inc (* 2 i))) 16))]
(aset arr i (byte b))))
arr))
\B (fn [^ByteBuffer bb]
(let [tag-type' (char (.get bb))
len (.getInt bb)
coercer (tag-value-coercer tag-type')
vs (repeatedly len (partial coercer bb))]
(str/join \, (cons tag-type' vs))))))

(defn- build-tag-decoder [tag-encoding tag-type content-id->block-data]
(let [decoder (build-codec-decoder tag-encoding :bytes content-id->block-data)
coercer (tag-value-coercer tag-type)]
(fn []
(let [bb (bb/make-lsb-byte-buffer (decoder))]
(coercer bb)))))

(defn build-tag-decoders
"Builds decoders for tags based on the encodings specified in the given
compression header and block data.
`tags` is a map {<tag name> {<type character> <encoding>}} and the return
value is a map {<tag name> {<type character> <decoder>}}, where:
- <tag name>: a keyword representing the tag name
- <type character>: a character representing a type of the tag
- <encoding>: a map representing the encoding of the tag and type
- <decoder>: a function with no arguments that returns a value decoded from
the data series for the tag upon each call"
[{:keys [tags]} blocks]
(let [content-id->block-data (into {} (map (juxt :content-id :data)) blocks)]
(reduce-kv
(fn [decoders tag m]
(reduce-kv
(fn [decoders tag-type encoding]
(let [decoder (build-tag-decoder encoding tag-type content-id->block-data)
tag-type' (str (if (#{\c \C \s \S \i \I} tag-type) \i tag-type))]
(assoc-in decoders [tag tag-type]
(fn [] {:type tag-type' :value (decoder)}))))
decoders m))
{} tags)))
Loading

0 comments on commit 43df52d

Please sign in to comment.