Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Feb 27, 2024
1 parent c4b8f23 commit ecea33f
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 87 deletions.
5 changes: 2 additions & 3 deletions src/cljam/io/cram/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
{:clj-kondo/ignore [:missing-docstring]}
(:require [cljam.io.cram.seq-resolver :as resolver]
[cljam.io.cram.reader :as reader.core]
[cljam.io.sam.util.refs :as util.refs]
[cljam.io.util.byte-buffer :as bb]
[cljam.util :as util]
[clojure.java.io :as cio])
Expand All @@ -17,9 +18,7 @@
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> reference resolver/seq-resolver)
header (volatile! nil)
refs (delay
(mapv (fn [{:keys [SN LN]}] {:name SN :len LN})
(:SQ @header)))
refs (delay (util.refs/make-refs @header))
rdr (reader.core/->CRAMReader url ch bb header refs seq-resolver)]
(reader.core/read-file-definition rdr)
(vreset! header (reader.core/read-header rdr))
Expand Down
173 changes: 94 additions & 79 deletions src/cljam/io/cram/decode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.util.byte-buffer :as bb]))

(defn- ref-name [cram-header ^long ref-id]
(or (when (<= 0 ref-id)
(get-in cram-header [:SQ ref-id :SN]))
"*"))

(defn- read-group-id [cram-header ^long rg-index]
(when (<= 0 rg-index)
(get-in cram-header [:RG rg-index :ID])))

(defn- build-positional-data-decoder
[cram-header {:keys [preservation-map]} slice-header {:keys [RI RL AP RG]}]
(let [AP' (if (:AP preservation-map)
Expand All @@ -19,13 +28,13 @@
RI
(constantly (:ref-seq-id slice-header)))]
(fn [record]
(let [rg (RG)]
(let [rg (read-group-id cram-header (RG))]
(assoc record
:rname (get-in cram-header [:SQ (RI') :SN])
:rname (ref-name cram-header (RI'))
::len (RL)
:pos (AP')
:options (cond-> []
(not= rg -1)
rg
(conj {:RG {:type "Z" :value rg}})))))))

(defn- build-read-name-decoder [{:keys [preservation-map]} {:keys [RN]}]
Expand All @@ -34,7 +43,7 @@
(throw (ex-info "Omitted read names are not supported yet." {}))))

(defn- build-mate-read-decoder [cram-header {:keys [MF NS NP TS NF]}]
(fn [record]
(fn [{:keys [rname] :as record}]
(let [flag (long (::flag record))]
(if (pos? (bit-and flag 0x02))
(let [mate-flag (long (MF))
Expand All @@ -44,10 +53,10 @@

(pos? (bit-and mate-flag 0x02))
(bit-or (sam.flag/encoded #{:next-unmapped})))
rnext (get-in cram-header [:SQ (NS) :SN])]
rnext (ref-name cram-header (NS))]
(assoc record
:flag bam-flag
:rnext (if (= (:rname record) rnext) "=" rnext)
:rnext (if (and (= rname rnext) (not= rname "*")) "=" rnext)
:pnext (NP)
:tlen (TS)))
(cond-> record
Expand All @@ -69,6 +78,25 @@
decoder (nth decoders tl)]
(assoc record :options (decoder))))))

(defn- record-end [record features]
(->> features
^long (reduce
(fn [^long ret f]
(case (:code f)
(:insertion :softclip)
(- ret (alength ^bytes (:bases f)))

(:deletion :ref-skip)
(+ ret (long (:len f)))

:insert-base
(dec ret)

ret))
(::len record))
dec
(+ (long (:pos record)))))

(defn- record-seq
[seq-resolver {:keys [preservation-map]} {:keys [rname pos] :as record} features]
(let [region {:chr rname :start pos :end (::end record)}
Expand Down Expand Up @@ -117,14 +145,20 @@
features)
(String. bs))))))

(defn- qual-score ^long [^long qual]
(+ qual 33))

(defn- decode-qual ^String [len qs-decoder]
(let [bb (bb/allocate-lsb-byte-buffer len)]
(dotimes [_ len]
(.put bb (byte (qual-score (qs-decoder)))))
(String. (.array bb))))

(defn- record-qual [record features {:keys [QS]}]
(let [flag (long (::flag record))
len (long (::len record))]
(if (pos? (bit-and flag 0x01))
(let [bb (bb/allocate-lsb-byte-buffer len)]
(dotimes [_ len]
(.put bb (byte (+ 33 (long (QS))))))
(String. (.array bb)))
(decode-qual len QS)
(let [qs (byte-array len)]
(run! (fn [{:keys [^long pos] :as f}]
(case (:code f)
Expand All @@ -133,77 +167,47 @@
(System/arraycopy ^bytes scores 0 qs (dec pos) (alength scores)))
nil))
features)
(dotimes [i len]
(aset qs i (byte (qual-score (aget qs i)))))
(String. qs)))))

(defn- features->cigar [^long read-len features]
(loop [[f & more :as fs] features
pos 0
acc (transient [])]
(if f
(let [p (long (:pos f))
gap (- p (inc pos))]
(if (pos? gap)
(recur fs p (conj! acc [gap \M]))
(if-let [[op ^long len] (case (:code f)
(:read-base :subst) [\M 1]
:insertion [\I (count (:bases f))]
:softclip [\S (count (:bases f))]
:hardclip [\H (:len f)]
:padding [\P (:len f)]
:deletion [\D (:len f)]
:ref-skip [\N (:len f)]
:insert-base [\I 1]
nil)]
(let [pos' (if (#{\M \I \S} op)
(+ p (dec len))
(dec p))]
(recur more pos' (cond-> acc (pos? len) (conj! [len op]))))
(recur more p acc))))
(let [acc' (cond-> acc
(< pos read-len)
(conj! [(- read-len pos) \M]))]
(some->> (not-empty (persistent! acc')) sam.cigar/simplify)))))

(defn- record-end [record features]
(->> features
^long (reduce
(fn [^long ret f]
(case (:code f)
(:insertion :softclip)
(- ret (alength ^bytes (:bases f)))

(:deletion :ref-skip)
(+ ret (long (:len f)))

:insert-base
(dec ret)

ret))
(::len record))
dec
(+ (long (:pos record)))))
(defn- record-cigar [record features]
(let [read-len (long (::len record))]
(loop [[f & more :as fs] features
pos 0
acc (transient [])]
(if f
(let [p (long (:pos f))
gap (- p (inc pos))]
(if (pos? gap)
(recur fs p (conj! acc [gap \M]))
(if-let [[op ^long len] (case (:code f)
(:read-base :subst) [\M 1]
:insertion [\I (count (:bases f))]
:softclip [\S (count (:bases f))]
:hardclip [\H (:len f)]
:padding [\P (:len f)]
:deletion [\D (:len f)]
:ref-skip [\N (:len f)]
:insert-base [\I 1]
nil)]
(let [pos' (if (#{\M \I \S} op)
(+ p (dec len))
(dec p))]
(recur more pos' (cond-> acc (pos? len) (conj! [len op]))))
(recur more p acc))))
(let [acc' (cond-> acc
(< pos read-len)
(conj! [(- read-len pos) \M]))]
(some->> (not-empty (persistent! acc')) sam.cigar/simplify))))))

(defn- build-mapped-read-decoder
[seq-resolver
compression-header
{:keys [FN MQ QS FC FP BA BS IN SC HC PD DL RS BB QQ] :as decoders}]
(fn [record]
(defn- build-read-features-decoder [{:keys [FN FC FP BA QS BS IN SC HC PD DL RS BB QQ]}]
(fn []
(loop [i (long (FN))
prev-pos 0
fs (transient [])]
(if (zero? i)
(let [features (persistent! fs)
len (::len record)
end (record-end record features)
record' (assoc record ::end end)]
(assoc record'
:seq (record-seq seq-resolver compression-header record' features)
:qual (record-qual record' features decoders)
:mapq (MQ)
:cigar (->> (or (features->cigar len features)
[[len \M]])
(map (fn [[n op]] (str n op)))
(apply str))))
(persistent! fs)
(let [code (char (FC))
pos (+ prev-pos (long (FP)))
data (case code
Expand All @@ -221,6 +225,20 @@
\Q {:code :score :score (QS)})]
(recur (dec i) pos (conj! fs (assoc data :pos pos))))))))

(defn- build-mapped-read-decoder [seq-resolver compression-header {:keys [MQ] :as decoders}]
(let [features-decoder (build-read-features-decoder decoders)]
(fn [record]
(let [fs (features-decoder)
end (record-end record fs)
record' (assoc record ::end end)]
(assoc record'
:mapq (MQ)
:seq (record-seq seq-resolver compression-header record' fs)
:qual (record-qual record' fs decoders)
:cigar (->> (record-cigar record' fs)
(map (fn [[n op]] (str n op)))
(apply str)))))))

(defn- build-unmapped-read-decoder [{:keys [BA QS]}]
(fn [record]
(let [len (long (::len record))
Expand All @@ -231,13 +249,10 @@
record' (assoc record
:seq (String. (.array bb))
:mapq 0
:cigar "")]
:cigar "*")]
(if (zero? (bit-and flag 0x01))
record'
(let [bb (bb/allocate-lsb-byte-buffer len)]
(dotimes [_ len]
(.put bb (byte (+ 33 (long (QS))))))
(assoc record' :qual (String. (.array bb))))))))
(assoc record' :qual (decode-qual len QS))))))

(defn build-cram-record-decoder
[seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders]
Expand Down
17 changes: 12 additions & 5 deletions src/cljam/io/cram/decode/structure.clj
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,27 @@

(def ^:private decode-block-data
(let [factory (CompressorStreamFactory.)]
(fn [^ByteBuffer bb method ^long size ^long raw-size]
(fn [^ByteBuffer bb ^long method ^long size ^long raw-size]
(if (zero? size)
(bb/allocate-lsb-byte-buffer 0)
;; TODO: decompress data with specified method
(case (long method)
(case method
0 (split-buffer bb size)
4 (->> (split-buffer bb size)
rans/decode
bb/make-lsb-byte-buffer)
(let [compressed (bb/read-bytes bb size)
bais (ByteArrayInputStream. compressed)
uncompressed (byte-array raw-size)
bb' (bb/make-lsb-byte-buffer uncompressed)]
(with-open [is (.createCompressorInputStream factory bais)]
bb' (bb/make-lsb-byte-buffer uncompressed)
compressor (case method
1 CompressorStreamFactory/GZIP
2 CompressorStreamFactory/BZIP2
3 CompressorStreamFactory/LZMA
(throw
(ex-info (str "compression method " method
" not supported")
{:method method})))]
(with-open [is (.createCompressorInputStream factory compressor bais)]
(.read is uncompressed)
bb')))))))

Expand Down

0 comments on commit ecea33f

Please sign in to comment.