Skip to content

Commit

Permalink
Merge pull request #311 from chrovis/feature/cram-region-read-without…
Browse files Browse the repository at this point in the history
…-index

Support for reading alignments in specific region without CRAI index file
  • Loading branch information
niyarin authored May 17, 2024
2 parents 90227f3 + 3623b37 commit 11d39b0
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 48 deletions.
34 changes: 21 additions & 13 deletions src/cljam/io/cram/core.clj
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
(ns cljam.io.cram.core
(:require [cljam.io.crai :as crai]
[cljam.io.cram.reader :as reader]
[cljam.io.cram.seq-resolver :as resolver]
[cljam.io.cram.reader :as reader.core]
[cljam.io.sam.util.refs :as util.refs]
[cljam.io.util.byte-buffer :as bb]
[cljam.util :as util]
[clojure.java.io :as cio])
(:import [cljam.io.cram.reader CRAMReader]
[java.io FileNotFoundException]
[java.nio.channels FileChannel]
[java.nio.file OpenOption StandardOpenOption]))

Expand All @@ -23,11 +24,17 @@
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> reference resolver/seq-resolver)
header (volatile! nil)
refs (delay (util.refs/make-refs @header))
idx (delay (crai/read-index (str f ".crai") @refs))
rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver)]
(reader.core/read-file-definition rdr)
(vreset! header (reader.core/read-header rdr))
refs (delay (vec (util.refs/make-refs @header)))
offset (volatile! nil)
idx (delay
(try
(crai/read-index (str f ".crai") @refs)
(catch FileNotFoundException _
nil)))
rdr (reader/->CRAMReader url ch bb header refs offset idx seq-resolver)]
(reader/read-file-definition rdr)
(vreset! header (reader/read-header rdr))
(vreset! offset (.position ch))
rdr))

(defn clone-reader
Expand All @@ -39,11 +46,12 @@
(into-array OpenOption [StandardOpenOption/READ]))
bb (bb/allocate-lsb-byte-buffer 256)
seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver)
rdr' (reader.core/->CRAMReader url ch bb
(delay @(.-header rdr))
(delay @(.-refs rdr))
(delay @(.-index rdr))
seq-resolver)]
(reader.core/read-file-definition rdr')
(reader.core/skip-container rdr')
rdr' (reader/->CRAMReader url ch bb
(delay @(.-header rdr))
(delay @(.-refs rdr))
(delay @(.-offset rdr))
(delay @(.-index rdr))
seq-resolver)]
(reader/read-file-definition rdr')
(reader/skip-container rdr')
rdr'))
4 changes: 2 additions & 2 deletions src/cljam/io/cram/decode/structure.clj
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
landmarks (decode-itf8-array bb)
crc (bb/read-bytes bb 4)]
{:length len
:ref ref-seq-id
:ref-seq-id ref-seq-id
:start start-pos
:span span
:records n-records
Expand All @@ -82,7 +82,7 @@
"Returns true iff the given container header represents an EOF container."
[container-header]
(and (= (:length container-header) 15)
(= (:ref container-header) -1)
(= (:ref-seq-id container-header) -1)
(= (:start container-header) 4542278)
(= (:span container-header) 0)
(= (:records container-header) 0)
Expand Down
96 changes: 70 additions & 26 deletions src/cljam/io/cram/reader.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
[cljam.io.cram.decode.structure :as struct]
[cljam.io.protocols :as protocols]
[cljam.util.intervals :as intervals])
(:import [java.io Closeable FileNotFoundException]
(:import [java.io Closeable]
[java.nio Buffer ByteBuffer ByteOrder]
[java.nio.channels FileChannel FileChannel$MapMode]))

(declare read-alignments read-alignments-in-region)

(deftype CRAMReader [url channel buffer header refs index seq-resolver]
(deftype CRAMReader [url channel buffer header refs offset index seq-resolver]
Closeable
(close [_]
(when seq-resolver
Expand All @@ -24,11 +24,7 @@
(read [this region]
(protocols/read-alignments this region))
(indexed? [_]
(try
@index
true
(catch FileNotFoundException _
false)))
(boolean @index))
protocols/IAlignmentReader
(read-header [_] @header)
(read-refs [_] @refs)
Expand Down Expand Up @@ -63,9 +59,8 @@
(read-to-buffer rdr 26)
(struct/decode-file-definition (.-buffer rdr)))

(defn- read-slice-records [^CRAMReader rdr bb compression-header]
(let [slice-header (struct/decode-slice-header-block bb)
blocks (into [] (map (fn [_] (struct/decode-block bb)))
(defn- read-slice-records [^CRAMReader rdr bb compression-header slice-header]
(let [blocks (into [] (map (fn [_] (struct/decode-block bb)))
(range (:blocks slice-header)))
core-block (first (filter #(zero? (long (:content-id %))) blocks))
bs-decoder (when core-block
Expand All @@ -79,19 +74,24 @@
ds-decoders
tag-decoders)))

(defn- with-each-slice-header [^ByteBuffer bb f slice-offsets]
(let [container-header-end (.position bb)
compression-header (struct/decode-compression-header-block bb)]
(mapcat (fn [^long offset]
(.position ^Buffer bb (+ container-header-end offset))
(f compression-header (struct/decode-slice-header-block bb)))
slice-offsets)))

(defn- read-container-records
([rdr bb container-header]
(read-container-records rdr bb container-header nil))
([^CRAMReader rdr ^ByteBuffer bb container-header idx-entries]
(let [container-header-end (.position bb)
compression-header (struct/decode-compression-header-block bb)]
(->> (if (seq idx-entries)
(map :slice-offset idx-entries)
(:landmarks container-header))
(mapcat
(fn [^long landmark]
(.position ^Buffer bb (+ container-header-end landmark))
(read-slice-records rdr bb compression-header)))))))
([^CRAMReader rdr bb container-header idx-entries]
(->> (if (seq idx-entries)
(map :slice-offset idx-entries)
(:landmarks container-header))
(with-each-slice-header bb
(fn [compression-header slice-header]
(read-slice-records rdr bb compression-header slice-header))))))

(defn- with-next-container-header [^CRAMReader rdr f]
(let [^FileChannel ch (.-channel rdr)
Expand Down Expand Up @@ -139,8 +139,13 @@
(concat alns (lazy-seq (step))))))]
(step))))

(defn- read-alignments-in-region
[^CRAMReader rdr {:keys [chr start end] :or {start 0 end Long/MAX_VALUE}}]
(defn- filter-overlapping-records [chr ^long start ^long end alns]
(filter #(and (= (:rname %) chr)
(<= (long (:pos %)) end)
(<= start (long (:end %))))
alns))

(defn- read-alignments-in-region-with-index [^CRAMReader rdr chr ^long start ^long end]
(let [^FileChannel ch (.-channel rdr)
idx @(.-index rdr)
offset->entries (->> (intervals/find-overlap-intervals idx chr start end)
Expand All @@ -154,7 +159,46 @@
(.position ch offset)
(concat (read-container-with rdr (read-fn entries))
(lazy-seq (step more)))))]
(filter #(and (= (:rname %) chr)
(<= (long (:pos %)) (long end))
(<= (long start) (long (:end %))))
(step offset->entries)))))
(filter-overlapping-records chr start end (step offset->entries)))))

(defn- overlaps? [container-or-slice-header refs chr start end]
(let [seq-id (long (:ref-seq-id container-or-slice-header))]
(case seq-id
-1 (= chr "*")
;; ref-seq-id = -2 means that the container or slice contains multiple
;; references, and the decoder can't tell in advance what reference
;; it actually has in it
-2 true
(let [chr' (get (nth refs seq-id) :name)
start' (long (:start container-or-slice-header))
end' (+ start' (long (:span container-or-slice-header)))]
(and (= chr' chr)
(<= start' (long end))
(<= (long start) end'))))))

(defn- read-alignments-in-region-without-index [^CRAMReader rdr chr ^long start ^long end]
(let [^FileChannel ch (.-channel rdr)
refs @(.-refs rdr)]
(letfn [(read1 [container-header bb]
(when-not (struct/eof-container? container-header)
(if (overlaps? container-header refs chr start end)
(with-each-slice-header bb
(fn [compression-header slice-header]
(when (overlaps? slice-header refs chr start end)
(read-slice-records rdr bb compression-header slice-header)))
(:landmarks container-header))
::skipped)))
(step []
(when (< (.position ch) (.size ch))
(when-let [alns (read-container-with rdr read1)]
(if (identical? alns ::skipped)
(recur)
(concat alns (lazy-seq (step)))))))]
(.position ch (long @(.-offset rdr)))
(filter-overlapping-records chr start end (step)))))

(defn- read-alignments-in-region
[^CRAMReader rdr {:keys [chr ^long start ^long end] :or {start 0 end Long/MAX_VALUE}}]
(if @(.-index rdr)
(read-alignments-in-region-with-index rdr chr start end)
(read-alignments-in-region-without-index rdr chr start end)))
Binary file added test-resources/cram/medium_without_index.cram
Binary file not shown.
6 changes: 3 additions & 3 deletions test/cljam/io/cram/decode/structure_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
:id "file:///tmp/cljam/me"}
(struct/decode-file-definition bb)))
(is (= {:length 278
:ref -1
:ref-seq-id -1
:start 0
:span 0
:records 0
Expand All @@ -42,7 +42,7 @@
(sam/read-header r)))
(struct/decode-cram-header-block bb)))
(is (= {:length 362828
:ref -2
:ref-seq-id -2
:start 0
:span 0
:records 10000
Expand Down Expand Up @@ -150,7 +150,7 @@
(range 29))))
(let [container-header (decode-container-header bb)]
(is (= {:length 98171
:ref -1
:ref-seq-id -1
:start 0
:span 0
:records 2271
Expand Down
18 changes: 14 additions & 4 deletions test/cljam/io/cram_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@
(cram/read-refs cram-rdr')))
(is (= (map fixup-bam-aln (sam/read-alignments bam-rdr))
(cram/read-alignments cram-rdr)
(cram/read-alignments cram-rdr')))))
(cram/read-alignments cram-rdr')))
(are [?region ?count] (= ?count
(count (cram/read-alignments cram-rdr ?region))
(count (cram/read-alignments cram-rdr' ?region)))
{:chr "ref"} 6
{:chr "ref2", :start 35, :end 35} 2)))

(deftest-remote reader-with-multiple-containers-test
(with-before-after {:before (prepare-cavia!)}
Expand All @@ -39,11 +44,16 @@
(cram/read-refs cram-rdr)))
(is (= (map fixup-bam-aln (sam/read-alignments bam-rdr))
(cram/read-alignments cram-rdr)))))
(testing "read alignments in specified regions"
(testing "read alignments in specified regions (with and without index file)"
(with-open [cram-rdr (cram/reader common/medium-cram-file
{:reference common/hg19-twobit-file})]
{:reference common/hg19-twobit-file})
cram-rdr' (cram/reader common/medium-without-index-cram-file
{:reference common/hg19-twobit-file})]
(is (cram/indexed? cram-rdr))
(are [?region ?count] (= ?count (count (cram/read-alignments cram-rdr ?region)))
(is (not (cram/indexed? cram-rdr')))
(are [?region ?count] (= ?count
(count (cram/read-alignments cram-rdr ?region))
(count (cram/read-alignments cram-rdr' ?region)))
{:chr "chr1"} 615
{:chr "*"} 4348
{:chr "chr1", :start 546610, :end 546610} 1
Expand Down
1 change: 1 addition & 0 deletions test/cljam/test_common.clj
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@
(def test-cram-file "test-resources/cram/test.cram")
(def medium-cram-file "test-resources/cram/medium.cram")
(def medium-with-standard-tags-cram-file "test-resources/cram/medium_with_standard_tags.cram")
(def medium-without-index-cram-file "test-resources/cram/medium_without_index.cram")

;; ### CRAM index files

Expand Down

0 comments on commit 11d39b0

Please sign in to comment.