diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj index 2e3322d0..beb739e1 100644 --- a/src/cljam/io/cram/core.clj +++ b/src/cljam/io/cram/core.clj @@ -1,12 +1,13 @@ (ns cljam.io.cram.core (:require [cljam.io.crai :as crai] + [cljam.io.cram.reader :as reader] [cljam.io.cram.seq-resolver :as resolver] - [cljam.io.cram.reader :as reader.core] [cljam.io.sam.util.refs :as util.refs] [cljam.io.util.byte-buffer :as bb] [cljam.util :as util] [clojure.java.io :as cio]) (:import [cljam.io.cram.reader CRAMReader] + [java.io FileNotFoundException] [java.nio.channels FileChannel] [java.nio.file OpenOption StandardOpenOption])) @@ -23,11 +24,17 @@ bb (bb/allocate-lsb-byte-buffer 256) seq-resolver (some-> reference resolver/seq-resolver) header (volatile! nil) - refs (delay (util.refs/make-refs @header)) - idx (delay (crai/read-index (str f ".crai") @refs)) - rdr (reader.core/->CRAMReader url ch bb header refs idx seq-resolver)] - (reader.core/read-file-definition rdr) - (vreset! header (reader.core/read-header rdr)) + refs (delay (vec (util.refs/make-refs @header))) + offset (volatile! nil) + idx (delay + (try + (crai/read-index (str f ".crai") @refs) + (catch FileNotFoundException _ + nil))) + rdr (reader/->CRAMReader url ch bb header refs offset idx seq-resolver)] + (reader/read-file-definition rdr) + (vreset! header (reader/read-header rdr)) + (vreset! offset (.position ch)) rdr)) (defn clone-reader @@ -39,11 +46,12 @@ (into-array OpenOption [StandardOpenOption/READ])) bb (bb/allocate-lsb-byte-buffer 256) seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver) - rdr' (reader.core/->CRAMReader url ch bb - (delay @(.-header rdr)) - (delay @(.-refs rdr)) - (delay @(.-index rdr)) - seq-resolver)] - (reader.core/read-file-definition rdr') - (reader.core/skip-container rdr') + rdr' (reader/->CRAMReader url ch bb + (delay @(.-header rdr)) + (delay @(.-refs rdr)) + (delay @(.-offset rdr)) + (delay @(.-index rdr)) + seq-resolver)] + (reader/read-file-definition rdr') + (reader/skip-container rdr') rdr')) diff --git a/src/cljam/io/cram/decode/structure.clj b/src/cljam/io/cram/decode/structure.clj index b9940cef..d4d00cd9 100644 --- a/src/cljam/io/cram/decode/structure.clj +++ b/src/cljam/io/cram/decode/structure.clj @@ -68,7 +68,7 @@ landmarks (decode-itf8-array bb) crc (bb/read-bytes bb 4)] {:length len - :ref ref-seq-id + :ref-seq-id ref-seq-id :start start-pos :span span :records n-records @@ -82,7 +82,7 @@ "Returns true iff the given container header represents an EOF container." [container-header] (and (= (:length container-header) 15) - (= (:ref container-header) -1) + (= (:ref-seq-id container-header) -1) (= (:start container-header) 4542278) (= (:span container-header) 0) (= (:records container-header) 0) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index 49512cb4..6a5734d9 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -5,13 +5,13 @@ [cljam.io.cram.decode.structure :as struct] [cljam.io.protocols :as protocols] [cljam.util.intervals :as intervals]) - (:import [java.io Closeable FileNotFoundException] + (:import [java.io Closeable] [java.nio Buffer ByteBuffer ByteOrder] [java.nio.channels FileChannel FileChannel$MapMode])) (declare read-alignments read-alignments-in-region) -(deftype CRAMReader [url channel buffer header refs index seq-resolver] +(deftype CRAMReader [url channel buffer header refs offset index seq-resolver] Closeable (close [_] (when seq-resolver @@ -24,11 +24,7 @@ (read [this region] (protocols/read-alignments this region)) (indexed? [_] - (try - @index - true - (catch FileNotFoundException _ - false))) + (boolean @index)) protocols/IAlignmentReader (read-header [_] @header) (read-refs [_] @refs) @@ -63,9 +59,8 @@ (read-to-buffer rdr 26) (struct/decode-file-definition (.-buffer rdr))) -(defn- read-slice-records [^CRAMReader rdr bb compression-header] - (let [slice-header (struct/decode-slice-header-block bb) - blocks (into [] (map (fn [_] (struct/decode-block bb))) +(defn- read-slice-records [^CRAMReader rdr bb compression-header slice-header] + (let [blocks (into [] (map (fn [_] (struct/decode-block bb))) (range (:blocks slice-header))) core-block (first (filter #(zero? (long (:content-id %))) blocks)) bs-decoder (when core-block @@ -79,19 +74,24 @@ ds-decoders tag-decoders))) +(defn- with-each-slice-header [^ByteBuffer bb f slice-offsets] + (let [container-header-end (.position bb) + compression-header (struct/decode-compression-header-block bb)] + (mapcat (fn [^long offset] + (.position ^Buffer bb (+ container-header-end offset)) + (f compression-header (struct/decode-slice-header-block bb))) + slice-offsets))) + (defn- read-container-records ([rdr bb container-header] (read-container-records rdr bb container-header nil)) - ([^CRAMReader rdr ^ByteBuffer bb container-header idx-entries] - (let [container-header-end (.position bb) - compression-header (struct/decode-compression-header-block bb)] - (->> (if (seq idx-entries) - (map :slice-offset idx-entries) - (:landmarks container-header)) - (mapcat - (fn [^long landmark] - (.position ^Buffer bb (+ container-header-end landmark)) - (read-slice-records rdr bb compression-header))))))) + ([^CRAMReader rdr bb container-header idx-entries] + (->> (if (seq idx-entries) + (map :slice-offset idx-entries) + (:landmarks container-header)) + (with-each-slice-header bb + (fn [compression-header slice-header] + (read-slice-records rdr bb compression-header slice-header)))))) (defn- with-next-container-header [^CRAMReader rdr f] (let [^FileChannel ch (.-channel rdr) @@ -139,8 +139,13 @@ (concat alns (lazy-seq (step))))))] (step)))) -(defn- read-alignments-in-region - [^CRAMReader rdr {:keys [chr start end] :or {start 0 end Long/MAX_VALUE}}] +(defn- filter-overlapping-records [chr ^long start ^long end alns] + (filter #(and (= (:rname %) chr) + (<= (long (:pos %)) end) + (<= start (long (:end %)))) + alns)) + +(defn- read-alignments-in-region-with-index [^CRAMReader rdr chr ^long start ^long end] (let [^FileChannel ch (.-channel rdr) idx @(.-index rdr) offset->entries (->> (intervals/find-overlap-intervals idx chr start end) @@ -154,7 +159,46 @@ (.position ch offset) (concat (read-container-with rdr (read-fn entries)) (lazy-seq (step more)))))] - (filter #(and (= (:rname %) chr) - (<= (long (:pos %)) (long end)) - (<= (long start) (long (:end %)))) - (step offset->entries))))) + (filter-overlapping-records chr start end (step offset->entries))))) + +(defn- overlaps? [container-or-slice-header refs chr start end] + (let [seq-id (long (:ref-seq-id container-or-slice-header))] + (case seq-id + -1 (= chr "*") + ;; ref-seq-id = -2 means that the container or slice contains multiple + ;; references, and the decoder can't tell in advance what reference + ;; it actually has in it + -2 true + (let [chr' (get (nth refs seq-id) :name) + start' (long (:start container-or-slice-header)) + end' (+ start' (long (:span container-or-slice-header)))] + (and (= chr' chr) + (<= start' (long end)) + (<= (long start) end')))))) + +(defn- read-alignments-in-region-without-index [^CRAMReader rdr chr ^long start ^long end] + (let [^FileChannel ch (.-channel rdr) + refs @(.-refs rdr)] + (letfn [(read1 [container-header bb] + (when-not (struct/eof-container? container-header) + (if (overlaps? container-header refs chr start end) + (with-each-slice-header bb + (fn [compression-header slice-header] + (when (overlaps? slice-header refs chr start end) + (read-slice-records rdr bb compression-header slice-header))) + (:landmarks container-header)) + ::skipped))) + (step [] + (when (< (.position ch) (.size ch)) + (when-let [alns (read-container-with rdr read1)] + (if (identical? alns ::skipped) + (recur) + (concat alns (lazy-seq (step)))))))] + (.position ch (long @(.-offset rdr))) + (filter-overlapping-records chr start end (step))))) + +(defn- read-alignments-in-region + [^CRAMReader rdr {:keys [chr ^long start ^long end] :or {start 0 end Long/MAX_VALUE}}] + (if @(.-index rdr) + (read-alignments-in-region-with-index rdr chr start end) + (read-alignments-in-region-without-index rdr chr start end))) diff --git a/test-resources/cram/medium_without_index.cram b/test-resources/cram/medium_without_index.cram new file mode 100644 index 00000000..6a0541c5 Binary files /dev/null and b/test-resources/cram/medium_without_index.cram differ diff --git a/test/cljam/io/cram/decode/structure_test.clj b/test/cljam/io/cram/decode/structure_test.clj index ab1269fa..490abc84 100644 --- a/test/cljam/io/cram/decode/structure_test.clj +++ b/test/cljam/io/cram/decode/structure_test.clj @@ -27,7 +27,7 @@ :id "file:///tmp/cljam/me"} (struct/decode-file-definition bb))) (is (= {:length 278 - :ref -1 + :ref-seq-id -1 :start 0 :span 0 :records 0 @@ -42,7 +42,7 @@ (sam/read-header r))) (struct/decode-cram-header-block bb))) (is (= {:length 362828 - :ref -2 + :ref-seq-id -2 :start 0 :span 0 :records 10000 @@ -150,7 +150,7 @@ (range 29)))) (let [container-header (decode-container-header bb)] (is (= {:length 98171 - :ref -1 + :ref-seq-id -1 :start 0 :span 0 :records 2271 diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index ce56d8cf..de1ff416 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -25,7 +25,12 @@ (cram/read-refs cram-rdr'))) (is (= (map fixup-bam-aln (sam/read-alignments bam-rdr)) (cram/read-alignments cram-rdr) - (cram/read-alignments cram-rdr'))))) + (cram/read-alignments cram-rdr'))) + (are [?region ?count] (= ?count + (count (cram/read-alignments cram-rdr ?region)) + (count (cram/read-alignments cram-rdr' ?region))) + {:chr "ref"} 6 + {:chr "ref2", :start 35, :end 35} 2))) (deftest-remote reader-with-multiple-containers-test (with-before-after {:before (prepare-cavia!)} @@ -39,11 +44,16 @@ (cram/read-refs cram-rdr))) (is (= (map fixup-bam-aln (sam/read-alignments bam-rdr)) (cram/read-alignments cram-rdr))))) - (testing "read alignments in specified regions" + (testing "read alignments in specified regions (with and without index file)" (with-open [cram-rdr (cram/reader common/medium-cram-file - {:reference common/hg19-twobit-file})] + {:reference common/hg19-twobit-file}) + cram-rdr' (cram/reader common/medium-without-index-cram-file + {:reference common/hg19-twobit-file})] (is (cram/indexed? cram-rdr)) - (are [?region ?count] (= ?count (count (cram/read-alignments cram-rdr ?region))) + (is (not (cram/indexed? cram-rdr'))) + (are [?region ?count] (= ?count + (count (cram/read-alignments cram-rdr ?region)) + (count (cram/read-alignments cram-rdr' ?region))) {:chr "chr1"} 615 {:chr "*"} 4348 {:chr "chr1", :start 546610, :end 546610} 1 diff --git a/test/cljam/test_common.clj b/test/cljam/test_common.clj index b7c97b83..5eb2ac4f 100644 --- a/test/cljam/test_common.clj +++ b/test/cljam/test_common.clj @@ -168,6 +168,7 @@ (def test-cram-file "test-resources/cram/test.cram") (def medium-cram-file "test-resources/cram/medium.cram") (def medium-with-standard-tags-cram-file "test-resources/cram/medium_with_standard_tags.cram") +(def medium-without-index-cram-file "test-resources/cram/medium_without_index.cram") ;; ### CRAM index files