Skip to content

Commit

Permalink
Add tests for CRAM index generation
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Jul 23, 2024
1 parent 2c7d255 commit 94c7d75
Show file tree
Hide file tree
Showing 9 changed files with 277 additions and 57 deletions.
33 changes: 22 additions & 11 deletions src/cljam/io/crai.clj
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,33 @@
[clojure.string :as str])
(:import [java.io Closeable OutputStreamWriter PrintWriter]))

(defn- read-index-entries [rdr]
(->> (line-seq rdr)
(map (fn [line]
(let [[^long seq-id start span container-offset slice-offset size]
(map #(Long/parseLong %) (str/split line #"\t"))
unmapped? (neg? seq-id)]
{:ref-seq-id seq-id
:start (if unmapped? 0 start)
:span (if unmapped? 0 span)
:container-offset container-offset
:slice-offset slice-offset
:size size})))))

(defn read-index
"Reads a CRAI file `f` and creates an index."
[f refs]
(let [refs (vec refs)]
(with-open [rdr (io/reader (util/compressor-input-stream f))]
(->> (line-seq rdr)
(map (fn [line]
(let [[^long seq-id ^long start ^long span container-offset slice-offset size]
(map #(Long/parseLong %) (str/split line #"\t"))
unmapped? (neg? seq-id)]
{:chr (if unmapped? "*" (:name (nth refs seq-id)))
:start (if unmapped? 0 start)
:end (if unmapped? 0 (+ start span))
:container-offset container-offset
:slice-offset slice-offset
:size size})))
(->> (read-index-entries rdr)
(map (fn [{:keys [^long ref-seq-id ^long start ^long span] :as entry}]
(-> entry
(assoc :chr
(if (neg? ref-seq-id)
"*"
(:name (nth refs ref-seq-id)))
:end (+ start span))
(dissoc :ref-seq-id :span))))
intervals/index-intervals))))

(defn find-overlapping-entries
Expand Down
Binary file added test-resources/cram/test.sorted.cram
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions test/cljam/algo/cram_indexer_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
(ns cljam.algo.cram-indexer-test
(:require [cljam.algo.cram-indexer :as indexer]
[cljam.io.crai :as crai]
[cljam.io.cram :as cram]
[cljam.test-common :as common]
[cljam.util :as util]
[clojure.java.io :as io]
[clojure.test :refer [deftest is]]))

(defn- read-index-entries [f]
(with-open [r (io/reader (util/compressor-input-stream f))]
(doall (#'crai/read-index-entries r))))

(deftest create-index-test
(let [f (io/file common/temp-dir "medium.cram.crai")]
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(is (thrown-with-msg? Exception #"Cannot create CRAM index file .*"
(indexer/create-index common/medium-cram-file f)))
(indexer/create-index common/medium-cram-file f
:skip-sort-order-check? true)
(is (= (read-index-entries common/medium-crai-file)
(read-index-entries f))))))

(deftest cram-without-alignments-test
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(let [header {:HD {:SO "coordinate"}
:SQ [{:SN "chr1", :LN 100}]}
target (io/file common/temp-dir "no_aln.cram")
target-crai (io/file common/temp-dir "no_aln.cram.crai")]
(with-open [w (cram/writer target)]
(cram/write-header w header)
(cram/write-refs w header))
(is (common/not-throw? (indexer/create-index target target-crai))))))
25 changes: 24 additions & 1 deletion test/cljam/io/crai_test.clj
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
(ns cljam.io.crai-test
(:require [cljam.io.crai :as crai]
[cljam.test-common :as common]
[clojure.test :refer [deftest are]]))
[cljam.util :as util]
[clojure.java.io :as io]
[clojure.test :refer [are deftest is]]))

(def ^:private test-refs
(->> (concat (range 1 23) ["X" "Y"])
Expand Down Expand Up @@ -116,3 +118,24 @@
:container-offset 378365
:slice-offset 190037
:size 12326}])))

(deftest write-index-entries-test
(let [entries [{:ref-seq-id 0, :start 546609, :span 205262429,
:container-offset 324, :slice-offset 563, :size 22007}
{:ref-seq-id 0, :start 206547069, :span 42644506,
:container-offset 324, :slice-offset 22570, :size 7349}
{:ref-seq-id 1, :start 67302, :span 231638920,
:container-offset 30272, :slice-offset 563, :size 21618}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 354657, :slice-offset 563, :size 23119}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 378365, :slice-offset 171, :size 23494}
{:ref-seq-id -1, :start 0, :span 0,
:container-offset 378365, :slice-offset 23665, :size 23213}]
f (io/file common/temp-dir "test.cram.crai")]
(common/with-before-after {:before (common/prepare-cache!)
:after (common/clean-cache!)}
(with-open [w (crai/writer f)]
(crai/write-index-entries w entries))
(with-open [r (io/reader (util/compressor-input-stream f))]
(is (= entries (#'crai/read-index-entries r)))))))
128 changes: 83 additions & 45 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,34 @@
[[{:code :softclip, :pos 1, :bases (mapv int "TT")}]
4]))

(deftest preprocess-slice-records-test
(let [rname->idx {"ref" 0}
records (object-array
[{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"}
{:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"}])]
(record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
(is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
::record/flag 0x03, ::record/ref-index 0, ::record/end 5
::record/features [{:code :subst, :pos 3 :subst 0}]}
{:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"
::record/flag 0x03, ::record/ref-index 0, ::record/end 7
::record/features [{:code :softclip, :pos 1, :bases [(int \C) (int \C)]}]}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"
::record/flag 0x03, ::record/ref-index 0, ::record/end 14
::record/features []}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"
::record/flag 0x03, ::record/ref-index 0, ::record/end 19
::record/features [{:code :insertion, :pos 2, :bases [(int \A)]}
{:code :deletion, :pos 4, :len 1}]}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"
::record/flag 0x03, ::record/ref-index -1, ::record/end 0
::record/features []}]
(walk/prewalk #(if (.isArray (class %)) (vec %) %)
records)))))

(deftest encode-slice-records-test
(testing "mapped reads"
(let [cram-header {:SQ
Expand All @@ -76,6 +104,9 @@
:RG
[{:ID "rg001"}
{:ID "rg002"}]}
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]
[{:tag :MD, :type \Z}
{:tag :NM, :type \c}]]
Expand All @@ -87,34 +118,36 @@
:NM {\c {:codec :byte-array-len
:len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]}
:val-encoding {:codec :external, :content-id 5131619}}}})
records [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/tags-index 1}
{:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15,
:cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/tags-index 1}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options [], ::record/tags-index 0}]
stats (record/encode-slice-records test-seq-resolver cram-header tag-dict subst-mat
records (object-array
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/tags-index 1}
{:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15,
:cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/tags-index 1}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders tag-encoders records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)
tag-res (walk/prewalk #(if (fn? %) (%) %) tag-encoders)]
Expand Down Expand Up @@ -269,25 +302,30 @@
(let [cram-header {:SQ
[{:SN "ref"}
{:SN "ref2"}]}
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]]
ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings)
records [{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF"
:options [], ::record/tags-index 0}
{:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD"
:options [], ::record/tags-index 0}
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options [], ::record/tags-index 0}]
stats (record/encode-slice-records test-seq-resolver cram-header tag-dict
subst-mat ds-encoders {} records)
records (object-array
[{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF"
:options [], ::record/tags-index 0}
{:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF"
:options [], ::record/tags-index 0}
{:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD"
:options [], ::record/tags-index 0}
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders {} records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)]
(is (= {:ri -1, :start 0, :end 0, :nbases 25, :nrecords 5}
(into {} stats)))
Expand Down
70 changes: 70 additions & 0 deletions test/cljam/io/cram/writer_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
(ns cljam.io.cram.writer-test
(:require [cljam.io.cram.encode.record :as record]
[cljam.io.cram.writer :as writer]
[clojure.test :refer [deftest is testing]]))

(deftest container-index-entries-test
(testing "no multiple reference slices"
(is (= [{:ref-seq-id 0, :start 123, :span 456
:container-offset 100, :slice-offset 200, :size 1000}
{:ref-seq-id 0, :start 987, :span 654
:container-offset 100, :slice-offset 1200, :size 900}
{:ref-seq-id 1, :start 1234, :span 5678
:container-offset 100, :slice-offset 2100, :size 1100}]
(#'writer/container-index-entries
100
{:landmarks [200 1200 2100]}
[{:header {:ref-seq-id 0, :start 123, :span 456}
:size 1000}
{:header {:ref-seq-id 0, :start 987, :span 654}
:size 900}
{:header {:ref-seq-id 1, :start 1234, :span 5678}
:size 1100}]
[[{::record/ref-index 0, :pos 123 ::record/end 273}
{::record/ref-index 0, :pos 428 ::record/end 578}]
[{::record/ref-index 0, :pos 987 ::record/end 1137}
{::record/ref-index 0, :pos 1490 ::record/end 1640}]
[{::record/ref-index 1, :pos 1234 ::record/end 1384}
{::record/ref-index 1, :pos 6761 ::record/end 6911}]])))
(is (= [{:ref-seq-id -1, :start 0, :span 0
:container-offset 100, :slice-offset 200, :size 1000}
{:ref-seq-id -1, :start 0, :span 0
:container-offset 100, :slice-offset 1200, :size 900}]
(#'writer/container-index-entries
100
{:landmarks [200 1200]}
[{:header {:ref-seq-id -1, :start 0, :span 0}
:size 1000}
{:header {:ref-seq-id -1, :start 0, :span 0}
:size 900}]
[[{::record/ref-index -1, :pos 0 ::record/end 150}
{::record/ref-index -1, :pos 0 ::record/end 150}]
[{::record/ref-index -1, :pos 0 ::record/end 150}
{::record/ref-index -1, :pos 0 ::record/end 150}]]))))
(testing "some multiple reference slices"
(is (= [{:ref-seq-id 0, :start 123, :span 456
:container-offset 100, :slice-offset 200, :size 1000}
{:ref-seq-id 0, :start 987, :span 654
:container-offset 100, :slice-offset 1200, :size 900}
{:ref-seq-id 1, :start 1234, :span 5678
:container-offset 100, :slice-offset 1200, :size 900}
;; Both of :start/:span should be zero for unmapped slices in respect of
;; the CRAM specification, but it's guaranteed by the CRAI writer,
;; so it doesn't matter what values these keys actually take here
{:ref-seq-id -1, :start 0, :span 1
:container-offset 100, :slice-offset 1200, :size 900}]
(#'writer/container-index-entries
100
{:landmarks [200 1200]}
[{:header {:ref-seq-id 0, :start 123, :span 456}
:size 1000}
{:header {:ref-seq-id -2, :start 0, :span 0}
:size 900}]
[[{::record/ref-index 0, :pos 123 ::record/end 273}
{::record/ref-index 0, :pos 428 ::record/end 578}]
[{::record/ref-index 0, :pos 987 ::record/end 1138}
{::record/ref-index 0, :pos 1490 ::record/end 1640}
{::record/ref-index 1, :pos 1234 ::record/end 1384}
{::record/ref-index 1, :pos 6761 ::record/end 6911}
{::record/ref-index -1, :pos 0 ::record/end 150}
{::record/ref-index -1, :pos 0 ::record/end 150}]])))))
Loading

0 comments on commit 94c7d75

Please sign in to comment.