From c2df3314379d0a214ecf972afb71b9d1cd61c012 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Thu, 29 Aug 2024 18:10:49 +0900 Subject: [PATCH] Support delta-encoded AP in sorted CRAM files --- src/cljam/io/cram/encode/context.clj | 9 ++++- src/cljam/io/cram/encode/record.clj | 13 +++++-- src/cljam/io/cram/writer.clj | 9 ++--- test/cljam/io/cram/encode/record_test.clj | 7 ++-- test/cljam/io/cram_test.clj | 45 ++++++++++++++++------- 5 files changed, 55 insertions(+), 28 deletions(-) diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index a5c4a8d2..21708365 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -1,13 +1,18 @@ (ns cljam.io.cram.encode.context (:require [cljam.io.cram.data-series :as ds] - [cljam.io.cram.encode.tag-dict :as tag-dict])) + [cljam.io.cram.encode.tag-dict :as tag-dict] + [cljam.io.sam.util.header :as sam.header])) (defn make-container-context "Creates a new container context." - [cram-header preservation-map seq-resolver] + [cram-header seq-resolver] (let [rname->idx (into {} (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) + preservation-map (cond-> {:RN true, :AP false, :RR true} + (= (sam.header/sort-order cram-header) + sam.header/order-coordinate) + (assoc :AP true)) subst-mat {\A {\T 0, \G 1, \C 2, \N 3} \T {\A 0, \G 1, \C 2, \N 3} \G {\A 0, \T 1, \C 2, \N 3} diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index bcd10e4a..94a75474 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -12,15 +12,22 @@ -1 (get rname->idx rname))) -(defn- build-positional-data-encoder [{:keys [cram-header]} {:keys [RI RL AP RG]}] +(defn- build-positional-data-encoder + [{:keys [cram-header preservation-map]} {:keys [RI RL AP RG]}] (let [rg-id->idx (into {} (map-indexed (fn [i {:keys [ID]}] [ID i])) - (:RG cram-header))] + (:RG cram-header)) + AP' (if (:AP preservation-map) + (let [pos (volatile! nil)] + (fn [^long pos'] + (AP (- pos' (long (or @pos 0)))) + (vreset! pos pos'))) + AP)] (fn [record] (let [rg (sam.option/value-for-tag :RG record)] (RI (::ref-index record)) (RL (count (:seq record))) - (AP (:pos record)) + (AP' (:pos record)) (RG (if rg (get rg-id->idx rg) -1)))))) (defn- build-read-name-encoder [{:keys [RN]}] diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index 66b2dd92..0027298e 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -50,10 +50,8 @@ (struct/encode-cram-header-container (.-stream wtr) header)) (defn- preprocess-records - [cram-header preservation-map seq-resolver options ^objects container-records] - (let [container-ctx (context/make-container-context cram-header - preservation-map - seq-resolver) + [cram-header seq-resolver options ^objects container-records] + (let [container-ctx (context/make-container-context cram-header seq-resolver) {:keys [ds-compressor-overrides tag-compressor-overrides]} options] (run! (partial record/preprocess-slice-records container-ctx) container-records) (context/finalize-container-context container-ctx @@ -172,8 +170,7 @@ (crai/write-index-entries index-writer entries))) (defn- write-container [^CRAMWriter wtr cram-header counter container-records] - (let [preservation-map {:RN true, :AP false, :RR true} - container-ctx (preprocess-records cram-header preservation-map (.-seq-resolver wtr) + (let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr) (.-options wtr) container-records) slices (generate-slices container-ctx counter container-records) compression-header-block (generate-compression-header-block container-ctx) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index b00f0503..dcc8cfb4 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -71,7 +71,7 @@ 4])) (defn- preprocess-slice-records [cram-header records] - (let [container-ctx (context/make-container-context cram-header {} test-seq-resolver)] + (let [container-ctx (context/make-container-context cram-header test-seq-resolver)] (record/preprocess-slice-records container-ctx records) (context/finalize-container-context container-ctx (constantly :raw) @@ -153,7 +153,8 @@ (deftest encode-slice-records-test (testing "mapped reads" - (let [cram-header {:SQ + (let [cram-header {:HD {:SO "coordinate"} + :SQ [{:SN "ref"} {:SN "ref2"}] :RG @@ -209,7 +210,7 @@ (is (= 1 (count (get ds-res :AP)))) (is (= 5 (get-in ds-res [:AP 0 :content-id]))) - (is (= [1 5 10 15 20] (seq (get-in ds-res [:AP 0 :data])))) + (is (= [1 4 5 5 5] (seq (get-in ds-res [:AP 0 :data])))) (is (= 1 (count (get ds-res :RG)))) (is (= 6 (get-in ds-res [:RG 0 :content-id]))) diff --git a/test/cljam/io/cram_test.clj b/test/cljam/io/cram_test.clj index 7e8d6269..709be92c 100644 --- a/test/cljam/io/cram_test.clj +++ b/test/cljam/io/cram_test.clj @@ -9,6 +9,7 @@ [clojure.test :refer [are deftest is testing]])) (def ^:private temp-cram-file (io/file common/temp-dir "test.cram")) +(def ^:private temp-sorted-cram-file (io/file common/temp-dir "test.sorted.cram")) (def ^:private temp-cram-file-2 (io/file common/temp-dir "test2.cram")) (def ^:private temp-cram-file-3 (io/file common/temp-dir "test3.cram")) @@ -74,20 +75,36 @@ (deftest writer-test (with-before-after {:before (prepare-cache!) :after (clean-cache!)} - (with-open [r (cram/reader common/test-cram-file - {:reference common/test-fa-file}) - w (cram/writer temp-cram-file - {:reference common/test-fa-file})] - (cram/write-header w (cram/read-header r)) - (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) - (with-open [r (cram/reader common/test-cram-file - {:reference common/test-fa-file}) - r' (cram/reader temp-cram-file - {:reference common/test-fa-file})] - (is (= (cram/read-header r) - (cram/read-header r'))) - (is (= (cram/read-alignments r) - (cram/read-alignments r')))))) + (testing "unsorted" + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-cram-file + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-cram-file + {:reference common/test-fa-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r'))))) + (testing "sorted by coordinate" + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + w (cram/writer temp-sorted-cram-file + {:reference common/test-fa-file})] + (cram/write-header w (cram/read-header r)) + (cram/write-alignments w (cram/read-alignments r) (cram/read-header r))) + (with-open [r (cram/reader common/test-sorted-cram-file + {:reference common/test-fa-file}) + r' (cram/reader temp-sorted-cram-file + {:reference common/test-fa-file})] + (is (= (cram/read-header r) + (cram/read-header r'))) + (is (= (cram/read-alignments r) + (cram/read-alignments r'))))))) (deftest-remote writer-with-multiple-containers-test (with-before-after {:before (do (prepare-cavia!)