From b79ae91b74b7ae5eebcbdad48f424633c4ed1134 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Tue, 15 Oct 2024 10:58:42 +0900 Subject: [PATCH] Construct subst matrix based on snv frequencies --- src/cljam/io/cram/encode/context.clj | 11 +- src/cljam/io/cram/encode/record.clj | 17 ++- src/cljam/io/cram/encode/subst_matrix.clj | 60 ++++++++ test/cljam/io/cram/encode/record_test.clj | 131 ++++++++++-------- .../io/cram/encode/subst_matrix_test.clj | 27 ++++ 5 files changed, 175 insertions(+), 71 deletions(-) create mode 100644 src/cljam/io/cram/encode/subst_matrix.clj create mode 100644 test/cljam/io/cram/encode/subst_matrix_test.clj diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index fb48aa03..edb3b28e 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -1,5 +1,6 @@ (ns cljam.io.cram.encode.context (:require [cljam.io.cram.data-series :as ds] + [cljam.io.cram.encode.subst-matrix :as subst-mat] [cljam.io.cram.encode.tag-dict :as tag-dict] [cljam.io.sam.util.header :as sam.header])) @@ -13,17 +14,13 @@ (= (sam.header/sort-order cram-header) sam.header/order-coordinate) (assoc :AP true)) - subst-mat {\A {\T 0, \G 1, \C 2, \N 3} - \T {\A 0, \G 1, \C 2, \N 3} - \G {\A 0, \T 1, \C 2, \N 3} - \C {\A 0, \T 1, \G 2, \N 3} - \N {\A 0, \T 1, \G 2, \C 3}} + subst-mat-builder (subst-mat/make-subst-matrix-builder) tag-dict-builder (tag-dict/make-tag-dict-builder)] {:cram-header cram-header :rname->idx rname->idx :preservation-map preservation-map - :subst-mat subst-mat :seq-resolver seq-resolver + :subst-mat-builder subst-mat-builder :tag-dict-builder tag-dict-builder :options options})) @@ -36,12 +33,14 @@ tag-compressor-overrides]} (:options container-ctx) ds-encodings (-> ds/default-data-series-encodings (ds/apply-ds-compressor-overrides ds-compressor-overrides)) + subst-mat (subst-mat/build-subst-matrix (:subst-mat-builder container-ctx)) tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx)) tag-encodings (-> (tag-dict/build-tag-encodings tag-dict) (ds/apply-tag-compressor-overrides tag-compressor-overrides))] (assoc container-ctx :alignment-stats alignment-stats :ds-encodings ds-encodings + :subst-mat subst-mat :tag-dict tag-dict :tag-encodings tag-encodings))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index fc5d4fec..032b92d3 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -1,5 +1,6 @@ (ns cljam.io.cram.encode.record (:require [cljam.io.cram.encode.alignment-stats :as stats] + [cljam.io.cram.encode.subst-matrix :as subst-mat] [cljam.io.cram.encode.tag-dict :as tag-dict] [cljam.io.cram.seq-resolver.protocol :as resolver] [cljam.io.sam.util.cigar :as sam.cigar] @@ -82,7 +83,7 @@ (BA (:base f)) (QS (:qual f))) :subst (do (FC (int \X)) - (BS (:subst f))) + (BS @(:subst f))) :insertion (do (FC (int \I)) (IN (:bases f))) :deletion (do (FC (int \D)) @@ -149,7 +150,7 @@ (run! record-encoder records))) (defn- add-mismatches - [n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] + [n subst-mat-builder ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] (loop [i (long n), rpos (long rpos), spos (long spos), fs fs] (if (zero? i) fs @@ -164,13 +165,11 @@ :base read-base :qual (if qs (- (aget qs spos) 33) -1)} {:code :subst :pos pos - :subst (-> subst-mat - (get (char ref-base)) - (get (char read-base)))})] + :subst (subst-mat/assign-code! subst-mat-builder ref-base read-base)})] (recur (dec i) (inc rpos) (inc spos) (conj! fs f)))))))) (defn- calculate-read-features&end - [seq-resolver subst-mat {:keys [rname ^long pos qual cigar] :as record}] + [seq-resolver subst-mat-builder {:keys [rname ^long pos qual cigar] :as record}] (if (or (zero? pos) (= (:seq record) "*")) [[] pos] (let [ref-bases ^bytes (resolver/resolve-sequence seq-resolver rname) @@ -185,7 +184,7 @@ (let [pos (inc spos)] (case op (\M \X \=) (recur more (+ rpos n) (+ spos n) - (add-mismatches n subst-mat ref-bases rpos + (add-mismatches n subst-mat-builder ref-bases rpos read-bases qs spos fs)) \I (let [spos' (+ spos n) bs (Arrays/copyOfRange read-bases spos spos')] @@ -203,7 +202,7 @@ "Preprocesses slice records to calculate some record fields prior to record encoding that are necessary for the CRAM writer to generate some header components." - [{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records] + [{:keys [rname->idx seq-resolver subst-mat-builder tag-dict-builder]} ^List records] (let [stats-builder (stats/make-alignment-stats-builder)] (dotimes [i (.size records)] (let [record (.get records i) @@ -215,7 +214,7 @@ (= (:seq record) "*") (bit-or 0x08)) ri (ref-index rname->idx (:rname record)) tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) - [fs end] (calculate-read-features&end seq-resolver subst-mat record) + [fs end] (calculate-read-features&end seq-resolver subst-mat-builder record) record' (assoc record ::flag cf ::ref-index ri ::end end ::features fs ::tags-index tags-id)] diff --git a/src/cljam/io/cram/encode/subst_matrix.clj b/src/cljam/io/cram/encode/subst_matrix.clj new file mode 100644 index 00000000..3702595f --- /dev/null +++ b/src/cljam/io/cram/encode/subst_matrix.clj @@ -0,0 +1,60 @@ +(ns cljam.io.cram.encode.subst-matrix) + +(defprotocol ICounter + (inc! [this])) + +(defprotocol IMutableInt + (set-val! [this v])) + +(deftype MutableInt [^:unsynchronized-mutable ^long n] + ICounter + (inc! [_] + (set! n (inc n))) + IMutableInt + (set-val! [_ v] + (set! n (long v))) + clojure.lang.IDeref + (deref [_] n)) + +(defprotocol ISubstMatrixBuilder + (assign-code! [this ref alt]) + (build-subst-matrix [this])) + +(definline ^:private base->index [b] + `(case (long ~b) + ~(int \A) 0 + ~(int \C) 1 + ~(int \G) 2 + ~(int \T) 3 + ~(int \N) 4)) + +(deftype SubstMatrixBuilder [^objects freqs] + ISubstMatrixBuilder + (assign-code! [_ r a] + (let [mut (aget ^objects (aget freqs (base->index r)) (base->index a))] + (inc! mut) + mut)) + (build-subst-matrix [_] + (let [all-bases [\A \C \G \T \N]] + (reduce + (fn [m r] + (let [^objects freqs (aget freqs (base->index (int r)))] + (->> all-bases + (keep (fn [a] + (when (not= r a) + [a (aget freqs (base->index (int a)))]))) + (sort-by (comp - deref second)) + (map-indexed vector) + (reduce (fn [m [i [a mut]]] + (set-val! mut i) + (assoc-in m [r a] i)) + m)))) + {} all-bases)))) + +(defn make-subst-matrix-builder + "Creates a new substitution matrix builder." + [] + (->> (fn [] (into-array (repeatedly 5 #(->MutableInt 0)))) + (repeatedly 5) + into-array + ->SubstMatrixBuilder)) diff --git a/test/cljam/io/cram/encode/record_test.clj b/test/cljam/io/cram/encode/record_test.clj index f00ed6e1..9b7678ff 100644 --- a/test/cljam/io/cram/encode/record_test.clj +++ b/test/cljam/io/cram/encode/record_test.clj @@ -1,6 +1,7 @@ (ns cljam.io.cram.encode.record-test (:require [cljam.io.cram.encode.context :as context] [cljam.io.cram.encode.record :as record] + [cljam.io.cram.encode.subst-matrix :as subst-mat] [cljam.io.cram.encode.tag-dict :as tag-dict] [cljam.io.cram.seq-resolver.protocol :as resolver] [cljam.io.sequence :as cseq] @@ -19,66 +20,79 @@ (let [s (get seqs chr)] (.getBytes (subs s (dec (long start)) end))))))) -(def subst-mat - {\A {\T 0, \G 1, \C 2, \N 3} - \T {\A 0, \G 1, \C 2, \N 3} - \G {\A 0, \T 1, \C 2, \N 3} - \C {\A 0, \T 1, \G 2, \N 3} - \N {\A 0, \T 1, \G 2, \C 3}}) +(defn- test-subst-mat-builder [m] + (let [^objects arr (make-array Object 5 5) + bs (map-indexed vector "ACGTN")] + (doseq [[i r] bs + [j a] bs + :when (not= i j)] + (aset ^objects (aget arr i) j + (or (get-in m [r a]) (subst-mat/->MutableInt 0)))) + (subst-mat/->SubstMatrixBuilder arr))) (deftest calculate-read-features&end - (are [?record ?expected] - (= ?expected - (walk/prewalk - #(if (instance? (Class/forName "[B") %) (vec %) %) - (#'record/calculate-read-features&end test-seq-resolver subst-mat ?record))) - {:rname "ref", :pos 2, :seq "GCATG", :qual "ABCDE", :cigar "5M"} - [[] - 6] - - {:rname "ref", :pos 2, :seq "GCCTG", :qual "ABCDE", :cigar "5M"} - [[{:code :subst, :pos 3, :subst 2}] - 6] - - {:rname "ref", :pos 2, :seq "GCRTG", :qual "ABCDE", :cigar "5M"} - [[{:code :read-base, :pos 3, :base (int \R), :qual (- (int \C) 33)}] - 6] - - {:rname "ref", :pos 2, :seq "GCCAA", :qual "ABCDE", :cigar "2M2I1M"} - [[{:code :insertion, :pos 3, :bases (mapv int "CA")}] - 4] - - {:rname "ref", :pos 2, :seq "GTCAA", :qual "ABCDE", :cigar "2M2I1M"} - [[{:code :subst, :pos 2, :subst 1} - {:code :insertion, :pos 3, :bases (mapv int "CA")}] - 4] - - {:rname "ref", :pos 2, :seq "GCGTT", :qual "ABCDE", :cigar "2M2D3M"} - [[{:code :deletion, :pos 3, :len 2}] - 8] - - {:rname "ref", :pos 2, :seq "GCCTT", :qual "ABCDE", :cigar "2M2D3M"} - [[{:code :deletion, :pos 3, :len 2} - {:code :subst, :pos 3, :subst 2}] - 8] - - {:rname "ref", :pos 2, :seq "GCATT", :qual "ABCDE", :cigar "3M2S"} - [[{:code :softclip, :pos 4, :bases (mapv int "TT")}] - 4] - - {:rname "ref", :pos 2, :seq "TTGCA", :qual "ABCDE", :cigar "2S3M"} - [[{:code :softclip, :pos 1, :bases (mapv int "TT")}] - 4])) - -(defn- preprocess-slice-records [cram-header records] + (let [a->c (subst-mat/->MutableInt 0) + c->t (subst-mat/->MutableInt 0) + g->c (subst-mat/->MutableInt 0) + subst-mat-builder (test-subst-mat-builder {\A {\C a->c}, \C {\T c->t}, \G {\C g->c}})] + (are [?record ?expected] + (= ?expected + (walk/prewalk + #(if (instance? (Class/forName "[B") %) (vec %) %) + (#'record/calculate-read-features&end test-seq-resolver subst-mat-builder ?record))) + {:rname "ref", :pos 2, :seq "GCATG", :qual "ABCDE", :cigar "5M"} + [[] + 6] + + {:rname "ref", :pos 2, :seq "GCCTG", :qual "ABCDE", :cigar "5M"} + [[{:code :subst, :pos 3, :subst a->c}] + 6] + + {:rname "ref", :pos 2, :seq "GCRTG", :qual "ABCDE", :cigar "5M"} + [[{:code :read-base, :pos 3, :base (int \R), :qual (- (int \C) 33)}] + 6] + + {:rname "ref", :pos 2, :seq "GCCAA", :qual "ABCDE", :cigar "2M2I1M"} + [[{:code :insertion, :pos 3, :bases (mapv int "CA")}] + 4] + + {:rname "ref", :pos 2, :seq "GTCAA", :qual "ABCDE", :cigar "2M2I1M"} + [[{:code :subst, :pos 2, :subst c->t} + {:code :insertion, :pos 3, :bases (mapv int "CA")}] + 4] + + {:rname "ref", :pos 2, :seq "GCGTT", :qual "ABCDE", :cigar "2M2D3M"} + [[{:code :deletion, :pos 3, :len 2}] + 8] + + {:rname "ref", :pos 2, :seq "GCCTT", :qual "ABCDE", :cigar "2M2D3M"} + [[{:code :deletion, :pos 3, :len 2} + {:code :subst, :pos 3, :subst g->c}] + 8] + + {:rname "ref", :pos 2, :seq "GCATT", :qual "ABCDE", :cigar "3M2S"} + [[{:code :softclip, :pos 4, :bases (mapv int "TT")}] + 4] + + {:rname "ref", :pos 2, :seq "TTGCA", :qual "ABCDE", :cigar "2S3M"} + [[{:code :softclip, :pos 1, :bases (mapv int "TT")}] + 4]) + (is (= 1 @a->c)) + (is (= 1 @c->t)) + (is (= 1 @g->c)))) + +(defn- preprocess-slice-records [cram-header subst-mat-init records] (let [opts {:ds-compressor-overrides (constantly :raw) :tag-compressor-overrides (constantly (constantly (constantly {:external :raw})))} - container-ctx (context/make-container-context cram-header test-seq-resolver opts) + container-ctx (-> (context/make-container-context cram-header test-seq-resolver opts) + (assoc :subst-mat-builder (test-subst-mat-builder subst-mat-init))) stats (record/preprocess-slice-records container-ctx records)] (context/finalize-container-context container-ctx [stats]))) (deftest preprocess-slice-records-test (let [cram-header {:SQ [{:SN "ref"}]} + c->a (subst-mat/->MutableInt 0) + subst-mat-init {\C {\A c->a}} records (ArrayList. [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH" :options [{:RG {:type "Z", :value "rg001"}} @@ -100,13 +114,13 @@ :options []} {:rname "*", :pos 10, :cigar "*", :seq "*", :qual "*" :options []}]) - container-ctx (preprocess-slice-records cram-header records)] + container-ctx (preprocess-slice-records cram-header subst-mat-init records)] (is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH" :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "2C2"}} {:NM {:type "c", :value 1}}] ::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0 - ::record/features [{:code :subst, :pos 3 :subst 0}]} + ::record/features [{:code :subst, :pos 3 :subst c->a}]} {:rname "ref", :pos 5, :cigar "2S3M", :seq "CCTGT", :qual "##AAC" :options [{:RG {:type "Z", :value "rg001"}} {:MD {:type "Z", :value "3"}} @@ -134,6 +148,7 @@ ::record/features []}] (walk/prewalk #(if (.isArray (class %)) (vec %) %) (vec records)))) + (is (= 0 @c->a)) (is (= [[{:tag :MD, :type \Z} {:tag :NM, :type \c}] []] (:tag-dict container-ctx))) @@ -160,6 +175,8 @@ :RG [{:ID "rg001"} {:ID "rg002"}]} + c->a (subst-mat/->MutableInt 0) + subst-mat-init {\C {\A c->a}} records (ArrayList. [{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0, :cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH" @@ -184,7 +201,8 @@ {:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0, :cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE" :options []}]) - slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0) + slice-ctx (-> (preprocess-slice-records cram-header subst-mat-init records) + (context/make-slice-context 0)) _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))] @@ -279,7 +297,7 @@ (is (= 1 (count (get ds-res :BS)))) (is (= 23 (get-in ds-res [:BS 0 :content-id]))) - (is (= [0] (seq (get-in ds-res [:BS 0 :data])))) + (is (= [@c->a] (seq (get-in ds-res [:BS 0 :data])))) (is (= 2 (count (get ds-res :IN)))) (is (= 24 (get-in ds-res [:IN 0 :content-id]))) @@ -355,7 +373,8 @@ {:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0, :cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD" :options []}]) - slice-ctx (context/make-slice-context (preprocess-slice-records cram-header records) 0) + slice-ctx (-> (preprocess-slice-records cram-header {} records) + (context/make-slice-context 0)) _ (record/encode-slice-records slice-ctx records) ds-res (walk/prewalk #(if (fn? %) (%) %) (:ds-encoders slice-ctx)) tag-res (walk/prewalk #(if (fn? %) (%) %) (:tag-encoders slice-ctx))] diff --git a/test/cljam/io/cram/encode/subst_matrix_test.clj b/test/cljam/io/cram/encode/subst_matrix_test.clj new file mode 100644 index 00000000..d7725983 --- /dev/null +++ b/test/cljam/io/cram/encode/subst_matrix_test.clj @@ -0,0 +1,27 @@ +(ns cljam.io.cram.encode.subst-matrix-test + (:require [cljam.io.cram.encode.subst-matrix :as subst-mat] + [clojure.test :refer [deftest is]])) + +(deftest make-subst-matrix-builder-test + (let [builder (subst-mat/make-subst-matrix-builder) + a->t (subst-mat/assign-code! builder (int \A) (int \T)) + a->c (subst-mat/assign-code! builder (int \A) (int \C)) + a->g (subst-mat/assign-code! builder (int \A) (int \G)) + a->t' (subst-mat/assign-code! builder (int \A) (int \T)) + g->c (subst-mat/assign-code! builder (int \G) (int \C)) + g->t (subst-mat/assign-code! builder (int \G) (int \T)) + t->n (subst-mat/assign-code! builder (int \T) (int \N)) + n->c (subst-mat/assign-code! builder (int \N) (int \C))] + (is (= {\A {\C 1, \G 2, \T 0, \N 3} + \C {\A 0, \G 1, \T 2, \N 3} + \G {\A 2, \C 0, \T 1, \N 3} + \T {\A 1, \C 2, \G 3, \N 0} + \N {\A 1, \C 0, \G 2, \T 3}} + (subst-mat/build-subst-matrix builder))) + (is (= 0 @a->t @a->t')) + (is (= 1 @a->c)) + (is (= 2 @a->g)) + (is (= 0 @g->c)) + (is (= 1 @g->t)) + (is (= 0 @t->n)) + (is (= 0 @n->c))))