Skip to content

Commit

Permalink
Move tag dict builder to dedicated namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Jul 25, 2024
1 parent e21c777 commit 08e3063
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 100 deletions.
10 changes: 7 additions & 3 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
[cljam.io.sam.util.cigar :as sam.cigar]
[cljam.io.sam.util.flag :as sam.flag]
[cljam.io.sam.util.option :as sam.option]
[cljam.io.cram.seq-resolver.protocol :as resolver])
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.cram.encode.tag-dict :as tag-dict])
(:import [java.util Arrays]))

(defn- ref-index [rname->idx rname]
Expand Down Expand Up @@ -201,7 +202,7 @@
"Preprocesses slice records to calculate some record fields prior to record
encoding that are necessary for the CRAM writer to generate some header
components."
[seq-resolver rname->idx subst-mat ^objects records]
[seq-resolver rname->idx tag-dict-builder subst-mat ^objects records]
(dotimes [i (alength records)]
(let [record (aget records i)
;; these flag bits of CF are hard-coded at the moment:
Expand All @@ -211,6 +212,9 @@
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
record' (assoc record ::flag cf ::ref-index ri ::end end ::features fs)]
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
(aset records i record'))))
58 changes: 58 additions & 0 deletions src/cljam/io/cram/encode/tag_dict.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
(ns cljam.io.cram.encode.tag-dict)

(defprotocol ITagDictionaryBuilder
(assign-tags-id! [this tags])
(build-tag-dict [this]))

(defn make-tag-dict-builder
"TODO"
[]
(let [tags->id (volatile! {})]
(reify ITagDictionaryBuilder
(assign-tags-id! [_ tags]
(let [tags' (into []
(keep (fn [opt]
(let [[tag m] (first opt)]
(when-not (= tag :RG)
[tag (first (:type m))]))))
tags)]
(or (get @tags->id tags')
(let [id (count @tags->id)]
(vswap! tags->id assoc tags' id)
id))))
(build-tag-dict [_]
(->> (sort-by val @tags->id)
(mapv (fn [[tags _]]
(mapv (fn [[tag tag-type]]
{:tag tag :type tag-type})
tags))))))))

(defn- build-tag-encoding [item]
(letfn [(tag-id [item]
(let [tag' (name (:tag item))]
(bit-or (bit-shift-left (int (nth tag' 0)) 16)
(bit-shift-left (int (nth tag' 1)) 8)
(int (:type item)))))
(tag-encoding-for-fixed-size [item size]
{:codec :byte-array-len
:len-encoding {:codec :huffman, :alphabet [size], :bit-len [0]}
:val-encoding {:codec :external, :content-id (tag-id item)}})]
(case (:type item)
(\A \c \C) (tag-encoding-for-fixed-size item 1)
(\s \S) (tag-encoding-for-fixed-size item 2)
(\i \I \f) (tag-encoding-for-fixed-size item 4)
(let [content-id (tag-id item)]
{:codec :byte-array-len
:len-encoding {:codec :external, :content-id content-id}
:val-encoding {:codec :external, :content-id content-id}}))))

(defn build-tag-encodings
"TODO"
[tag-dict]
(reduce
(fn [m entry]
(reduce
(fn [m {tag-type :type :as item}]
(update-in m [(:tag item) tag-type] #(or % (build-tag-encoding item))))
m entry))
{} tag-dict))
66 changes: 9 additions & 57 deletions src/cljam/io/cram/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[cljam.io.cram.encode.alignment-stats :as stats]
[cljam.io.cram.encode.record :as record]
[cljam.io.cram.encode.structure :as struct]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.protocols :as protocols]
[cljam.io.sam.util.header :as sam.header])
Expand Down Expand Up @@ -49,62 +50,11 @@
(struct/encode-cram-header-container (.-stream wtr) header))

(defn- preprocess-records
[seq-resolver rname->idx subst-mat ^objects container-records]
[seq-resolver rname->idx tag-dict-builder subst-mat ^objects container-records]
(dotimes [i (alength container-records)]
(let [slice-records (aget container-records i)]
(record/preprocess-slice-records seq-resolver rname->idx subst-mat slice-records))))

(defn- build-tag-dictionary [^objects container-records]
(let [tags->index (volatile! {})]
(dotimes [i (alength container-records)]
(let [^objects slice-records (aget container-records i)]
(dotimes [j (alength slice-records)]
(let [record (aget slice-records j)
tags (into []
(keep (fn [opt]
(let [[tag m] (first opt)]
(when-not (= tag :RG)
[tag (first (:type m))]))))
(:options record))
idx (or (get @tags->index tags)
(let [idx (count @tags->index)]
(vswap! tags->index assoc tags idx)
idx))]
(aset slice-records j
(assoc record ::record/tags-index idx))))))
(->> (sort-by val @tags->index)
(mapv (fn [[tags _]]
(mapv (fn [[tag tag-type]]
{:tag tag :type tag-type})
tags))))))

(defn- build-tag-encoding [item]
(letfn [(tag-id [item]
(let [tag' (name (:tag item))]
(bit-or (bit-shift-left (int (nth tag' 0)) 16)
(bit-shift-left (int (nth tag' 1)) 8)
(int (:type item)))))
(tag-encoding-for-fixed-size [item size]
{:codec :byte-array-len
:len-encoding {:codec :huffman, :alphabet [size], :bit-len [0]}
:val-encoding {:codec :external, :content-id (tag-id item)}})]
(case (:type item)
(\A \c \C) (tag-encoding-for-fixed-size item 1)
(\s \S) (tag-encoding-for-fixed-size item 2)
(\i \I \f) (tag-encoding-for-fixed-size item 4)
(let [content-id (tag-id item)]
{:codec :byte-array-len
:len-encoding {:codec :external, :content-id content-id}
:val-encoding {:codec :external, :content-id content-id}}))))

(defn- build-tag-encodings [tag-dict]
(reduce
(fn [m entry]
(reduce
(fn [m {tag-type :type :as item}]
(update-in m [(:tag item) tag-type] #(or % (build-tag-encoding item))))
m entry))
{} tag-dict))
(record/preprocess-slice-records seq-resolver rname->idx tag-dict-builder
subst-mat slice-records))))

(defn- reference-md5 [seq-resolver cram-header {:keys [^long ri ^long start ^long end]}]
(if (neg? ri)
Expand Down Expand Up @@ -222,14 +172,16 @@
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict-builder (tag-dict/make-tag-dict-builder)
subst-mat {\A {\T 0, \G 1, \C 2, \N 3}
\T {\A 0, \G 1, \C 2, \N 3}
\G {\A 0, \T 1, \C 2, \N 3}
\C {\A 0, \T 1, \G 2, \N 3}
\N {\A 0, \T 1, \G 2, \C 3}}
_ (preprocess-records seq-resolver rname->idx subst-mat container-records)
tag-dict (build-tag-dictionary container-records)
tag-encodings (build-tag-encodings tag-dict)
_ (preprocess-records seq-resolver rname->idx tag-dict-builder
subst-mat container-records)
tag-dict (tag-dict/build-tag-dict tag-dict-builder)
tag-encodings (tag-dict/build-tag-encodings tag-dict)
slices (generate-slices seq-resolver cram-header rname->idx
counter tag-dict tag-encodings container-records)
compression-header-block (struct/generate-compression-header-block
Expand Down
101 changes: 61 additions & 40 deletions test/cljam/io/cram/encode/record_test.clj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
(ns cljam.io.cram.encode.record-test
(:require [cljam.io.cram.data-series :as ds]
[cljam.io.cram.encode.record :as record]
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.cram.seq-resolver.protocol :as resolver]
[cljam.io.sequence :as cseq]
[cljam.test-common :as common]
Expand Down Expand Up @@ -70,28 +71,55 @@

(deftest preprocess-slice-records-test
(let [rname->idx {"ref" 0}
tag-dict-builder (tag-dict/make-tag-dict-builder)
records (object-array
[{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"}
{:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"}])]
(record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
[{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]}
{:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"
:options []}])]
(record/preprocess-slice-records test-seq-resolver rname->idx
tag-dict-builder subst-mat records)
(is (= [{:rname "ref", :pos 1, :cigar "5M", :seq "AGAAT", :qual "HFHHH"
::record/flag 0x03, ::record/ref-index 0, ::record/end 5
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 5, ::record/tags-index 0
::record/features [{:code :subst, :pos 3 :subst 0}]}
{:rname "ref", :pos 5, :cigar "2S3M",:seq "CCTGT", :qual "##AAC"
::record/flag 0x03, ::record/ref-index 0, ::record/end 7
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 7, ::record/tags-index 0
::record/features [{:code :softclip, :pos 1, :bases [(int \C) (int \C)]}]}
{:rname "ref", :pos 10, :cigar "5M", :seq "GATAA", :qual "CCCFF"
::record/flag 0x03, ::record/ref-index 0, ::record/end 14
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 14, ::record/tags-index 0
::record/features []}
{:rname "ref", :pos 15, :cigar "1M1I1M1D2M", :seq "GAAAG", :qual "EBBFF"
::record/flag 0x03, ::record/ref-index 0, ::record/end 19
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/flag 0x03, ::record/ref-index 0, ::record/end 19, ::record/tags-index 0
::record/features [{:code :insertion, :pos 2, :bases [(int \A)]}
{:code :deletion, :pos 4, :len 1}]}
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE"
::record/flag 0x03, ::record/ref-index -1, ::record/end 0
{:rname "*", :pos 0, :cigar "*", :seq "CTGTG", :qual "AEEEE", :options []
::record/flag 0x03, ::record/ref-index -1, ::record/end 0, ::record/tags-index 1
::record/features []}]
(walk/prewalk #(if (.isArray (class %)) (vec %) %)
records)))))
Expand All @@ -107,46 +135,37 @@
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]
[{:tag :MD, :type \Z}
{:tag :NM, :type \c}]]
tag-dict-builder (tag-dict/make-tag-dict-builder)
ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings)
tag-encoders (ds/build-tag-encoders
{:MD {\Z {:codec :byte-array-len
:len-encoding {:codec :external, :content-id 5063770}
:val-encoding {:codec :external, :content-id 5063770}}}
:NM {\c {:codec :byte-array-len
:len-encoding {:codec :huffman, :alphabet [1], :bit-len [0]}
:val-encoding {:codec :external, :content-id 5131619}}}})
records (object-array
[{:qname "q001", :flag 99, :rname "ref", :pos 1, :end 5, :mapq 0,
:cigar "5M", :rnext "=", :pnext 151, :tlen 150, :seq "AGAAT", :qual "HFHHH"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "2C2"}}
{:NM {:type "c", :value 1}}]
::record/tags-index 1}
{:NM {:type "c", :value 1}}]}
{:qname "q002", :flag 99, :rname "ref", :pos 5, :end 7, :mapq 15,
:cigar "2S3M", :rnext "=", :pnext 15, :tlen 15, :seq "CCTGT", :qual "##AAC"
:options [{:RG {:type "Z", :value "rg001"}}
{:MD {:type "Z", :value "3"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:NM {:type "c", :value 0}}]}
{:qname "q003", :flag 177, :rname "ref", :pos 10, :end 14, :mapq 60,
:cigar "5M", :rnext "ref2", :pnext 100, :tlen 0, :seq "GATAA", :qual "CCCFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "5"}}
{:NM {:type "c", :value 0}}]
::record/tags-index 1}
{:NM {:type "c", :value 0}}]}
{:qname "q004", :flag 147, :rname "ref", :pos 15, :end 19, :mapq 15,
:cigar "1M1I1M1D2M", :rnext "=", :pnext 5, :tlen -15, :seq "GAAAG", :qual "EBBFF"
:options [{:RG {:type "Z", :value "rg002"}}
{:MD {:type "Z", :value "3^T2"}}
{:NM {:type "c", :value 2}}]
::record/tags-index 1}
{:NM {:type "c", :value 2}}]}
{:qname "q005", :flag 73, :rname "ref", :pos 20, :end 24, :mapq 0,
:cigar "5M", :rnext "*", :pnext 0, :tlen 0, :seq "CTGTG", :qual "AEEEE"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
:options []}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx
tag-dict-builder subst-mat records)
tag-dict (tag-dict/build-tag-dict tag-dict-builder)
tag-encodings (tag-dict/build-tag-encodings tag-dict)
tag-encoders (ds/build-tag-encoders tag-encodings)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders tag-encoders records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)
Expand Down Expand Up @@ -209,7 +228,7 @@

(is (= 1 (count (get ds-res :TL))))
(is (= 13 (get-in ds-res [:TL 0 :content-id])))
(is (= [1 1 1 1 0] (seq (get-in ds-res [:TL 0 :data]))))
(is (= [0 0 0 0 1] (seq (get-in ds-res [:TL 0 :data]))))

(is (= 1 (count (get ds-res :FN))))
(is (= 14 (get-in ds-res [:FN 0 :content-id])))
Expand Down Expand Up @@ -305,25 +324,27 @@
rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
tag-dict [[]]
tag-dict-builder (tag-dict/make-tag-dict-builder)
ds-encoders (ds/build-data-series-encoders ds/default-data-series-encodings)
records (object-array
[{:qname "q001", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "AATCC", :qual "CCFFF"
:options [], ::record/tags-index 0}
:options []}
{:qname "q001", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "ATTGT", :qual "BDFAD"
:options [], ::record/tags-index 0}
:options []}
{:qname "q002", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TGGTA", :qual "ADDHF"
:options [], ::record/tags-index 0}
:options []}
{:qname "q002", :flag 141, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "TCTTG", :qual "DDDFD"
:options [], ::record/tags-index 0}
:options []}
{:qname "q003", :flag 77, :rname "*", :pos 0, :end 0, :mapq 0,
:cigar "*", :rnext "*", :pnext 0, :tlen 0, :seq "GCACA", :qual "BCCFD"
:options [], ::record/tags-index 0}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx subst-mat records)
:options []}])
_ (record/preprocess-slice-records test-seq-resolver rname->idx
tag-dict-builder subst-mat records)
tag-dict (tag-dict/build-tag-dict tag-dict-builder)
stats (record/encode-slice-records cram-header rname->idx tag-dict
ds-encoders {} records)
ds-res (walk/prewalk #(if (fn? %) (%) %) ds-encoders)]
Expand Down

0 comments on commit 08e3063

Please sign in to comment.