From 07a16f7d83651eb7047f716ba807b2d6560dcb61 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Mon, 13 Nov 2023 07:36:28 +0900 Subject: [PATCH] Replace some LSB protocol uses with byte buffer wrapper --- src/cljam/io/bam/decoder.clj | 208 +++++++++++++++++------------------ src/cljam/io/bcf/reader.clj | 61 +++++----- 2 files changed, 135 insertions(+), 134 deletions(-) diff --git a/src/cljam/io/bam/decoder.clj b/src/cljam/io/bam/decoder.clj index 9589890c..6f1c86da 100644 --- a/src/cljam/io/bam/decoder.clj +++ b/src/cljam/io/bam/decoder.clj @@ -8,7 +8,7 @@ [cljam.io.sam.util.refs :as refs] [cljam.io.sam.util.cigar :as cigar] [cljam.io.bam.common :as common] - [cljam.io.util.lsb :as lsb]) + [cljam.io.util.byte-buffer :as bb]) (:import [java.util Arrays] [java.nio Buffer ByteBuffer ByteOrder CharBuffer] [cljam.io.protocols SAMAlignment SAMRegionBlock SAMCoordinateBlock SAMQuerynameBlock])) @@ -28,7 +28,7 @@ "Parses a tag according to `tag-type`." [tag-type ^ByteBuffer bb] `(case (long ~tag-type) - ~(long \Z) (lsb/read-null-terminated-string ~bb) + ~(long \Z) (bb/read-null-terminated-string ~bb) ~(long \A) (char (.get ~bb)) ~(long \I) (bit-and (.getInt ~bb) 0xffffffff) ~(long \i) (.getInt ~bb) @@ -37,7 +37,7 @@ ~(long \c) (int (.get ~bb)) ~(long \C) (bit-and (int (.get ~bb)) 0xff) ~(long \f) (.getFloat ~bb) - ~(long \H) (proton/hex->bytes (lsb/read-null-terminated-string ~bb)) + ~(long \H) (proton/hex->bytes (bb/read-null-terminated-string ~bb)) (throw (Exception. "Unrecognized tag type")))) (defn- parse-tag-array [^ByteBuffer bb] @@ -129,28 +129,28 @@ When called with start and end, this function may return nil if any base of the block is not included in the range." ([refs block] - (let [buffer (ByteBuffer/wrap (:data block)) - ref-id ^int (lsb/read-int buffer) + (let [buffer (bb/make-lsb-byte-buffer (:data block)) + ref-id (.getInt buffer) rname (or (refs/ref-name refs ref-id) "*") - pos (inc (int (lsb/read-int buffer))) - l-read-name (short (lsb/read-ubyte buffer)) - mapq (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 2) ; bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - l-seq (int (lsb/read-int buffer)) - next-ref-id (int (lsb/read-int buffer)) + pos (inc (.getInt buffer)) + l-read-name (short (bb/read-ubyte buffer)) + mapq (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 2) ; bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + l-seq (.getInt buffer) + next-ref-id (.getInt buffer) rnext (decode-next-ref-id refs ref-id next-ref-id) - pnext (inc (int (lsb/read-int buffer))) - tlen (int (lsb/read-int buffer)) - qname (lsb/read-string buffer (dec l-read-name)) - _ (lsb/skip buffer 1) - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + pnext (inc (.getInt buffer)) + tlen (.getInt buffer) + qname (bb/read-string buffer (dec l-read-name)) + _ (bb/skip buffer 1) + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) [cigar len] (cigar/decode-cigar-and-ref-length cigar-bytes) ref-end (if (zero? (long len)) pos (dec (+ pos (long len)))) - seq' (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) - qual (decode-qual (lsb/read-bytes buffer l-seq)) - rest' (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) + seq' (decode-seq (bb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) + qual (decode-qual (bb/read-bytes buffer l-seq)) + rest' (bb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) options (decode-options rest') [cigar* options*] (if-let [cg (and (cigar/placeholder? cigar-bytes) @@ -160,31 +160,31 @@ (SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq) cigar* rnext (int pnext) (int tlen) seq' qual options*))) ([refs block ^long start ^long end] - (let [buffer (ByteBuffer/wrap (:data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer)))] + (let [buffer (bb/make-lsb-byte-buffer (:data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer))] (when (<= pos end) - (let [l-read-name (short (lsb/read-ubyte buffer)) - mapq (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 2) ; bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - l-seq (int (lsb/read-int buffer)) - next-ref-id (int (lsb/read-int buffer)) + (let [l-read-name (short (bb/read-ubyte buffer)) + mapq (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 2) ; bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + l-seq (.getInt buffer) + next-ref-id (.getInt buffer) rnext (decode-next-ref-id refs ref-id next-ref-id) - pnext (inc (int (lsb/read-int buffer))) - tlen (int (lsb/read-int buffer)) - qname (lsb/read-string buffer (dec l-read-name)) - _ (lsb/skip buffer 1) - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + pnext (inc (.getInt buffer)) + tlen (.getInt buffer) + qname (bb/read-string buffer (dec l-read-name)) + _ (bb/skip buffer 1) + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) [cigar len] (cigar/decode-cigar-and-ref-length cigar-bytes) ref-end (int (if (zero? (long len)) pos (dec (+ pos (long len)))))] (when (<= start ref-end) - (let [seq' (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) - qual (decode-qual (lsb/read-bytes buffer l-seq)) - rest' (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) + (let [seq' (decode-seq (bb/read-bytes buffer (quot (inc l-seq) 2)) l-seq) + qual (decode-qual (bb/read-bytes buffer l-seq)) + rest' (bb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq)) rname (or (refs/ref-name refs ref-id) "*") options (decode-options rest') [cigar* options*] @@ -198,27 +198,27 @@ (defn decode-region-block "Decodes BAM block and returns a SAMRegionBlock instance containing covering range of the alignment." ([^BAMRawBlock block] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer))) - l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer)) + l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + _ (bb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (int (if (zero? ref-length) pos (dec (+ pos ref-length))))] (SAMRegionBlock. (.data block) ref-id pos ref-end))) ([^BAMRawBlock block ^long start ^long end] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer)))] + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer))] (when (<= pos end) - (let [l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + _ (bb/skip buffer (+ 18 l-read-name)) ;; flag, l_seq, rnext, pnext, tlen, qname + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (if (zero? ref-length) pos (dec (+ pos ref-length)))] (when (<= start ref-end) @@ -227,23 +227,23 @@ (defn decode-coordinate-block "Decodes BAM block and returns a SAMCoordinateBlock instance containing ref-id, pos and flag." ([^BAMRawBlock block] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer))) - _ (lsb/skip buffer 6) ;; l_read_name, MAPQ, bin, n_cigar_op - flag (int (lsb/read-ushort buffer))] ;; l_seq, rnext, pnext, tlen, qname + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer)) + _ (bb/skip buffer 6) ;; l_read_name, MAPQ, bin, n_cigar_op + flag (int (bb/read-ushort buffer))] ;; l_seq, rnext, pnext, tlen, qname (SAMCoordinateBlock. (.data block) (int ref-id) (int pos) (int flag)))) ([^BAMRawBlock block ^long start ^long end] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer)))] + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer))] (when (<= pos end) - (let [l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + _ (bb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (if (zero? ref-length) pos (dec (+ pos ref-length)))] (when (<= start ref-end) @@ -252,26 +252,26 @@ (defn decode-queryname-block "Decodes BAM block and returns a SAMQuerynameBlock instance containing qname and flag." ([^BAMRawBlock block] - (let [buffer (ByteBuffer/wrap (.data block)) - _ (lsb/skip buffer 8) ;; ref-id, pos - l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 5) ;; MAPQ, bin, n_cigar_op - flag (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen - qname (lsb/read-string buffer l-read-name)] + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + _ (bb/skip buffer 8) ;; ref-id, pos + l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 5) ;; MAPQ, bin, n_cigar_op + flag (int (bb/read-ushort buffer)) + _ (bb/skip buffer 16) ;; l_seq, rnext, pnext, tlen + qname (bb/read-string buffer l-read-name)] (SAMQuerynameBlock. (.data block) qname (int flag)))) ([^BAMRawBlock block ^long start ^long end] - (let [buffer (ByteBuffer/wrap (.data block)) - _ (lsb/skip buffer 4) ;; ref-id - pos (inc (int (lsb/read-int buffer)))] + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + _ (bb/skip buffer 4) ;; ref-id + pos (inc (.getInt buffer))] (when (<= pos end) - (let [l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer 16) ;; l_seq, rnext, pnext, tlen - qname (lsb/read-string buffer l-read-name) - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + _ (bb/skip buffer 16) ;; l_seq, rnext, pnext, tlen + qname (bb/read-string buffer l-read-name) + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (if (zero? ref-length) pos (dec (+ pos ref-length)))] (when (<= start ref-end) @@ -281,29 +281,29 @@ (defn decode-pointer-block "Decodes BAM block and returns a BAMPointerBlock instance containing region, flag and block pointers." ([^BAMRawBlock block] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer))) - l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer)) + l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + _ (bb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (if (zero? ref-length) pos (dec (+ pos ref-length)))] (BAMPointerBlock. (.data block) ref-id pos ref-end (int flag) (.pointer-beg block) (.pointer-end block)))) ([^BAMRawBlock block ^long start ^long end] - (let [buffer (ByteBuffer/wrap (.data block)) - ref-id (int (lsb/read-int buffer)) - pos (inc (int (lsb/read-int buffer)))] + (let [buffer (bb/make-lsb-byte-buffer (.data block)) + ref-id (.getInt buffer) + pos (inc (.getInt buffer))] (when (<= pos end) - (let [l-read-name (short (lsb/read-ubyte buffer)) - _ (lsb/skip buffer 3) ;; MAPQ, bin - n-cigar-op (int (lsb/read-ushort buffer)) - flag (int (lsb/read-ushort buffer)) - _ (lsb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname - cigar-bytes (lsb/read-bytes buffer (* n-cigar-op 4)) + (let [l-read-name (short (bb/read-ubyte buffer)) + _ (bb/skip buffer 3) ;; MAPQ, bin + n-cigar-op (int (bb/read-ushort buffer)) + flag (int (bb/read-ushort buffer)) + _ (bb/skip buffer (+ 16 l-read-name)) ;; l_seq, rnext, pnext, tlen, qname + cigar-bytes (bb/read-bytes buffer (* n-cigar-op 4)) ref-length (cigar/count-ref-bytes cigar-bytes) ref-end (if (zero? ref-length) pos (dec (+ pos ref-length)))] (when (<= start ref-end) diff --git a/src/cljam/io/bcf/reader.clj b/src/cljam/io/bcf/reader.clj index 9c625800..9449543b 100644 --- a/src/cljam/io/bcf/reader.clj +++ b/src/cljam/io/bcf/reader.clj @@ -3,6 +3,7 @@ [clojure.tools.logging :as logging] [cljam.io.protocols :as protocols] [cljam.io.util.bgzf :as bgzf] + [cljam.io.util.byte-buffer :as bb] [cljam.io.util.lsb :as lsb] [cljam.io.vcf.reader :as vcf-reader] [cljam.io.vcf.util :as vcf-util] @@ -110,18 +111,18 @@ (defn- read-typed-atomic-value "Reads an atomic value, which is typed as either integer(8,16,32 bit) or float or character." - [r ^long type-id] + [^ByteBuffer bb ^long type-id] (case type-id - 1 (let [i (byte (lsb/read-byte r))] + 1 (let [i (.get bb)] (case (bit-and 0xFF i) 0x80 nil 0x81 :eov i)) - 2 (let [i (short (lsb/read-short r))] + 2 (let [i (.getShort bb)] (case (bit-and 0xFFFF i) 0x8000 nil 0x8001 :eov i)) - 3 (let [i (int (lsb/read-int r))] + 3 (let [i (.getInt bb)] (case (bit-and 0xFFFFFFFF i) 0x80000000 nil 0x80000001 :eov i)) - 5 (let [i (int (lsb/read-int r))] + 5 (let [i (.getInt bb)] (case (bit-and 0xFFFFFFFF i) 0x7F800001 nil 0x7F800002 :eov (Float/intBitsToFloat i))) - 7 (lsb/read-byte r))) + 7 (.get bb))) (defn- bytes->strs [ba] @@ -132,20 +133,20 @@ (cstr/split (String. (byte-array ba)) #","))) (defn- read-typed-value - "Reads typed value from BCF file. n-sample is a number of values repeated + "Reads typed value from byte buffer. n-sample is a number of values repeated after type specifier byte." - ([rdr] - (first (read-typed-value rdr 1))) - ([rdr ^long n-sample] - (let [type-byte (int (lsb/read-byte rdr)) + ([bb] + (first (read-typed-value bb 1))) + ([^ByteBuffer bb ^long n-sample] + (let [type-byte (int (.get bb)) len (unsigned-bit-shift-right (bit-and 0xF0 type-byte) 4) - total-len (if (= len 15) (long (first (read-typed-value rdr))) len) + total-len (if (= len 15) (long (first (read-typed-value bb))) len) type-id (bit-and 0x0F type-byte)] (case type-id 0 (repeat n-sample nil) 7 (doall (repeatedly n-sample - #(bytes->strs (lsb/read-bytes rdr total-len)))) - (->> #(read-typed-atomic-value rdr type-id) + #(bytes->strs (bb/read-bytes bb total-len)))) + (->> #(read-typed-atomic-value bb type-id) (repeatedly (* n-sample total-len)) (partition total-len) (map (fn [xs] (take-while #(not= % :eov) xs))) @@ -153,19 +154,19 @@ (defn- read-typed-kv "Reads a key-value pair." - ([rdr] - (let [[k [v]] (read-typed-kv rdr 1)] + ([bb] + (let [[k [v]] (read-typed-kv bb 1)] [k v])) - ([rdr n-sample] - [(first (read-typed-value rdr)) (read-typed-value rdr n-sample)])) + ([bb n-sample] + [(first (read-typed-value bb)) (read-typed-value bb n-sample)])) (defn- read-data-line-buffer "Reads a single record of variant and store to the ByteBuffer objects." [rdr] (let [l-shared (lsb/read-uint rdr) l-indv (lsb/read-uint rdr) - shared-bb (ByteBuffer/allocate l-shared) - indv-bb (ByteBuffer/allocate l-indv)] + shared-bb (bb/allocate-lsb-byte-buffer l-shared) + indv-bb (bb/allocate-lsb-byte-buffer l-indv)] (lsb/read-bytes rdr (.array shared-bb) 0 l-shared) (lsb/read-bytes rdr (.array indv-bb) 0 l-indv) {:l-shared l-shared @@ -176,23 +177,23 @@ (defn- parse-data-line-shallow "Parses only chromosome, position and ref-length. Can be used for position-based querying." [contigs {:keys [^ByteBuffer shared] :as m}] - (let [chrom-id (lsb/read-int shared) - pos (inc (int (lsb/read-int shared))) - rlen (lsb/read-int shared)] + (let [chrom-id (.getInt shared) + pos (inc (.getInt shared)) + rlen (.getInt shared)] (.position ^Buffer shared 0) (assoc m :chr (:id (contigs chrom-id)) :pos pos :rlen rlen))) (defn- parse-data-line-deep "Parses full data of a variant. Returns a map containing indices for meta-info." [{:keys [^ByteBuffer shared ^ByteBuffer individual]}] - (let [chrom-id (lsb/read-int shared) - pos (inc (int (lsb/read-int shared))) - rlen (lsb/read-int shared) - qual (lsb/read-int shared) - n-allele-info (int (lsb/read-int shared)) + (let [chrom-id (.getInt shared) + pos (inc (.getInt shared)) + rlen (.getInt shared) + qual (.getInt shared) + n-allele-info (.getInt shared) n-allele (unsigned-bit-shift-right n-allele-info 16) n-info (bit-and n-allele-info 0xFFFF) - n-fmt-sample (long (lsb/read-uint shared)) + n-fmt-sample (long (bb/read-uint shared)) n-fmt (bit-and 0xFF (unsigned-bit-shift-right n-fmt-sample 24)) n-sample (bit-and n-fmt-sample 0xFFFFFF) id (let [i (read-typed-value shared)] (if (sequential? i) (first i) i)) @@ -293,7 +294,7 @@ :vcf VCF-style map. FORMAT, FILTER, INFO and samples columns are strings. :bcf BCF-style map. CHROM, FILTER, INFO and :genotype contains indices to meta-info. :shallow Only CHROM, POS and ref-length are parsed. - :raw Raw map of ByteBufers." + :raw Raw map of ByteBuffers." ([rdr] (read-variants rdr {})) ([^BCFReader rdr {:keys [depth] :or {depth :deep}}]