From 8617d433de7f8d23c1c03307e70e5221319e5012 Mon Sep 17 00:00:00 2001 From: Shogo Ohta Date: Tue, 30 Jan 2024 17:38:28 +0900 Subject: [PATCH] wip --- src/cljam/io/cram.clj | 30 +++++---------------- src/cljam/io/cram/core.clj | 41 +++++++++++++++++++++++++++++ src/cljam/io/cram/decode/record.clj | 26 +++++++++--------- src/cljam/io/cram/reader.clj | 37 ++++++++++++++++---------- src/cljam/io/cram/seq_resolver.clj | 23 ++++++++++++++++ src/cljam/io/util.clj | 8 ++++++ 6 files changed, 115 insertions(+), 50 deletions(-) create mode 100644 src/cljam/io/cram/core.clj create mode 100644 src/cljam/io/cram/seq_resolver.clj diff --git a/src/cljam/io/cram.clj b/src/cljam/io/cram.clj index 12e496bc..a426d277 100644 --- a/src/cljam/io/cram.clj +++ b/src/cljam/io/cram.clj @@ -1,32 +1,16 @@ (ns cljam.io.cram {:clj-kondo/ignore [:missing-docstring]} - (:require [cljam.io.cram.reader :as reader] + (:require [cljam.io.cram.core :as cram] [cljam.io.protocols :as protocols] - [cljam.io.sequence :as cseq] - [cljam.io.util.byte-buffer :as bb] - [cljam.util :as util] - [clojure.java.io :as cio]) - (:import [cljam.io.cram.reader CRAMReader] - [java.nio.channels FileChannel] - [java.nio.file OpenOption StandardOpenOption])) + [cljam.io.util :as io-util]) + (:import [cljam.io.cram.reader CRAMReader])) (defn reader (^CRAMReader [f] (reader f {})) - (^CRAMReader [f {:keys [reference]}] - (let [file (cio/file f) - url (util/as-url (.getAbsolutePath file)) - ch (FileChannel/open (.toPath file) - (into-array OpenOption [StandardOpenOption/READ])) - bb (bb/allocate-lsb-byte-buffer 256) - seq-rdr (some-> reference cseq/reader) - header (volatile! nil) - refs (delay - (mapv (fn [{:keys [SN LN]}] {:name SN :len LN}) - (:SQ @header))) - rdr (reader/->CRAMReader url ch bb header refs seq-rdr)] - (reader/read-file-definition rdr) - (vreset! header (reader/read-header rdr)) - rdr))) + (^CRAMReader [f option] + (if (io-util/cram-reader? f) + (cram/clone-reader f) + (cram/reader f option)))) (defn read-header [rdr] (protocols/read-header rdr)) diff --git a/src/cljam/io/cram/core.clj b/src/cljam/io/cram/core.clj new file mode 100644 index 00000000..74a85b24 --- /dev/null +++ b/src/cljam/io/cram/core.clj @@ -0,0 +1,41 @@ +(ns cljam.io.cram.core + {:clj-kondo/ignore [:missing-docstring]} + (:require [cljam.io.cram.seq-resolver :as resolver] + [cljam.io.cram.reader :as reader.core] + [cljam.io.util.byte-buffer :as bb] + [cljam.util :as util] + [clojure.java.io :as cio]) + (:import [cljam.io.cram.reader CRAMReader] + [java.nio.channels FileChannel] + [java.nio.file OpenOption StandardOpenOption])) + +(defn reader ^CRAMReader [f {:keys [reference]}] + (let [file (cio/file f) + url (util/as-url (.getAbsolutePath file)) + ch (FileChannel/open (.toPath file) + (into-array OpenOption [StandardOpenOption/READ])) + bb (bb/allocate-lsb-byte-buffer 256) + seq-resolver (some-> reference resolver/seq-resolver) + header (volatile! nil) + refs (delay + (mapv (fn [{:keys [SN LN]}] {:name SN :len LN}) + (:SQ @header))) + rdr (reader.core/->CRAMReader url ch bb header refs seq-resolver)] + (reader.core/read-file-definition rdr) + (vreset! header (reader.core/read-header rdr)) + rdr)) + +(defn clone-reader ^CRAMReader [^CRAMReader rdr] + (let [url (.-url rdr) + file (cio/as-file url) + ch (FileChannel/open (.toPath file) + (into-array OpenOption [StandardOpenOption/READ])) + bb (bb/allocate-lsb-byte-buffer 256) + seq-resolver (some-> (.-seq-resolver rdr) resolver/clone-seq-resolver) + rdr' (reader.core/->CRAMReader url ch bb + (delay @(.-header rdr)) + (delay @(.-refs rdr)) + seq-resolver)] + (reader.core/read-file-definition rdr') + (reader.core/skip-container rdr') + rdr')) diff --git a/src/cljam/io/cram/decode/record.clj b/src/cljam/io/cram/decode/record.clj index 1bbc327a..b6094d9c 100644 --- a/src/cljam/io/cram/decode/record.clj +++ b/src/cljam/io/cram/decode/record.clj @@ -1,7 +1,6 @@ (ns cljam.io.cram.decode.record {:clj-kondo/ignore [:missing-docstring]} (:require [cljam.io.sam.util.cigar :as sam.cigar] - [cljam.io.sequence :as cseq] [cljam.io.util.byte-buffer :as bb])) (defn- build-positional-data-decoder @@ -62,9 +61,9 @@ (assoc record :options)))))) (defn- record-seq - [seq-reader {:keys [preservation-map]} {:keys [rname pos end ::len]} features] + [seq-resolver {:keys [preservation-map]} {:keys [rname pos end ::len]} features] (let [region {:chr rname :start pos :end end} - ref-bases (.getBytes ^String (cseq/read-sequence seq-reader region)) + ref-bases (.getBytes ^String (seq-resolver region)) bs (byte-array len (byte (int \N))) subst (:SM preservation-map)] (loop [[f & more :as fs] features, rpos 0, spos 1] @@ -173,7 +172,7 @@ (+ (long pos)))) (defn- build-mapped-read-decoder - [seq-reader + [seq-resolver compression-header {:keys [FN MQ QS FC FP BA BS IN SC HC PD DL RS BB QQ] :as decoders}] (fn [{::keys [len] :as record}] @@ -185,7 +184,7 @@ end (record-end record features) record' (assoc record :end end)] (assoc record' - :seq (record-seq seq-reader compression-header record' features) + :seq (record-seq seq-resolver compression-header record' features) :qual (record-qual record' features decoders) :mapq (MQ) :cigar (->> (or (features->cigar len features) @@ -227,16 +226,17 @@ (assoc record' :qual (String. (.array bb)))))))) (defn build-cram-record-decoder - [seq-reader cram-header compression-header slice-header {:keys [BF CF] :as decoders} tag-decoders] - (let [pd-decoder (build-positional-data-decoder cram-header + [seq-resolver cram-header compression-header slice-header ds-decoders tag-decoders] + (let [{:keys [BF CF]} ds-decoders + pd-decoder (build-positional-data-decoder cram-header compression-header slice-header - decoders) - rn-decoder (build-read-name-decoder compression-header decoders) - mt-decoder (build-mate-read-decoder cram-header decoders) - at-decoder (build-auxiliary-tags-decoder compression-header decoders tag-decoders) - mr-decoder (build-mapped-read-decoder seq-reader compression-header decoders) - ur-decoder (build-unmapped-read-decoder decoders)] + ds-decoders) + rn-decoder (build-read-name-decoder compression-header ds-decoders) + mt-decoder (build-mate-read-decoder cram-header ds-decoders) + at-decoder (build-auxiliary-tags-decoder compression-header ds-decoders tag-decoders) + mr-decoder (build-mapped-read-decoder seq-resolver compression-header ds-decoders) + ur-decoder (build-unmapped-read-decoder ds-decoders)] (fn [] (let [bf (BF) cf (CF) diff --git a/src/cljam/io/cram/reader.clj b/src/cljam/io/cram/reader.clj index 54d88183..724a2182 100644 --- a/src/cljam/io/cram/reader.clj +++ b/src/cljam/io/cram/reader.clj @@ -8,13 +8,13 @@ [java.nio Buffer ByteBuffer ByteOrder] [java.nio.channels FileChannel FileChannel$MapMode])) -(declare read-header read-alignments) +(declare read-alignments) -(deftype CRAMReader [url channel buffer header refs seq-reader] +(deftype CRAMReader [url channel buffer header refs seq-resolver] Closeable (close [_] - (when seq-reader - (.close ^Closeable seq-reader)) + (when seq-resolver + (.close ^Closeable seq-resolver)) (.close ^FileChannel channel)) protocols/IReader (reader-url [_] url) @@ -95,10 +95,9 @@ (let [slice-header (struct/decode-slice-header-block bb) blocks (into [] (map (fn [_] (struct/decode-block bb))) (range (:blocks slice-header))) - ds-decoders (ds/build-data-series-decoders compression-header blocks) tag-decoders (ds/build-tag-decoders compression-header blocks) - record-decoder (record/build-cram-record-decoder (.-seq-reader rdr) + record-decoder (record/build-cram-record-decoder (.-seq-resolver rdr) @(.-header rdr) compression-header slice-header @@ -121,23 +120,33 @@ (.position ^Buffer bb (+ container-header-end landmark)) (read-slice-records rdr bb compression-header)))))) -(defn- read-container-with [^CRAMReader rdr f] +(defn- with-next-container-header [^CRAMReader rdr f] (let [^FileChannel ch (.-channel rdr) pos (.position ch) _ (read-to-buffer rdr) ^ByteBuffer bb (.-buffer rdr) container-header (struct/decode-container-header bb) - header-size (.position bb) + container-start (+ pos (.position bb)) container-size (long (:length container-header)) - bb' (-> ch - (.map FileChannel$MapMode/READ_ONLY (+ pos header-size) container-size) - (.order ByteOrder/LITTLE_ENDIAN)) - ret (f container-header bb')] - (.position ch (+ pos header-size container-size)) + ret (f container-header container-start)] + (.position ch (+ container-start container-size)) ret)) +(defn- read-container-with [^CRAMReader rdr f] + (letfn [(f' [container-header container-start] + (let [container-size (long (:length container-header)) + ^FileChannel ch (.-channel rdr) + bb (-> ch + (.map FileChannel$MapMode/READ_ONLY container-start container-size) + (.order ByteOrder/LITTLE_ENDIAN))] + (f container-header bb)))] + (with-next-container-header rdr f'))) + +(defn skip-container [rdr] + (with-next-container-header rdr (constantly nil))) + (defn read-header [^CRAMReader rdr] - (read-container-with rdr #(struct/decode-cram-header-block %2))) + (read-container-with rdr (fn [_ bb] (struct/decode-cram-header-block bb)))) (defn read-alignments [^CRAMReader rdr] (let [^FileChannel ch (.-channel rdr)] diff --git a/src/cljam/io/cram/seq_resolver.clj b/src/cljam/io/cram/seq_resolver.clj new file mode 100644 index 00000000..e7d8ebd7 --- /dev/null +++ b/src/cljam/io/cram/seq_resolver.clj @@ -0,0 +1,23 @@ +(ns cljam.io.cram.seq-resolver + {:clj-kondo/ignore [:missing-docstring]} + (:require [cljam.io.sequence :as cseq]) + (:import [java.io Closeable])) + +(declare seq-resolver) + +(deftype SeqResolver [seq-reader] + Object + (clone [_] + (seq-resolver seq-reader)) + java.io.Closeable + (close [_] + (.close ^Closeable seq-reader)) + clojure.lang.IFn + (invoke [_ region] + (cseq/read-sequence seq-reader region))) + +(defn seq-resolver [seq-file] + (->SeqResolver (cseq/reader seq-file))) + +(defn clone-seq-resolver [^SeqResolver resolver] + (.clone resolver)) diff --git a/src/cljam/io/util.clj b/src/cljam/io/util.clj index c8241dba..fb06b0fa 100644 --- a/src/cljam/io/util.clj +++ b/src/cljam/io/util.clj @@ -5,6 +5,7 @@ cljam.io.sam.writer cljam.io.bam.reader cljam.io.bam.writer + cljam.io.cram.reader cljam.io.vcf.reader cljam.io.vcf.writer cljam.io.bcf.reader @@ -50,6 +51,11 @@ [wtr] (instance? cljam.io.bam.writer.BAMWriter wtr)) +(defn cram-reader? + "Checks if given object is an instance of CRAMReader." + [rdr] + (instance? cljam.io.cram.reader.CRAMReader rdr)) + (defn variant-reader? "Checks if given object implements protocol IVariantReader." [rdr] @@ -153,6 +159,7 @@ #"(?i)\.sam$" :sam #"(?i)\.bai$" :bai #"(?i)\.bam$" :bam + #"(?i)\.cram$" :cram #"(?i)\.f(ast)?q" :fastq #"(?i)\.fai$" :fai #"(?i)\.(fa|fasta|fas|fsa|seq|fna|faa|ffn|frn|mpfa)" :fasta @@ -183,6 +190,7 @@ (condp re-find s #"^BAM\01" :bam #"^BAI\01" :bai + #"^CRAM" :cram #"^BCF\02" :bcf #"^TBI\01" :tbi #"^##fileformat=VCF" :vcf