diff --git a/src/cljam/io/cram/encode/context.clj b/src/cljam/io/cram/encode/context.clj index a5c4a8d2..4f926eeb 100644 --- a/src/cljam/io/cram/encode/context.clj +++ b/src/cljam/io/cram/encode/context.clj @@ -1,13 +1,18 @@ (ns cljam.io.cram.encode.context (:require [cljam.io.cram.data-series :as ds] - [cljam.io.cram.encode.tag-dict :as tag-dict])) + [cljam.io.cram.encode.tag-dict :as tag-dict] + [cljam.io.sam.util.header :as sam.header])) (defn make-container-context "Creates a new container context." - [cram-header preservation-map seq-resolver] + [cram-header seq-resolver] (let [rname->idx (into {} (map-indexed (fn [i {:keys [SN]}] [SN i])) (:SQ cram-header)) + preservation-map (cond-> {:RN true, :AP false, :RR true} + (= (sam.header/sort-order cram-header) + sam.header/order-coordinate) + (assoc :AP true)) subst-mat {\A {\T 0, \G 1, \C 2, \N 3} \T {\A 0, \G 1, \C 2, \N 3} \G {\A 0, \T 1, \C 2, \N 3} @@ -25,24 +30,26 @@ "Finalizes the builders in the container context and returns a new container context containing those builders' results. This operation must be done before creating a slice context." - [container-ctx ds-compressor-overrides tag-compressor-overrides] + [container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides] (let [ds-encodings (-> ds/default-data-series-encodings (ds/apply-ds-compressor-overrides ds-compressor-overrides)) tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx)) tag-encodings (-> (tag-dict/build-tag-encodings tag-dict) (ds/apply-tag-compressor-overrides tag-compressor-overrides))] (assoc container-ctx + :alignment-stats alignment-stats :ds-encodings ds-encodings :tag-dict tag-dict :tag-encodings tag-encodings))) (defn make-slice-context - "Creates a slice context from the given container context. Note that the container - context must be finalized with `finalize-container-context`." - [{:keys [ds-encodings tag-encodings] :as container-ctx}] + "Creates a slice context for the ith slice from the given container context. + Note that the container context must be finalized with `finalize-container-context`." + [{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i] (let [ds-encoders (ds/build-data-series-encoders ds-encodings) tag-encoders (ds/build-tag-encoders tag-encodings)] (assoc container-ctx + :alignment-stats (nth alignment-stats i) :ds-encoders ds-encoders :tag-encoders tag-encoders))) diff --git a/src/cljam/io/cram/encode/record.clj b/src/cljam/io/cram/encode/record.clj index bcd10e4a..fc5d4fec 100644 --- a/src/cljam/io/cram/encode/record.clj +++ b/src/cljam/io/cram/encode/record.clj @@ -12,15 +12,22 @@ -1 (get rname->idx rname))) -(defn- build-positional-data-encoder [{:keys [cram-header]} {:keys [RI RL AP RG]}] +(defn- build-positional-data-encoder + [{:keys [cram-header preservation-map alignment-stats]} {:keys [RI RL AP RG]}] (let [rg-id->idx (into {} (map-indexed (fn [i {:keys [ID]}] [ID i])) - (:RG cram-header))] + (:RG cram-header)) + AP' (if (:AP preservation-map) + (let [pos (volatile! (:start alignment-stats))] + (fn [^long pos'] + (AP (- pos' (long @pos))) + (vreset! pos pos'))) + AP)] (fn [record] (let [rg (sam.option/value-for-tag :RG record)] (RI (::ref-index record)) (RL (count (:seq record))) - (AP (:pos record)) + (AP' (:pos record)) (RG (if rg (get rg-id->idx rg) -1)))))) (defn- build-read-name-encoder [{:keys [RN]}] @@ -113,8 +120,7 @@ (BA (aget bs i))) (encode-qual record QS)))) -(defn- build-cram-record-encoder - [{:keys [ds-encoders] :as slice-ctx} stats-builder] +(defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}] (let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders) name-encoder (build-read-name-encoder ds-encoders) mate-encoder (build-mate-read-encoder slice-ctx ds-encoders) @@ -125,9 +131,6 @@ (fn [record] (let [bf (bit-and (long (:flag record)) (bit-not (sam.flag/encoded #{:next-reversed :next-unmapped})))] - (stats/update! stats-builder - (::ref-index record) (:pos record) (::end record) - (count (:seq record)) 1) (BF bf) (CF (::flag record)) (pos-encoder record) @@ -142,10 +145,8 @@ "Encodes CRAM records in a slice all at once using the given slice context. Returns the alignment stats for this slice." [slice-ctx records] - (let [stats-builder (stats/make-alignment-stats-builder) - record-encoder (build-cram-record-encoder slice-ctx stats-builder)] - (run! record-encoder records) - (stats/build stats-builder))) + (let [record-encoder (build-cram-record-encoder slice-ctx)] + (run! record-encoder records))) (defn- add-mismatches [n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs] @@ -203,18 +204,21 @@ encoding that are necessary for the CRAM writer to generate some header components." [{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records] - (dotimes [i (.size records)] - (let [record (.get records i) - ;; these flag bits of CF are hard-coded at the moment: - ;; - 0x01: quality scores stored as array (true) - ;; - 0x02: detached (true) - ;; - 0x04: has mate downstream (false) - cf (cond-> 0x03 - (= (:seq record) "*") (bit-or 0x08)) - ri (ref-index rname->idx (:rname record)) - tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) - [fs end] (calculate-read-features&end seq-resolver subst-mat record) - record' (assoc record - ::flag cf ::ref-index ri ::end end - ::features fs ::tags-index tags-id)] - (.set records i record')))) + (let [stats-builder (stats/make-alignment-stats-builder)] + (dotimes [i (.size records)] + (let [record (.get records i) + ;; these flag bits of CF are hard-coded at the moment: + ;; - 0x01: quality scores stored as array (true) + ;; - 0x02: detached (true) + ;; - 0x04: has mate downstream (false) + cf (cond-> 0x03 + (= (:seq record) "*") (bit-or 0x08)) + ri (ref-index rname->idx (:rname record)) + tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record)) + [fs end] (calculate-read-features&end seq-resolver subst-mat record) + record' (assoc record + ::flag cf ::ref-index ri ::end end + ::features fs ::tags-index tags-id)] + (stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1) + (.set records i record'))) + (stats/build stats-builder))) diff --git a/src/cljam/io/cram/writer.clj b/src/cljam/io/cram/writer.clj index 66b2dd92..1a8b7bd7 100644 --- a/src/cljam/io/cram/writer.clj +++ b/src/cljam/io/cram/writer.clj @@ -50,13 +50,13 @@ (struct/encode-cram-header-container (.-stream wtr) header)) (defn- preprocess-records - [cram-header preservation-map seq-resolver options ^objects container-records] - (let [container-ctx (context/make-container-context cram-header - preservation-map - seq-resolver) - {:keys [ds-compressor-overrides tag-compressor-overrides]} options] - (run! (partial record/preprocess-slice-records container-ctx) container-records) + [cram-header seq-resolver options ^objects container-records] + (let [container-ctx (context/make-container-context cram-header seq-resolver) + {:keys [ds-compressor-overrides tag-compressor-overrides]} options + stats (mapv (partial record/preprocess-slice-records container-ctx) + container-records)] (context/finalize-container-context container-ctx + stats ds-compressor-overrides tag-compressor-overrides))) @@ -91,9 +91,9 @@ :bases nbases :records nrecords}) -(defn- generate-slice [container-ctx counter slice-records] - (let [slice-ctx (context/make-slice-context container-ctx) - stats (record/encode-slice-records slice-ctx slice-records) +(defn- generate-slice [slice-ctx counter slice-records] + (record/encode-slice-records slice-ctx slice-records) + (let [stats (:alignment-stats slice-ctx) blocks (generate-blocks slice-ctx) ref-md5 (reference-md5 slice-ctx stats) header (assoc (stats->header-base stats) @@ -103,7 +103,6 @@ header-block (struct/generate-slice-header-block header blocks) block-data (mapv :data blocks)] {:header header - :stats stats :header-block header-block :data-blocks block-data :counter (+ (long counter) (long (:nrecords stats))) @@ -112,10 +111,11 @@ (apply + (alength header-block)))})) (defn- generate-slices [container-ctx counter container-records] - (loop [[slice-records & more] container-records, counter counter, acc []] + (loop [i 0, [slice-records & more] container-records, counter counter, acc []] (if slice-records - (let [slice (generate-slice container-ctx counter slice-records)] - (recur more (:counter slice) (conj acc slice))) + (let [slice-ctx (context/make-slice-context container-ctx i) + slice (generate-slice slice-ctx counter slice-records)] + (recur (inc i) more (:counter slice) (conj acc slice))) acc))) (defn- generate-compression-header-block @@ -123,8 +123,8 @@ (struct/generate-compression-header-block preservation-map subst-mat tag-dict ds-encodings tag-encodings)) -(defn- generate-container-header [^bytes compression-header-block slices] - (let [stats (stats/merge-stats (map :stats slices)) +(defn- generate-container-header [container-ctx ^bytes compression-header-block slices] + (let [stats (stats/merge-stats (:alignment-stats container-ctx)) container-len (->> slices (map (fn [{:keys [^bytes header-block data-blocks]}] (apply + (alength header-block) @@ -172,12 +172,13 @@ (crai/write-index-entries index-writer entries))) (defn- write-container [^CRAMWriter wtr cram-header counter container-records] - (let [preservation-map {:RN true, :AP false, :RR true} - container-ctx (preprocess-records cram-header preservation-map (.-seq-resolver wtr) + (let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr) (.-options wtr) container-records) slices (generate-slices container-ctx counter container-records) compression-header-block (generate-compression-header-block container-ctx) - container-header (generate-container-header compression-header-block slices) + container-header (generate-container-header container-ctx + compression-header-block + slices) ^DataOutputStream out (.-stream wtr) container-offset (.size out)] (struct/encode-container-header out (assoc container-header :counter counter))