Skip to content

Commit

Permalink
Support delta-encoded AP in sorted CRAM files
Browse files Browse the repository at this point in the history
  • Loading branch information
athos committed Sep 3, 2024
1 parent b70109a commit 6b3b4ca
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 51 deletions.
19 changes: 13 additions & 6 deletions src/cljam/io/cram/encode/context.clj
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
(ns cljam.io.cram.encode.context
(:require [cljam.io.cram.data-series :as ds]
[cljam.io.cram.encode.tag-dict :as tag-dict]))
[cljam.io.cram.encode.tag-dict :as tag-dict]
[cljam.io.sam.util.header :as sam.header]))

(defn make-container-context
"Creates a new container context."
[cram-header preservation-map seq-resolver]
[cram-header seq-resolver]
(let [rname->idx (into {}
(map-indexed (fn [i {:keys [SN]}] [SN i]))
(:SQ cram-header))
preservation-map (cond-> {:RN true, :AP false, :RR true}
(= (sam.header/sort-order cram-header)
sam.header/order-coordinate)
(assoc :AP true))
subst-mat {\A {\T 0, \G 1, \C 2, \N 3}
\T {\A 0, \G 1, \C 2, \N 3}
\G {\A 0, \T 1, \C 2, \N 3}
Expand All @@ -25,24 +30,26 @@
"Finalizes the builders in the container context and returns a new container
context containing those builders' results. This operation must be done before
creating a slice context."
[container-ctx ds-compressor-overrides tag-compressor-overrides]
[container-ctx alignment-stats ds-compressor-overrides tag-compressor-overrides]
(let [ds-encodings (-> ds/default-data-series-encodings
(ds/apply-ds-compressor-overrides ds-compressor-overrides))
tag-dict (tag-dict/build-tag-dict (:tag-dict-builder container-ctx))
tag-encodings (-> (tag-dict/build-tag-encodings tag-dict)
(ds/apply-tag-compressor-overrides tag-compressor-overrides))]
(assoc container-ctx
:alignment-stats alignment-stats
:ds-encodings ds-encodings
:tag-dict tag-dict
:tag-encodings tag-encodings)))

(defn make-slice-context
"Creates a slice context from the given container context. Note that the container
context must be finalized with `finalize-container-context`."
[{:keys [ds-encodings tag-encodings] :as container-ctx}]
"Creates a slice context for the ith slice from the given container context.
Note that the container context must be finalized with `finalize-container-context`."
[{:keys [alignment-stats ds-encodings tag-encodings] :as container-ctx} i]
(let [ds-encoders (ds/build-data-series-encoders ds-encodings)
tag-encoders (ds/build-tag-encoders tag-encodings)]
(assoc container-ctx
:alignment-stats (nth alignment-stats i)
:ds-encoders ds-encoders
:tag-encoders tag-encoders)))

Expand Down
58 changes: 31 additions & 27 deletions src/cljam/io/cram/encode/record.clj
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,22 @@
-1
(get rname->idx rname)))

(defn- build-positional-data-encoder [{:keys [cram-header]} {:keys [RI RL AP RG]}]
(defn- build-positional-data-encoder
[{:keys [cram-header preservation-map alignment-stats]} {:keys [RI RL AP RG]}]
(let [rg-id->idx (into {}
(map-indexed (fn [i {:keys [ID]}] [ID i]))
(:RG cram-header))]
(:RG cram-header))
AP' (if (:AP preservation-map)
(let [pos (volatile! (:start alignment-stats))]
(fn [^long pos']
(AP (- pos' (long @pos)))
(vreset! pos pos')))
AP)]
(fn [record]
(let [rg (sam.option/value-for-tag :RG record)]
(RI (::ref-index record))
(RL (count (:seq record)))
(AP (:pos record))
(AP' (:pos record))
(RG (if rg (get rg-id->idx rg) -1))))))

(defn- build-read-name-encoder [{:keys [RN]}]
Expand Down Expand Up @@ -113,8 +120,7 @@
(BA (aget bs i)))
(encode-qual record QS))))

(defn- build-cram-record-encoder
[{:keys [ds-encoders] :as slice-ctx} stats-builder]
(defn- build-cram-record-encoder [{:keys [ds-encoders] :as slice-ctx}]
(let [pos-encoder (build-positional-data-encoder slice-ctx ds-encoders)
name-encoder (build-read-name-encoder ds-encoders)
mate-encoder (build-mate-read-encoder slice-ctx ds-encoders)
Expand All @@ -125,9 +131,6 @@
(fn [record]
(let [bf (bit-and (long (:flag record))
(bit-not (sam.flag/encoded #{:next-reversed :next-unmapped})))]
(stats/update! stats-builder
(::ref-index record) (:pos record) (::end record)
(count (:seq record)) 1)
(BF bf)
(CF (::flag record))
(pos-encoder record)
Expand All @@ -142,10 +145,8 @@
"Encodes CRAM records in a slice all at once using the given slice context.
Returns the alignment stats for this slice."
[slice-ctx records]
(let [stats-builder (stats/make-alignment-stats-builder)
record-encoder (build-cram-record-encoder slice-ctx stats-builder)]
(run! record-encoder records)
(stats/build stats-builder)))
(let [record-encoder (build-cram-record-encoder slice-ctx)]
(run! record-encoder records)))

(defn- add-mismatches
[n subst-mat ^bytes ref-bases rpos ^bytes read-bases ^bytes qs spos fs]
Expand Down Expand Up @@ -203,18 +204,21 @@
encoding that are necessary for the CRAM writer to generate some header
components."
[{:keys [rname->idx subst-mat seq-resolver tag-dict-builder]} ^List records]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
(.set records i record'))))
(let [stats-builder (stats/make-alignment-stats-builder)]
(dotimes [i (.size records)]
(let [record (.get records i)
;; these flag bits of CF are hard-coded at the moment:
;; - 0x01: quality scores stored as array (true)
;; - 0x02: detached (true)
;; - 0x04: has mate downstream (false)
cf (cond-> 0x03
(= (:seq record) "*") (bit-or 0x08))
ri (ref-index rname->idx (:rname record))
tags-id (tag-dict/assign-tags-id! tag-dict-builder (:options record))
[fs end] (calculate-read-features&end seq-resolver subst-mat record)
record' (assoc record
::flag cf ::ref-index ri ::end end
::features fs ::tags-index tags-id)]
(stats/update! stats-builder ri (:pos record) end (count (:seq record)) 1)
(.set records i record')))
(stats/build stats-builder)))
37 changes: 19 additions & 18 deletions src/cljam/io/cram/writer.clj
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@
(struct/encode-cram-header-container (.-stream wtr) header))

(defn- preprocess-records
[cram-header preservation-map seq-resolver options ^objects container-records]
(let [container-ctx (context/make-container-context cram-header
preservation-map
seq-resolver)
{:keys [ds-compressor-overrides tag-compressor-overrides]} options]
(run! (partial record/preprocess-slice-records container-ctx) container-records)
[cram-header seq-resolver options ^objects container-records]
(let [container-ctx (context/make-container-context cram-header seq-resolver)
{:keys [ds-compressor-overrides tag-compressor-overrides]} options
stats (mapv (partial record/preprocess-slice-records container-ctx)
container-records)]
(context/finalize-container-context container-ctx
stats
ds-compressor-overrides
tag-compressor-overrides)))

Expand Down Expand Up @@ -91,9 +91,9 @@
:bases nbases
:records nrecords})

(defn- generate-slice [container-ctx counter slice-records]
(let [slice-ctx (context/make-slice-context container-ctx)
stats (record/encode-slice-records slice-ctx slice-records)
(defn- generate-slice [slice-ctx counter slice-records]
(record/encode-slice-records slice-ctx slice-records)
(let [stats (:alignment-stats slice-ctx)
blocks (generate-blocks slice-ctx)
ref-md5 (reference-md5 slice-ctx stats)
header (assoc (stats->header-base stats)
Expand All @@ -103,7 +103,6 @@
header-block (struct/generate-slice-header-block header blocks)
block-data (mapv :data blocks)]
{:header header
:stats stats
:header-block header-block
:data-blocks block-data
:counter (+ (long counter) (long (:nrecords stats)))
Expand All @@ -112,19 +111,20 @@
(apply + (alength header-block)))}))

(defn- generate-slices [container-ctx counter container-records]
(loop [[slice-records & more] container-records, counter counter, acc []]
(loop [i 0, [slice-records & more] container-records, counter counter, acc []]
(if slice-records
(let [slice (generate-slice container-ctx counter slice-records)]
(recur more (:counter slice) (conj acc slice)))
(let [slice-ctx (context/make-slice-context container-ctx i)
slice (generate-slice slice-ctx counter slice-records)]
(recur (inc i) more (:counter slice) (conj acc slice)))
acc)))

(defn- generate-compression-header-block
^bytes [{:keys [preservation-map subst-mat tag-dict ds-encodings tag-encodings]}]
(struct/generate-compression-header-block preservation-map subst-mat tag-dict
ds-encodings tag-encodings))

(defn- generate-container-header [^bytes compression-header-block slices]
(let [stats (stats/merge-stats (map :stats slices))
(defn- generate-container-header [container-ctx ^bytes compression-header-block slices]
(let [stats (stats/merge-stats (:alignment-stats container-ctx))
container-len (->> slices
(map (fn [{:keys [^bytes header-block data-blocks]}]
(apply + (alength header-block)
Expand Down Expand Up @@ -172,12 +172,13 @@
(crai/write-index-entries index-writer entries)))

(defn- write-container [^CRAMWriter wtr cram-header counter container-records]
(let [preservation-map {:RN true, :AP false, :RR true}
container-ctx (preprocess-records cram-header preservation-map (.-seq-resolver wtr)
(let [container-ctx (preprocess-records cram-header (.-seq-resolver wtr)
(.-options wtr) container-records)
slices (generate-slices container-ctx counter container-records)
compression-header-block (generate-compression-header-block container-ctx)
container-header (generate-container-header compression-header-block slices)
container-header (generate-container-header container-ctx
compression-header-block
slices)
^DataOutputStream out (.-stream wtr)
container-offset (.size out)]
(struct/encode-container-header out (assoc container-header :counter counter))
Expand Down

0 comments on commit 6b3b4ca

Please sign in to comment.