Expose DiskArrays.cache (#417)

* forward DiskArrays.cache * add some docs * bump version
JuliaDataCubes · Jul 26, 2024 · 3b39252 · 3b39252 · meggart · Jul 26, 2024
1 parent 664590f
commit 3b39252
Show file tree

Hide file tree

Showing 5 changed files with 179 additions and 133 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "YAXArrays"
 uuid = "c21b50f5-aa40-41ea-b809-c0f5e47bfa5c"
 authors = ["Fabian Gans <fgans@bgc-jena.mpg.de>"]
-version = "0.5.9"
+version = "0.5.10"
 
 [deps]
 CFTime = "179af706-886a-5703-950a-314cd64e0468"

diff --git a/docs/src/UserGuide/cache.md b/docs/src/UserGuide/cache.md
@@ -0,0 +1,18 @@
+# Caching YAXArrays
+
+For some applications like interactive plotting of large datasets it can not be avoided that the same data must be accessed several times. In these cases it can be useful to store recently accessed data in a cache. In YAXArrays this can be easily achieved using the `cache` function. For example, if we open a large dataset from a remote source and want to keep data in a cache of size 500MB one can use:
+
+````julia
+using YAXArrays, Zarr
+ds = open_dataset("path/to/source")
+cachesize = 500 #MB
+cache(ds,maxsize = cachesize)
+````
+
+The above will wrap every array in the dataset into its own cache, where the 500MB are distributed equally across datasets. 
+Alternatively individual caches can be applied to single `YAXArray`s
+
+````julia
+yax = ds.avariable
+cache(yax,maxsize = 1000)
+````
diff --git a/src/Cubes/Cubes.jl b/src/Cubes/Cubes.jl
@@ -3,7 +3,7 @@ The functions provided by YAXArrays are supposed to work on different types of c
 Data types that
 """
 module Cubes
-using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks
+using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks, cache
 using Distributed: myid
 using Dates: TimeType, Date
 using IntervalSets: Interval, (..)
@@ -17,7 +17,7 @@ using Tables: istable, schema, columns
 using DimensionalData: DimensionalData as DD, AbstractDimArray, NoName
 import DimensionalData: name
 
-export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks
+export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks, cache
 
 """
 This function calculates a subset of a cube's data
@@ -179,6 +179,7 @@ function Base.permutedims(c::YAXArray, p)
     newchunks = DiskArrays.GridChunks(eachchunk(c).chunks[collect(dimnums)])
     YAXArray(newdims, newdata, c.properties, newchunks, c.cleaner)
 end
+DiskArrays.cache(a::YAXArray;maxsize=1000) = DD.rebuild(a,cache(a.data;maxsize))
 
 # DimensionalData overloads
 

diff --git a/src/DatasetAPI/Datasets.jl b/src/DatasetAPI/Datasets.jl
@@ -145,6 +145,15 @@ function Base.getindex(x::Dataset, i::Vector{Symbol})
     cubesnew = [j => x.cubes[j] for j in i]
     Dataset(; cubesnew...)
 end
+function DiskArrays.cache(ds::Dataset;maxsize=1000)
+    #Distribute cache size equally across cubes
+    maxsize = maxsize ÷ length(ds.cubes)
+    cachedcubes = OrderedDict{Symbol,YAXArray}(
+        k => DiskArrays.cache(ds.cubes[k];maxsize) for k in keys(ds.cubes)
+    )
+    Dataset(cachedcubes,ds.axes,ds.properties)
+end
+
 
 function fuzzyfind(s::String, comp::Vector{String})
     sl = lowercase(s)