From 621ecc69c6a5691cacb9d93778d2f1bcc5dcdf97 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Thu, 25 Jul 2024 14:50:01 +0200 Subject: [PATCH 1/3] forward DiskArrays.cache --- src/Cubes/Cubes.jl | 3 +- src/DatasetAPI/Datasets.jl | 9 ++ test/Datasets/datasets.jl | 274 +++++++++++++++++++------------------ 3 files changed, 155 insertions(+), 131 deletions(-) diff --git a/src/Cubes/Cubes.jl b/src/Cubes/Cubes.jl index 0dbff4ba..e2cf5fe2 100644 --- a/src/Cubes/Cubes.jl +++ b/src/Cubes/Cubes.jl @@ -3,7 +3,7 @@ The functions provided by YAXArrays are supposed to work on different types of c Data types that """ module Cubes -using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks +using DiskArrays: DiskArrays, eachchunk, approx_chunksize, max_chunksize, grid_offset, GridChunks, cache using Distributed: myid using Dates: TimeType, Date using IntervalSets: Interval, (..) @@ -179,6 +179,7 @@ function Base.permutedims(c::YAXArray, p) newchunks = DiskArrays.GridChunks(eachchunk(c).chunks[collect(dimnums)]) YAXArray(newdims, newdata, c.properties, newchunks, c.cleaner) end +DiskArrays.cache(a::YAXArray;maxsize=1000) = DD.rebuild(a,cache(a.data;maxsize)) # DimensionalData overloads diff --git a/src/DatasetAPI/Datasets.jl b/src/DatasetAPI/Datasets.jl index d65a27ae..9e7c2b5e 100644 --- a/src/DatasetAPI/Datasets.jl +++ b/src/DatasetAPI/Datasets.jl @@ -145,6 +145,15 @@ function Base.getindex(x::Dataset, i::Vector{Symbol}) cubesnew = [j => x.cubes[j] for j in i] Dataset(; cubesnew...) end +function DiskArrays.cache(ds::Dataset;maxsize=1000) + #Distribute cache size equally across cubes + maxsize = maxsize รท length(ds.cubes) + cachedcubes = OrderedDict{Symbol,YAXArray}( + k => DiskArrays.cache(ds.cubes[k];maxsize) for k in keys(ds.cubes) + ) + Dataset(cachedcubes,ds.axes,ds.properties) +end + function fuzzyfind(s::String, comp::Vector{String}) sl = lowercase(s) diff --git a/test/Datasets/datasets.jl b/test/Datasets/datasets.jl index 22e1516f..423177d5 100644 --- a/test/Datasets/datasets.jl +++ b/test/Datasets/datasets.jl @@ -5,51 +5,51 @@ using Dates @testset "Datasets axes Ti" begin - using Zarr, NetCDF - - ## first example - data = [rand(4, 5, 12), rand(4, 5, 12), rand(4, 5)] - # dim_time = DD.Dim{:Time}(Date(2001, 1, 15):Month(1):Date(2001, 12, 15)) - dim_time = Ti(Date(2001, 1, 15):Month(1):Date(2001, 12, 15)) - axlist1 = ( - DD.Dim{:XVals}(1.0:4.0), - DD.Dim{:YVals}([1, 2, 3, 4, 5]), - dim_time - ) - axlist2 = (DD.Dim{:XVals}(1.0:4.0), DD.Dim{:YVals}([1, 2, 3, 4, 5])) - props = [Dict("att$i" => i) for i = 1:3] - c1, c2, c3 = ( - YAXArray(axlist1, data[1], props[1]), - YAXArray(axlist1, data[2], props[2]), - YAXArray(axlist2, data[3], props[3]), - ) - ds = Dataset(avar=c1, something=c2, smaller=c3) - # previous version will throw this error: `KeyError: key :Ti not found` - f = "./temp.zarr" - @test_nowarn savedataset(ds; path=f) - rm(f, recursive=true, force=true) - - - ## second example - using Downloads - path2file = "https://www.unidata.ucar.edu/software/netcdf/examples/sresa1b_ncar_ccsm3-example.nc" - filename = Downloads.download(path2file, "sresa1b_ncar_ccsm3-example.nc") - ds = open_dataset(filename) - f = "./temp.zarr" - savedataset(ds, path=f, driver=:zarr, overwrite=true) - rm(f, recursive=true, force=true) - rm(filename) - - ## third example - # using EarthDataLab - # using DimensionalData - # using Zarr, YAXArrays, ra = esdc(res="tiny") - # ra_tair = ra[variable=At("air_temperature_2m")] - # ra_resp = ra[variable=At("terrestrial_ecosystem_respiration")] - # ds = Dataset(tair=ra_tair, resp=ra_resp) - # f = "./temp.zarr" - # savedataset(ds, path=f, driver=:zarr, overwrite=true) - # rm(f, recursive=true, force=true) + using Zarr, NetCDF + + ## first example + data = [rand(4, 5, 12), rand(4, 5, 12), rand(4, 5)] + # dim_time = DD.Dim{:Time}(Date(2001, 1, 15):Month(1):Date(2001, 12, 15)) + dim_time = Ti(Date(2001, 1, 15):Month(1):Date(2001, 12, 15)) + axlist1 = ( + DD.Dim{:XVals}(1.0:4.0), + DD.Dim{:YVals}([1, 2, 3, 4, 5]), + dim_time + ) + axlist2 = (DD.Dim{:XVals}(1.0:4.0), DD.Dim{:YVals}([1, 2, 3, 4, 5])) + props = [Dict("att$i" => i) for i = 1:3] + c1, c2, c3 = ( + YAXArray(axlist1, data[1], props[1]), + YAXArray(axlist1, data[2], props[2]), + YAXArray(axlist2, data[3], props[3]), + ) + ds = Dataset(avar=c1, something=c2, smaller=c3) + # previous version will throw this error: `KeyError: key :Ti not found` + f = "./temp.zarr" + @test_nowarn savedataset(ds; path=f) + rm(f, recursive=true, force=true) + + + ## second example + using Downloads + path2file = "https://www.unidata.ucar.edu/software/netcdf/examples/sresa1b_ncar_ccsm3-example.nc" + filename = Downloads.download(path2file, "sresa1b_ncar_ccsm3-example.nc") + ds = open_dataset(filename) + f = "./temp.zarr" + savedataset(ds, path=f, driver=:zarr, overwrite=true) + rm(f, recursive=true, force=true) + rm(filename) + + ## third example + # using EarthDataLab + # using DimensionalData + # using Zarr, YAXArrays, ra = esdc(res="tiny") + # ra_tair = ra[variable=At("air_temperature_2m")] + # ra_resp = ra[variable=At("terrestrial_ecosystem_respiration")] + # ds = Dataset(tair=ra_tair, resp=ra_resp) + # f = "./temp.zarr" + # savedataset(ds, path=f, driver=:zarr, overwrite=true) + # rm(f, recursive=true, force=true) end @@ -67,7 +67,7 @@ end YAXArray(axlist1, data[2], props[2]), YAXArray(axlist2, data[3], props[3]), ) - ds = Dataset(avar = c1, something = c2, smaller = c3) + ds = Dataset(avar=c1, something=c2, smaller=c3) @testset "Basic functions" begin b = IOBuffer() show(b, ds) @@ -80,7 +80,7 @@ end Variables: """ s2 = split(s2, "\n") - @test s[[1]] == s2[[1]] + @test s[[1]] == s2[[1]] # @test all(i->in(i,s2), s[3:5]) for n in [:avar, :something, :smaller, :XVals, :Time, :YVals] @test n in propertynames(ds) @@ -108,13 +108,13 @@ end #@test length(ds3[Time=(Date(2001,2,1),Date(2001,8,1))].Time) == 6 end @testset "Subsetting datasets" begin - dssub = ds[Time=DD.At(Date(2001,2,15))] + dssub = ds[Time=DD.At(Date(2001, 2, 15))] @test dssub isa Dataset @test sort(collect(keys(dssub.axes))) == [:XVals, :YVals] - @test ndims(dssub.avar)==2 - dssub2 = ds[var=[:avar,:something], Time=Date(2001,1,15)..Date(2001,6,15)] - @test length(dssub2.cubes)==2 - @test size(dssub2.avar)==(4,5,6) + @test ndims(dssub.avar) == 2 + dssub2 = ds[var=[:avar, :something], Time=Date(2001, 1, 15) .. Date(2001, 6, 15)] + @test length(dssub2.cubes) == 2 + @test size(dssub2.avar) == (4, 5, 6) end @testset "Dataset interface" begin struct MockDataset @@ -175,8 +175,8 @@ end "d3" => ["d3"], ), Dict( - "global_att1"=>5, - "global_att2"=>"Hi", + "global_att1" => 5, + "global_att2" => "Hi", ), Dict( "Var1" => att1, @@ -240,12 +240,12 @@ end newds, newds2 = YAXArrays.Datasets.createdataset( MockDataset, al, - path = fn, - persist = false, - chunksize = (4, 2, 4), - chunkoffset = (2, 0, 3), - properties = Dict("att1" => 5), - datasetaxis = "A", + path=fn, + persist=false, + chunksize=(4, 2, 4), + chunkoffset=(2, 0, 3), + properties=Dict("att1" => 5), + datasetaxis="A", ) @test size(newds.data) == (12, 2, 10) @test size(newds.data.a.parent) == (14, 2, 13) @@ -259,8 +259,8 @@ end newds = YAXArrays.Datasets.createdataset( MockDataset, al, - path = fn, - datasetaxis = "A", + path=fn, + datasetaxis="A", ) end end @@ -271,8 +271,8 @@ end x = rand(10, 5) ax1 = Dim{:Ax1}(string.(1:10)) ax2 = Dim{:Ax2}(1:5) - p = string(tempname(),".zarr") - savecube(YAXArray((ax1, ax2), x), p, backend = :zarr) + p = string(tempname(), ".zarr") + savecube(YAXArray((ax1, ax2), x), p, backend=:zarr) @test ispath(p) cube1 = Cube(p) @test cube1.Ax1 == ax1 @@ -280,7 +280,7 @@ end @test eltype(cube1.Ax2) <: Int64 @test cube1.data == x p2 = string(tempname(), ".nc") - savecube(cube1, p2, backend = :netcdf) + savecube(cube1, p2, backend=:netcdf) @test ispath(p2) cube2 = Cube(p2) @test cube2.Ax1 == ax1 @@ -291,26 +291,26 @@ end @testset "Saving, loading and appending" begin using YAXArrays, Zarr, NetCDF, DiskArrays - x,y,z = rand(10,20),rand(10),rand(10,20,5) - a,b,c = YAXArray.((x,y,z)) - f = tempname()*".zarr" - savecube(a,f,backend=:zarr) + x, y, z = rand(10, 20), rand(10), rand(10, 20, 5) + a, b, c = YAXArray.((x, y, z)) + f = tempname() * ".zarr" + savecube(a, f, backend=:zarr) cube = Cube(f) @test cube.axes == a.axes @test cube.data == x @test cube.chunks == a.chunks - f = tempname()*".nc"; - savecube(a,f,backend=:netcdf) + f = tempname() * ".nc" + savecube(a, f, backend=:netcdf) cube = Cube(f) @test cube.axes == a.axes @test cube.data == x @test cube.chunks == a.chunks - ds = Dataset(;a,b); - f = tempname(); - savedataset(ds,path=f,driver=:zarr) - ds = open_dataset(f,driver=:zarr) + ds = Dataset(; a, b) + f = tempname() + savedataset(ds, path=f, driver=:zarr) + ds = open_dataset(f, driver=:zarr) @test ds.a.axes == a.axes @test ds.a.data == x @test ds.a.chunks == a.chunks @@ -319,8 +319,8 @@ end @test ds.b.data == y @test ds.b.chunks == b.chunks - ds2 = Dataset(c = c); - savedataset(ds2,path=f,backend=:zarr,append=true); + ds2 = Dataset(c=c) + savedataset(ds2, path=f, backend=:zarr, append=true) ds = open_dataset(f, driver=:zarr) @test ds.a.axes == a.axes @@ -332,109 +332,123 @@ end @test ds.b.chunks == b.chunks @test ds.c.axes == c.axes - @test ds.c.data[:,:,:] == z + @test ds.c.data[:, :, :] == z @test ds.c.chunks == c.chunks - d = YAXArray(zeros(Union{Missing, Int32},10,20)) + d = YAXArray(zeros(Union{Missing,Int32}, 10, 20)) f = tempname() - r = savecube(d,f,driver=:zarr,skeleton=true) - @test all(ismissing,r[:,:]) + r = savecube(d, f, driver=:zarr, skeleton=true) + @test all(ismissing, r[:, :]) - d = YAXArray(zeros(Int32,10,20)) + d = YAXArray(zeros(Int32, 10, 20)) f = tempname() - r = savecube(d,f,driver=:zarr,skeleton=true) - @test all(==(YAXArrayBase.defaultfillval(Int32)),r[:,:]) + r = savecube(d, f, driver=:zarr, skeleton=true) + @test all(==(YAXArrayBase.defaultfillval(Int32)), r[:, :]) - f = tempname()*".zarr" - a_chunked = setchunks(a,(5,10)) - savecube(a_chunked,f,backend=:zarr) - @test Cube(f).chunks == DiskArrays.GridChunks(size(a),(5,10)) + f = tempname() * ".zarr" + a_chunked = setchunks(a, (5, 10)) + savecube(a_chunked, f, backend=:zarr) + @test Cube(f).chunks == DiskArrays.GridChunks(size(a), (5, 10)) - ds = Dataset(;a,b,c); - dschunked = setchunks(ds,Dict("Dim_1"=>5, "Dim_2"=>10, "Dim_3"=>2)); - f = tempname(); - savedataset(dschunked,path=f,driver=:zarr) + ds = Dataset(; a, b, c) + dschunked = setchunks(ds, Dict("Dim_1" => 5, "Dim_2" => 10, "Dim_3" => 2)) + f = tempname() + savedataset(dschunked, path=f, driver=:zarr) ds = open_dataset(f, driver=:zarr) @test ds.a.axes == a.axes - @test ds.a.data[:,:] == x - @test ds.a.chunks == DiskArrays.GridChunks(size(a),(5,10)) + @test ds.a.data[:, :] == x + @test ds.a.chunks == DiskArrays.GridChunks(size(a), (5, 10)) @test ds.b.axes == b.axes @test ds.b.data[:] == y - @test ds.b.chunks == DiskArrays.GridChunks(size(b),(5,)) + @test ds.b.chunks == DiskArrays.GridChunks(size(b), (5,)) @test ds.c.axes == c.axes - @test ds.c.data[:,:,:] == z - @test ds.c.chunks == DiskArrays.GridChunks(size(c),(5,10,2)) + @test ds.c.data[:, :, :] == z + @test ds.c.chunks == DiskArrays.GridChunks(size(c), (5, 10, 2)) - ds = Dataset(;a,b,c); - dschunked = setchunks(ds,(a = (5,10), b = Dict("Dim_1"=>5), c = (Dim_1 = 5, Dim_2 = 10, Dim_3 = 2))); - f = tempname(); - savedataset(dschunked,path=f,driver=:zarr) - ds = open_dataset(f,driver=:zarr) + ds = Dataset(; a, b, c) + dschunked = setchunks(ds, (a=(5, 10), b=Dict("Dim_1" => 5), c=(Dim_1=5, Dim_2=10, Dim_3=2))) + f = tempname() + savedataset(dschunked, path=f, driver=:zarr) + ds = open_dataset(f, driver=:zarr) @test ds.a.axes == a.axes - @test ds.a.data[:,:] == x - @test ds.a.chunks == DiskArrays.GridChunks(size(a),(5,10)) + @test ds.a.data[:, :] == x + @test ds.a.chunks == DiskArrays.GridChunks(size(a), (5, 10)) @test ds.b.axes == b.axes @test ds.b.data[:] == y - @test ds.b.chunks == DiskArrays.GridChunks(size(b),(5,)) + @test ds.b.chunks == DiskArrays.GridChunks(size(b), (5,)) @test ds.c.axes == c.axes - @test ds.c.data[:,:,:] == z - @test ds.c.chunks == DiskArrays.GridChunks(size(c),(5,10,2)) + @test ds.c.data[:, :, :] == z + @test ds.c.chunks == DiskArrays.GridChunks(size(c), (5, 10, 2)) - ds = Dataset(a = YAXArray(rand(10,20)), b = YAXArray(rand(10,20)), c = YAXArray(rand(10,20))); - dschunked = setchunks(ds,(5,10)); - f = tempname(); - savedataset(dschunked,path=f,driver=:zarr) - ds2 = open_dataset(f,driver=:zarr) + ds = Dataset(a=YAXArray(rand(10, 20)), b=YAXArray(rand(10, 20)), c=YAXArray(rand(10, 20))) + dschunked = setchunks(ds, (5, 10)) + f = tempname() + savedataset(dschunked, path=f, driver=:zarr) + ds2 = open_dataset(f, driver=:zarr) @test ds2.a.axes == ds.a.axes - @test ds2.a.data[:,:] == ds.a.data - @test ds2.a.chunks == DiskArrays.GridChunks(size(a),(5,10)) + @test ds2.a.data[:, :] == ds.a.data + @test ds2.a.chunks == DiskArrays.GridChunks(size(a), (5, 10)) @test ds2.b.axes == ds.b.axes - @test ds2.b.data[:,:] == ds.b.data - @test ds2.b.chunks == DiskArrays.GridChunks(size(a),(5,10)) + @test ds2.b.data[:, :] == ds.b.data + @test ds2.b.chunks == DiskArrays.GridChunks(size(a), (5, 10)) @test ds2.c.axes == ds.c.axes - @test ds2.c.data[:,:] == ds.c.data - @test ds2.c.chunks == DiskArrays.GridChunks(size(a),(5,10)) + @test ds2.c.data[:, :] == ds.c.data + @test ds2.c.chunks == DiskArrays.GridChunks(size(a), (5, 10)) + +end +@testset "Caching" begin + a = AccessCountDiskArray(reshape(1:100, 5, 20), chunksize=(2, 10)) + ds = Dataset(; ar=YAXArray((DD.X(1:5), DD.Y(1:20)), a)) + dscached = DiskArrays.cache(ds) + @test isa(dscached.ar.data, DiskArrays.CachedDiskArray) + @test dscached.ar[1, 1] == 1 + @test dscached.ar[2, 1:10] == 2:5:47 + @test getindex_count(a) == 1 + dssub = dscached[X=DD.Between(3, 5), Y=DD.Between(19, 20)] + @test dssub.ar[:, :].data == [93 98; 94 99; 95 100] + @test getindex_count(a) == 3 + @test length(dscached.ar.data.cache) == 3 end -@testset "Mapslices" begin +@testset "Mapslices" begin using YAXArrays, StatsBase - a = ones(10,20,5) + a = ones(10, 20, 5) cube = YAXArray(a) mean_slice = mapslices(mean, cube; dims="Dim_1") - @test mean_slice[:,:] == ones(20,5) + @test mean_slice[:, :] == ones(20, 5) end @testset "Making Cubes from heterogemous data types" begin - a1 = YAXArray(rand(Int8,10,10)) - a2 = YAXArray(rand(Float32,10,10)) - a3 = YAXArray(rand(Int16,10,10)) - a4 = YAXArray(rand(Float64,10,10)) - a5 = YAXArray(fill("hello",10,10)) - ds = Dataset(a=a1, b=a2,c=a3,d=a4) + a1 = YAXArray(rand(Int8, 10, 10)) + a2 = YAXArray(rand(Float32, 10, 10)) + a3 = YAXArray(rand(Int16, 10, 10)) + a4 = YAXArray(rand(Float64, 10, 10)) + a5 = YAXArray(fill("hello", 10, 10)) + ds = Dataset(a=a1, b=a2, c=a3, d=a4) c = Cube(ds) - @test size(c) == (10,10,4) + @test size(c) == (10, 10, 4) @test eltype(c) <: Float64 - x = c[var=At("c")][:,:] + x = c[var=At("c")][:, :] @test eltype(x) <: Float64 @test x == Float64.(a3.data) - ds = Dataset(a=a1, b=a2,c=a3,d=a4,e=a5) + ds = Dataset(a=a1, b=a2, c=a3, d=a4, e=a5) @test_throws ArgumentError Cube(ds) end From baba914599ceb9fc30d4bae37d1b6c54f64db2bd Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Thu, 25 Jul 2024 14:59:37 +0200 Subject: [PATCH 2/3] add some docs --- docs/src/UserGuide/cache.md | 18 ++++++++++++++++++ src/Cubes/Cubes.jl | 2 +- test/Datasets/datasets.jl | 4 ++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 docs/src/UserGuide/cache.md diff --git a/docs/src/UserGuide/cache.md b/docs/src/UserGuide/cache.md new file mode 100644 index 00000000..67b12c2a --- /dev/null +++ b/docs/src/UserGuide/cache.md @@ -0,0 +1,18 @@ +# Caching YAXArrays + +For some applications like interactive plotting of large datasets it can not be avoided that the same data must be accessed several times. In these cases it can be useful to store recently accessed data in a cache. In YAXArrays this can be easily achieved using the `cache` function. For example, if we open a large dataset from a remote source and want to keep data in a cache of size 500MB one can use: + +````julia +using YAXArrays, Zarr +ds = open_dataset("path/to/source") +cachesize = 500 #MB +cache(ds,maxsize = cachesize) +```` + +The above will wrap every array in the dataset into its own cache, where the 500MB are distributed equally across datasets. +Alternatively individual caches can be applied to single `YAXArray`s + +````julia +yax = ds.avariable +cache(yax,maxsize = 1000) +```` diff --git a/src/Cubes/Cubes.jl b/src/Cubes/Cubes.jl index e2cf5fe2..6666cde6 100644 --- a/src/Cubes/Cubes.jl +++ b/src/Cubes/Cubes.jl @@ -17,7 +17,7 @@ using Tables: istable, schema, columns using DimensionalData: DimensionalData as DD, AbstractDimArray, NoName import DimensionalData: name -export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks +export concatenatecubes, caxes, subsetcube, readcubedata, renameaxis!, YAXArray, setchunks, cache """ This function calculates a subset of a cube's data diff --git a/test/Datasets/datasets.jl b/test/Datasets/datasets.jl index 423177d5..c6dba655 100644 --- a/test/Datasets/datasets.jl +++ b/test/Datasets/datasets.jl @@ -411,6 +411,10 @@ end end @testset "Caching" begin + using YAXArrays.Cubes.DiskArrays.TestTypes + using YAXArrays.Cubes: DiskArrays + using Test + import DimensionalData as DD a = AccessCountDiskArray(reshape(1:100, 5, 20), chunksize=(2, 10)) ds = Dataset(; ar=YAXArray((DD.X(1:5), DD.Y(1:20)), a)) dscached = DiskArrays.cache(ds) From 3643367d5d2c9fb4a59145bfa641805a7940240f Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Thu, 25 Jul 2024 15:33:28 +0200 Subject: [PATCH 3/3] bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8cb69324..a891b303 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "YAXArrays" uuid = "c21b50f5-aa40-41ea-b809-c0f5e47bfa5c" authors = ["Fabian Gans "] -version = "0.5.9" +version = "0.5.10" [deps] CFTime = "179af706-886a-5703-950a-314cd64e0468"