From 2ebac481ecbdbf5929544d955eb5e176b5e73148 Mon Sep 17 00:00:00 2001 From: Fabian Gans Date: Fri, 22 Nov 2024 15:30:55 +0100 Subject: [PATCH] Add option to persist handle to NetCDF files (#31) * add interface for keeping handles open for faster dataset opening * update tests * test on lts instead of 1.9 * Add dependabot * test 1.10 since workflows are too old --- .github/dependabot.yml | 7 ++++ .github/workflows/CI.yml | 2 +- Project.toml | 2 +- ext/ArchGDALExt/archgdaldataset.jl | 2 +- ext/NetCDFExt.jl | 47 +++++++++++++++++++++------ src/datasets/datasetinterface.jl | 7 +++- test/datasets.jl | 51 ++++++++++++++++++++++-------- 7 files changed, 91 insertions(+), 27 deletions(-) create mode 100644 .github/dependabot.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..ff6499d --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" \ No newline at end of file diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index bea5530..1616ce1 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: version: - - '1.9' + - '1.10' - '1' - 'nightly' os: diff --git a/Project.toml b/Project.toml index 4618228..17e4b37 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "YAXArrayBase" uuid = "90b8fcef-0c2d-428d-9c56-5f86629e9d14" authors = ["Fabian Gans "] -version = "0.7.4" +version = "0.7.5" [deps] DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" diff --git a/ext/ArchGDALExt/archgdaldataset.jl b/ext/ArchGDALExt/archgdaldataset.jl index 3f2cd3d..7bca56d 100644 --- a/ext/ArchGDALExt/archgdaldataset.jl +++ b/ext/ArchGDALExt/archgdaldataset.jl @@ -71,7 +71,7 @@ function GDALDataset(filename; mode="r") end Base.haskey(ds::GDALDataset, k) = in(k, ("X", "Y")) || haskey(ds.bands, k) #Implement Dataset interface -function YAB.get_var_handle(ds::GDALDataset, name) +function YAB.get_var_handle(ds::GDALDataset, name; persist=true) if name == "X" range(ds.trans[1], length = ds.bandsize[1], step = ds.trans[2]) elseif name == "Y" diff --git a/ext/NetCDFExt.jl b/ext/NetCDFExt.jl index 5390ac9..156aa4a 100644 --- a/ext/NetCDFExt.jl +++ b/ext/NetCDFExt.jl @@ -15,8 +15,33 @@ as a data sink: struct NetCDFDataset filename::String mode::UInt16 + handle::Base.RefValue{Union{Nothing, NcFile}} end -NetCDFDataset(filename;mode="r") = mode == "r" ? NetCDFDataset(filename,NC_NOWRITE) : NetCDFDataset(filename,NC_WRITE) +function NetCDFDataset(filename;mode="r") + m = mode == "r" ? NC_NOWRITE : NC_WRITE + NetCDFDataset(filename,m,Ref{Union{Nothing, NcFile}}(nothing)) +end +function dsopen(f,ds::NetCDFDataset) + if ds.handle[] === nothing + NetCDF.open(f, ds.filename) + else + f(ds.handle[]) + end +end +function YAB.open_dataset_handle(f, ds::NetCDFDataset) + if ds.handle[] === nothing + try + ds.handle[] = NetCDF.open(ds.filename, mode=ds.mode) + f(ds) + finally + ds.handle[]=nothing + end + else + f(ds) + end +end + + import .NetCDF: AbstractDiskArray, readblock!, writeblock!, haschunks, eachchunk @@ -49,15 +74,19 @@ YAB.iscompressed(v::NetCDFVariable) = NetCDF.open(v->v.compress > 0, v.filename, Base.size(v::NetCDFVariable) = v.size -YAB.get_var_dims(ds::NetCDFDataset,name) = NetCDF.open(v->map(i->i.name,v[name].dim),ds.filename) -YAB.get_varnames(ds::NetCDFDataset) = NetCDF.open(v->collect(keys(v.vars)),ds.filename) -YAB.get_var_attrs(ds::NetCDFDataset, name) = NetCDF.open(v->v[name].atts,ds.filename) -YAB.get_global_attrs(ds::NetCDFDataset) = NetCDF.open(nc->nc.gatts, ds.filename) -function Base.getindex(ds::NetCDFDataset, i) - s,et = NetCDF.open(j->(size(j),eltype(j)),ds.filename,i) - NetCDFVariable{et,length(s)}(ds.filename, i, s) +YAB.get_var_dims(ds::NetCDFDataset,name) = dsopen(v->map(i->i.name,v[name].dim),ds) +YAB.get_varnames(ds::NetCDFDataset) = dsopen(v->collect(keys(v.vars)),ds) +YAB.get_var_attrs(ds::NetCDFDataset, name) = dsopen(v->v[name].atts,ds) +YAB.get_global_attrs(ds::NetCDFDataset) = dsopen(nc->nc.gatts, ds) +function YAB.get_var_handle(ds::NetCDFDataset, i; persist = true) + if persist || ds.handle[] === nothing + s,et = NetCDF.open(j->(size(j),eltype(j)),ds.filename,i) + NetCDFVariable{et,length(s)}(ds.filename, i, s) + else + ds.handle[][i] + end end -Base.haskey(ds::NetCDFDataset,k) = NetCDF.open(nc->haskey(nc.vars,k),ds.filename) +Base.haskey(ds::NetCDFDataset,k) = dsopen(nc->haskey(nc.vars,k),ds) function YAB.add_var(p::NetCDFDataset, T::Type, varname, s, dimnames, attr; chunksize=s, compress = -1) diff --git a/src/datasets/datasetinterface.jl b/src/datasets/datasetinterface.jl index 4439263..1f58ef8 100644 --- a/src/datasets/datasetinterface.jl +++ b/src/datasets/datasetinterface.jl @@ -1,6 +1,6 @@ #Functions to be implemented for Dataset sources: "Return a DiskArray handle to a dataset" -get_var_handle(ds, name) = ds[name] +get_var_handle(ds, name; persist=true) = ds[name] "Return a list of variable names" function get_varnames end @@ -18,6 +18,11 @@ function get_global_attrs end "Initialize and return a handle to a new empty dataset" function create_empty end +"Apply a function `f` on a dataset `ds` while keeping possible file handles open during the operations" +function open_dataset_handle(f, ds) + f(ds) +end + """ add_var(ds, T, name, s, dimlist, atts) diff --git a/test/datasets.jl b/test/datasets.jl index 684a536..4c40582 100644 --- a/test/datasets.jl +++ b/test/datasets.jl @@ -38,6 +38,29 @@ h = get_var_handle(ds_nc, "tas") @test all(isapprox.(h[1:2,1:2], [215.893 217.168; 215.805 217.03])) @test allow_parallel_write(ds_nc) == false @test allow_missings(ds_nc) == false +#Repeat the same test with an open get_var_handle +ds_nc2 = YAXArrayBase.to_dataset(p2) +YAXArrayBase.open_dataset_handle(ds_nc2) do ds_nc + @test ds_nc.handle[] !== nothing + vn = get_varnames(ds_nc) + @test sort(vn) == ["area", "lat", "lat_bnds", "lon", "lon_bnds", "msk_rgn", + "plev", "pr", "tas", "time", "time_bnds", "ua"] + @test get_var_dims(ds_nc, "tas") == ["lon", "lat", "time"] + @test get_var_dims(ds_nc, "area") == ["lon", "lat"] + @test get_var_dims(ds_nc, "time") == ["time"] + @test get_var_dims(ds_nc, "time_bnds") == ["bnds", "time"] + @test get_var_attrs(ds_nc,"tas")["long_name"] == "air_temperature" + h1 = get_var_handle(ds_nc, "tas",persist=true) + @test !(h1 isa NetCDF.NcVar) + @test !YAXArrayBase.iscompressed(h1) + @test all(isapprox.(h1[1:2,1:2], [215.893 217.168; 215.805 217.03])) + h2 = get_var_handle(ds_nc, "tas",persist=false) + @test h2 isa NetCDF.NcVar + @test !YAXArrayBase.iscompressed(h2) + @test all(isapprox.(h2[1:2,1:2], [215.893 217.168; 215.805 217.03])) + @test allow_parallel_write(ds_nc) == false + @test allow_missings(ds_nc) == false +end end @testset "Reading Zarr" begin @@ -71,22 +94,22 @@ end @test allow_missings(ds_tif) == true end function test_write(T) - p = tempname() - ds = create_empty(T, p) -add_var(ds, 0.5:1:9.5, "lon", ("lon",), Dict("units"=>"degrees_east")) -add_var(ds, 20:-1.0:1, "lat", ("lat",), Dict("units"=>"degrees_north")) -v = add_var(ds, Float32, "tas", (10,20), ("lon", "lat"), Dict{String,Any}("units"=>"Celsius")) + p = tempname() + ds = create_empty(T, p) + add_var(ds, 0.5:1:9.5, "lon", ("lon",), Dict("units"=>"degrees_east")) + add_var(ds, 20:-1.0:1, "lat", ("lat",), Dict("units"=>"degrees_north")) + v = add_var(ds, Float32, "tas", (10,20), ("lon", "lat"), Dict{String,Any}("units"=>"Celsius")) -v[:,:] = collect(reshape(1:200, 10, 20)) + v[:,:] = collect(reshape(1:200, 10, 20)) -@test sort(get_varnames(ds)) == ["lat","lon","tas"] -@test get_var_dims(ds, "tas") == ["lon", "lat"] -@test get_var_dims(ds, "lon") == ["lon"] -@test get_var_attrs(ds,"tas")["units"] == "Celsius" -h = get_var_handle(ds, "lon") -@test h[:] == 0.5:1:9.5 -v = get_var_handle(ds, "tas") -@test v[1:2,1:2] == [1 11; 2 12] + @test sort(get_varnames(ds)) == ["lat","lon","tas"] + @test get_var_dims(ds, "tas") == ["lon", "lat"] + @test get_var_dims(ds, "lon") == ["lon"] + @test get_var_attrs(ds,"tas")["units"] == "Celsius" + h = get_var_handle(ds, "lon") + @test h[:] == 0.5:1:9.5 + v = get_var_handle(ds, "tas") + @test v[1:2,1:2] == [1 11; 2 12] end @testset "Writing NetCDF" begin