Estimating statistics per month #217

dpabon · 2023-01-24T10:07:25Z

In case someone else needs it, here is my implementation for estimating the median per month:

using using EarthDataLab
using YAXArrays
using Statistics
earth_dataset = esdd()

# Checking the properties of the land surface temperature product
earth_dataset.land_surface_temperature.properties

#=
"long_name"                => "Land Surface Temperature"
  "time_coverage_end"        => "2011-12-31"
  "time_coverage_resolution" => "P8D"
  "time_coverage_start"      => "2002-05-21"
  "name"                     => "land_surface_temperature"
  "esa_cci_path"             => "NaN"
  "orig_version"             => "NaN"
=#

# MCD12Q1

lst = earth_dataset.land_surface_temperature[time = (Date("2002-05-21"),Date("2011-12-31"))]

# let's estimate the median per month:

time_to_index = getAxis("time", lst)

time_index = yearmonth.(time_to_index)

new_dates = unique(time_index)


index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]


function median_by_index(xout, xin; index_list = time_to_index)
    #@show size(xin)
    #@show typeof(xin)
    xout .= NaN
    if !all(isnan, xin)
        for i in eachindex(index_list)
            if !all(isnan, xin[index_list[i]])
                xout[i] = median(filter(!isnan, xin[index_list[i]]))
            end
        end
    end
    
end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end


Indims = InDims("Time")

outdims = OutDims(RangeAxis("time", dates_builder(new_dates)))


lst_monthly_high = mapCube(median_by_index, lst, indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

Balinus · 2023-03-15T22:07:54Z

Thanks for thise code! Very useful. I' m trying to do something similar to get daily maximum/minimum/mean values for 6-hourly dataset. Works great!

However, I have 50-member (so I have an additional dimention number). I'm not sure how to use your code to include another dimensions.

The original dimensions are :

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 721 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
Variable            Axis with 16 elements: fg10 mn2t24 .. tclw mx2t24 
units: K
Total size: 43.84 GB

and I'm trying to have something like (181 time elements).

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 181 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
Variable            Axis with 16 elements: fg10 mn2t24 .. tclw mx2t24 
units: K
Total size: 43.84 GB

My guess is something using InDims/OutDims but I must admint I don't understand yet everything. Or can I just add a loop in the function median_by_index over the number dimensions?

Any help would be much welcome! Cheers!

felixcremer · 2023-03-15T23:00:09Z

Have you tried to apply the function as is? YAXArrays should make the loop over the number dimension as well. If you want to keep min, Max and mean you could add an Stats dimension to the outdims.
outdims = OutDims(RangeAxis("time", dates_builder(new_dates)), CategoricalAxis("stats",["min","max","mean"]))

Balinus · 2023-03-16T11:28:00Z

Thanks!

Yes, I tried and I get an error I have difficulty to understand. Here's my code and error msg.

# Dataset "lst"

lst = ds[Variable="t2m"]

YAXArray with the following dimensions
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
number              Axis with 50 Elements from 1 to 50
time                Axis with 721 Elements from 2019-05-01T00:00:00 to 2019-10-28T00:00:00
units: K
Total size: 2.74 GB

# Time handling
time_to_index = getAxis("time", lst)
time_index = yearmonthday.(time_to_index)
new_dates = unique(time_index)
index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

# Functions
function maximum_by_index(xout, xin; index_list = time_to_index)
    #@show size(xin)
    #@show typeof(xin)
    xout .= NaN
    if !all(isnan, xin)
        for i in eachindex(index_list)
            if !all(isnan, xin[index_list[i]])
                xout[i] = maximum(filter(!isnan, xin[index_list[i]]))
            end
        end
    end
    
end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end

Indims = InDims("time")
outdims = OutDims(RangeAxis("time", dates_builder(new_dates)))

t2m_daily_high = mapCube(maximum_by_index, lst, indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

Error message about a key with Zarr (the file is a netCDF).

┌ Warning: There are still cache misses
└ @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1070
KeyError: key :zarr not found

Stacktrace:
  [1] getindex(h::OrderedCollections.OrderedDict{Symbol, Any}, key::Symbol)
    @ OrderedCollections ~/.julia/packages/OrderedCollections/PRayh/src/ordered_dict.jl:380
  [2] getbackend(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:784
  [3] generateOutCube(oc::YAXArrays.DAT.OutputCube, ispar::Base.RefValue{Bool}, max_cache::Float64, loopcachesize::Tuple{Int64, Int64, Int64}, co::Tuple{Int64, Int64, Int64})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:846
  [4] (::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}})(c::YAXArrays.DAT.OutputCube)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:842
  [5] foreach(f::YAXArrays.DAT.var"#131#132"{YAXArrays.DAT.DATConfig{1, 1}, Tuple{Int64, Int64, Int64}, Tuple{Int64, Int64, Int64}}, itr::Tuple{YAXArrays.DAT.OutputCube})
    @ Base ./abstractarray.jl:2694
  [6] generateOutCubes(dc::YAXArrays.DAT.DATConfig{1, 1})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:841
  [7] mapCube(::typeof(maximum_by_index), ::Tuple{YAXArray{Union{Missing, Float64}, 4, DiskArrays.SubDiskArray{Union{Missing, Float64}, 4}, Vector{CubeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Vector{Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:472
  [8] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
  [9] top-level scope
    @ In[13]:1
 [10] eval
    @ ./boot.jl:373 [inlined]
 [11] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1196

Edit - The code work if I extract a single member of the ensemble (e.g. number=21):

t2m_daily_high = mapCube(maximum_by_index, ds[Variable="t2m", number=21], indims = Indims, outdims = outdims; index_list = index_in_cube, showprog = true)

YAXArray with the following dimensions
time                Axis with 181 Elements from 2019-05-01 to 2019-10-01
longitude           Axis with 101 Elements from -80.0 to -55.0
latitude            Axis with 101 Elements from 65.0 to 40.0
Total size: 42.26 MB

felixcremer · 2023-03-16T14:06:46Z

This is only a problem with the size of the array. If you extract the computation can be handled in memory, but when you try to do the computation for all 50 ensembles it has to save the results somewhere and it tries to use the Zarr backend as a default, but this is not loaded. To circumvent this problem you can either load the Zarr package via using Zarr or you can force it to use the NetCDF backend by specifiying this in the OutDims.

outdims = OutDims(RangeAxis("time", dates_builder(new_dates)), backend=:netcdf)

felixcremer · 2023-03-16T14:13:05Z

Admittedly, this is a bad error message, therefore I opened a new issue to track this error message specifically.

Balinus · 2023-03-16T17:26:56Z

Ah, thanks! Indeed, it was simply a matter of loading the Zarr library. Thanks!

There was another problem that seems linked to the data. I currently have 16 variables in the Cube and some of them fails when I try to calculate the daily maximum values. For instance :

@time fg10_daily_high = mapCube(maximum_by_index, lst[Variable="fg10"], indims = indims, outdims = outdims; index_list = index_in_cube, showprog = true, max_cache=1.0e9)

return the following error:

┌ Warning: There are still cache misses
└ @ YAXArrays.DAT /gpfs/home/dl2594/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1070
TaskFailedException

    nested task error: TypeError: non-boolean (Missing) used in boolean context
    Stacktrace:
     [1] maximum_by_index(xout::Vector{Union{Missing, Float64}}, xin::Vector{Union{Missing, Float64}}; index_list::Vector{Vector{Int64}})
       @ Main ./In[23]:7
     [2] innercode(f::Function, cI::CartesianIndex{3}, xinBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, xoutBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, filters::Tuple{Tuple{YAXArrays.DAT.AllMissing}}, inwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, outwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, axvalcreator::YAXArrays.DAT.NoLoopAxes, offscur::Tuple{Int64, Int64, Int64}, addargs::Tuple{}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
       @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1123
     [3] macro expansion
       @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1162 [inlined]
     [4] (::YAXArrays.DAT.var"#395#threadsfor_fun#175"{typeof(maximum_by_index), Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, Tuple{Tuple{YAXArrays.DAT.AllMissing}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, YAXArrays.DAT.NoLoopAxes, Tuple{}, Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}}, Tuple{Int64, Int64, Int64}, CartesianIndices{3, Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}}})(onethread::Bool)
       @ YAXArrays.DAT ./threadingconstructs.jl:85
     [5] (::YAXArrays.DAT.var"#395#threadsfor_fun#175"{typeof(maximum_by_index), Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, Tuple{Tuple{YAXArrays.DAT.AllMissing}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, Tuple{Vector{Vector{Union{Missing, Float64}}}}, YAXArrays.DAT.NoLoopAxes, Tuple{}, Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}}, Tuple{Int64, Int64, Int64}, CartesianIndices{3, Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}}})()
       @ YAXArrays.DAT ./threadingconstructs.jl:52

Stacktrace:
  [1] wait
    @ ./task.jl:334 [inlined]
  [2] threading_run(func::Function)
    @ Base.Threads ./threadingconstructs.jl:38
  [3] macro expansion
    @ ./threadingconstructs.jl:97 [inlined]
  [4] innerLoop(loopRanges::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}}, f::Function, xinBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (1, 2, 3, Colon()), nothing}}, xoutBC::Tuple{YAXArrays.YAXTools.PickAxisArray{Union{Missing, Float64}, 4, Array{Union{Missing, Float64}, 4}, (Colon(), 1, 2, 3), nothing}}, filters::Tuple{Tuple{YAXArrays.DAT.AllMissing}}, inwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, outwork::Tuple{Vector{Vector{Union{Missing, Float64}}}}, axvalcreator::YAXArrays.DAT.NoLoopAxes, addargs::Tuple{}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1161
  [5] (::YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}})(r::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:709
  [6] (::ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}})(x::Tuple{UnitRange{Int64}, UnitRange{Int64}, UnitRange{Int64}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1016
  [7] iterate
    @ ./generator.jl:47 [inlined]
  [8] _collect(c::DiskArrays.GridChunks{3}, itr::Base.Generator{DiskArrays.GridChunks{3}, ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}}}, #unused#::Base.EltypeUnknown, isz::Base.HasShape{3})
    @ Base ./array.jl:744
  [9] collect_similar(cont::DiskArrays.GridChunks{3}, itr::Base.Generator{DiskArrays.GridChunks{3}, ProgressMeter.var"#56#59"{Distributed.RemoteChannel{Channel{Bool}}, YAXArrays.DAT.var"#108#112"{YAXArrays.DAT.DATConfig{1, 1}}}})
    @ Base ./array.jl:653
 [10] map(f::Function, A::DiskArrays.GridChunks{3})
    @ Base ./abstractarray.jl:2849
 [11] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
 [12] macro expansion
    @ ./task.jl:399 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
 [14] macro expansion
    @ ./task.jl:399 [inlined]
 [15] progress_map(::Function, ::Vararg{Any}; mapfun::Function, progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
 [16] progress_map(::Function, ::Vararg{Any})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1003
 [17] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:707
 [18] mapCube(::typeof(maximum_by_index), ::Tuple{YAXArray{Union{Missing, Float64}, 4, DiskArrays.SubDiskArray{Union{Missing, Float64}, 4}, Vector{CubeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Vector{Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:475
 [19] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
 [20] top-level scope
    @ ./timing.jl:220 [inlined]
 [21] top-level scope
    @ ./In[51]:0
 [22] eval
    @ ./boot.jl:373 [inlined]
 [23] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1196

If I remove the "bad" data, I can use the function as-is and do the calculations over all variables and member. That is like magic!

Balinus · 2023-03-16T18:01:27Z

The problem seems related to the presence of missing values. Is there a function or option in open_dataset where we can set missing values to NaN or something else? I've looked at documentation and InDims seems to have options about missing values, but I'm unsure how to use it(?)

Balinus · 2023-03-17T15:35:08Z

Found my solution. The function was basically filtering for NaN. I just modified it to test for missing values.

I tried some benchmark for a sub-sample of 1 month of 6-hourly data and performance is similar to what I can get from ds.resample(time="1D").max().compute(). About 7 seconds.

Sadly, when using the whole dataset (~720 days, 50 members, 5 variables), the function is not scaling well: 45sec for Python vs 365sec for current implementation, using same computer power for both processes.

There is a lot of allocations that I need to take care of!

 @time daily_high = mapCube(maximum_by_index, ds, indims = indims, outdims = outdims; index_list = index_in_cube, showprog = true, max_cache=1.0e9)
  8.219816 seconds (193.34 M allocations: 20.026 GiB, 20.80% gc time)

Is there a better resampling approach?

edit - Current function

function maximum_by_index(xout, xin; index_list = time_to_index)
    
    xout .= NaN    
    if !all(ismissing, xin)
        
        for i in eachindex(index_list)
            data = xin[index_list[i]]
            if !all(ismissing, data)               
                    xout[i] = maximum(filter(!ismissing, data))
            end
        end
        
    end
    
end 

function dates_builder(x)
    out = Date[]
    for i in eachindex(x)
        push!(out, Date(x[i][1], x[i][2]))
    end

    return out
end

dpabon · 2023-03-20T10:32:23Z

Hi @Balinus glad that my script was useful.

I'm still learning Julia, then my original implementation is not very efficient in terms of memory allocation.

But I played a bit, optimizing your function:

using EarthDataLab
using YAXArrays
using Statistics
using Zarr
using Dates
using BenchmarkTools
earth_dataset = esdd()


###################################


lst = earth_dataset.land_surface_temperature[time = (Date("2002-05-21"),Date("2011-12-31"))]

# let's estimate the median per month:

time_to_index = getAxis("time", lst)

time_index = yearmonth.(time_to_index)

new_dates = unique(time_index)


index_in_cube = [findall(==(i), time_index) for i in unique(time_index)]

# original function

function maximum_by_index(xout, xin; index_list = time_to_index)
    
    xout .= NaN    
    if !all(ismissing, xin)
        
        for i in eachindex(index_list)
            data = xin[index_list[i]]
            if !all(ismissing, data)               
                    xout[i] = maximum(filter(!ismissing, data))
            end
        end
        
    end
    
end 


lst

dummy_data =rand(442)

output_1 = zeros(length(index_in_cube))


@btime maximum_by_index(output_1, dummy_data; index_list = index_in_cube)

your function after running twice:

8.738 μs (234 allocations: 21.12 KiB)

your function + some tricks:

function maximum_by_index(xout, xin; index_list = time_to_index)
    
    xout .= NaN    
    if !all(ismissing, xin)
        for i in eachindex(index_list)
            data = view(xin, index_list[i])
            xout[i] = maximum(skipmissing(data))
        end
        
    end
    
end 


@btime maximum_by_index(output_2, dummy_data; index_list = index_in_cube)

produce:

1.546 μs (2 allocations: 32 bytes)

just to double-check:

output_1 == output_2
true

Maybe there is space for even more optimization, but optimizing functions in Julia sometimes is a rabbit's hole.

Balinus · 2023-03-20T12:42:09Z

Thanks @dpabon !

I will take a shot with your functions and see how it goes, cheers!

Balinus · 2023-03-20T13:01:58Z

Way better than the initial function!

I added a check on the daily values (i have 2-3 days with allmissing)

function maximum_by_index_view(xout, xin; index_list = time_to_index)
    
    xout .= NaN
    if !all(ismissing, xin)
        for i in eachindex(index_list)
            data = view(xin, index_list[i])
            if !all(ismissing, data)
                xout[i] = maximum(skipmissing(data))
            end
        end
        
    end
    
end

Benchmark is now (for a subset of the initial data): 420ms vs 120ms. So, still a "speedup" of about 3-4x for Python versus an initial speedup of 8x. Anyway, I think that might be fast enough to continue with the current implementation and profit from julia's flexibility down the road, thanks!

Balinus · 2023-03-20T13:10:05Z

Is there a way to generalize this function under a "resampling" umbrella? I think extracting data on a general time sample represent a lot of what climate scientist needs to do, like seasonal/daily/monthly statistics (max, min, mean or any arbitrary function like a climate indicator).

dpabon · 2023-03-20T13:32:33Z

I'm not certain how you are benchmarking Julia vs Python, but be aware that the first time you run the function, Julia compiles the code then it takes a bit longer and the second time should be considered as the real time.

Yes, I can re-write a meta function that runs YAXArrays map cube below, where you can pass the parameters you suggest. But it's going to take some days. I'm planning to release a "YAXArrayToolbox.jl" or "YAXArrayRecipes.jl" (not definite name yet) package to perform different operations and spatial analysis using YAXArrays.

Balinus · 2023-03-20T14:00:14Z

No worries! I will certainly check your package when it's ready. The package will be very useful I think 😄

About benchmarking, I use BenchmarkTools and the @benchmark macro. For Python, I force the .compute() method to ensure that the actual calculation is done (not 100% it is though!)

dpabon · 2023-05-23T07:51:03Z

Hi @Balinus,

You can now use the aggregate_time() function in the YAXArraysToolbox.jl package. Any feedback is more than welcome.

Balinus · 2023-05-26T15:18:16Z

Nice! I will take a look!

TabeaW · 2023-06-21T08:12:58Z

aggregate_time(cube; time_axis = "time", new_resolution = "month", new_time_step=1, fun="mean", p=nothing, skipMissing=true, skipnan=true, showprog=true, max_cache="1GB")
leads for me to an error

ERROR: On worker 2:
TypeError: non-boolean (Missing) used in boolean context
Stacktrace:
  [1] #mean_by_index_2#6
    @ ~/.julia/packages/YAXArraysToolbox/mTuzS/src/aggregate_time.jl:102
  [2] innercode
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1123
  [3] innerLoop
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:1146
  [4] #107
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:701
  [5] fnew
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:665
  [6] #56
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1016
  [7] #invokelatest#2
    @ ./essentials.jl:816
  [8] invokelatest
    @ ./essentials.jl:813
  [9] #110
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285
 [10] run_work_thunk
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:70
 [11] macro expansion
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/process_messages.jl:285 [inlined]
 [12] #109
    @ ./task.jl:514
Stacktrace:
  [1] (::Base.var"#988#990")(x::Task)
    @ Base ./asyncmap.jl:177
  [2] foreach(f::Base.var"#988#990", itr::Vector{Any})
    @ Base ./abstractarray.jl:3073
  [3] maptwice(wrapped_f::Function, chnl::Channel{Any}, worker_tasks::Vector{Any}, c::DiskArrays.GridChunks{2})
    @ Base ./asyncmap.jl:177
  [4] wrap_n_exec_twice
    @ ./asyncmap.jl:153 [inlined]
  [5] #async_usemap#973
    @ ./asyncmap.jl:103 [inlined]
  [6] async_usemap
    @ ./asyncmap.jl:84 [inlined]
  [7] #asyncmap#972
    @ ./asyncmap.jl:81 [inlined]
  [8] asyncmap
    @ ./asyncmap.jl:80 [inlined]
  [9] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; distributed::Bool, batch_size::Int64, on_error::Nothing, retry_delays::Vector{Any}, retry_check::Nothing)
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/pmap.jl:126
 [10] pmap(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2})
    @ ~/julia-1.9.0/share/julia/stdlib/v1.9/Distributed/src/pmap.jl:99
 [11] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1015 [inlined]
 [12] macro expansion
    @ ./task.jl:476 [inlined]
 [13] macro expansion
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1014 [inlined]
 [14] macro expansion
    @ ./task.jl:476 [inlined]
 [15] progress_map(::Function, ::Vararg{Any}; mapfun::Function, progress::ProgressMeter.Progress, channel_bufflen::Int64, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ ProgressMeter ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1007
 [16] #progress_pmap#60
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
 [17] progress_pmap
    @ ~/.julia/packages/ProgressMeter/sN2xr/src/ProgressMeter.jl:1032 [inlined]
 [18] pmap_with_data(f::Function, p::WorkerPool, c::DiskArrays.GridChunks{2}; initfunc::Function, progress::ProgressMeter.Progress, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:668
 [19] pmap_with_data(f::Function, c::DiskArrays.GridChunks{2}; initfunc::Function, kwargs::Base.Pairs{Symbol, ProgressMeter.Progress, Tuple{Symbol}, NamedTuple{(:progress,), Tuple{ProgressMeter.Progress}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:673
 [20] runLoop(dc::YAXArrays.DAT.DATConfig{1, 1}, showprog::Bool)
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:698
 [21] mapCube(::typeof(YAXArraysToolbox.mean_by_index_2), ::Tuple{YAXArray{Union{Missing, Float32}, 3, DiskArrayTools.CFDiskArray{Float32, 3, Float32, ZArray{Float32, 3, Zarr.BloscCompressor, DirectoryStore}}, Vector{RangeAxis}}}; max_cache::Float64, indims::InDims, outdims::OutDims, inplace::Bool, ispar::Bool, debug::Bool, include_loopvars::Bool, showprog::Bool, irregular_loopranges::Bool, nthreads::Dict{Int64, Int64}, loopchunksize::Dict{Any, Any}, kwargs::Base.Pairs{Symbol, Vector{Vector{Int64}}, Tuple{Symbol}, NamedTuple{(:index_list,), Tuple{Vector{Vector{Int64}}}}})
    @ YAXArrays.DAT ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:475
 [22] #mapCube#36
    @ ~/.julia/packages/YAXArrays/au5n4/src/DAT/DAT.jl:303 [inlined]
 [23] aggregate_time(cube_in::YAXArray{Union{Missing, Float32}, 3, DiskArrayTools.CFDiskArray{Float32, 3, Float32, ZArray{Float32, 3, Zarr.BloscCompressor, DirectoryStore}}, Vector{RangeAxis}}; time_axis::String, new_resolution::String, new_time_step::Int64, fun::String, p::Nothing, skipMissing::Bool, skipnan::Bool, showprog::Bool, max_cache::String)
    @ YAXArraysToolbox ~/.julia/packages/YAXArraysToolbox/mTuzS/src/aggregate_time.jl:569
 [24] top-level scope
    @ REPL[8]:1

dpabon · 2023-06-21T13:30:46Z

Hi @TabeaW please can open a new issue in the YAXArraysToolbox.jl package?

felixcremer · 2023-08-23T22:11:23Z

We should update the example to 0.5 and add it to the docs.

TabeaW · 2023-08-24T06:27:52Z

See maybe #306 for monthly mean.

dpabon · 2023-08-24T07:29:44Z

We should update the example to 0.5 and add it to the docs.

Hi Felix,

Which one exactly? The one to compute the mean per month?

felixcremer · 2023-08-24T14:49:34Z

Yes that one. I tried to update it yesterday but I ran into some Ti issues.

dpabon · 2023-09-18T12:57:57Z

Hi Felix, here my implementation (already working with YAXArrays v0.5) https://github.com/dpabon/YAXArraysToolbox.jl/blob/migration_v0.5_YAXArrays/src/aggregate_time.jl

felixcremer mentioned this issue Mar 16, 2023

Using a larger than memory NetCDF file should not lead to getindex error from the backendlist #243

Open

TabeaW mentioned this issue Jun 22, 2023

Error in aggregate_time dpabon/YAXArraysToolbox.jl#2

Closed

dpabon closed this as completed Jun 26, 2023

felixcremer reopened this Aug 23, 2023

lazarusA added the documentation Improvements or additions to documentation label Sep 22, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Estimating statistics per month #217

Estimating statistics per month #217

dpabon commented Jan 24, 2023

Balinus commented Mar 15, 2023

felixcremer commented Mar 15, 2023

Balinus commented Mar 16, 2023 •

edited

Loading

felixcremer commented Mar 16, 2023

felixcremer commented Mar 16, 2023

Balinus commented Mar 16, 2023 •

edited

Loading

Balinus commented Mar 16, 2023

Balinus commented Mar 17, 2023 •

edited

Loading

dpabon commented Mar 20, 2023 •

edited

Loading

Balinus commented Mar 20, 2023

Balinus commented Mar 20, 2023 •

edited

Loading

Balinus commented Mar 20, 2023

dpabon commented Mar 20, 2023

Balinus commented Mar 20, 2023

dpabon commented May 23, 2023 •

edited

Loading

Balinus commented May 26, 2023

TabeaW commented Jun 21, 2023

dpabon commented Jun 21, 2023

felixcremer commented Aug 23, 2023

TabeaW commented Aug 24, 2023

dpabon commented Aug 24, 2023

felixcremer commented Aug 24, 2023

dpabon commented Sep 18, 2023

Estimating statistics per month #217

Estimating statistics per month #217

Comments

dpabon commented Jan 24, 2023

Balinus commented Mar 15, 2023

felixcremer commented Mar 15, 2023

Balinus commented Mar 16, 2023 • edited Loading

felixcremer commented Mar 16, 2023

felixcremer commented Mar 16, 2023

Balinus commented Mar 16, 2023 • edited Loading

Balinus commented Mar 16, 2023

Balinus commented Mar 17, 2023 • edited Loading

dpabon commented Mar 20, 2023 • edited Loading

Balinus commented Mar 20, 2023

Balinus commented Mar 20, 2023 • edited Loading

Balinus commented Mar 20, 2023

dpabon commented Mar 20, 2023

Balinus commented Mar 20, 2023

dpabon commented May 23, 2023 • edited Loading

Balinus commented May 26, 2023

TabeaW commented Jun 21, 2023

dpabon commented Jun 21, 2023

felixcremer commented Aug 23, 2023

TabeaW commented Aug 24, 2023

dpabon commented Aug 24, 2023

felixcremer commented Aug 24, 2023

dpabon commented Sep 18, 2023

Balinus commented Mar 16, 2023 •

edited

Loading

Balinus commented Mar 16, 2023 •

edited

Loading

Balinus commented Mar 17, 2023 •

edited

Loading

dpabon commented Mar 20, 2023 •

edited

Loading

Balinus commented Mar 20, 2023 •

edited

Loading

dpabon commented May 23, 2023 •

edited

Loading