diff --git a/Project.toml b/Project.toml index 2d36218..f14a9f4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "NaNStatistics" uuid = "b946abbf-3ea7-4610-9019-9858bfdeaf2d" authors = ["C. Brenhin Keller"] -version = "0.6.40" +version = "0.6.41" [deps] PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" diff --git a/README.md b/README.md index b044666..7d6e878 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,10 @@ Summary statistics exported by NaNStatistics are generally named the same as the * `nanpctile`   percentile * `nanpctile!`   as `nanpctile` but quicksorts in-place for efficiency +##### Other summary statistics +* `nanskewness`   skewness +* `nankurtosis`   excess kurtosis + Note that, regardless of implementation, functions involving medians or percentiles are generally significantly slower than other summary statistics, since calculating a median or percentile requires a quicksort or quickselect of the input array; if not done in-place as in `nanmedian!` and `nanpctile!` then a copy of the entire array must also be made. These functions will generally support the same `dims` keyword argument as their normal Julia counterparts (though are most efficient when operating on an entire collection). diff --git a/src/ArrayStats/nankurtosis.jl b/src/ArrayStats/nankurtosis.jl new file mode 100644 index 0000000..b9677a5 --- /dev/null +++ b/src/ArrayStats/nankurtosis.jl @@ -0,0 +1,243 @@ +""" +```julia +nankurtosis(A; dims=:, mean=nothing) +``` +Compute the kurtosis of all non-`NaN` elements in `A`, optionally over dimensions specified by `dims`. +As `StatsBase.kurtosis`, but ignoring `NaN`s. + +A precomputed `mean` may optionally be provided, which results in a somewhat faster +calculation. If `corrected` is `true`, then _Bessel's correction_ is applied, such +that the sum is divided by `n-1` rather than `n`. + +As an alternative to `dims`, `nankurtosis` also supports the `dim` keyword, which +behaves identically to `dims`, but also drops any singleton dimensions that have +been reduced over (as is the convention in some other languages). + + +## Examples +```julia +julia> using NaNStatistics + +julia> A = [1 2 3 ; 4 5 6; 7 8 9] +3×3 Matrix{Int64}: + 1 2 3 + 4 5 6 + 7 8 9 + +julia> nankurtosis(A, dims=1) +1×3 Matrix{Float64}: + -1.5 -1.5 -1.5 + +julia> nankurtosis(A, dims=2) +3-element Vector{Float64}: + -1.5 + -1.5 + -1.5 +``` +""" +nankurtosis(A; dims=:, dim=:, mean=nothing, corrected=false) = __nankurtosis(mean, corrected, A, dims, dim) +__nankurtosis(mean, corrected, A, ::Colon, ::Colon) = _nankurtosis(mean, corrected, A, :) +__nankurtosis(mean, corrected, A, region, ::Colon) = _nankurtosis(mean, corrected, A, region) +__nankurtosis(mean, corrected, A, ::Colon, region) = reducedims(_nankurtosis(mean, corrected, A, region), region) +export nankurtosis + +# If dims is an integer, wrap it in a tuple +_nankurtosis(μ, corrected::Bool, A, dims::Int) = _nankurtosis(μ, corrected, A, (dims,)) + +# If the mean isn't known, compute it +_nankurtosis(::Nothing, corrected::Bool, A, dims::Tuple) = _nankurtosis!(_nanmean(A, dims), corrected, A, dims) +# Reduce all the dims! +function _nankurtosis(::Nothing, corrected::Bool, A, ::Colon) + T = eltype(A) + Tₒ = Base.promote_op(/, T, Int) + n = 0 + Σ = ∅ = zero(T) + @inbounds @simd ivdep for i ∈ eachindex(A) + Aᵢ = A[i] + notnan = Aᵢ==Aᵢ + n += notnan + Σ += ifelse(notnan, Aᵢ, ∅) + end + μ = Σ / n + μ₄ = μ₂ = ∅ₒ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + notnan = δ==δ + δ² = δ * δ + μ₂ += ifelse(notnan, δ², ∅ₒ) + μ₄ += ifelse(notnan, δ² * δ², ∅ₒ) + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₄/n)/σ^4 - 3 +end +function _nankurtosis(::Nothing, corrected::Bool, A::AbstractArray{T}, ::Colon) where T<:Integer + Tₒ = Base.promote_op(/, T, Int) + n = length(A) + Σ = zero(Tₒ) + @inbounds @simd ivdep for i ∈ eachindex(A) + Σ += A[i] + end + μ = Σ / n + μ₄ = μ₂ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + δ² = δ * δ + μ₂ += δ² + μ₄ += δ² * δ² + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₄/n)/σ^4 - 3 +end + + +# If the mean is known, pass it on in the appropriate form +_nankurtosis(μ, corrected::Bool, A, dims::Tuple) = _nankurtosis!(collect(μ), corrected, A, dims) +_nankurtosis(μ::Array, corrected::Bool, A, dims::Tuple) = _nankurtosis!(copy(μ), corrected, A, dims) +_nankurtosis(μ::Number, corrected::Bool, A, dims::Tuple) = _nankurtosis!([μ], corrected, A, dims) +# Reduce all the dims! +function _nankurtosis(μ::Number, corrected::Bool, A, ::Colon) + n = 0 + μ₄ = μ₂ = ∅ₒ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + notnan = δ==δ + n += notnan + δ² = δ * δ + μ₂ += ifelse(notnan, δ², ∅ₒ) + μ₄ += ifelse(notnan, δ² * δ², ∅ₒ) + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₄/n)/σ^4 - 3 +end +function _nankurtosis(μ::Number, corrected::Bool, A::AbstractArray{T}, ::Colon) where T<:Integer + μ₄ = μ₂ = zero(typeof(μ)) + if μ==μ + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + δ² = δ * δ + μ₂ += δ² + μ₄ += δ² * δ² + end + n = length(A) + else + n = 0 + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₄/n)/σ^4 - 3 +end + +# Metaprogramming magic adapted from Chris Elrod example: +# Generate customized set of loops for a given ndims and a vector +# `static_dims` of dimensions to reduce over +function staticdim_nankurtosis_quote(static_dims::Vector{Int}, N::Int) + M = length(static_dims) + # `static_dims` now contains every dim we're taking the var over. + Bᵥ = Expr(:call, :view, :B) + reduct_inds = Int[] + nonreduct_inds = Int[] + # Firstly, build our expressions for indexing each array + Aind = :(A[]) + Bind = :(Bᵥ[]) + inds = Vector{Symbol}(undef, N) + for n ∈ 1:N + ind = Symbol(:i_,n) + inds[n] = ind + push!(Aind.args, ind) + if n ∈ static_dims + push!(reduct_inds, n) + push!(Bᵥ.args, :(firstindex(B,$n))) + else + push!(nonreduct_inds, n) + push!(Bᵥ.args, :) + push!(Bind.args, ind) + end + end + firstn = first(nonreduct_inds) + # Secondly, build up our set of loops + block = Expr(:block) + loops = Expr(:for, :($(inds[firstn]) = indices((A,B),$firstn)), block) + if length(nonreduct_inds) > 1 + for n ∈ @view(nonreduct_inds[2:end]) + newblock = Expr(:block) + push!(block.args, Expr(:for, :($(inds[n]) = indices((A,B),$n)), newblock)) + block = newblock + end + end + rblock = block + # Push more things here if you want them at the beginning of the reduction loop + push!(rblock.args, :(μ = $Bind)) + push!(rblock.args, :(n = 0)) + push!(rblock.args, :(μ₄ = μ₂ = ∅)) + # Build the reduction loop + for n ∈ reduct_inds + newblock = Expr(:block) + if n==last(reduct_inds) + push!(block.args, Expr(:macrocall, Symbol("@simd"), :ivdep, Expr(:for, :($(inds[n]) = axes(A,$n)), newblock))) + else + push!(block.args, Expr(:for, :($(inds[n]) = axes(A,$n)), newblock)) + end + block = newblock + end + # Push more things here if you want them in the innermost loop + push!(block.args, :(δ = $Aind - μ)) + push!(block.args, :(notnan = δ==δ)) + push!(block.args, :(n += notnan)) + push!(block.args, :(δ² = δ * δ)) + push!(block.args, :(μ₂ += ifelse(notnan, δ², ∅))) + push!(block.args, :(μ₄ += ifelse(notnan, δ² * δ², ∅))) + + # Push more things here if you want them at the end of the reduction loop + push!(rblock.args, :(σ = sqrt(μ₂ * inv(max(n-corrected,0))) )) + push!(rblock.args, :($Bind = (μ₄/n)/σ^4 - 3)) + + # Put it all together + quote + ∅ = zero(eltype(B)) + Bᵥ = $Bᵥ + @inbounds $loops + return B + end +end + +# Turn non-static integers in `dims` tuple into `StaticInt`s +# so we can construct `static_dims` vector within @generated code +function branches_nankurtosis_quote(N::Int, M::Int, D) + static_dims = Int[] + for m ∈ 1:M + param = D.parameters[m] + if param <: StaticInt + new_dim = _dim(param)::Int + @assert new_dim ∉ static_dims + push!(static_dims, new_dim) + else + t = Expr(:tuple) + for n ∈ static_dims + push!(t.args, :(StaticInt{$n}())) + end + q = Expr(:block, :(dimm = dims[$m])) + qold = q + ifsym = :if + for n ∈ 1:N + n ∈ static_dims && continue + tc = copy(t) + push!(tc.args, :(StaticInt{$n}())) + qnew = Expr(ifsym, :(dimm == $n), :(return _nankurtosis!(B, corrected, A, $tc))) + for r ∈ m+1:M + push!(tc.args, :(dims[$r])) + end + push!(qold.args, qnew) + qold = qnew + ifsym = :elseif + end + push!(qold.args, Expr(:block, :(throw("Dimension `$dimm` not found.")))) + return q + end + end + staticdim_nankurtosis_quote(static_dims, N) +end + +# Efficient @generated in-place var +@generated function _nankurtosis!(B::AbstractArray{Tₒ,N}, corrected::Bool, A::AbstractArray{T,N}, dims::D) where {Tₒ,T,N,M,D<:Tuple{Vararg{IntOrStaticInt,M}}} + N == M && return :(B[1] = _nankurtosis(B[1], corrected, A, :); B) + branches_nankurtosis_quote(N, M, D) +end diff --git a/src/ArrayStats/nanskewness.jl b/src/ArrayStats/nanskewness.jl new file mode 100644 index 0000000..bde02f8 --- /dev/null +++ b/src/ArrayStats/nanskewness.jl @@ -0,0 +1,243 @@ +""" +```julia +nanskewness(A; dims=:, mean=nothing) +``` +Compute the skewness of all non-`NaN` elements in `A`, optionally over dimensions specified by `dims`. +As `StatsBase.skewness`, but ignoring `NaN`s. + +A precomputed `mean` may optionally be provided, which results in a somewhat faster +calculation. + +As an alternative to `dims`, `nanskewness` also supports the `dim` keyword, which +behaves identically to `dims`, but also drops any singleton dimensions that have +been reduced over (as is the convention in some other languages). + + +## Examples +```julia +julia> using NaNStatistics + +julia> A = [1 2 3 ; 4 5 6; 7 8 9] +3×3 Matrix{Int64}: + 1 2 3 + 4 5 6 + 7 8 9 + +julia> nanskewness(A, dims=1) +1×3 Matrix{Float64}: + 0.0 0.0 0.0 + + +julia> nanskewness(A, dims=2) +3-element Vector{Float64}: + 0.0 + 0.0 + 0.0 +``` +""" +nanskewness(A; dims=:, dim=:, mean=nothing, corrected=false) = __nanskewness(mean, corrected, A, dims, dim) +__nanskewness(mean, corrected, A, ::Colon, ::Colon) = _nanskewness(mean, corrected, A, :) +__nanskewness(mean, corrected, A, region, ::Colon) = _nanskewness(mean, corrected, A, region) +__nanskewness(mean, corrected, A, ::Colon, region) = reducedims(_nanskewness(mean, corrected, A, region), region) +export nanskewness + +# If dims is an integer, wrap it in a tuple +_nanskewness(μ, corrected::Bool, A, dims::Int) = _nanskewness(μ, corrected, A, (dims,)) + +# If the mean isn't known, compute it +_nanskewness(::Nothing, corrected::Bool, A, dims::Tuple) = _nanskewness!(_nanmean(A, dims), corrected, A, dims) +# Reduce all the dims! +function _nanskewness(::Nothing, corrected::Bool, A, ::Colon) + T = eltype(A) + Tₒ = Base.promote_op(/, T, Int) + n = 0 + Σ = ∅ = zero(T) + @inbounds @simd ivdep for i ∈ eachindex(A) + Aᵢ = A[i] + notnan = Aᵢ==Aᵢ + n += notnan + Σ += ifelse(notnan, Aᵢ, ∅) + end + μ = Σ / n + μ₃ = μ₂ = ∅ₒ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + notnan = δ==δ + δ² = δ * δ + μ₂ += ifelse(notnan, δ², ∅ₒ) + μ₃ += ifelse(notnan, δ² * δ, ∅ₒ) + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₃/n)/σ^3 +end +function _nanskewness(::Nothing, corrected::Bool, A::AbstractArray{T}, ::Colon) where T<:Integer + Tₒ = Base.promote_op(/, T, Int) + n = length(A) + Σ = zero(Tₒ) + @inbounds @simd ivdep for i ∈ eachindex(A) + Σ += A[i] + end + μ = Σ / n + μ₃ = μ₂ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + δ² = δ * δ + μ₂ += δ² + μ₃ += δ² * δ + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₃/n)/σ^3 +end + + +# If the mean is known, pass it on in the appropriate form +_nanskewness(μ, corrected::Bool, A, dims::Tuple) = _nanskewness!(collect(μ), corrected, A, dims) +_nanskewness(μ::Array, corrected::Bool, A, dims::Tuple) = _nanskewness!(copy(μ), corrected, A, dims) +_nanskewness(μ::Number, corrected::Bool, A, dims::Tuple) = _nanskewness!([μ], corrected, A, dims) +# Reduce all the dims! +function _nanskewness(μ::Number, corrected::Bool, A, ::Colon) + n = 0 + μ₃ = μ₂ = ∅ₒ = zero(typeof(μ)) + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + notnan = δ==δ + n += notnan + δ² = δ * δ + μ₂ += ifelse(notnan, δ², ∅ₒ) + μ₃ += ifelse(notnan, δ² * δ, ∅ₒ) + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₃/n)/σ^3 +end +function _nanskewness(μ::Number, corrected::Bool, A::AbstractArray{T}, ::Colon) where T<:Integer + μ₃ = μ₂ = zero(typeof(μ)) + if μ==μ + @inbounds @simd ivdep for i ∈ eachindex(A) + δ = A[i] - μ + δ² = δ * δ + μ₂ += δ² + μ₃ += δ² * δ + end + n = length(A) + else + n = 0 + end + σ = sqrt(μ₂ / max(n-corrected,0)) + return (μ₃/n)/σ^3 +end + +# Metaprogramming magic adapted from Chris Elrod example: +# Generate customized set of loops for a given ndims and a vector +# `static_dims` of dimensions to reduce over +function staticdim_nanskewness_quote(static_dims::Vector{Int}, N::Int) + M = length(static_dims) + # `static_dims` now contains every dim we're taking the var over. + Bᵥ = Expr(:call, :view, :B) + reduct_inds = Int[] + nonreduct_inds = Int[] + # Firstly, build our expressions for indexing each array + Aind = :(A[]) + Bind = :(Bᵥ[]) + inds = Vector{Symbol}(undef, N) + for n ∈ 1:N + ind = Symbol(:i_,n) + inds[n] = ind + push!(Aind.args, ind) + if n ∈ static_dims + push!(reduct_inds, n) + push!(Bᵥ.args, :(firstindex(B,$n))) + else + push!(nonreduct_inds, n) + push!(Bᵥ.args, :) + push!(Bind.args, ind) + end + end + firstn = first(nonreduct_inds) + # Secondly, build up our set of loops + block = Expr(:block) + loops = Expr(:for, :($(inds[firstn]) = indices((A,B),$firstn)), block) + if length(nonreduct_inds) > 1 + for n ∈ @view(nonreduct_inds[2:end]) + newblock = Expr(:block) + push!(block.args, Expr(:for, :($(inds[n]) = indices((A,B),$n)), newblock)) + block = newblock + end + end + rblock = block + # Push more things here if you want them at the beginning of the reduction loop + push!(rblock.args, :(μ = $Bind)) + push!(rblock.args, :(n = 0)) + push!(rblock.args, :(μ₃ = μ₂ = ∅)) + # Build the reduction loop + for n ∈ reduct_inds + newblock = Expr(:block) + if n==last(reduct_inds) + push!(block.args, Expr(:macrocall, Symbol("@simd"), :ivdep, Expr(:for, :($(inds[n]) = axes(A,$n)), newblock))) + else + push!(block.args, Expr(:for, :($(inds[n]) = axes(A,$n)), newblock)) + end + block = newblock + end + # Push more things here if you want them in the innermost loop + push!(block.args, :(δ = $Aind - μ)) + push!(block.args, :(notnan = δ==δ)) + push!(block.args, :(n += notnan)) + push!(block.args, :(δ² = δ * δ)) + push!(block.args, :(μ₂ += ifelse(notnan, δ², ∅))) + push!(block.args, :(μ₃ += ifelse(notnan, δ² * δ, ∅))) + + # Push more things here if you want them at the end of the reduction loop + push!(rblock.args, :(σ = sqrt(μ₂ * inv(max(n-corrected,0))) )) + push!(rblock.args, :($Bind = (μ₃/n)/σ^3 )) + + # Put it all together + quote + ∅ = zero(eltype(B)) + Bᵥ = $Bᵥ + @inbounds $loops + return B + end +end + +# Turn non-static integers in `dims` tuple into `StaticInt`s +# so we can construct `static_dims` vector within @generated code +function branches_nanskewness_quote(N::Int, M::Int, D) + static_dims = Int[] + for m ∈ 1:M + param = D.parameters[m] + if param <: StaticInt + new_dim = _dim(param)::Int + @assert new_dim ∉ static_dims + push!(static_dims, new_dim) + else + t = Expr(:tuple) + for n ∈ static_dims + push!(t.args, :(StaticInt{$n}())) + end + q = Expr(:block, :(dimm = dims[$m])) + qold = q + ifsym = :if + for n ∈ 1:N + n ∈ static_dims && continue + tc = copy(t) + push!(tc.args, :(StaticInt{$n}())) + qnew = Expr(ifsym, :(dimm == $n), :(return _nanskewness!(B, corrected, A, $tc))) + for r ∈ m+1:M + push!(tc.args, :(dims[$r])) + end + push!(qold.args, qnew) + qold = qnew + ifsym = :elseif + end + push!(qold.args, Expr(:block, :(throw("Dimension `$dimm` not found.")))) + return q + end + end + staticdim_nanskewness_quote(static_dims, N) +end + +# Efficient @generated in-place var +@generated function _nanskewness!(B::AbstractArray{Tₒ,N}, corrected::Bool, A::AbstractArray{T,N}, dims::D) where {Tₒ,T,N,M,D<:Tuple{Vararg{IntOrStaticInt,M}}} + N == M && return :(B[1] = _nanskewness(B[1], corrected, A, :); B) + branches_nanskewness_quote(N, M, D) +end diff --git a/src/Histograms.jl b/src/Histograms.jl index 44e7695..b676614 100644 --- a/src/Histograms.jl +++ b/src/Histograms.jl @@ -116,16 +116,11 @@ export histvar """ ```julia -histskewness(counts, bincenters; corrected::Bool=true) +histskewness(counts, bincenters) ``` Estimate the skewness of the data represented by a histogram, specified as `counts` in equally spaced bins centered at `bincenters`. -If `counts` have been normalized, or represent an analytical estimate of a PDF -rather than a histogram representing counts of a dataset, Bessel's correction -to the variance should likely not be performed - i.e., set the -`corrected` keyword argument to `false`. - ## Examples ```julia julia> binedges = -10:0.01:10; @@ -134,12 +129,12 @@ julia> counts = histcounts(randn(10000), binedges); julia> bincenters = (binedges[1:end-1] + binedges[2:end])/2 -9.995:0.01:9.995 -t + julia> histskewness(counts, bincenters) -0.02609968854343211 +0.011075369240851738 ``` """ -function histskewness(counts, bincenters; corrected::Bool=true) +function histskewness(counts, bincenters; corrected::Bool=false) N = ∅ₙ = zero(eltype(counts)) Σ = ∅ = zero(Base.promote_op(*, eltype(counts), eltype(bincenters))) @inbounds @simd ivdep for i in eachindex(counts, bincenters) @@ -148,35 +143,26 @@ function histskewness(counts, bincenters; corrected::Bool=true) N += ifelse(isnan(pᵢ), ∅ₙ, counts[i]) end μ = Σ/N - Σ = ∅ - @inbounds @simd ivdep for i in eachindex(counts, bincenters) - dᵢ = bincenters[i] - μ - pᵢ = counts[i] * dᵢ * dᵢ - Σ += ifelse(isnan(pᵢ), ∅, pᵢ) - end - σ = sqrt(Σ / max(N-corrected, ∅ₙ)) - Σ = ∅ + μ₂ = μ₃ = ∅ @inbounds @simd ivdep for i in eachindex(counts, bincenters) dᵢ = bincenters[i] - μ - pᵢ = counts[i] * dᵢ^3 - Σ += ifelse(isnan(pᵢ), ∅, pᵢ) + δ² = dᵢ * dᵢ + pᵢ = counts[i] * δ² + μ₂ += ifelse(isnan(pᵢ), ∅, pᵢ) + μ₃ += ifelse(isnan(pᵢ), ∅, pᵢ*dᵢ) end - return (Σ / N)/σ^3 + σ = sqrt(μ₂ / max(N-corrected, ∅ₙ)) + return (μ₃ / N)/σ^3 end export histskewness """ ```julia -histkurtosis(counts, bincenters; corrected::Bool=true) +histkurtosis(counts, bincenters) ``` Estimate the excess kurtosis [1] of the data represented by a histogram, specified as `counts` in equally spaced bins centered at `bincenters`. -If `counts` have been normalized, or represent an analytical estimate of a PDF -rather than a histogram representing counts of a dataset, Bessel's correction -to the variance should likely not be performed - i.e., set the -`corrected` keyword argument to `false`. - [1] We follow Distributions.jl in returning excess kurtosis rather than raw kurtosis. Excess kurtosis is defined as as kurtosis - 3, such that a Normal distribution has zero excess kurtosis. @@ -194,7 +180,7 @@ julia> histkurtosis(counts, bincenters) 0.028863400305099596 ``` """ -function histkurtosis(counts, bincenters; corrected::Bool=true) +function histkurtosis(counts, bincenters; corrected::Bool=false) N = ∅ₙ = zero(eltype(counts)) Σ = ∅ = zero(Base.promote_op(*, eltype(counts), eltype(bincenters))) @inbounds @simd ivdep for i in eachindex(counts, bincenters) @@ -203,20 +189,16 @@ function histkurtosis(counts, bincenters; corrected::Bool=true) N += ifelse(isnan(pᵢ), ∅ₙ, counts[i]) end μ = Σ/N - Σ = ∅ + μ₂ = μ₄ = ∅ @inbounds @simd ivdep for i in eachindex(counts, bincenters) dᵢ = bincenters[i] - μ - pᵢ = counts[i] * dᵢ * dᵢ - Σ += ifelse(isnan(pᵢ), ∅, pᵢ) - end - σ = sqrt(Σ / max(N-corrected, ∅ₙ)) - Σ = ∅ - @inbounds @simd ivdep for i in eachindex(counts, bincenters) - dᵢ = bincenters[i] - μ - pᵢ = counts[i] * dᵢ^4 - Σ += ifelse(isnan(pᵢ), ∅, pᵢ) + δ² = dᵢ * dᵢ + pᵢ = counts[i] * δ² + μ₂ += ifelse(isnan(pᵢ), ∅, pᵢ) + μ₄ += ifelse(isnan(pᵢ), ∅, pᵢ*δ²) end - return (Σ / N)/σ^4 - 3 + σ = sqrt(μ₂ / max(N-corrected, ∅ₙ)) + return (μ₄ / N)/σ^4 - 3 end export histkurtosis diff --git a/src/NaNStatistics.jl b/src/NaNStatistics.jl index b1bff95..c4ac1b7 100644 --- a/src/NaNStatistics.jl +++ b/src/NaNStatistics.jl @@ -13,6 +13,8 @@ module NaNStatistics include("ArrayStats/nanlogsumexp.jl") include("ArrayStats/nanvar.jl") include("ArrayStats/nanstd.jl") + include("ArrayStats/nanskewness.jl") + include("ArrayStats/nankurtosis.jl") include("ArrayStats/nansem.jl") include("ArrayStats/nancov.jl") include("Sorting/quicksort.jl") diff --git a/test/testArrayStats.jl b/test/testArrayStats.jl index cb64f69..88e444e 100644 --- a/test/testArrayStats.jl +++ b/test/testArrayStats.jl @@ -51,6 +51,8 @@ @test nanpctile([0:100...,NaN],99) === 99.0 @test nanmin(1.,2.) === 1.0 @test nanmax(1.,2.) === 2.0 + @test nanskewness([1,2,3,NaN]) ≈ 0.0 + @test nankurtosis([1,2,3,NaN]) ≈ -1.5 ## --- Non-monotonic arrays @@ -73,6 +75,8 @@ @test nanaad(A) ≈ 2.5 @test nanmedian(A) ≈ 5.5 @test nanpctile(A,50) ≈ 5.5 + @test nanskewness(A) ≈ 0.0 + @test nankurtosis(A) ≈ -1.2242424242424244 ## --- Arrays containing only NaNs should yield NaN (or 0 for sums) @@ -98,6 +102,8 @@ @test isnan(nanmedian(A)) @test isnan(nanpctile(A, 90)) @test isnan(nanquantile(A, 0.9)) + @test isnan(nanskewness(A)) + @test isnan(nankurtosis(A)) @test isnan(nanmin(NaN,NaN)) @test isnan(nanmax(NaN,NaN)) @@ -121,6 +127,8 @@ @test isnan(nanmedian(A)) @test isnan(nanpctile(A, 90)) @test isnan(nanquantile(A, 0.9)) + @test isnan(nanskewness(A)) + @test isnan(nankurtosis(A)) @test nanargmin(A) === 1 @test nanargmax(A) === 1 @@ -150,6 +158,8 @@ @test nanaad([1,2,3]) ≈ 2/3 @test nanmedian([1,2,3]) === 2.0 @test nanpctile([0:100...],99) === 99.0 + @test nanskewness([1,2,3]) ≈ 0.0 + @test nankurtosis([1,2,3]) ≈ -1.5 @test nanmin(1,2) === 1 @test nanmax(1,2) === 2 @@ -178,6 +188,8 @@ @test nanaad(1:3) ≈ 2/3 @test nanmedian(1:3) === 2.0 @test nanpctile(0:100,99) === 99.0 + @test nanskewness(1:3) ≈ 0.0 + @test nankurtosis(1:3) ≈ -1.5 A = 1:10. @test nansum(A) === 55.0 @@ -202,6 +214,8 @@ @test nanaad(1:3.) ≈ 2/3 @test nanmedian(1:3.) === 2.0 @test nanpctile(0:100.,99) === 99.0 + @test nanskewness(1:3.) ≈ 0.0 + @test nankurtosis(1:3.) ≈ -1.5 ## --- Summary statistics: dimensional tests, Int64 @@ -239,7 +253,10 @@ @test nanmedian(A, dims=2) == median(A, dims=2) @test nanpctile(A, 10, dims=1) ≈ [10.9 110.9 210.9] @test nanpctile(A, 10, dims=2) ≈ 21:120 - + @test vec(nanskewness(A, dims=1)) ≈ skewness.(eachcol(A)) + @test vec(nanskewness(A, dims=2)) ≈ skewness.(eachrow(A)) + @test vec(nankurtosis(A, dims=1)) ≈ kurtosis.(eachcol(A)) + @test vec(nankurtosis(A, dims=2)) ≈ kurtosis.(eachrow(A)) ## --- Summary statistics: dimensional tests, Float64 @@ -291,6 +308,10 @@ @test nanmad(A, dims=2) == fill(100.0, 100, 1) @test nanaad(A, dims=1) == [25.0 25.0 25.0] @test nanaad(A, dims=2) ≈ fill(200/3, 100, 1) + @test vec(nanskewness(A, dims=1)) ≈ skewness.(eachcol(A)) + @test vec(nanskewness(A, dims=2)) ≈ skewness.(eachrow(A)) + @test vec(nankurtosis(A, dims=1)) ≈ kurtosis.(eachcol(A)) + @test vec(nankurtosis(A, dims=2)) ≈ kurtosis.(eachrow(A)) ## --- Summary statistics: dimensional tests, Float64, dim @@ -338,6 +359,10 @@ @test nanmad(A, dim=2) == fill(100.0, 100) @test nanaad(A, dim=1) == [25.0, 25.0, 25.0] @test nanaad(A, dim=2) ≈ fill(200/3, 100) + @test nanskewness(A, dim=1) ≈ skewness.(eachcol(A)) + @test nanskewness(A, dim=2) ≈ skewness.(eachrow(A)) + @test nankurtosis(A, dim=1) ≈ kurtosis.(eachcol(A)) + @test nankurtosis(A, dims=2) ≈ kurtosis.(eachrow(A)) ## --- Summary statistics: dimensional tests, Float64, nonstandard array type @@ -389,6 +414,10 @@ @test nanmad(A, dims=2) == fill(100.0, 100, 1) @test nanaad(A, dims=1) == [25.0 25.0 25.0] @test nanaad(A, dims=2) ≈ fill(200/3, 100, 1) + @test vec(nanskewness(A, dims=1)) ≈ skewness.(eachcol(A)) + @test vec(nanskewness(A, dims=2)) ≈ skewness.(eachrow(A)) + @test vec(nankurtosis(A, dims=1)) ≈ kurtosis.(eachcol(A)) + @test vec(nankurtosis(A, dims=2)) ≈ kurtosis.(eachrow(A)) ## --- Test fallbacks for complex reductions diff --git a/test/testHistograms.jl b/test/testHistograms.jl index 3c2957e..f820cd4 100644 --- a/test/testHistograms.jl +++ b/test/testHistograms.jl @@ -37,14 +37,20 @@ ## --- Statistics on histograms a = randn(10000) * 1.2 + @test nanmean(a) ≈ mean(a) + @test nanvar(a) ≈ var(a) + @test nanstd(a) ≈ std(a) + @test nanskewness(a) ≈ skewness(a) + @test nankurtosis(a) ≈ kurtosis(a) + binedges = -10:0.1:10 - bincenters = (binedges[1:end-1] + binedges[2:end])/2 + bincenters = (binedges[1:end-1] + binedges[2:end])/2 h = histcounts(a, binedges) - @test histmean(h, bincenters) ≈ nanmean(a) atol = 0.02 - @test histvar(h, bincenters) ≈ nanvar(a) atol = 0.02 - @test histstd(h, bincenters) ≈ nanstd(a) atol = 0.02 - @test histskewness(h, bincenters) ≈ 0 atol = 0.2 - @test histkurtosis(h, bincenters) ≈ 0 atol = 0.2 + @test histmean(h, bincenters) ≈ mean(a) atol = 0.02 + @test histvar(h, bincenters) ≈ var(a) atol = 0.02 + @test histstd(h, bincenters) ≈ std(a) atol = 0.02 + @test histskewness(h, bincenters) ≈ skewness(a) atol = 0.02 + @test histkurtosis(h, bincenters) ≈ kurtosis(a) atol = 0.02 n = pdf.(Normal(0,1), bincenters) @test histmean(n, bincenters) ≈ 0 atol = 1e-6