From f6c798223e6920725787095cb6b2de86a0f916ed Mon Sep 17 00:00:00 2001 From: Charles Dufour <34485907+dufourc1@users.noreply.github.com> Date: Tue, 28 Nov 2023 10:35:09 +0100 Subject: [PATCH] Hamming distance starting node labels (#47) Refactor and add original method to find initial node labels from paper --- Project.toml | 7 +- src/NetworkHistogram.jl | 6 +- src/assignment.jl | 109 +++++++++++++++++- src/config_rules/accept_rule.jl | 4 +- src/config_rules/bandwidth_selection_rule.jl | 61 ++++++++++ src/config_rules/starting_assignment_rule.jl | 8 ++ src/group_numbering.jl | 4 +- src/histogram.jl | 26 ++++- src/optimize.jl | 103 ++++++----------- src/proposal.jl | 40 ++++++- src/utils.jl | 104 ++++++++++++++++- .../starting_assigment_rule_test.jl | 37 ++++-- test/pipeline_test.jl | 60 ++++++++-- test/runtests.jl | 1 + test/simple_test_example.jl | 19 +++ test/test_multilayer.jl | 10 ++ 16 files changed, 492 insertions(+), 107 deletions(-) create mode 100644 src/config_rules/bandwidth_selection_rule.jl create mode 100644 test/test_multilayer.jl diff --git a/Project.toml b/Project.toml index db81c1e..7820e7d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,14 +1,16 @@ name = "NetworkHistogram" uuid = "7806f430-7229-459c-b2e6-df35e8e4eb5d" authors = ["Charles Dufour", "Jake Grainger"] -version = "0.5.0" +version = "0.5.1" [deps] +ArnoldiMethod = "ec485272-7323-5ecc-a04f-4719b315124d" Arpack = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JLD = "4138dd39-2aa7-5051-a626-17a0bb65d9c8" +Kronecker = "2c470bb0-bcc8-11e8-3dad-c9649493f05e" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -17,7 +19,6 @@ TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" ValueHistories = "98cad3c8-aec3-5f06-8e41-884608649ab7" [compat] -julia = "1.8" Arpack = "0.5.4" BenchmarkTools = "1.3.2" CodecZstd = "0.7.2" @@ -27,7 +28,7 @@ ProgressMeter = "1.7.2" StatsBase = "0.33.21" TranscodingStreams = "0.9.11" ValueHistories = "0.5.4" - +julia = "1.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/NetworkHistogram.jl b/src/NetworkHistogram.jl index b72d37a..1272972 100644 --- a/src/NetworkHistogram.jl +++ b/src/NetworkHistogram.jl @@ -1,11 +1,12 @@ module NetworkHistogram -using ValueHistories, StatsBase, Random, LinearAlgebra +using ValueHistories, StatsBase, Random, LinearAlgebra, Kronecker using Arpack: eigs +using ArnoldiMethod: partialschur, partialeigen, SR, LR export graphhist, PreviousBestValue, Strict, RandomNodeSwap -export OrderedStart, RandomStart, EigenStart +export OrderedStart, RandomStart, EigenStart, DistStart include("group_numbering.jl") include("assignment.jl") @@ -15,6 +16,7 @@ include("config_rules/starting_assignment_rule.jl") include("config_rules/swap_rule.jl") include("config_rules/accept_rule.jl") include("config_rules/stop_rule.jl") +include("config_rules/bandwidth_selection_rule.jl") include("optimize.jl") include("histogram.jl") diff --git a/src/assignment.jl b/src/assignment.jl index b191acc..ea06117 100644 --- a/src/assignment.jl +++ b/src/assignment.jl @@ -1,16 +1,17 @@ -mutable struct Assignment{T} +mutable struct Assignment{T, M} const group_size::GroupSize{T} const node_labels::Vector{Int} const counts::Matrix{Int} - const realized::Matrix{Float64} - const estimated_theta::Matrix{Float64} + const realized::Array{Float64, M} + const estimated_theta::Array{Float64, M} + const number_layers::Int likelihood::Float64 function Assignment(A, node_labels, group_size::GroupSize{T}) where {T} + M = ndims(A) number_groups = length(group_size) - estimated_theta = zeros(Float64, size(A, 1), number_groups) counts = zeros(Int64, number_groups, number_groups) realized = zeros(Int64, number_groups, number_groups) @@ -33,15 +34,84 @@ mutable struct Assignment{T} likelihood = compute_log_likelihood(number_groups, estimated_theta, counts, size(A, 1)) - new{T}(group_size, + new{T, M}(group_size, node_labels, counts, realized, estimated_theta, + 1, + likelihood) + end + + function Assignment(A::Array{I, 3}, + node_labels, + group_size::GroupSize{T}) where {I, T} + M = ndims(A) + number_groups = length(group_size) + + counts = zeros(Int64, number_groups, number_groups) + realized = zeros(Int64, number_groups, number_groups, 2^size(A, 3)) + + A_updated = zeros(Int64, size(A, 1), size(A, 2)) + for i in 1:size(A, 1) + for j in (i + 1):size(A, 2) + A_updated[i, j] = _binary_to_index(A[i, j, :]) + A_updated[j, i] = A_updated[i, j] + end + end + + @inbounds @simd for m in 1:size(realized, 3) + for k in 1:number_groups + for l in k:number_groups + realized[k, l, m] = sum(A_updated[node_labels .== k, + node_labels .== l] .== m) + realized[l, k, m] = realized[k, l, m] + counts[k, l] = group_size[k] * group_size[l] + counts[l, k] = counts[k, l] + end + end + end + + @inbounds @simd for m in 1:size(realized, 3) + for k in 1:number_groups + counts[k, k] = group_size[k] * (group_size[k] - 1) ÷ 2 + realized[k, k, m] = sum(A_updated[node_labels .== k, node_labels .== k] .== + m) ÷ 2 + end + end + estimated_theta = realized ./ counts + likelihood = compute_multivariate_log_likelihood(number_groups, + estimated_theta, + realized) + + new{T, M}(group_size, + node_labels, + counts, + realized, + estimated_theta, + size(A, 3), likelihood) end end +function _binary_to_index(binary_vector::Vector{Int}) + total = 1 + for i in 1:length(binary_vector) + total += binary_vector[i] * 2^(i - 1) + end + return total +end + +function _index_to_binary(index::Int, M) + binary_vector = zeros(Int, M) + index -= 1 + for i in 1:M + binary_vector[i] = index % 2 + index = index ÷ 2 + end + return binary_vector +end + """ compute_log_likelihood(number_groups, estimated_theta, counts, number_nodes) @@ -66,6 +136,22 @@ function compute_log_likelihood(number_groups, estimated_theta, counts, number_n return loglik end +function compute_multivariate_log_likelihood(number_groups, estimated_theta, realized) + loglik = 0.0 + @inbounds @simd for i in 1:number_groups + for j in i:number_groups + for m in 1:size(realized, 3) + if realized[i, j, m] != 0 + θ = estimated_theta[i, j, m] + θ_c = θ <= 0 ? 1e-14 : (θ >= 1 ? 1 - 1e-14 : θ) + loglik += log(θ_c) * realized[i, j, m] + end + end + end + end + return loglik +end + """ compute_log_likelihood(assignment::Assignment) @@ -82,13 +168,24 @@ where ``\\hat{\\theta}_{ab}`` is the estimated probability of an edge between co \\hat{\\theta}_{ab} = \\frac{\\sum\\limits_{i current.likelihood deepcopy!(current, proposal) end diff --git a/src/config_rules/bandwidth_selection_rule.jl b/src/config_rules/bandwidth_selection_rule.jl new file mode 100644 index 0000000..fc0eb40 --- /dev/null +++ b/src/config_rules/bandwidth_selection_rule.jl @@ -0,0 +1,61 @@ + +function select_bandwidth(A::Array{T, 2}; type = "degs", alpha = 1, c = 1)::Int where {T} + h = oracle_bandwidth(A, type, alpha, c) + return max(2, min(size(A)[1], round(Int, h))) +end + +function select_bandwidth(A::Array{T, 3}; type = "degs", alpha = 1, c = 1)::Int where {T} + hs = [select_bandwidth(A[:, :, i]; type, alpha, c) for i in 1:size(A, 3)] + @warn "Naive bandwidth selection for multilayer graph histogram: using minimum over layers" + h = max(2, min(size(A, 1), round(Int, minimum(hs)))) + return h +end + +""" + oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) + +Oracle bandwidth selection for graph histogram, using + +```math +\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}}, +``` + +where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. +""" +function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) + if type ∉ ["eigs", "degs"] + error("Invalid input type $(type)") + end + + if alpha != 1 + error("Currently only supports alpha = 1") + end + + n = size(A, 1) + midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int, + (n ÷ 2 + c * sqrt(n)))) + rhoHat_inv = inv(sum(A) / (n * (n - 1))) + + # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat); + if type == "eigs" + eig_res = eigs(A, nev = 1, which = :LM) + u = eig_res.vectors + mult = eig_res.values[1] + elseif type == "degs" + u = sum(A, dims = 2) + mult = (u' * A * u) / (sum(u .^ 2))^2 + else + error("Invalid input type $(type)") + end + + # Calculation bandwidth + u = sort(u, dims = 1) + uMid = u[midPt] + lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid + + h = (2^(alpha + 1) * alpha * mult^2 * + (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 * + rhoHat_inv)^(-1 / (2 * (alpha + 1))) + #estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2 + return h[1] +end diff --git a/src/config_rules/starting_assignment_rule.jl b/src/config_rules/starting_assignment_rule.jl index a162532..1ee0261 100644 --- a/src/config_rules/starting_assignment_rule.jl +++ b/src/config_rules/starting_assignment_rule.jl @@ -2,6 +2,7 @@ abstract type StartingAssignment end struct OrderedStart <: StartingAssignment end struct RandomStart <: StartingAssignment end struct EigenStart <: StartingAssignment end +struct DistStart <: StartingAssignment end """ initialize_node_labels(A, h, starting_assignment_rule::StartingAssignment) @@ -13,6 +14,7 @@ node labels and a `GroupSize` object. - `OrderedStart()`: Sequentially assign nodes to groups based on the ordering of `A`. - `RandomStart()`: Randomly assign nodes to groups. - `EigenStart()`: Assign nodes to groups based on the second eigenvector of the normalized Laplacian. +- `DistStart()`: Assign nodes to groups based on the Hamming distance between rows of `A`. """ initialize_node_labels @@ -47,3 +49,9 @@ function initialize_node_labels(A, h, ::EigenStart) end return node_labels, group_size end + +function initialize_node_labels(A, h, ::DistStart) + group_size = GroupSize(size(A, 1), h) + node_labels = spectral_clustering(A, h) + return node_labels, group_size +end diff --git a/src/group_numbering.jl b/src/group_numbering.jl index 7380574..12d9691 100644 --- a/src/group_numbering.jl +++ b/src/group_numbering.jl @@ -19,11 +19,11 @@ struct GroupSize{T} <: AbstractVector{Int} else remainder_group = number_nodes - number_groups * standard_group if remainder_group == 1 - @warn "h has to be changes as only one node in remainder group" + @warn "h has to be changed, only one node in remainder group" standard_group -= 1 remainder_group = number_groups + 1 # because equal to 1+number_groups because we take 1 from each standard group, and there are number_groups of them if standard_group == 1 - error("Standard group size now 1, please choose a new h value.") + error("Standard group size now 1, please choose a new value for h.") end end new{Tuple{Int, Int}}((standard_group, remainder_group), number_groups + 1) diff --git a/src/histogram.jl b/src/histogram.jl index 79339ba..126d1ad 100644 --- a/src/histogram.jl +++ b/src/histogram.jl @@ -1,10 +1,11 @@ -struct GraphHist{T} - θ::Matrix{T} +struct GraphHist{T, M} + θ::Array{T, M} node_labels::Vector{Int} - function GraphHist(a::Assignment) + num_layers::Int + function GraphHist(a::Assignment{T, M}) where {T, M} θ = a.estimated_theta node_labels = a.node_labels - new{typeof(θ[1])}(θ, node_labels) + new{typeof(θ[1]), M}(θ, node_labels, a.number_layers) end end @@ -23,3 +24,20 @@ Contains the estimated network histogram and the node labels. blockmodel approximation." Proceedings of the National Academy of Sciences 111.41 (2014): 14722-14727. """ GraphHist + +function get_moment_representation(g::GraphHist{T, 2}) where {T} + return g.θ +end + +function get_moment_representation(g::GraphHist{T, 3}) where {T} + moments = zeros(size(g.θ, 1), size(g.θ, 2), 2^g.num_layers - 1) + transition = collect(kronecker([1 1; 0 1], g.num_layers)) + for i in 1:size(g.θ, 1) + for j in 1:size(g.θ, 2) + moments[i, j, :] .= (transition * g.θ[i, j, :])[2:end] + end + end + indices_for_moments = [findall(x -> x == 1, _index_to_binary(e, g.num_layers)) + for e in 2:size(g.θ, 3)] + return moments, indices_for_moments +end diff --git a/src/optimize.jl b/src/optimize.jl index f432c0d..751ffcf 100644 --- a/src/optimize.jl +++ b/src/optimize.jl @@ -3,9 +3,35 @@ function checkadjacency(A) if !(eltype(A) === Bool) @assert all(a ∈ [zero(eltype(A)), one(eltype(A))] for a in A) "All elements of the ajacency matrix should be zero or one." end + check_symmetry_and_diag(A) + return nothing +end + +function check_symmetry_and_diag(A) @assert issymmetric(A) @assert all(A[i, i] == zero(eltype(A)) for i in 1:size(A, 1)) "The diagonal of the adjacency matrix should all be zeros." - return nothing +end + +function check_symmetry_and_diag(A::Array{T, 3}) where {T} + for layer in eachslice(A, dims = 3) + check_symmetry_and_diag(layer) + @assert all(layer[i, i] == zero(eltype(layer)) for i in 1:size(layer, 1)) "The diagonal of the adjacency matrix should all be zeros." + end +end + +function update_adj(A::Array{T, 2}) where {T} + return A +end + +function update_adj(A::Array{T, 3}) where {T} + A_updated = zeros(Int64, size(A, 1), size(A, 2)) + for i in 1:size(A, 1) + for j in (i + 1):size(A, 2) + A_updated[i, j] = _binary_to_index(A[i, j, :]) + A_updated[j, i] = A_updated[i, j] + end + end + return A_updated end """ @@ -54,13 +80,14 @@ julia> loglikelihood = out.likelihood -22.337057781338277 ``` """ -function graphhist(A; h = select_bandwidth(A), maxitr = 1000, - swap_rule::NodeSwapRule = RandomNodeSwap(), - starting_assignment_rule::StartingAssignment = RandomStart(), - accept_rule::AcceptRule = Strict(), - stop_rule::StopRule = PreviousBestValue(3), record_trace = true) +function graphhist(A; h = select_bandwidth(A), maxitr = 10000, + swap_rule::NodeSwapRule = RandomNodeSwap(), + starting_assignment_rule::StartingAssignment = EigenStart(), + accept_rule::AcceptRule = Strict(), + stop_rule::StopRule = PreviousBestValue(100), record_trace = true) checkadjacency(A) @assert maxitr > 0 + A = drop_disconnected_components(A) return _graphhist(A, Val{record_trace}(), h = h, maxitr = maxitr, swap_rule = swap_rule, starting_assignment_rule = starting_assignment_rule, @@ -74,8 +101,8 @@ end Internal version of `graphhist` which is type stable. """ function _graphhist(A, record_trace = Val{true}(); h, maxitr, swap_rule, - starting_assignment_rule, accept_rule, stop_rule) - best, current, proposal, history = initialize(A, h, starting_assignment_rule, + starting_assignment_rule, accept_rule, stop_rule) + best, current, proposal, history, A = initialize(A, h, starting_assignment_rule, record_trace) for i in 1:maxitr @@ -103,8 +130,8 @@ function graphhist_format_output(best, history::NoTraceHistory) end function update_best!(history::GraphOptimizationHistory, iteration::Int, - current::Assignment, - best::Assignment) + current::Assignment, + best::Assignment) if current.likelihood > best.likelihood update_best!(history, iteration, current.likelihood) deepcopy!(best, current) @@ -123,59 +150,5 @@ function initialize(A, h, starting_assignment_rule, record_trace) current = deepcopy(proposal) best = deepcopy(proposal) history = initialize_history(best, current, proposal, record_trace) - return best, current, proposal, history -end - -function select_bandwidth(A, type = "degs", alpha = 1, c = 1)::Int - h = oracle_bandwidth(A, type, alpha, c) - return max(2, min(size(A)[1], round(Int, h))) -end - -""" - oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) - -Oracle bandwidth selection for graph histogram, using - -```math -\\widehat{h^*}=\\left(2\\left(\\left(d^T d\\right)^{+}\\right)^2 d^T A d \\cdot \\hat{m} \\hat{b}\\right)^{-\\frac{1}{2}} \\hat{\\rho}_n^{\\frac{1}{4}}, -``` - -where ``d`` is the vector of degree sorted in increasing order,``\\hat{\\rho}_n`` is the empirical edge density, and ``m``, ``b`` are the slope and intercept fitted on ``d[n/2-c\\sqrt{n}:n/2+c\\sqrt{n}]`` for some ``c``. -""" -function oracle_bandwidth(A, type = "degs", alpha = 1, c = min(4, sqrt(size(A, 1)) / 8)) - if type ∉ ["eigs", "degs"] - error("Invalid input type $(type)") - end - - if alpha != 1 - error("Currently only supports alpha = 1") - end - - n = size(A, 1) - midPt = collect(max(1, round(Int, (n ÷ 2 - c * sqrt(n)))):round(Int, - (n ÷ 2 + c * sqrt(n)))) - rhoHat_inv = inv(sum(A) / (n * (n - 1))) - - # Rank-1 graphon estimate via fhat(x,y) = mult*u(x)*u(y)*pinv(rhoHat); - if type == "eigs" - eig_res = eigs(A, nev = 1, which = :LM) - u = eig_res.vectors - mult = eig_res.values[1] - elseif type == "degs" - u = sum(A, dims = 2) - mult = (u' * A * u) / (sum(u .^ 2))^2 - else - error("Invalid input type $(type)") - end - - # Calculation bandwidth - u = sort(u, dims = 1) - uMid = u[midPt] - lmfit_coef = hcat(ones(length(uMid)), 1:length(uMid)) \ uMid - - h = (2^(alpha + 1) * alpha * mult^2 * - (lmfit_coef[2] * length(uMid) / 2 + lmfit_coef[1])^2 * lmfit_coef[2]^2 * - rhoHat_inv)^(-1 / (2 * (alpha + 1))) - #estMSqrd = 2*mult^2*(lmfit_coef[2]*length(uMid)/2+lmfit_coef[1])^2*lmfit_coef[2]^2*rhoHat_inv^2*(n+1)^2 - return h[1] + return best, current, proposal, history, update_adj(A) end diff --git a/src/proposal.jl b/src/proposal.jl index f4fb594..848a2fb 100644 --- a/src/proposal.jl +++ b/src/proposal.jl @@ -12,8 +12,8 @@ proposal is stored in the history. The `proposal` assignment is modified in place to avoid unnecessary memory allocation. """ function create_proposal!(history::GraphOptimizationHistory, iteration::Int, - proposal::Assignment, - current::Assignment, A, swap_rule) + proposal::Assignment, + current::Assignment, A, swap_rule) swap = select_swap(current, A, swap_rule) make_proposal!(proposal, current, swap, A) update_proposal!(history, iteration, proposal.likelihood) @@ -66,7 +66,8 @@ swap of the nodes specified in `swap`. NOTE labels of the nodes before the swap """ -function update_observed!(proposal::Assignment, swap::Tuple{Int, Int}, A) + +function update_observed!(proposal::Assignment{T, 2}, swap::Tuple{Int, Int}, A) where {T} group_node_1 = proposal.node_labels[swap[1]] group_node_2 = proposal.node_labels[swap[2]] @@ -99,3 +100,36 @@ function update_observed!(proposal::Assignment, swap::Tuple{Int, Int}, A) return nothing end + +function update_observed!(proposal::Assignment{T, 3}, swap::Tuple{Int, Int}, A) where {T} + group_node_1 = proposal.node_labels[swap[1]] + group_node_2 = proposal.node_labels[swap[2]] + if group_node_1 == group_node_2 + return nothing + end + + for i in axes(A, 1) + if i == swap[1] || i == swap[2] || A[swap[1], i] == A[swap[2], i] + continue + end + group_i = proposal.node_labels[i] + + proposal.realized[group_node_1, group_i, A[i, swap[1]]] -= 1 + proposal.realized[group_i, group_node_1, A[i, swap[1]]] = proposal.realized[group_node_1, + group_i, A[i, swap[1]]] + proposal.realized[group_node_2, group_i, A[i, swap[1]]] += 1 + proposal.realized[group_i, group_node_2, A[i, swap[1]]] = proposal.realized[group_node_2, + group_i, A[i, swap[1]]] + + proposal.realized[group_node_1, group_i, A[i, swap[2]]] += 1 + proposal.realized[group_i, group_node_1, A[i, swap[2]]] = proposal.realized[group_node_1, + group_i, A[i, swap[2]]] + proposal.realized[group_node_2, group_i, A[i, swap[2]]] -= 1 + proposal.realized[group_i, group_node_2, A[i, swap[2]]] = proposal.realized[group_node_2, + group_i, A[i, swap[2]]] + end + + @. proposal.estimated_theta = proposal.realized / proposal.counts + + return nothing +end diff --git a/src/utils.jl b/src/utils.jl index a33154c..3b85a6f 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -4,6 +4,106 @@ function laplacian(A) end function normalized_laplacian(A) - D_scale = vec(sum(A; dims = 1)) .^ -0.5 - return I(size(A, 1)) - Diagonal(D_scale) * (A * Diagonal(D_scale)) + L = zeros(size(A)) + degrees = vec(sum(A, dims = 1)) + for j in 1:size(A, 1) + for i in 1:size(A, 2) + if i == j + L[i, j] = 1 + else + L[i, j] = A[i, j] / sqrt(degrees[i] * degrees[j]) + end + end + end + return L +end + +function normalized_laplacian(A::AbstractArray{T, 3}) where {T} + L = zeros(size(A, 1), size(A, 2)) + for layer in eachslice(A, dims = 3) + L .+= normalized_laplacian(layer) + end + return L ./ size(A, 3) +end + +function drop_disconnected_components(A::AbstractArray{T, 2}) where {T} + indices = findall(x -> x != 0, vec(sum(A, dims = 1))) + return A[indices, indices] +end + +function drop_disconnected_components(A::AbstractArray{T, 3}) where {T} + indices = findall(x -> x != 0, vec(sum(A, dims = (1, 3)))) + return A[indices, indices, :] +end + +""" + hamming_distance(x, y) + +Compute the normalized Hamming distance between two vectors `x` and `y`. +""" +function hamming_distance(x::Vector{T}, y::Vector{T}) where {T} + return sum(x .!= y) / length(x) +end + +""" + pairwise_hamming_distance(A) + +Compute the pairwise Hamming distance between all rows of `A`. If `A` is a 3D +array, then the average Hamming distance for each layer of the array is returned. +""" +pairwise_hamming_distance + +function pairwise_hamming_distance(matrix::AbstractArray{T, 2}) where {T} + n = size(matrix, 1) + dist_matrix = zeros(n, n) + for i in 1:n, j in (i + 1):n + dist_matrix[i, j] = hamming_distance(matrix[i, :], matrix[j, :]) + dist_matrix[j, i] = dist_matrix[i, j] # Symmetric matrix + end + return dist_matrix +end + +function pairwise_hamming_distance(matrix::AbstractArray{T, 3}) where {T} + n = size(matrix, 1) + dist_matrix = zeros(n, n) + for layer in eachslice(matrix, dims = 3) + dist_matrix .+= pairwise_hamming_distance(layer) + end + return dist_matrix ./ size(matrix, 3) +end + +function spectral_clustering(A, h) + n = size(A, 1) + + L = 1 .- pairwise_hamming_distance(A) ./ n + + # Compute the degree matrix + d = sum(L, dims = 2) + + # Compute the normalized Laplacian + normalized_L = sum(1.0 ./ d) .* L .- sum(d) / sqrt(sum(d .^ 2)) + + # Compute eigenvalues and eigenvectors of the normalized Laplacian + + decomp, history = partialschur(normalized_L, nev = 2, which = LR()) + _, eigen_vecs = partialeigen(decomp) + + # Extract the second eigenvector + u = real.(eigen_vecs[:, 1]) + u = u .* sign(u[1]) # Set the first coordinate >= 0 wlog + + # Sort based on the embedding + ind = sortperm(u, alg = QuickSort, rev = false) + + # Determine the number of clusters + k = ceil(Int, n / h) + + # Initialize cluster assignments + idxInit = zeros(Int, n) + for i in 1:k + for j in ((i - 1) * h + 1):min(n, i * h) + idxInit[ind[j]] = i + end + end + return idxInit end diff --git a/test/config_rules/starting_assigment_rule_test.jl b/test/config_rules/starting_assigment_rule_test.jl index e867e54..5ff2617 100644 --- a/test/config_rules/starting_assigment_rule_test.jl +++ b/test/config_rules/starting_assigment_rule_test.jl @@ -1,12 +1,33 @@ import NetworkHistogram: initialize_node_labels -@testset "starting assigment rule" begin - A, _, _, _ = make_simple_example() - for method in (OrderedStart(), RandomStart(), EigenStart()) - node_labels, group_size = initialize_node_labels(A, 4, method) - if method isa OrderedStart - @test sort(node_labels) == node_labels + +@testset "starting assignment rules" begin + @testset "starting assigment rule simple graphs" begin + A, _, _, _ = make_simple_example() + for method in (OrderedStart(), + RandomStart(), + EigenStart(), + DistStart()) + node_labels, group_size = initialize_node_labels(A, 4, method) + if method isa OrderedStart + @test sort(node_labels) == node_labels + end + @test all(sum(n -> n == j, node_labels) == group_size[j] + for j in unique(node_labels)) + end + end + + @testset "starting assigment rule multilayer graphs" begin + A, _, _, _ = make_multivariate_example() + for method in (OrderedStart(), + RandomStart(), + EigenStart(), + DistStart()) + node_labels, group_size = initialize_node_labels(A, 4, method) + if method isa OrderedStart + @test sort(node_labels) == node_labels + end + @test all(sum(n -> n == j, node_labels) == group_size[j] + for j in unique(node_labels)) end - @test all(sum(n -> n == j, node_labels) == group_size[j] - for j in unique(node_labels)) end end diff --git a/test/pipeline_test.jl b/test/pipeline_test.jl index 6470297..7f86664 100644 --- a/test/pipeline_test.jl +++ b/test/pipeline_test.jl @@ -1,15 +1,15 @@ @testset "Pipeline" begin + A = [0 0 1 0 1 0 1 1 0 1 + 0 0 1 1 1 1 1 1 0 0 + 1 1 0 1 0 0 0 0 1 0 + 0 1 1 0 1 0 1 0 0 0 + 1 1 0 1 0 0 1 0 0 1 + 0 1 0 0 0 0 0 1 0 0 + 1 1 0 1 1 0 0 1 0 1 + 1 1 0 0 0 1 1 0 0 1 + 0 0 1 0 0 0 0 0 0 1 + 1 0 0 0 1 0 1 1 1 0] @testset "dummy run" begin - A = [0 0 1 0 1 0 1 1 0 1 - 0 0 1 1 1 1 1 1 0 0 - 1 1 0 1 0 0 0 0 1 0 - 0 1 1 0 1 0 1 0 0 0 - 1 1 0 1 0 0 1 0 0 1 - 0 1 0 0 0 0 0 1 0 0 - 1 1 0 1 1 0 0 1 0 1 - 1 1 0 0 0 1 1 0 0 1 - 0 0 1 0 0 0 0 0 0 1 - 1 0 0 0 1 0 1 1 1 0] @testset "run bandwidth float" begin estimated = graphhist(A; h = 0.5) @test all(estimated.graphhist.θ .>= 0.0) @@ -22,6 +22,11 @@ @test all(estimated.graphhist.θ .<= 1.0) @test size(estimated.graphhist.θ) == (2, 2) end + @testset "run with automatic bandwidth" begin + estimated = graphhist(A) + @test all(estimated.graphhist.θ .>= 0.0) + @test all(estimated.graphhist.θ .<= 1.0) + end end @testset "associative stochastic block model" begin @@ -41,4 +46,39 @@ end end end + + @testset "multilayer run" begin + @testset "2 layers perfectly correlated" begin + A_2 = cat(A, A, dims = 3) + estimated, history = graphhist(A_2; h = 0.5) + @test all(estimated.θ .>= 0.0) + @test all(estimated.θ .<= 1.0) + @test size(estimated.θ) == (2, 2, 4) + end + @testset "run with automatic bandwidth" begin + A_2 = cat(A, A, dims = 3) + estimated, history = graphhist(A_2) + @test all(estimated.θ .>= 0.0) + @test all(estimated.θ .<= 1.0) + end + + @testset "2 layers perfectly anti-correlated" begin + A_2 = cat(A, abs.(A .- 1), dims = 3) + for i in 1:size(A, 1) + A_2[i, i, 2] = 0 + end + estimated, history = graphhist(A_2; h = 0.5) + @test all(estimated.θ .>= 0.0) + @test all(estimated.θ .<= 1.0) + @test size(estimated.θ) == (2, 2, 4) + end + + @testset "3 layers" begin + A_3 = cat(A, A, A, dims = 3) + estimated, history = graphhist(A_3; h = 0.5) + @test all(estimated.θ .>= 0.0) + @test all(estimated.θ .<= 1.0) + @test size(estimated.θ) == (2, 2, 8) + end + end end diff --git a/test/runtests.jl b/test/runtests.jl index 8759e16..643bfc7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,7 @@ include("simple_test_example.jl") @testset "NetworkHistogram.jl" begin include("pipeline_test.jl") + include("test_multilayer.jl") include("proposal_test.jl") include("starting_labels_test.jl") include("oracle_bandwidth_test.jl") diff --git a/test/simple_test_example.jl b/test/simple_test_example.jl index cc40bc8..010f511 100644 --- a/test/simple_test_example.jl +++ b/test/simple_test_example.jl @@ -18,3 +18,22 @@ function make_simple_example() assignment = NetworkHistogram.Assignment(A, node_labels, group_size) return A, node_labels, group_size, assignment end + +function make_multivariate_example() + A = [0 1 1 1 0 0 1 0 + 1 0 1 1 0 0 0 0 + 1 1 0 0 0 0 0 0 + 1 1 0 0 0 0 0 1 + 0 0 0 0 0 1 1 1 + 0 0 0 0 1 0 1 1 + 1 0 0 0 1 1 0 0 + 0 0 0 1 1 1 0 0] + A = cat(A, abs.(A .- 1), dims = 3) + for i in size(A, 1) + A[i, i, :] .= 0 + end + node_labels = [1, 1, 1, 1, 2, 2, 2, 2] + group_size = NetworkHistogram.GroupSize(8, 4) + assignment = NetworkHistogram.Assignment(A, node_labels, group_size) + return A, node_labels, group_size, assignment +end diff --git a/test/test_multilayer.jl b/test/test_multilayer.jl new file mode 100644 index 0000000..aba8e04 --- /dev/null +++ b/test/test_multilayer.jl @@ -0,0 +1,10 @@ +@testset "multilayer" begin + @testset "test initialisation of assignments" begin + A, labels, group_size, assignment = make_simple_example() + A2, _, _, assignment2 = make_multivariate_example() + @test all(assignment.estimated_theta .== assignment2.estimated_theta[:, :, 2]) + @test all(assignment.realized .== assignment2.realized[:, :, 2]) + @test assignment.likelihood == assignment2.likelihood + @test sum(assignment2.estimated_theta) ≈ size(assignment2.estimated_theta, 1)^2 + end +end