From c6a4a3019e9d17c3be01c6422fccb016ef9ddf5e Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Wed, 3 Jan 2024 13:06:43 +0100 Subject: [PATCH] use custom DynamicHistogram instead --- .../query/statistics/DynamicHistogram.java | 148 ++++++++++++++++++ .../NumberColumnStatsCollector.java | 95 ++--------- 2 files changed, 157 insertions(+), 86 deletions(-) create mode 100644 backend/src/main/java/com/bakdata/conquery/models/query/statistics/DynamicHistogram.java diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DynamicHistogram.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DynamicHistogram.java new file mode 100644 index 0000000000..b52edaf166 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DynamicHistogram.java @@ -0,0 +1,148 @@ +package com.bakdata.conquery.models.query.statistics; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import com.google.common.math.Stats; +import it.unimi.dsi.fastutil.doubles.DoubleArrayList; +import it.unimi.dsi.fastutil.doubles.DoubleList; +import lombok.Data; + +@Data +public class DynamicHistogram { + private final Node[] nodes; + private final double min; + private final double width; + + public static DynamicHistogram create(double min, double max, int expectedBins) { + return new DynamicHistogram(new Node[expectedBins], min, (max - min) / expectedBins); + } + + public void add(double value) { + final int index = (int) Math.floor((min - value) / width); + + if (nodes[index] == null) { + nodes[index] = new Node(new DoubleArrayList()); + } + + nodes[index].add(value); + } + + public List balanced(int expectedBins, int total) { + + final List merged = mergeLeft(total, nodes); + + final List split = splitRight(expectedBins, merged); + + return split; + + } + + private static List mergeLeft(int total, Node[] nodes) { + final List bins = new ArrayList<>(); + + Node prior = null; + + for (Node bin : nodes) { + + if (prior == null) { + prior = bin; + continue; + } + + // If the bin is too small, we merge-left + if ((double) prior.getCount() / total <= (1d / total)) { + prior = prior.merge(bin); + continue; + } + + // Only emit bin, if we cannot merge left. + bins.add(prior); + prior = null; + } + + if (prior != null) { + bins.add(prior); + } + + // since we're merging from right, we need to reverse + Collections.reverse(bins); + + return bins; + } + + private static List splitRight(int expectedBins, List nodes) { + + if ((double) nodes.size() / (double) expectedBins >= 0.7d) { + return nodes; + } + + final List bins = new ArrayList<>(); + + final Stats stats = nodes.stream().mapToDouble(node -> (double) node.getCount()).boxed().collect(Stats.toStats()); + + final double stdDev = stats.sampleStandardDeviation(); + final double mean = stats.mean(); + + + for (Node node : nodes) { + if (node.getCount() < mean + stdDev) { + bins.add(node); + continue; + } + + bins.addAll(node.split()); + } + + return bins; + } + + @Data + public static final class Node { + private final DoubleList entries; + private double min = Double.MAX_VALUE; + private double max = Double.MIN_VALUE; + + public Node merge(Node other) { + final Node out = new Node(new DoubleArrayList(getCount() + other.getCount())); + + out.max = Math.max(max, other.getMax()); + out.min = Math.min(min, other.getMin()); + + out.entries.addAll(other.getEntries()); + out.entries.addAll(getEntries()); + + return out; + } + + public int getCount() { + return entries.size(); + } + + public List split() { + final double mean = entries.doubleStream().average().getAsDouble(); + final Node lower = new Node(new DoubleArrayList()); + final Node higher = new Node(new DoubleArrayList()); + + + for (double entry : entries) { + if (entry <= mean) { + lower.add(entry); + } + else { + higher.add(entry); + } + } + + return List.of(lower, higher); + } + + public void add(double value) { + max = Math.max(max, value); + min = Math.min(min, value); + entries.add(value); + } + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java index 0728c6a9e0..f50f4d1b30 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java @@ -10,17 +10,11 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.DoubleStream; import c10n.C10N; import com.bakdata.conquery.models.i18n.I18n; import com.bakdata.conquery.models.query.PrintSettings; import com.bakdata.conquery.models.types.ResultType; -import com.dynatrace.dynahist.Histogram; -import com.dynatrace.dynahist.bin.Bin; -import com.dynatrace.dynahist.layout.CustomLayout; -import com.dynatrace.dynahist.layout.Layout; -import com.dynatrace.dynahist.layout.LogLinearLayout; import lombok.Getter; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.jetbrains.annotations.NotNull; @@ -92,34 +86,32 @@ public ResultColumnStatistics describe() { return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), Collections.emptyList(), Collections.emptyMap()); } - final List bins = createBins(getStatistics().getN(), 15d); + final List bins = createBins(((int) getStatistics().getN()), 15); final Map extras = getExtras(); return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), bins, extras); } @NotNull - private List createBins(long total, double expectedBins) { - final Layout layout = getLayout(); + private List createBins(int total, int expectedBins) { + final DynamicHistogram histogram = DynamicHistogram.create(getStatistics().getMin(), getStatistics().getMax(), expectedBins); - final Histogram histogram = Histogram.createDynamic(layout); + Arrays.stream(getStatistics().getValues()).forEach(histogram::add); - Arrays.stream(getStatistics().getValues()).forEach(histogram::addValue); - - final List bins = postProcessBins((double) total, expectedBins, histogram); + final List balanced = histogram.balanced(expectedBins, total); final List entries = new ArrayList<>(); - for (Count bin : bins) { - final String lower = printValue(bin.min()); - final String upper = printValue(bin.max()); + for (DynamicHistogram.Node bin : balanced) { + final String lower = printValue(bin.getMin()); + final String upper = printValue(bin.getMax()); final String binLabel = String.format("%s - %s", lower, upper); - entries.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.count())); + entries.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.getCount())); } return entries; } @@ -148,83 +140,14 @@ private Map getExtras() { return out; } - @NotNull - private Layout getLayout() { - if (getType() instanceof ResultType.IntegerT) { - return getIntegerLayout(); - } - - final Layout initial = LogLinearLayout.create(getStatistics().getStandardDeviation() / 2, 1 / 10d, getStatistics().getMin(), getStatistics().getMax()); - return initial; - } - - /** - * Merge small bins from left to right order - */ - @NotNull - private static List postProcessBins(double total, double expectedBins, Histogram histogram) { - final List bins = new ArrayList<>(); - - Count prior = null; - - for (Bin bin : histogram.nonEmptyBinsDescending()) { - final Count current = Count.fromBin(bin); - - if (prior == null) { - prior = current; - continue; - } - - // If the bin is too small, we merge-left - if ((double) prior.count() / total <= (1 / expectedBins)) { - prior = prior.merge(current); - continue; - } - - // Only emit bin, if we cannot merge left. - bins.add(prior); - prior = null; - } - - if(prior != null) { - bins.add(prior); - } - Collections.reverse(bins); - - return bins; - } private String printValue(Number value) { return formatter.format(value.doubleValue()); } - private Layout getIntegerLayout() { - final int expectedBins = 15; - - final int min = (int) Math.round(getStatistics().getMin()); - final int max = (int) Math.round(getStatistics().getMax()); - - final int nBins = Math.min(1 + min - max, expectedBins); - - final int width = Math.max(1, (min - max) / nBins); - - - return CustomLayout.create(DoubleStream.iterate(min, cur -> cur <= max, cur -> cur + width).toArray()); - } - - private record Count(double min, double max, long count) { - - public static Count fromBin(Bin bin) { - return new Count(bin.getLowerBound(), bin.getUpperBound(), bin.getBinCount()); - } - public Count merge(Count other) { - return new Count(Math.min(min(), other.min()), Math.max(max(), other.max()), count() + other.count()); - } - - } }