Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/feature/query-stats-followup' in…
Browse files Browse the repository at this point in the history
…to staging/query-statistics
  • Loading branch information
awildturtok committed Jan 3, 2024
2 parents 1c46e55 + c6a4a30 commit cd5027b
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 84 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
package com.bakdata.conquery.models.query.statistics;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import com.google.common.math.Stats;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.doubles.DoubleList;
import lombok.Data;

@Data
public class DynamicHistogram {
private final Node[] nodes;
private final double min;
private final double width;

public static DynamicHistogram create(double min, double max, int expectedBins) {
return new DynamicHistogram(new Node[expectedBins], min, (max - min) / expectedBins);
}

public void add(double value) {
final int index = (int) Math.floor((min - value) / width);

if (nodes[index] == null) {
nodes[index] = new Node(new DoubleArrayList());
}

nodes[index].add(value);
}

public List<Node> balanced(int expectedBins, int total) {

final List<Node> merged = mergeLeft(total, nodes);

final List<Node> split = splitRight(expectedBins, merged);

return split;

}

private static List<Node> mergeLeft(int total, Node[] nodes) {
final List<Node> bins = new ArrayList<>();

Node prior = null;

for (Node bin : nodes) {

if (prior == null) {
prior = bin;
continue;
}

// If the bin is too small, we merge-left
if ((double) prior.getCount() / total <= (1d / total)) {
prior = prior.merge(bin);
continue;
}

// Only emit bin, if we cannot merge left.
bins.add(prior);
prior = null;
}

if (prior != null) {
bins.add(prior);
}

// since we're merging from right, we need to reverse
Collections.reverse(bins);

return bins;
}

private static List<Node> splitRight(int expectedBins, List<Node> nodes) {

if ((double) nodes.size() / (double) expectedBins >= 0.7d) {
return nodes;
}

final List<Node> bins = new ArrayList<>();

final Stats stats = nodes.stream().mapToDouble(node -> (double) node.getCount()).boxed().collect(Stats.toStats());

final double stdDev = stats.sampleStandardDeviation();
final double mean = stats.mean();


for (Node node : nodes) {
if (node.getCount() < mean + stdDev) {
bins.add(node);
continue;
}

bins.addAll(node.split());
}

return bins;
}

@Data
public static final class Node {
private final DoubleList entries;
private double min = Double.MAX_VALUE;
private double max = Double.MIN_VALUE;

public Node merge(Node other) {
final Node out = new Node(new DoubleArrayList(getCount() + other.getCount()));

out.max = Math.max(max, other.getMax());
out.min = Math.min(min, other.getMin());

out.entries.addAll(other.getEntries());
out.entries.addAll(getEntries());

return out;
}

public int getCount() {
return entries.size();
}

public List<Node> split() {
final double mean = entries.doubleStream().average().getAsDouble();
final Node lower = new Node(new DoubleArrayList());
final Node higher = new Node(new DoubleArrayList());


for (double entry : entries) {
if (entry <= mean) {
lower.add(entry);
}
else {
higher.add(entry);
}
}

return List.of(lower, higher);
}

public void add(double value) {
max = Math.max(max, value);
min = Math.min(min, value);
entries.add(value);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,11 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.DoubleStream;

import c10n.C10N;
import com.bakdata.conquery.models.i18n.I18n;
import com.bakdata.conquery.models.query.PrintSettings;
import com.bakdata.conquery.models.types.ResultType;
import com.dynatrace.dynahist.Histogram;
import com.dynatrace.dynahist.bin.Bin;
import com.dynatrace.dynahist.layout.CustomLayout;
import com.dynatrace.dynahist.layout.Layout;
import com.dynatrace.dynahist.layout.LogLinearLayout;
import lombok.Getter;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.jetbrains.annotations.NotNull;
Expand Down Expand Up @@ -92,34 +86,32 @@ public ResultColumnStatistics describe() {
return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), Collections.emptyList(), Collections.emptyMap());
}

final List<StringColumnStatsCollector.ColumnDescription.Entry> bins = createBins(getStatistics().getN(), 15d);
final List<StringColumnStatsCollector.ColumnDescription.Entry> bins = createBins(((int) getStatistics().getN()), 15);
final Map<String, String> extras = getExtras();

return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), bins, extras);
}

@NotNull
private List<StringColumnStatsCollector.ColumnDescription.Entry> createBins(long total, double expectedBins) {
final Layout layout = getLayout();
private List<StringColumnStatsCollector.ColumnDescription.Entry> createBins(int total, int expectedBins) {
final DynamicHistogram histogram = DynamicHistogram.create(getStatistics().getMin(), getStatistics().getMax(), expectedBins);

final Histogram histogram = Histogram.createDynamic(layout);
Arrays.stream(getStatistics().getValues()).forEach(histogram::add);

Arrays.stream(getStatistics().getValues()).forEach(histogram::addValue);

final List<Count> bins = postProcessBins((double) total, expectedBins, histogram);
final List<DynamicHistogram.Node> balanced = histogram.balanced(expectedBins, total);


final List<StringColumnStatsCollector.ColumnDescription.Entry> entries = new ArrayList<>();


for (Count bin : bins) {
final String lower = printValue(bin.min());
final String upper = printValue(bin.max());
for (DynamicHistogram.Node bin : balanced) {
final String lower = printValue(bin.getMin());
final String upper = printValue(bin.getMax());

final String binLabel = String.format("%s - %s", lower, upper);


entries.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.count()));
entries.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.getCount()));
}
return entries;
}
Expand Down Expand Up @@ -148,81 +140,14 @@ private Map<String, String> getExtras() {
return out;
}

@NotNull
private Layout getLayout() {
if (getType() instanceof ResultType.IntegerT) {
return getIntegerLayout();
}

final Layout initial = LogLinearLayout.create(getStatistics().getStandardDeviation() / 2, 1 / 10d, getStatistics().getMin(), getStatistics().getMax());


return initial;
}

/**
* Merge small bins from left to right order
*/
@NotNull
private static List<Count> postProcessBins(double total, double expectedBins, Histogram histogram) {
final List<Count> bins = new ArrayList<>();

Count prior = null;

for (Bin bin : histogram.nonEmptyBinsDescending()) {
final Count current = Count.fromBin(bin);

if (prior == null) {
prior = current;
continue;
}

// If the bin is too small, we merge-left
if ((double) prior.count() / total <= (1 / expectedBins)) {
prior = prior.merge(current);
continue;
}

// Only emit bin, if we cannot merge left.
bins.add(prior);
prior = null;
}

if(prior != null) {
bins.add(prior);
}

return bins;
}

private String printValue(Number value) {
return formatter.format(value.doubleValue());
}

private Layout getIntegerLayout() {
final int expectedBins = 15;

final int min = (int) Math.round(getStatistics().getMin());
final int max = (int) Math.round(getStatistics().getMax());

final int nBins = Math.min(1 + min - max, expectedBins);

final int width = Math.max(1, (min - max) / nBins);


return CustomLayout.create(DoubleStream.iterate(min, cur -> cur <= max, cur -> cur + width).toArray());
}

private record Count(double min, double max, long count) {

public static Count fromBin(Bin bin) {
return new Count(bin.getLowerBound(), bin.getUpperBound(), bin.getBinCount());
}

public Count merge(Count other) {
return new Count(Math.min(min(), other.min()), Math.max(max(), other.max()), count() + other.count());
}

}

}

0 comments on commit cd5027b

Please sign in to comment.