Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/feature/query-stats-followup' in…
Browse files Browse the repository at this point in the history
…to staging/query-statistics
  • Loading branch information
awildturtok committed Jan 8, 2024
2 parents 9ed660f + f17fdb3 commit eefe738
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.Objects;

import groovy.lang.Tuple;
import groovy.lang.Tuple2;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.doubles.DoubleIterator;
import it.unimi.dsi.fastutil.doubles.DoubleList;
import lombok.Data;
import lombok.ToString;
Expand All @@ -23,8 +28,18 @@ public class BalancingHistogram {

private int total;

public static BalancingHistogram create(double min, double max, int expectedBins, double stiffness) {
return new BalancingHistogram(new Node[expectedBins], min, (max - min) / (expectedBins - 1), expectedBins, stiffness);
public static BalancingHistogram create(double min, double max, int expectedBins, double stiffness, boolean snap) {
double width = (max - min) / (expectedBins - 1);

if (snap) {
min = Math.floor(min);
max = Math.ceil(max);
width = Math.min(1, Math.round(width));

expectedBins = (int) Math.ceil((min - max) / width);
}

return new BalancingHistogram(new Node[expectedBins], min, width, expectedBins, stiffness);
}

public void add(double value) {
Expand Down Expand Up @@ -94,7 +109,7 @@ private List<Node> splitRight(List<Node> nodes) {

final Deque<Node> frontier = new ArrayDeque<>(nodes);

while(!frontier.isEmpty()) {
while (!frontier.isEmpty()) {
final Node node = frontier.pop();
if (node.getCount() <= (expectedBinSize * (1 + stiffness))) {
bins.add(node);
Expand All @@ -107,7 +122,8 @@ private List<Node> splitRight(List<Node> nodes) {
final Node higher = split.get(1);

// node has a heavy bias
if(Math.min(higher.getCount(), lower.getCount()) <= expectedBinSize * 0.1d /* This is not the merge threshold, just a sufficiently small number */){
if (Math.min(higher.getCount(), lower.getCount())
<= expectedBinSize * 0.1d /* This is not the merge threshold, just a sufficiently small number */) {
bins.add(node);
continue;
}
Expand All @@ -119,6 +135,57 @@ private List<Node> splitRight(List<Node> nodes) {
return bins;
}

public List<Node> snapped() {
final Node first = nodes[0];

double min;
double max = Math.floor(first.min);


for (int index = 0; index < nodes.length; index++) {
final Node current = nodes[index];

if (current == null) {
continue;
}

min = max;
max = Math.max(min, Math.round(current.max));

boolean isLast = index == nodes.length - 1;

if (isLast) {
max = Math.ceil(current.max);
}

final Tuple2<DoubleList, DoubleList> spill = current.adjust(min, max);

final DoubleList lower = spill.getV1();

if (!lower.isEmpty()) {
lower.forEach(nodes[index - 1]::add);
}

final DoubleList higher = spill.getV2();

if (!higher.isEmpty()) {
if (isLast) {
higher.forEach(current::add);
}
else {
higher.forEach(nodes[index + 1]::add);
}
}

}

return Arrays.stream(nodes).filter(Objects::nonNull).filter(node -> node.getCount() > 0).toList();
}

public List<Node> nodes() {
return Arrays.stream(nodes).filter(Objects::nonNull).toList();
}

@Data
public static final class Node {
@ToString.Exclude
Expand Down Expand Up @@ -166,6 +233,30 @@ public void add(double value) {
min = Math.min(min, value);
entries.add(value);
}

public Tuple2<DoubleList, DoubleList> adjust(double min, double max) {
final DoubleList lower = new DoubleArrayList();
final DoubleList higher = new DoubleArrayList();
final DoubleIterator iterator = entries.doubleIterator();

this.min = min;
this.max = max;

while (iterator.hasNext()) {
final double value = iterator.nextDouble();

if (value < min) {
lower.add(value);
iterator.remove();
}
else if (value >= max) {
higher.add(value);
iterator.remove();
}
}

return Tuple.tuple(lower, higher);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ public ResultColumnStatistics describe() {

@NotNull
private List<StringColumnStatsCollector.ColumnDescription.Entry> createBins(int expectedBins) {
final BalancingHistogram histogram = BalancingHistogram.create(getStatistics().getMin(), getStatistics().getMax(), expectedBins, 0.8d);
final BalancingHistogram histogram = BalancingHistogram.create(getStatistics().getMin(), getStatistics().getMax(), expectedBins, 0.8d, false);

Arrays.stream(getStatistics().getValues()).forEach(histogram::add);

final List<BalancingHistogram.Node> balanced = histogram.balanced();
final List<BalancingHistogram.Node> balanced = histogram.nodes();


final List<StringColumnStatsCollector.ColumnDescription.Entry> entries = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package com.bakdata.conquery.models.query.statistics;

import java.util.List;
import java.util.Random;

import lombok.extern.slf4j.Slf4j;
import org.junit.jupiter.api.Test;

@Slf4j
class BalancingHistogramTest {

public static final int SEED = 0xD00F;
private final int max = 250;

@Test
void balanced() {
BalancingHistogram histogram = BalancingHistogram.create(0, max, 15, 0.8d, false);

final Random random = new Random(SEED);

for (int val = 0; val < 1000; val++) {
histogram.add(random.nextDouble(0, max));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(0, max / 5d));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(max / 2d, max / 2d + max / 5d));
}

histogram.add(max);


List<BalancingHistogram.Node> balanced = histogram.balanced();
log.info("{}", balanced);
}

@Test
void snapped() {
BalancingHistogram histogram = BalancingHistogram.create(0, max, 15, 0.8d, false);

final Random random = new Random(SEED);

for (int val = 0; val < 1000; val++) {
histogram.add(random.nextDouble(0, max));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(0, max / 5d));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(max / 2d, max / 2d + max / 5d));
}

histogram.add(max);


List<BalancingHistogram.Node> balanced = histogram.snapped();
log.info("{}", balanced);
}

@Test
void plain() {
BalancingHistogram histogram = BalancingHistogram.create(0, max, 15, 0.8d, false);

final Random random = new Random(SEED);

for (int val = 0; val < 1000; val++) {
histogram.add(random.nextDouble(0, max));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(0, max / 5d));
}

for (int val = 0; val < 100; val++) {
histogram.add(random.nextDouble(max / 2d, max / 2d + max / 5d));
}

histogram.add(max);


List<BalancingHistogram.Node> balanced = histogram.nodes();
log.info("{}", balanced);
}
}

0 comments on commit eefe738

Please sign in to comment.