Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/feature/query-stats-followup' in…
Browse files Browse the repository at this point in the history
…to staging/query-statistics
  • Loading branch information
awildturtok committed Jan 2, 2024
2 parents ee019b7 + 8450262 commit d616ea1
Show file tree
Hide file tree
Showing 2 changed files with 261 additions and 307 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,24 @@
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.DoubleStream;

import c10n.C10N;
import com.bakdata.conquery.io.cps.CPSType;
import com.bakdata.conquery.models.i18n.I18n;
import com.bakdata.conquery.models.query.PrintSettings;
import com.bakdata.conquery.models.types.ResultType;
import com.dynatrace.dynahist.Histogram;
import com.dynatrace.dynahist.bin.Bin;
import com.dynatrace.dynahist.layout.CustomLayout;
import com.dynatrace.dynahist.layout.Layout;
import com.dynatrace.dynahist.layout.LogLinearLayout;
import lombok.Getter;
import lombok.ToString;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.jetbrains.annotations.NotNull;

Expand Down Expand Up @@ -89,99 +89,137 @@ public void consume(Number value) {
public ResultColumnStatistics describe() {
// If no real samples were collected, we short-circuit, as Statistics will throw an exception when empty.
if (getStatistics().getN() == 0) {
return new ColumnDescription(
getName(), getLabel(), getDescription(), getType().toString(),
getNulls().intValue(), getNulls().intValue(),
Double.NaN, Double.NaN, Double.NaN, Double.NaN, Double.NaN, 0, Collections.emptyList()
);
return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), Collections.emptyList(), Collections.emptyMap());
}

final List<StringColumnStatsCollector.ColumnDescription.Entry> bins = createBins();
final List<StringColumnStatsCollector.ColumnDescription.Entry> bins = createBins(getStatistics().getN(), 15d);
final Map<String, String> extras = getExtras();

return new StringColumnStatsCollector.ColumnDescription(
getName(), getLabel(), getDescription(), bins,
getExtras()
);
return new StringColumnStatsCollector.ColumnDescription(getName(), getLabel(), getDescription(), bins, extras);
}

@NotNull
private List<StringColumnStatsCollector.ColumnDescription.Entry> createBins() {
//TODO create logic for integral bins
//TODO if we have a long tail of small bins, consider merging them

Layout layout = LogLinearLayout.create(getStatistics().getStandardDeviation() / 2, 1 / 10d, getStatistics().getMin(), getStatistics().getMax());

private List<StringColumnStatsCollector.ColumnDescription.Entry> createBins(long total, double expectedBins) {
final Layout layout = getLayout();

final Histogram histogram = Histogram.createDynamic(layout);

Arrays.stream(getStatistics().getValues()).forEach(histogram::addValue);

final List<Count> bins = postProcessBins((double) total, expectedBins, histogram);


final List<StringColumnStatsCollector.ColumnDescription.Entry> bins = new ArrayList<>();
final List<StringColumnStatsCollector.ColumnDescription.Entry> entries = new ArrayList<>();

for (Bin bin : histogram.nonEmptyBinsAscending()) {
//TODO Do we need to handle under/overflow?

final String lower = printValue(bin.getLowerBound());
final String upper = printValue(bin.getUpperBound());
for (Count bin : bins) {
final String lower = printValue(bin.min());
final String upper = printValue(bin.max());

final String binLabel = String.format("%s - %s", lower, upper);


bins.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.getBinCount()));
entries.add(new StringColumnStatsCollector.ColumnDescription.Entry(binLabel, bin.count()));
}
return bins;
return entries;
}

@NotNull
private Map<String, String> getExtras() {
final StatisticsLabels labels = C10N.get(StatisticsLabels.class);

return Map.of(
labels.min(), printValue(getStatistics().getMin()),
labels.max(), printValue(getStatistics().getMax()),
labels.mean(), printValue(getStatistics().getMean()),
labels.median(), printValue(getStatistics().getPercentile(50)),
labels.p25(), printValue(getStatistics().getPercentile(25)),
labels.p75(), printValue(getStatistics().getPercentile(75)),
labels.sum(), printValue(getStatistics().getSum()),
labels.std(), getPrintSettings().getDecimalFormat().format(getStatistics().getStandardDeviation()),
labels.count(), getPrintSettings().getIntegerFormat().format(getStatistics().getN()),
labels.missing(), getPrintSettings().getIntegerFormat().format(getStatistics().getN())
);
// LinkedHashMap remembers insertion order
final LinkedHashMap<String, String> out = new LinkedHashMap<>();

out.put(labels.min(), printValue(getStatistics().getMin()));
out.put(labels.max(), printValue(getStatistics().getMax()));
out.put(labels.mean(), printValue(getStatistics().getMean()));

out.put(labels.p25(), printValue(getStatistics().getPercentile(25)));
out.put(labels.median(), printValue(getStatistics().getPercentile(50)));
out.put(labels.p75(), printValue(getStatistics().getPercentile(75)));
out.put(labels.std(), getPrintSettings().getDecimalFormat().format(getStatistics().getStandardDeviation()));

out.put(labels.sum(), printValue(getStatistics().getSum()));

out.put(labels.count(), getPrintSettings().getIntegerFormat().format(getStatistics().getN()));
out.put(labels.missing(), getPrintSettings().getIntegerFormat().format(getNulls().get()));

return out;
}

@NotNull
private Layout getLayout() {
if (getType() instanceof ResultType.IntegerT) {
return getIntegerLayout();
}

final Layout initial = LogLinearLayout.create(getStatistics().getStandardDeviation() / 2, 1 / 10d, getStatistics().getMin(), getStatistics().getMax());


return initial;
}

/**
* Merge small bins from left to right order
*/
@NotNull
private static List<Count> postProcessBins(double total, double expectedBins, Histogram histogram) {
final List<Count> bins = new ArrayList<>();

Count prior = null;

for (Bin bin : histogram.nonEmptyBinsDescending()) {
final Count current = Count.fromBin(bin);

if (prior == null) {
prior = current;
continue;
}

// If the bin is too small, we merge-left
if ((double) prior.count() / total <= (1 / expectedBins)) {
prior = prior.merge(current);
continue;
}

// Only emit bin, if we cannot merge left.
bins.add(prior);
prior = null;
}

bins.add(prior);
return bins;
}

private String printValue(Number value) {
return formatter.format(value.doubleValue());
}

@Getter
@CPSType(id = "DESCRIPTIVE", base = ResultColumnStatistics.class)
@ToString(callSuper = true)
public static class ColumnDescription extends ColumnStatsCollector.ResultColumnStatistics {

private final int count;
private final int nullValues;
private final double mean;
private final double median;
private final double stdDev;
private final Number min;
private final Number max;
private final Number sum;

private final Collection<? extends Number> samples;

public ColumnDescription(String name, String label, String description, String type, int count, int nullValues, double mean, double median, double stdDev, Number min, Number max, Number sum, Collection<? extends Number> samples) {
super(name, label, description, type);
this.count = count;
this.nullValues = nullValues;
this.mean = mean;
this.median = median;
this.stdDev = stdDev;
this.min = min;
this.max = max;
this.sum = sum;
this.samples = samples;
private Layout getIntegerLayout() {
final int expectedBins = 15;

final int min = (int) Math.round(getStatistics().getMin());
final int max = (int) Math.round(getStatistics().getMin());

final int nBins = Math.min(1 + min - max, expectedBins);

final int width = (min - max) / nBins;


return CustomLayout.create(DoubleStream.iterate(min, cur -> cur <= max, cur -> cur + width).toArray());
}

private record Count(double min, double max, long count) {

public static Count fromBin(Bin bin) {
return new Count(bin.getLowerBound(), bin.getUpperBound(), bin.getBinCount());
}

public Count merge(Count other) {
return new Count(Math.min(min(), other.min()), Math.max(max(), other.max()), count() + other.count());
}

}

}
Loading

0 comments on commit d616ea1

Please sign in to comment.