From 06042ec385567083860acf992d3cecc89bd1c1f3 Mon Sep 17 00:00:00 2001 From: awildturtok <1553491+awildturtok@users.noreply.github.com> Date: Tue, 5 Mar 2024 15:28:28 +0100 Subject: [PATCH] Query stats followup (#3255) Followup for `statistics` endpoint. Major rework of Histogram logic. --- .../conquery/apiv1/QueryProcessor.java | 108 ++------- .../ResultHeadersC10n.java | 4 + .../io/result/arrow/ResultArrowProcessor.java | 43 ++-- .../io/result/csv/ResultCsvProcessor.java | 5 +- .../io/result/excel/ExcelRenderer.java | 7 +- .../io/result/excel/ResultExcelProcessor.java | 5 +- .../parquet/ResultParquetProcessor.java | 5 +- .../models/config/FrontendConfig.java | 30 ++- .../forms/managed/ManagedInternalForm.java | 5 +- .../conquery/models/query/ManagedQuery.java | 18 +- .../conquery/models/query/PrintSettings.java | 14 +- .../models/query/SingleTableResult.java | 6 +- .../query/preview/EntityPreviewExecution.java | 9 +- .../models/query/results/EntityResult.java | 8 +- .../query/results/MultilineEntityResult.java | 5 + .../query/results/SinglelineEntityResult.java | 5 + .../BooleanColumnStatsCollector.java | 50 +---- .../statistics/ColumnStatsCollector.java | 42 ++-- .../statistics/DateColumnStatsCollector.java | 60 ++--- .../models/query/statistics/Histogram.java | 211 ++++++++++++++++++ .../HistogramColumnDescription.java | 25 +++ .../statistics/ListColumnStatsCollector.java | 13 +- .../NumberColumnStatsCollector.java | 191 +++++++++++----- .../query/statistics/ResultStatistics.java | 125 +++++++++++ .../query/statistics/StatisticsLabels.java | 56 +++++ .../StringColumnStatsCollector.java | 60 +++-- .../resources/api/ResultArrowResource.java | 13 +- .../resources/api/ResultCsvResource.java | 7 +- .../resources/api/ResultExcelResource.java | 6 +- .../resources/api/ResultParquetResource.java | 6 +- .../sql/conquery/SqlManagedQuery.java | 15 +- .../sql/execution/SqlEntityResult.java | 9 +- .../json/AbstractQueryEngineTest.java | 5 +- .../conquery/integration/json/FormTest.java | 5 +- .../tests/QueryStatisticsTest.java | 180 +++++++-------- .../conquery/io/result/ResultTestUtil.java | 3 +- .../arrow/ArrowResultGenerationTest.java | 5 +- .../result/csv/CsvResultGenerationTest.java | 3 +- .../result/excel/ExcelResultRenderTest.java | 6 +- .../parquet/ParquetResultGenerationTest.java | 3 +- .../query/statistics/HistogramTest.java | 150 +++++++++++++ .../SIMPLE_TREECONCEPT_Query.json | 14 +- .../query/QUERY_STATISTICS_TESTS/content.csv | 16 +- 43 files changed, 1096 insertions(+), 460 deletions(-) create mode 100644 backend/src/main/java/com/bakdata/conquery/models/query/statistics/Histogram.java create mode 100644 backend/src/main/java/com/bakdata/conquery/models/query/statistics/HistogramColumnDescription.java create mode 100644 backend/src/main/java/com/bakdata/conquery/models/query/statistics/StatisticsLabels.java create mode 100644 backend/src/test/java/com/bakdata/conquery/models/query/statistics/HistogramTest.java diff --git a/backend/src/main/java/com/bakdata/conquery/apiv1/QueryProcessor.java b/backend/src/main/java/com/bakdata/conquery/apiv1/QueryProcessor.java index 5aa1182101..e296826a31 100644 --- a/backend/src/main/java/com/bakdata/conquery/apiv1/QueryProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/apiv1/QueryProcessor.java @@ -2,24 +2,22 @@ import java.net.MalformedURLException; import java.net.URISyntaxException; +import java.text.NumberFormat; import java.time.LocalDate; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.Optional; -import java.util.Random; +import java.util.OptionalLong; import java.util.Set; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.BooleanSupplier; import java.util.function.Consumer; import java.util.function.Predicate; -import java.util.random.RandomGenerator; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -57,9 +55,7 @@ import com.bakdata.conquery.models.auth.entities.User; import com.bakdata.conquery.models.auth.permissions.Ability; import com.bakdata.conquery.models.auth.permissions.ConqueryPermission; -import com.bakdata.conquery.models.common.CDateSet; import com.bakdata.conquery.models.common.Range; -import com.bakdata.conquery.models.common.daterange.CDateRange; import com.bakdata.conquery.models.config.ColumnConfig; import com.bakdata.conquery.models.config.ConqueryConfig; import com.bakdata.conquery.models.datasets.Dataset; @@ -83,11 +79,8 @@ import com.bakdata.conquery.models.query.queryplan.DateAggregationAction; import com.bakdata.conquery.models.query.resultinfo.ResultInfo; import com.bakdata.conquery.models.query.resultinfo.UniqueNamer; -import com.bakdata.conquery.models.query.results.EntityResult; -import com.bakdata.conquery.models.query.statistics.ColumnStatsCollector; import com.bakdata.conquery.models.query.statistics.ResultStatistics; import com.bakdata.conquery.models.query.visitor.QueryVisitor; -import com.bakdata.conquery.models.types.ResultType; import com.bakdata.conquery.models.types.SemanticType; import com.bakdata.conquery.models.worker.DatasetRegistry; import com.bakdata.conquery.models.worker.Namespace; @@ -96,8 +89,6 @@ import com.bakdata.conquery.util.io.IdColumnUtil; import com.google.common.collect.ClassToInstanceMap; import com.google.common.collect.MutableClassToInstanceMap; -import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -import it.unimi.dsi.fastutil.ints.IntSet; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -114,29 +105,7 @@ public class QueryProcessor { @Inject private ConqueryConfig config; - private static CDateSet extractValidityDate(ResultType dateType, Object dateValue) { - if (dateType instanceof ResultType.DateRangeT) { - return CDateSet.create(CDateRange.fromList((List) dateValue)); - } - - if (dateType instanceof ResultType.DateT) { - return CDateSet.create(CDateRange.exactly((Integer) dateValue)); - } - - if (dateType instanceof ResultType.ListT listT) { - final CDateSet out = CDateSet.createEmpty(); - - for (Object date : ((List) dateValue)) { - out.addAll(extractValidityDate(listT.getElementType(), date)); - } - - // since they are ordered, we can be sure this is always the correct span - return out; - } - - throw new IllegalStateException("Unexpected date Type %s".formatted(dateType)); - } public Stream getAllQueries(Dataset dataset, HttpServletRequest req, Subject subject, boolean allProviders) { final Collection allQueries = storage.getAllExecutions(); @@ -567,7 +536,7 @@ public Stream> resolveEntities(Subject subject, List Id-Value } - return result.streamResults() + return result.streamResults(OptionalLong.empty()) .map(printer::createId) .map(entityPrintId -> { final Map out = new HashMap<>(); @@ -590,72 +559,25 @@ public ResultStatistics getResultStatistics(ManagedQuery managedQuery) { final Query query = managedQuery.getQuery(); final List resultInfos = query.getResultInfos(); - final RandomGenerator random = new Random(); - final int requiredSamples = config.getFrontend().getVisualisationSamples(); - + final Optional + dateInfo = + query.getResultInfos().stream().filter(info -> info.getSemantics().contains(new SemanticType.EventDateT())).findFirst(); - final int totalSamples = managedQuery.getLastResultCount().intValue(); + final int dateIndex = dateInfo.map(resultInfos::indexOf).orElse(0 /*Discarded if dateInfo is not present*/); - //We collect about $requiredSamples values as samples for visualisation, while streaming the values. - // Note that nextInt produces values > 0 and < totalSamples. This is equivalent to `P(k) = $requiredSamples/$totalSamples` but terser. - final BooleanSupplier samplePicker; + final Locale locale = I18n.LOCALE.get(); + final NumberFormat decimalFormat = NumberFormat.getNumberInstance(locale); + decimalFormat.setMaximumFractionDigits(2); - if (totalSamples <= requiredSamples) { - samplePicker = () -> true; - } - else { - samplePicker = () -> random.nextInt(totalSamples) < requiredSamples; - } + final NumberFormat integerFormat = NumberFormat.getNumberInstance(locale); - final boolean hasValidityDates = resultInfos.get(0).getSemantics().contains(new SemanticType.EventDateT()); - final ResultType dateType = resultInfos.get(0).getType(); - final PrintSettings printSettings = new PrintSettings(false, I18n.LOCALE.get(), managedQuery.getNamespace(), config, null); + final PrintSettings printSettings = + new PrintSettings(true, locale, managedQuery.getNamespace(), config, null, null, decimalFormat, integerFormat); final UniqueNamer uniqueNamer = new UniqueNamer(printSettings); - final List statsCollectors = resultInfos.stream() - .map(info -> ColumnStatsCollector.getStatsCollector(info, printSettings, samplePicker, info.getType(), uniqueNamer)) - .collect(Collectors.toList()); - - final IntSet entities = new IntOpenHashSet(); - final AtomicInteger lines = new AtomicInteger(); - - final AtomicReference span = new AtomicReference<>(null); - - - managedQuery.streamResults() - .peek(result -> entities.add(result.getEntityId())) - .map(EntityResult::listResultLines) - .flatMap(List::stream) - .forEach(line -> { - - if (hasValidityDates) { - final CDateSet dateSet = extractValidityDate(dateType, line[0]); - span.getAndAccumulate(dateSet.span(), (old, incoming) -> incoming.spanClosed(old)); - } - - lines.incrementAndGet(); - - for (int col = 0; col < line.length; col++) { - final ColumnStatsCollector collector = statsCollectors.get(col); - if (collector == null) { - continue; - } - - collector.consume(line[col]); - } - }); - - return new ResultStatistics( - entities.size(), - lines.get(), - statsCollectors.stream() - .filter(Objects::nonNull) // Not all columns produces stats - .map(ColumnStatsCollector::describe) - .toList(), - span.get().toSimpleRange() - ); + return ResultStatistics.collectResultStatistics(managedQuery, resultInfos, dateInfo, dateIndex, printSettings, uniqueNamer, config); } } diff --git a/backend/src/main/java/com/bakdata/conquery/internationalization/ResultHeadersC10n.java b/backend/src/main/java/com/bakdata/conquery/internationalization/ResultHeadersC10n.java index b2e6709f60..1385fa764b 100644 --- a/backend/src/main/java/com/bakdata/conquery/internationalization/ResultHeadersC10n.java +++ b/backend/src/main/java/com/bakdata/conquery/internationalization/ResultHeadersC10n.java @@ -41,5 +41,9 @@ public interface ResultHeadersC10n { @De("Nachbeobachtungszeitraum") String outcomeDateRange(); + @En("{0} others") + @De("{0} andere") + String others(long count); + } diff --git a/backend/src/main/java/com/bakdata/conquery/io/result/arrow/ResultArrowProcessor.java b/backend/src/main/java/com/bakdata/conquery/io/result/arrow/ResultArrowProcessor.java index 6e9c535fc9..944333e4b9 100644 --- a/backend/src/main/java/com/bakdata/conquery/io/result/arrow/ResultArrowProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/io/result/arrow/ResultArrowProcessor.java @@ -9,6 +9,7 @@ import java.nio.channels.Channels; import java.util.List; import java.util.Locale; +import java.util.OptionalLong; import java.util.function.Function; import javax.inject.Inject; @@ -53,7 +54,7 @@ public class ResultArrowProcessor { private final ArrowConfig arrowConfig; - public Response createResultFile(Subject subject, ManagedExecution exec, boolean pretty) { + public Response createResultFile(Subject subject, ManagedExecution exec, boolean pretty, OptionalLong limit) { return getArrowResult( (output) -> (root) -> new ArrowFileWriter(root, new DictionaryProvider.MapDictionaryProvider(), Channels.newChannel(output)), subject, @@ -63,21 +64,8 @@ public Response createResultFile(Subject subject, ManagedExecution exec, boolean FILE_EXTENTION_ARROW_FILE, FILE_MEDIA_TYPE, conqueryConfig, - arrowConfig - ); - } - - public Response createResultStream(Subject subject, ManagedExecution exec, boolean pretty) { - return getArrowResult( - (output) -> (root) -> new ArrowStreamWriter(root, new DictionaryProvider.MapDictionaryProvider(), output), - subject, - ((ManagedExecution & SingleTableResult) exec), - datasetRegistry, - pretty, - FILE_EXTENTION_ARROW_STREAM, - STREAM_MEDIA_TYPE, - conqueryConfig, - arrowConfig + arrowConfig, + limit ); } @@ -90,7 +78,9 @@ public static Response getArrow String fileExtension, MediaType mediaType, ConqueryConfig config, - ArrowConfig arrowConfig) { + ArrowConfig arrowConfig, + OptionalLong limit + ) { ConqueryMDC.setLocation(subject.getName()); @@ -102,10 +92,10 @@ public static Response getArrow // Get the locale extracted by the LocaleFilter - final Namespace namespace = datasetRegistry.get(dataset.getId()); IdPrinter idPrinter = IdColumnUtil.getIdPrinter(subject, exec, namespace, config.getIdColumns().getIds()); final Locale locale = I18n.LOCALE.get(); + PrintSettings settings = new PrintSettings( pretty, locale, @@ -127,7 +117,7 @@ public static Response getArrow arrowConfig, resultInfosId, resultInfosExec, - exec.streamResults() + exec.streamResults(limit) ); } finally { @@ -138,5 +128,20 @@ public static Response getArrow return makeResponseWithFileName(Response.ok(out), String.join(".", exec.getLabelWithoutAutoLabelSuffix(), fileExtension), mediaType, ResultUtil.ContentDispositionOption.ATTACHMENT); } + public Response createResultStream(Subject subject, ManagedExecution exec, boolean pretty, OptionalLong limit) { + return getArrowResult( + (output) -> (root) -> new ArrowStreamWriter(root, new DictionaryProvider.MapDictionaryProvider(), output), + subject, + ((ManagedExecution & SingleTableResult) exec), + datasetRegistry, + pretty, + FILE_EXTENTION_ARROW_STREAM, + STREAM_MEDIA_TYPE, + conqueryConfig, + arrowConfig, + limit + ); + } + } diff --git a/backend/src/main/java/com/bakdata/conquery/io/result/csv/ResultCsvProcessor.java b/backend/src/main/java/com/bakdata/conquery/io/result/csv/ResultCsvProcessor.java index 543cf3a195..8a7f4fc024 100644 --- a/backend/src/main/java/com/bakdata/conquery/io/result/csv/ResultCsvProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/io/result/csv/ResultCsvProcessor.java @@ -7,6 +7,7 @@ import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.Locale; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.WebApplicationException; @@ -39,7 +40,7 @@ public class ResultCsvProcessor { private final ConqueryConfig config; private final DatasetRegistry datasetRegistry; - public Response createResult(Subject subject, E exec, boolean pretty, Charset charset) { + public Response createResult(Subject subject, E exec, boolean pretty, Charset charset, OptionalLong limit) { final Dataset dataset = exec.getDataset(); @@ -62,7 +63,7 @@ public Response createResult(Su final StreamingOutput out = os -> { try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(os, charset))) { final CsvRenderer renderer = new CsvRenderer(config.getCsv().createWriter(writer), settings); - renderer.toCSV(config.getIdColumns().getIdResultInfos(), exec.getResultInfos(), exec.streamResults()); + renderer.toCSV(config.getIdColumns().getIdResultInfos(), exec.getResultInfos(), exec.streamResults(limit)); } catch (EofException e) { log.trace("User canceled download"); diff --git a/backend/src/main/java/com/bakdata/conquery/io/result/excel/ExcelRenderer.java b/backend/src/main/java/com/bakdata/conquery/io/result/excel/ExcelRenderer.java index 2708a04796..b38c9d4735 100644 --- a/backend/src/main/java/com/bakdata/conquery/io/result/excel/ExcelRenderer.java +++ b/backend/src/main/java/com/bakdata/conquery/io/result/excel/ExcelRenderer.java @@ -5,6 +5,7 @@ import java.math.BigDecimal; import java.util.List; import java.util.Map; +import java.util.OptionalLong; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Stream; @@ -40,6 +41,8 @@ public class ExcelRenderer { + public static final int MAX_LINES = 1_048_576; + private static final Map, TypeWriter> TYPE_WRITER_MAP = Map.of( ResultType.DateT.class, ExcelRenderer::writeDateCell, ResultType.IntegerT.class, ExcelRenderer::writeIntegerCell, @@ -71,7 +74,7 @@ private interface TypeWriter { public void renderToStream( List idHeaders, E exec, - OutputStream outputStream) throws IOException { + OutputStream outputStream, OptionalLong limit) throws IOException { final List resultInfosExec = exec.getResultInfos(); setMetaData(exec); @@ -85,7 +88,7 @@ public void renderToStream( writeHeader(sheet, idHeaders, resultInfosExec, table); - int writtenLines = writeBody(sheet, resultInfosExec, exec.streamResults()); + int writtenLines = writeBody(sheet, resultInfosExec, exec.streamResults(OptionalLong.of(limit.orElse(MAX_LINES)))); postProcessTable(sheet, table, writtenLines, idHeaders.size()); diff --git a/backend/src/main/java/com/bakdata/conquery/io/result/excel/ResultExcelProcessor.java b/backend/src/main/java/com/bakdata/conquery/io/result/excel/ResultExcelProcessor.java index 12c586b644..29f6baaa00 100644 --- a/backend/src/main/java/com/bakdata/conquery/io/result/excel/ResultExcelProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/io/result/excel/ResultExcelProcessor.java @@ -3,6 +3,7 @@ import static com.bakdata.conquery.io.result.ResultUtil.makeResponseWithFileName; import java.util.Locale; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.core.MediaType; @@ -38,7 +39,7 @@ public class ResultExcelProcessor { private final ExcelConfig excelConfig; - public Response createResult(Subject subject, E exec, boolean pretty) { + public Response createResult(Subject subject, E exec, boolean pretty, OptionalLong limit) { ConqueryMDC.setLocation(subject.getName()); @@ -57,7 +58,7 @@ public Response createResult(Su final ExcelRenderer excelRenderer = new ExcelRenderer(excelConfig, settings); final StreamingOutput out = output -> { - excelRenderer.renderToStream(conqueryConfig.getIdColumns().getIdResultInfos(), exec, output); + excelRenderer.renderToStream(conqueryConfig.getIdColumns().getIdResultInfos(), exec, output, limit); log.trace("FINISHED downloading {}", exec.getId()); }; diff --git a/backend/src/main/java/com/bakdata/conquery/io/result/parquet/ResultParquetProcessor.java b/backend/src/main/java/com/bakdata/conquery/io/result/parquet/ResultParquetProcessor.java index 6676d0b14e..389746e35a 100644 --- a/backend/src/main/java/com/bakdata/conquery/io/result/parquet/ResultParquetProcessor.java +++ b/backend/src/main/java/com/bakdata/conquery/io/result/parquet/ResultParquetProcessor.java @@ -3,6 +3,7 @@ import static com.bakdata.conquery.io.result.ResultUtil.makeResponseWithFileName; import java.util.Locale; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.core.MediaType; @@ -35,7 +36,7 @@ public class ResultParquetProcessor { private final DatasetRegistry datasetRegistry; private final ConqueryConfig config; - public Response createResultFile(Subject subject, ManagedExecution exec, boolean pretty) { + public Response createResultFile(Subject subject, ManagedExecution exec, boolean pretty, OptionalLong limit) { ConqueryMDC.setLocation(subject.getName()); @@ -68,7 +69,7 @@ public Response createResultFile(Subject subject, ManagedExecution exec, boolean config.getIdColumns().getIdResultInfos(), singleTableResult.getResultInfos(), settings, - singleTableResult.streamResults() + singleTableResult.streamResults(limit) ); }; diff --git a/backend/src/main/java/com/bakdata/conquery/models/config/FrontendConfig.java b/backend/src/main/java/com/bakdata/conquery/models/config/FrontendConfig.java index 531f08d82a..96065200d0 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/config/FrontendConfig.java +++ b/backend/src/main/java/com/bakdata/conquery/models/config/FrontendConfig.java @@ -11,6 +11,9 @@ import com.bakdata.conquery.models.forms.frontendconfiguration.FormScanner; import com.fasterxml.jackson.annotation.JsonAlias; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.google.common.collect.Range; +import io.dropwizard.validation.ValidationMethod; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; @@ -34,13 +37,13 @@ public class FrontendConfig { private int observationPeriodYears = 6; /** - * Query preview uses real samples for display, this parameter controls how many we try to sample. - * - * Note, that we use stochastic sampling, to avoid keeping the entire rendered query in memory, so this number is not exact. + * Limit to number of histogram entries. + * Note, that zero and out of bounds values are tracked in separate bins, so you can have three additional bins. */ @Min(0) - private int visualisationSamples = 1000; + private int visualisationsHistogramLimit = 10; + private Range visualisationPercentiles = Range.closed(15, 85); /** * The url that points a manual. This is also used by the {@link FormScanner} * as the base url for forms that specify a relative url. Internally {@link URI#resolve(URI)} @@ -63,16 +66,31 @@ public class FrontendConfig { */ @Nullable private URL manualUrl; - @Nullable @Email private String contactEmail; - /** * If true, users are always allowed to add custom values into SelectFilter input fields. */ private boolean alwaysAllowCreateValue = false; + @ValidationMethod(message = "Percentiles must be concrete and within 0 - 100") + @JsonIgnore + public boolean isValidPercentiles() { + if(!visualisationPercentiles.hasLowerBound() || !visualisationPercentiles.hasUpperBound()){ + return false; + } + + if (visualisationPercentiles.lowerEndpoint() < 0) { + return false; + } + + if (visualisationPercentiles.upperEndpoint() > 100) { + return false; + } + + return true; + } @Data public static class CurrencyConfig { diff --git a/backend/src/main/java/com/bakdata/conquery/models/forms/managed/ManagedInternalForm.java b/backend/src/main/java/com/bakdata/conquery/models/forms/managed/ManagedInternalForm.java index a27cb4c09a..405c8e68de 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/forms/managed/ManagedInternalForm.java +++ b/backend/src/main/java/com/bakdata/conquery/models/forms/managed/ManagedInternalForm.java @@ -2,6 +2,7 @@ import java.util.List; import java.util.Map; +import java.util.OptionalLong; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -143,12 +144,12 @@ public List getResultInfos() { } @Override - public Stream streamResults() { + public Stream streamResults(OptionalLong limit) { if (subQueries.size() != 1) { // Get the query, only if there is only one query set in the whole execution throw new UnsupportedOperationException("Cannot return the result query of a multi query form"); } - return subQueries.values().iterator().next().streamResults(); + return subQueries.values().iterator().next().streamResults(limit); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/ManagedQuery.java b/backend/src/main/java/com/bakdata/conquery/models/query/ManagedQuery.java index 2302d72d72..6a1638f9a8 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/ManagedQuery.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/ManagedQuery.java @@ -2,7 +2,9 @@ import java.util.Collections; import java.util.List; +import java.util.OptionalLong; import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -98,13 +100,23 @@ public void addResult(ShardResult result) { @Override protected void finish(ExecutionState executionState) { - lastResultCount = query.countResults(streamResults()); + lastResultCount = query.countResults(streamResults(OptionalLong.empty())); super.finish(executionState); } - public Stream streamResults() { - return getNamespace().getExecutionManager().streamQueryResults(this); + + public Stream streamResults(OptionalLong maybeLimit) { + final Stream results = getNamespace().getExecutionManager().streamQueryResults(this); + + if(maybeLimit.isEmpty()){ + return results; + } + + final long limit = maybeLimit.getAsLong(); + final AtomicLong consumed = new AtomicLong(); + + return results.takeWhile(line -> consumed.addAndGet(line.length()) < limit); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/PrintSettings.java b/backend/src/main/java/com/bakdata/conquery/models/query/PrintSettings.java index 132d3b435e..6698caa81f 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/PrintSettings.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/PrintSettings.java @@ -1,5 +1,6 @@ package com.bakdata.conquery.models.query; +import java.text.DecimalFormat; import java.text.NumberFormat; import java.time.format.DateTimeFormatter; import java.util.Currency; @@ -41,6 +42,7 @@ public class PrintSettings { private final DateTimeFormatter dateFormatter; private final NumberFormat decimalFormat; private final NumberFormat integerFormat; + private final NumberFormat currencyFormat; private final Currency currency; /** @@ -57,6 +59,10 @@ public class PrintSettings { private final PrintIdMapper idMapper; public PrintSettings(boolean prettyPrint, Locale locale, Namespace namespace, ConqueryConfig config, PrintIdMapper idMapper, Function columnNamer) { + this(prettyPrint, locale, namespace, config, idMapper, columnNamer, DECIMAL_FORMAT.apply(locale), NUMBER_FORMAT.apply(locale)); + } + + public PrintSettings(boolean prettyPrint, Locale locale, Namespace namespace, ConqueryConfig config, PrintIdMapper idMapper, Function columnNamer, NumberFormat decimalFormat, NumberFormat numberFormat) { this.prettyPrint = prettyPrint; this.locale = locale; this.namespace = namespace; @@ -64,8 +70,8 @@ public PrintSettings(boolean prettyPrint, Locale locale, Namespace namespace, Co this.columnNamer = columnNamer; this.idMapper = idMapper; - this.integerFormat = NUMBER_FORMAT.apply(locale); - this.decimalFormat = DECIMAL_FORMAT.apply(locale); + this.integerFormat = numberFormat; + this.decimalFormat = decimalFormat; this.listFormat = prettyPrint ? config.getLocale().getListFormats().get(0) : UNPRETTY_LIST_FORMAT; this.dateRangeSeparator = prettyPrint ? config.getLocale().findDateRangeSeparator(locale) : UNPRETTY_DATERANGE_SEPERATOR; @@ -73,6 +79,10 @@ public PrintSettings(boolean prettyPrint, Locale locale, Namespace namespace, Co this.dateFormat = config.getLocale().findDateFormat(locale); this.dateFormatter = prettyPrint ? DateTimeFormatter.ofPattern(dateFormat) : UNPRETTY_DATEFORMATTER; + this.currencyFormat = DecimalFormat.getCurrencyInstance(locale); + currencyFormat.setCurrency(currency); + currencyFormat.setMaximumFractionDigits(currency.getDefaultFractionDigits()); + } public PrintSettings(boolean prettyPrint, Locale locale, Namespace namespace, ConqueryConfig config, PrintIdMapper idMapper) { diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/SingleTableResult.java b/backend/src/main/java/com/bakdata/conquery/models/query/SingleTableResult.java index 9802198819..64fd65be1d 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/SingleTableResult.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/SingleTableResult.java @@ -3,6 +3,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.OptionalLong; import java.util.stream.Stream; import com.bakdata.conquery.models.config.ConqueryConfig; @@ -44,7 +45,10 @@ default List generateColumnDescriptions(boolean isInitialized, @JsonIgnore List getResultInfos(); - Stream streamResults(); + /** + * @param limit Optionally limits how many lines are emitted. + */ + Stream streamResults(OptionalLong limit); @JsonIgnore long resultRowCount(); diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/preview/EntityPreviewExecution.java b/backend/src/main/java/com/bakdata/conquery/models/query/preview/EntityPreviewExecution.java index 00ecb07465..40eb592659 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/preview/EntityPreviewExecution.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/preview/EntityPreviewExecution.java @@ -8,6 +8,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.OptionalLong; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -293,7 +294,7 @@ private List transformQueryResultToInfos(ManagedQuery // Submitted Query is a single line of an AbsoluteFormQuery => MultilineEntityResult with a single line. - final MultilineEntityResult result = (MultilineEntityResult) infoCardExecution.streamResults().collect(MoreCollectors.onlyElement()); + final MultilineEntityResult result = (MultilineEntityResult) infoCardExecution.streamResults(OptionalLong.empty()).collect(MoreCollectors.onlyElement()); final Object[] values = result.getValues().get(0); final List extraInfos = new ArrayList<>(values.length); @@ -329,7 +330,7 @@ private List toChronoInfos(PreviewConfi for (PreviewConfig.TimeStratifiedSelects description : previewConfig.getTimeStratifiedSelects()) { final ManagedQuery query = subQueries.get(description.label()); - final EntityResult entityResult = query.streamResults().collect(MoreCollectors.onlyElement()); + final EntityResult entityResult = query.streamResults(OptionalLong.empty()).collect(MoreCollectors.onlyElement()); final Map select2desc = description.selects().stream() @@ -447,8 +448,8 @@ public List getResultInfos() { } @Override - public Stream streamResults() { - return getValuesQuery().streamResults(); + public Stream streamResults(OptionalLong limit) { + return getValuesQuery().streamResults(limit); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/results/EntityResult.java b/backend/src/main/java/com/bakdata/conquery/models/query/results/EntityResult.java index bf3e364a69..516578c520 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/results/EntityResult.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/results/EntityResult.java @@ -1,11 +1,11 @@ package com.bakdata.conquery.models.query.results; -import com.bakdata.conquery.io.cps.CPSBase; -import com.fasterxml.jackson.annotation.JsonTypeInfo; - import java.util.List; import java.util.function.UnaryOperator; import java.util.stream.Stream; + +import com.bakdata.conquery.io.cps.CPSBase; +import com.fasterxml.jackson.annotation.JsonTypeInfo; @JsonTypeInfo(use=JsonTypeInfo.Id.CUSTOM, property="type") @CPSBase public interface EntityResult { @@ -21,6 +21,8 @@ public interface EntityResult { * Returns a list of the computed result line for this entity on the query. */ List listResultLines(); + + int length(); /** * Allows to modify the underlying result directly. The return value of the line modifier is the new line. diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/results/MultilineEntityResult.java b/backend/src/main/java/com/bakdata/conquery/models/query/results/MultilineEntityResult.java index ecacf8481d..69fd03d190 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/results/MultilineEntityResult.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/results/MultilineEntityResult.java @@ -49,4 +49,9 @@ public void modifyResultLinesInplace(UnaryOperator lineModifier) { public List listResultLines() { return values; } + + @Override + public int length() { + return values.size(); + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/results/SinglelineEntityResult.java b/backend/src/main/java/com/bakdata/conquery/models/query/results/SinglelineEntityResult.java index fec7137ad5..a7766bc84c 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/results/SinglelineEntityResult.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/results/SinglelineEntityResult.java @@ -39,4 +39,9 @@ public void modifyResultLinesInplace(UnaryOperator lineModifier) { public List listResultLines() { return Collections.singletonList(values); } + + @Override + public int length() { + return 1; + } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/BooleanColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/BooleanColumnStatsCollector.java index 619371e0f9..a131d6d026 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/BooleanColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/BooleanColumnStatsCollector.java @@ -1,60 +1,28 @@ package com.bakdata.conquery.models.query.statistics; -import java.util.concurrent.atomic.AtomicLong; - -import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.query.PrintSettings; import com.bakdata.conquery.models.types.ResultType; import lombok.Getter; -import lombok.ToString; @Getter -public class BooleanColumnStatsCollector extends ColumnStatsCollector { +public class BooleanColumnStatsCollector extends ColumnStatsCollector { - private final AtomicLong trues = new AtomicLong(); - private final AtomicLong falses = new AtomicLong(); - private final AtomicLong nulls = new AtomicLong(0); + private final StringColumnStatsCollector delegate; - public BooleanColumnStatsCollector(String name, String label, String description, ResultType type, PrintSettings printSettings) { - super(name, label, description, type, printSettings); + public BooleanColumnStatsCollector(String name, String label, String description, PrintSettings printSettings) { + super(name, label, description, printSettings); + delegate = new StringColumnStatsCollector(name, label, description, ResultType.StringT.INSTANCE, printSettings, Integer.MAX_VALUE); } @Override - public void consume(Boolean value) { - if (value == null) { - nulls.incrementAndGet(); - return; - } - - if (value){ - trues.incrementAndGet(); - } - else { - falses.incrementAndGet(); - } + public void consume(Object value) { + final String printed = value == null ? null : ResultType.BooleanT.INSTANCE.printNullable(getPrintSettings(), value); + delegate.consume(printed); } @Override public ResultColumnStatistics describe() { - - return new ColumnDescription(getName(), getLabel(), getDescription(), getType().toString(), trues.get(), falses.get(), nulls.get(), nulls.get() + trues.get() + falses.get()); + return delegate.describe(); } - @Getter - @CPSType(id = "BOOLEAN", base = ResultColumnStatistics.class) - @ToString(callSuper = true) - public static class ColumnDescription extends ResultColumnStatistics { - private final long trues; - private final long falses; - private final long nulls; - private final long total; - - public ColumnDescription(String name, String label, String description, String type, long trues, long falses, long nulls, long total) { - super(name, label, description, type); - this.trues = trues; - this.falses = falses; - this.nulls = nulls; - this.total = total; - } - } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ColumnStatsCollector.java index 28599443c1..a9f4e30316 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ColumnStatsCollector.java @@ -1,10 +1,9 @@ package com.bakdata.conquery.models.query.statistics; -import java.util.function.BooleanSupplier; - import javax.annotation.Nullable; import com.bakdata.conquery.io.cps.CPSBase; +import com.bakdata.conquery.models.config.FrontendConfig; import com.bakdata.conquery.models.query.PrintSettings; import com.bakdata.conquery.models.query.resultinfo.ResultInfo; import com.bakdata.conquery.models.query.resultinfo.UniqueNamer; @@ -14,60 +13,51 @@ import lombok.Data; @Data -public abstract class ColumnStatsCollector { +public abstract class ColumnStatsCollector { private final String name; private final String label; private final String description; - private final ResultType type; @JsonIgnore private final PrintSettings printSettings; - public static ColumnStatsCollector getStatsCollector(ResultInfo info, final PrintSettings printSettings, BooleanSupplier samplePicker, ResultType type, UniqueNamer uniqueNamer) { + public static ColumnStatsCollector getStatsCollector(ResultInfo info, final PrintSettings printSettings, ResultType type, UniqueNamer uniqueNamer, FrontendConfig config) { // List recursion must be done before assigning uniqueNames - if (type instanceof ResultType.ListT listT){ - final ColumnStatsCollector columnStatsCollector = getStatsCollector(info, printSettings, samplePicker, listT.getElementType(), uniqueNamer); + if (type instanceof ResultType.ListT listT) { + final ColumnStatsCollector columnStatsCollector = getStatsCollector(info, printSettings, listT.getElementType(), uniqueNamer, config); // name label type are discarded when using ListColumnStatsCollector - return new ListColumnStatsCollector<>(null, null, null, type, columnStatsCollector, printSettings); + return new ListColumnStatsCollector(null, null, null, columnStatsCollector, printSettings); } - final String name = uniqueNamer.getUniqueName(info); final String label = info.defaultColumnName(printSettings); - if (type instanceof ResultType.IntegerT) { - return new NumberColumnStatsCollector(name, label, info.getDescription(), type, samplePicker, printSettings); + if (type instanceof ResultType.NumericT || type instanceof ResultType.MoneyT || type instanceof ResultType.IntegerT) { + return new NumberColumnStatsCollector(name, label, info.getDescription(), type, printSettings, config.getVisualisationsHistogramLimit(), config.getVisualisationPercentiles().lowerEndpoint(), config.getVisualisationPercentiles().upperEndpoint()); } - if (type instanceof ResultType.NumericT) { - return new NumberColumnStatsCollector(name, label, info.getDescription(), type, samplePicker, printSettings); - } - - if (type instanceof ResultType.MoneyT) { - return new NumberColumnStatsCollector(name, label, info.getDescription(), type, samplePicker, printSettings); - } - if (type instanceof ResultType.StringT) { - return new StringColumnStatsCollector(name, label, info.getDescription(), type, printSettings); + if (type instanceof ResultType.StringT stringT) { + return new StringColumnStatsCollector(name, label, info.getDescription(), stringT, printSettings, config.getVisualisationsHistogramLimit()); } if (type instanceof ResultType.BooleanT) { - return new BooleanColumnStatsCollector(name, label, info.getDescription(), type, printSettings); + return new BooleanColumnStatsCollector(name, label, info.getDescription(), printSettings); } if (type instanceof ResultType.DateT) { - return new DateColumnStatsCollector(name, label, info.getDescription(), samplePicker, type, printSettings); + return new DateColumnStatsCollector(name, label, info.getDescription(), type, printSettings); } if (type instanceof ResultType.DateRangeT) { - return new DateColumnStatsCollector(name, label, info.getDescription(), samplePicker, type, printSettings); + return new DateColumnStatsCollector(name, label, info.getDescription(), type, printSettings); } throw new IllegalArgumentException("Don't know how to describe column of type %s".formatted(type)); } - public abstract void consume(@Nullable T value); + public abstract void consume(@Nullable Object value); public abstract ResultColumnStatistics describe(); @@ -78,13 +68,11 @@ public abstract static class ResultColumnStatistics { private final String name; private final String label; private final String description; - private final String type; - protected ResultColumnStatistics(String name, String label, String description, String type) { + protected ResultColumnStatistics(String name, String label, String description) { this.name = name; this.label = label; this.description = description; - this.type = type; } } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DateColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DateColumnStatsCollector.java index 2f072b7e02..0e24def8d1 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DateColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/DateColumnStatsCollector.java @@ -2,14 +2,12 @@ import java.time.LocalDate; import java.time.temporal.IsoFields; -import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; -import java.util.function.BooleanSupplier; +import java.util.function.Function; import com.bakdata.conquery.io.cps.CPSType; import com.bakdata.conquery.models.common.CDate; @@ -21,20 +19,32 @@ import lombok.ToString; @Getter -public class DateColumnStatsCollector extends ColumnStatsCollector { +public class DateColumnStatsCollector extends ColumnStatsCollector { private final SortedMap quarterCounts = new TreeMap<>(); private final SortedMap monthCounts = new TreeMap<>(); private final AtomicInteger totalCount = new AtomicInteger(); private final AtomicLong nulls = new AtomicLong(0); - private final List samples = new ArrayList<>(); - private final BooleanSupplier samplePicker; + private final Function dateExtractor; private CDateRange span = null; - public DateColumnStatsCollector(String name, String label, String description, BooleanSupplier samplePicker, ResultType type, PrintSettings printSettings) { - super(name, label, description, type, printSettings); - this.samplePicker = samplePicker; + + public DateColumnStatsCollector(String name, String label, String description, ResultType type, PrintSettings printSettings) { + super(name, label, description, printSettings); + dateExtractor = getDateExtractor(type); + } + + private static Function getDateExtractor(ResultType dateType) { + if (dateType instanceof ResultType.DateRangeT) { + return dateValue -> CDateRange.fromList((List) dateValue); + } + + if (dateType instanceof ResultType.DateT) { + return dateValue -> CDateRange.exactly((Integer) dateValue); + } + + throw new IllegalStateException("Unexpected type %s".formatted(dateType)); } @Override @@ -46,8 +56,7 @@ public void consume(Object value) { return; } - - final CDateRange dateRange = extractDateRange(getType(), value); + final CDateRange dateRange = dateExtractor.apply(value); span = dateRange.spanClosed(span); if (dateRange.isOpen()) { @@ -60,20 +69,6 @@ public void consume(Object value) { } - private static CDateRange extractDateRange(ResultType dateType, Object dateValue) { - if (dateType instanceof ResultType.DateRangeT) { - return CDateRange.fromList((List) dateValue); - - } - - if (dateType instanceof ResultType.DateT) { - return CDateRange.exactly((Integer) dateValue); - } - - - throw new IllegalStateException("Unexpected type %s".formatted(dateType)); - } - private void handleDay(int day) { final LocalDate date = CDate.toLocalDate(day); final int year = date.getYear(); @@ -88,23 +83,17 @@ private void handleDay(int day) { quarterCounts.compute(yearQuarter, (ignored, current) -> current == null ? 1 : current + 1); monthCounts.compute(yearMonth, (ignored, current) -> current == null ? 1 : current + 1); - - if (samplePicker.getAsBoolean()) { - samples.add(CDate.toLocalDate(day)); - } } @Override public ResultColumnStatistics describe() { - samples.sort(LocalDate::compareTo); return new ColumnDescription(getName(), getLabel(), getDescription(), totalCount.get(), getNulls().intValue(), quarterCounts, monthCounts, - span.toSimpleRange(), - samples + span == null ? CDateRange.all().toSimpleRange() : span.toSimpleRange() ); } @@ -120,16 +109,13 @@ public static class ColumnDescription extends ResultColumnStatistics { private final Range span; - private final Collection samples; - - public ColumnDescription(String name, String label, String description, int count, int nullValues, SortedMap quarterCounts, SortedMap monthCounts, Range span, Collection samples) { - super(name, label, description, "DATES"); + public ColumnDescription(String name, String label, String description, int count, int nullValues, SortedMap quarterCounts, SortedMap monthCounts, Range span) { + super(name, label, description); this.count = count; this.nullValues = nullValues; this.quarterCounts = quarterCounts; this.monthCounts = monthCounts; this.span = span; - this.samples = samples; } } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/Histogram.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/Histogram.java new file mode 100644 index 0000000000..cd19f27b60 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/Histogram.java @@ -0,0 +1,211 @@ +package com.bakdata.conquery.models.query.statistics; + +import java.util.Comparator; +import java.util.List; +import java.util.function.Function; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import it.unimi.dsi.fastutil.doubles.Double2ObjectFunction; +import lombok.Data; +import lombok.ToString; +import lombok.extern.slf4j.Slf4j; + +/** + * Basic implementation of a histogram. + * + * Of note: + * * We have over and underflow bins, if values exceed our assumed value-range. + * * The bins are always aligned such that zero is its own separate bin, as we assume it is a special value. + * - This means, bin limits will be slightly adjusted left to have equal-spaced bins, that have zero not intersect them. + */ +@Data +@Slf4j +public class Histogram { + + + private final Node[] nodes; + private final Node zeroNode; + private final Node underflowNode; + private final Node overflowNode; + private final double lower; + private final double upper; + private final double width; + + private int total; + + private static Histogram rounded(double lower, double upper, double absMin, double absMax, int expectedBins) { + // adjust lower/upper to start on rounded edges. + final double adjLower = Math.max(Math.floor(absMin), Math.floor(lower)); + final double adjUpper = Math.min(Math.ceil(absMax), Math.ceil(upper)); + + final double width = (double) Math.max(1, Math.round((adjUpper - adjLower) / expectedBins)); + + final double newLower; + + if (adjLower == 0d) { + newLower = 0; + } + else if (adjLower < 0) { + // We adjust slightly downward so that we have even sized bins, that meet exactly at zero (which is tracked separately) + newLower = -width * Math.ceil(Math.abs(adjLower) / width); + } + else { + newLower = adjLower; + } + + final double newUpper = newLower + width * expectedBins; + + final Node[] nodes = IntStream.range(0, expectedBins) + // Note, using multiplication is important to avoid floating-point imprecision when wanting to arrive exactly around 0 etc. + .mapToObj(index -> new Node(newLower + width * index, newLower + width * (index + 1))) + .toArray(Node[]::new); + + + return new Histogram(nodes, + new Node(0, 0), + new Node(Math.min(absMin, newLower), newLower), + new Node(newUpper, Math.max(absMax, newUpper), true), + newLower, newUpper, + width); + } + + private static Histogram unrounded(double lower, double upper, double absMin, double absMax, int expectedBins) { + + final double width = (upper - lower) / expectedBins; + + final double adjLower; + + // We have to adjust left if we have a zero-crossing, to ensure partitioning out the zero-bin + if (lower < 0 && absMax > 0) { + adjLower = -Math.ceil(Math.abs(lower) / width) * width; + } + else { + adjLower = lower; + } + + final double newUpper = adjLower + width * expectedBins; + + final Node[] nodes = IntStream.range(0, expectedBins) + .mapToObj(index -> new Node(adjLower + width * index, adjLower + width * (index + 1))) + .toArray(Node[]::new); + + return new Histogram(nodes, new Node(0, 0), new Node(absMin, lower), new Node(newUpper, absMax, true), adjLower, newUpper, width); + } + + public static Histogram zeroCentered(double lower, double upper, double absMin, double absMax, int expectedBins, boolean roundWidth) { + if (lower == upper) { + // Short circuit for degenerate cases + return new Histogram(new Node[0], + new Node(0, 0), + new Node(absMin, lower), + new Node(upper, absMax, true), + lower, upper, + 0 + ); + } + + if(roundWidth){ + return rounded(lower, upper, absMin, absMax, expectedBins); + } + else { + return unrounded(lower, upper, absMin, absMax, expectedBins); + } + } + + public void add(double value) { + total++; + + if (value == 0d) { + zeroNode.add(); + return; + } + + if (value < lower) { + underflowNode.add(); + return; + } + + if (value >= upper) { + overflowNode.add(); + return; + } + + final int index = (int) Math.floor((value - lower) / width); + nodes[index].add(); + } + + public List nodes() { + return Stream.of( + Stream.of(underflowNode, overflowNode, zeroNode).filter(node -> node.getCount() > 0), + Stream.of(nodes) + ) + .flatMap(Function.identity()) + // We compare by Max as well to fix zeroNode and underflowNode sorting when absMin >= 0 + .sorted(Comparator.comparingDouble(Node::getMin).thenComparingDouble(Node::getMax)) + .toList(); + } + + @Data + public static final class Node { + /** + * This is an em-dash. + */ + private static final String FROM_TO = " – "; + + private int hits; + + private final double min, max; + + @ToString.Exclude + private final boolean overflow; + + public Node(double min, double max, boolean overflow) { + this.min = min; + this.max = max; + this.overflow = overflow; + } + + public Node(double min, double max){ + this(min, max, false); + } + + public int getCount() { + return hits; + } + + public void add() { + hits++; + } + + + String createLabel(Double2ObjectFunction printer, boolean isInteger) { + final String lower = printer.apply(getMin()); + + if(getMin() == getMax()){ + return lower; + } + + if(isInteger){ + + if (getMax() - getMin() <= 1){ + return lower; + } + + // Integers allow us to forfeit the brace notation by closing the range (unless we are the overflow bin which tracks real values) + final String upper = printer.apply(getMax() - (isOverflow() ? 0 : 1)); + + return lower + FROM_TO + upper; + } + + final String upper = printer.apply(getMax()); + + final String startBrackets = getMin() == 0 ? "(" : "["; + final String endBrackets = ")"; + + return startBrackets + lower + FROM_TO + upper + endBrackets; + } + + } + +} diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/HistogramColumnDescription.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/HistogramColumnDescription.java new file mode 100644 index 0000000000..708550bee6 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/HistogramColumnDescription.java @@ -0,0 +1,25 @@ +package com.bakdata.conquery.models.query.statistics; + +import java.util.List; +import java.util.Map; + +import com.bakdata.conquery.io.cps.CPSType; +import lombok.Getter; +import lombok.ToString; + +@Getter +@CPSType(id = "HISTO", base = ColumnStatsCollector.ResultColumnStatistics.class) +@ToString(callSuper = true) +public class HistogramColumnDescription extends ColumnStatsCollector.ResultColumnStatistics { + + public static record Entry(String label, long value) {}; + private final List entries; + + private final Map extras; + + public HistogramColumnDescription(String name, String label, String description, List histogram, Map extras) { + super(name, label, description); + this.entries = histogram; + this.extras = extras; + } +} diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ListColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ListColumnStatsCollector.java index f59aca58ef..cf62bf54be 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ListColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ListColumnStatsCollector.java @@ -3,7 +3,6 @@ import java.util.Collection; import com.bakdata.conquery.models.query.PrintSettings; -import com.bakdata.conquery.models.types.ResultType; import lombok.Getter; import lombok.ToString; import org.jetbrains.annotations.Nullable; @@ -11,22 +10,22 @@ @ToString @Getter -public class ListColumnStatsCollector extends ColumnStatsCollector>{ +public class ListColumnStatsCollector extends ColumnStatsCollector { - private final ColumnStatsCollector underlying; + private final ColumnStatsCollector underlying; - public ListColumnStatsCollector(String name, String label, String description, ResultType type, ColumnStatsCollector underlying, PrintSettings printSettings) { - super(name, label, description, type, printSettings); + public ListColumnStatsCollector(String name, String label, String description, ColumnStatsCollector underlying, PrintSettings printSettings) { + super(name, label, description, printSettings); this.underlying = underlying; } @Override - public void consume(@Nullable Collection values) { + public void consume(@Nullable Object values) { if(values == null){ return; } - for (T value : values) { + for (Object value : (Collection) values) { underlying.consume(value); } } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java index bcb4569d5b..302bf7db59 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/NumberColumnStatsCollector.java @@ -1,36 +1,65 @@ package com.bakdata.conquery.models.query.statistics; -import java.util.ArrayList; -import java.util.Collection; +import java.text.DecimalFormat; +import java.text.NumberFormat; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.concurrent.atomic.AtomicLong; -import java.util.function.BooleanSupplier; -import com.bakdata.conquery.io.cps.CPSType; +import c10n.C10N; import com.bakdata.conquery.models.query.PrintSettings; import com.bakdata.conquery.models.types.ResultType; +import com.google.common.collect.Range; import lombok.Getter; -import lombok.ToString; +import lombok.extern.slf4j.Slf4j; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; +import org.jetbrains.annotations.NotNull; @Getter -public class NumberColumnStatsCollector> extends ColumnStatsCollector { +@Slf4j +public class NumberColumnStatsCollector> extends ColumnStatsCollector { + + private final ResultType type; private final DescriptiveStatistics statistics = new DescriptiveStatistics(); private final AtomicLong nulls = new AtomicLong(0); - private final List samples = new ArrayList<>(); + private final Comparator comparator; - private final BooleanSupplier samplePicker; + private final NumberFormat formatter; + private final int expectedBins; + private final double upperPercentile; + private final double lowerPercentile; - private final Comparator comparator; + public NumberColumnStatsCollector(String name, String label, String description, ResultType type, PrintSettings printSettings, int expectedBins, double lowerPercentile, double upperPercentile) { + super(name, label, description, printSettings); + + this.type = type; + + comparator = selectComparator(type); + + // We have to clone, as NumberFormat is not thread-safe and shared. + formatter = selectFormatter(type, printSettings); - public NumberColumnStatsCollector(String name, String label, String description, ResultType type, BooleanSupplier samplePicker, PrintSettings printSettings) { - super(name, label, description, type, printSettings); - this.samplePicker = samplePicker; - this.comparator = selectComparator(type); + this.expectedBins = expectedBins; + this.upperPercentile = upperPercentile; + this.lowerPercentile = lowerPercentile; + } + + private NumberFormat selectFormatter(ResultType type, PrintSettings printSettings) { + if (type instanceof ResultType.MoneyT) { + return ((DecimalFormat) printSettings.getCurrencyFormat().clone()); + } + else if (type instanceof ResultType.IntegerT) { + return ((NumberFormat) printSettings.getIntegerFormat().clone()); + } + else { + return ((NumberFormat) printSettings.getDecimalFormat().clone()); + } } private Comparator selectComparator(ResultType resultType) { @@ -50,73 +79,121 @@ private Comparator selectComparator(ResultType resultType) { throw new IllegalArgumentException("Cannot handle result type %s".formatted(resultType.toString())); } + /** + * If distance between bounds is less than expectedBins, we expand our bounds along percentiles. + */ + private static Range expandBounds(double lower, double upper, int expectedBins, DescriptiveStatistics statistics, double by) { + assert by > 0; + + // limitation of DescriptiveStatistics#getPercentile: crashes if lower==0, so we short circuit. + final boolean underflow = lower <= 1.d; + final boolean overflow = upper >= 99; + + final double min = underflow ? statistics.getMin() : statistics.getPercentile(lower); + final double max = overflow ? statistics.getMax() : statistics.getPercentile(upper); + + // No need to walk further, if we are already at the limits. + if (underflow && overflow) { + return Range.closed(min, max); + } + + if (max - min < expectedBins) { + return expandBounds(Math.max(0, lower - by), Math.min(100, upper + by), expectedBins, statistics, by); + } + + return Range.closed(min, max); + } + @Override - public void consume(Number value) { + public void consume(Object value) { if (value == null) { nulls.incrementAndGet(); return; } + Number number = (Number) value; + // TODO this feels like a pretty borked abstraction if (getType() instanceof ResultType.MoneyT moneyT) { - value = moneyT.readIntermediateValue(getPrintSettings(), value); + number = moneyT.readIntermediateValue(getPrintSettings(), number); } - statistics.addValue(value.doubleValue()); + statistics.addValue(number.doubleValue()); - if (samplePicker.getAsBoolean()) { - samples.add((TYPE) value); - } } @Override public ResultColumnStatistics describe() { // If no real samples were collected, we short-circuit, as Statistics will throw an exception when empty. if (getStatistics().getN() == 0) { - return new ColumnDescription( - getName(), getLabel(), getDescription(), getType().toString(), - getNulls().intValue(), getNulls().intValue(), - Double.NaN, Double.NaN, Double.NaN, Double.NaN, Double.NaN, Collections.emptyList() - ); + return new HistogramColumnDescription(getName(), getLabel(), getDescription(), Collections.emptyList(), Collections.emptyMap()); } - final double p99 = getStatistics().getPercentile(99d); - final double maybeP01 = getStatistics().getPercentile(1d); + final List bins = createBins(); + final Map extras = getExtras(); - // If min is basically 0, we don't prune for it, as those are usually relevant values. - final double p01 = (Math.abs(maybeP01) < 2 * Double.MIN_VALUE) ? Double.MIN_VALUE : maybeP01; + return new HistogramColumnDescription(getName(), getLabel(), getDescription(), bins, extras); - return new ColumnDescription(getName(), getLabel(), getDescription(), getType().toString(), (int) (getStatistics().getN() + getNulls().intValue()), getNulls().intValue(), getStatistics().getMean(), getStatistics().getPercentile(50d /*This is the median.*/), getStatistics().getStandardDeviation(), (int) getStatistics().getMin(), (int) getStatistics().getMax(), - // We cull extremes, as that can cause distortions when displayed. - samples.stream().filter(val -> val.doubleValue() >= p01 && val.doubleValue() <= p99).sorted(comparator).toList() - ); } - @Getter - @CPSType(id = "DESCRIPTIVE", base = ResultColumnStatistics.class) - @ToString(callSuper = true) - public static class ColumnDescription extends ColumnStatsCollector.ResultColumnStatistics { - - private final int count; - private final int nullValues; - private final double mean; - private final double median; - private final double stdDev; - private final Number min; - private final Number max; - - private final Collection samples; - - public ColumnDescription(String name, String label, String description, String type, int count, int nullValues, double mean, double median, double stdDev, Number min, Number max, Collection samples) { - super(name, label, description, type); - this.count = count; - this.nullValues = nullValues; - this.mean = mean; - this.median = median; - this.stdDev = stdDev; - this.min = min; - this.max = max; - this.samples = samples; + @NotNull + private List createBins() { + + final Range bounds = expandBounds(lowerPercentile, upperPercentile, expectedBins, statistics, 5); + + log.trace("Creating Histogram for {} with params inner=({}, {}), bounds=({},{}) bins={}", getLabel(), bounds.lowerEndpoint(), bounds.upperEndpoint(), getStatistics().getMin(), getStatistics().getMax(), expectedBins); + + final Histogram histogram = + Histogram.zeroCentered(bounds.lowerEndpoint(), bounds.upperEndpoint(), getStatistics().getMin(), getStatistics().getMax(), expectedBins, bounds.upperEndpoint() - bounds.lowerEndpoint() > 1); + + Arrays.stream(getStatistics().getValues()).forEach(histogram::add); + + return histogram.nodes() + .stream() + .map(bin -> { + final String binLabel = bin.createLabel(this::printValue, getType() instanceof ResultType.IntegerT); + + return new HistogramColumnDescription.Entry(binLabel, bin.getCount()); + }) + .toList(); + } + + @NotNull + private Map getExtras() { + final StatisticsLabels labels = C10N.get(StatisticsLabels.class, getPrintSettings().getLocale()); + + + // LinkedHashMap remembers insertion order + final LinkedHashMap out = new LinkedHashMap<>(); + + out.put(labels.min(), printValue(getStatistics().getMin())); + out.put(labels.max(), printValue(getStatistics().getMax())); + + // mean is always a decimal number, therefore integer needs special handling + if(getType() instanceof ResultType.IntegerT){ + out.put(labels.mean(), getPrintSettings().getDecimalFormat().format(getStatistics().getMean())); + } + else { + out.put(labels.mean(), printValue(getStatistics().getMean())); } + + out.put(labels.p25(), printValue(getStatistics().getPercentile(25))); + out.put(labels.median(), printValue(getStatistics().getPercentile(50))); + out.put(labels.p75(), printValue(getStatistics().getPercentile(75))); + + out.put(labels.std(), getPrintSettings().getDecimalFormat().format(getStatistics().getStandardDeviation())); + + out.put(labels.sum(), printValue(getStatistics().getSum())); + out.put(labels.count(), getPrintSettings().getIntegerFormat().format(getStatistics().getN())); + out.put(labels.missing(), getPrintSettings().getIntegerFormat().format(getNulls().get())); + + return out; } + + + private String printValue(Number value) { + return formatter.format(value.doubleValue()); + } + + } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ResultStatistics.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ResultStatistics.java index 43c0976ed3..fdd7369abe 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ResultStatistics.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/ResultStatistics.java @@ -2,8 +2,133 @@ import java.time.LocalDate; import java.util.List; +import java.util.Optional; +import java.util.OptionalLong; +import java.util.concurrent.Callable; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiConsumer; +import java.util.function.Consumer; +import java.util.stream.IntStream; import com.bakdata.conquery.models.common.Range; +import com.bakdata.conquery.models.common.daterange.CDateRange; +import com.bakdata.conquery.models.config.ConqueryConfig; +import com.bakdata.conquery.models.query.ManagedQuery; +import com.bakdata.conquery.models.query.PrintSettings; +import com.bakdata.conquery.models.query.SingleTableResult; +import com.bakdata.conquery.models.query.resultinfo.ResultInfo; +import com.bakdata.conquery.models.query.resultinfo.UniqueNamer; +import com.bakdata.conquery.models.query.results.EntityResult; +import com.bakdata.conquery.models.types.ResultType; +import com.google.common.util.concurrent.Futures; +import com.google.common.util.concurrent.ListenableFuture; +import com.google.common.util.concurrent.ListeningExecutorService; +import com.google.common.util.concurrent.MoreExecutors; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; +import org.apache.commons.lang3.time.StopWatch; +import org.jetbrains.annotations.NotNull; +@Slf4j public record ResultStatistics(int entities, int total, List statistics, Range dateRange) { + @SneakyThrows + @NotNull + public static ResultStatistics collectResultStatistics(ManagedQuery managedQuery, List resultInfos, Optional dateInfo, int dateIndex, PrintSettings printSettings, UniqueNamer uniqueNamer, ConqueryConfig conqueryConfig) { + + //TODO pull inner executor service from ManagerNode + final ListeningExecutorService executorService = MoreExecutors.listeningDecorator(Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors() - 1)); + + // Yes, we are actually iterating the result for every job. + + // Span date-column + final ListenableFuture> futureSpan = executorService.submit(() -> calculateDateSpan(managedQuery, dateInfo, dateIndex)); + + // Count result lines (may differ in case of form or SecondaryIdQuery) + final ListenableFuture + futureLines = + executorService.submit(() -> managedQuery.streamResults(OptionalLong.empty()).mapToInt(result -> result.listResultLines().size()).sum()); + + // compute ResultColumnStatistics for each column + final List> + futureDescriptions = + IntStream.range(0, resultInfos.size()).mapToObj(col -> (Callable) () -> { + final StopWatch started = StopWatch.createStarted(); + + final ResultInfo info = resultInfos.get(col); + final ColumnStatsCollector statsCollector = + ColumnStatsCollector.getStatsCollector(info, printSettings, info.getType(), uniqueNamer, conqueryConfig.getFrontend()); + + log.trace("BEGIN stats collection for {}", info); + + managedQuery.streamResults(OptionalLong.empty()).map(EntityResult::listResultLines).flatMap(List::stream).forEach(line -> statsCollector.consume(line[col])); + + log.trace("DONE collecting values for {}, in {}", info, started); + + final ColumnStatsCollector.ResultColumnStatistics description = statsCollector.describe(); + + log.debug("DONE description for {}, in {}", info, started); + + return description; + }) + .map(executorService::submit) + .toList(); + + final Range span = futureSpan.get(); + final List descriptions = Futures.allAsList(futureDescriptions).get(); + final int lines = futureLines.get(); + return new ResultStatistics(managedQuery.getLastResultCount().intValue(), lines, descriptions, span); + } + + private static Range calculateDateSpan(SingleTableResult managedQuery, Optional dateInfo, int dateIndex) { + if (dateInfo.isEmpty()) { + return CDateRange.all().toSimpleRange(); + } + + final AtomicReference spanRef = new AtomicReference<>(null); + final Consumer dateAggregator = getDateSpanner(dateInfo.get(), dateIndex, spanRef); + + managedQuery.streamResults(OptionalLong.empty()).flatMap(EntityResult::streamValues).forEach(dateAggregator); + + final CDateRange span = spanRef.get(); + + if (span == null) { + return CDateRange.all().toSimpleRange(); + } + + return span.toSimpleRange(); + } + + /** + * If not dateInfo is given, don't try to span values. otherwise takes values from line at dateIndex, and handles them according to dateInfo. + */ + private static Consumer getDateSpanner(ResultInfo dateInfo, int dateIndex, AtomicReference spanRef) { + + final Consumer spanner = date -> spanRef.getAndAccumulate(date, (old, incoming) -> incoming.spanClosed(old)); + + final BiConsumer> extractor = validityDateExtractor(dateInfo.getType()); + + return line -> extractor.accept(line[dateIndex], spanner); + + } + + public static BiConsumer> validityDateExtractor(ResultType dateType) { + if (dateType instanceof ResultType.DateRangeT) { + return (obj, con) -> con.accept(CDateRange.fromList((List) obj)); + } + + + if (dateType instanceof ResultType.DateT) { + return (obj, con) -> con.accept(CDateRange.exactly((Integer) obj)); + } + + if (dateType instanceof ResultType.ListT listT) { + final BiConsumer> extractor = validityDateExtractor(listT.getElementType()); + return (obj, con) -> ((List) obj).forEach(date -> extractor.accept(date, con)); + } + + throw new IllegalStateException("Unexpected date Type %s".formatted(dateType)); + } + + } diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StatisticsLabels.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StatisticsLabels.java new file mode 100644 index 0000000000..95dc4f1665 --- /dev/null +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StatisticsLabels.java @@ -0,0 +1,56 @@ +package com.bakdata.conquery.models.query.statistics; + +import c10n.annotations.De; +import c10n.annotations.En; + +public interface StatisticsLabels { + @En("Minimum") + @De("Minimum") + public String min(); + + @En("Maximum") + @De("Maximum") + public String max(); + + @En("Mean") + @De("Mittelwert") + public String mean(); + + @En("Median") + @De("Median") + public String median(); + + @En("Sum") + @De("Summe") + public String sum(); + + @En("Standard Deviation") + @De("Standardabweichung") + public String std(); + + @En("3rd Quartile") + @De("3. Quartil") + public String p75(); + + @En("1st Quartile") + @De("1. Quartil") + public String p25(); + + @En("Total") + @De("Anzahl") + public String count(); + + @En("Missing") + @De("Fehlende Einträge") + public String missing(); + + @En("{0} additional Values") + @De("{0} weitere Werte") + public String remainingValues(long count); + + @En("{0} entries") + @De("{0} Einträge") + public String remainingEntries(long count); + + +} diff --git a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StringColumnStatsCollector.java b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StringColumnStatsCollector.java index c43d93c8b5..ab469df184 100644 --- a/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StringColumnStatsCollector.java +++ b/backend/src/main/java/com/bakdata/conquery/models/query/statistics/StringColumnStatsCollector.java @@ -1,29 +1,37 @@ package com.bakdata.conquery.models.query.statistics; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; import java.util.stream.StreamSupport; -import com.bakdata.conquery.io.cps.CPSType; +import c10n.C10N; import com.bakdata.conquery.models.query.PrintSettings; import com.bakdata.conquery.models.types.ResultType; import lombok.Getter; -import lombok.ToString; +import lombok.extern.slf4j.Slf4j; import org.apache.commons.math3.stat.Frequency; @Getter -public class StringColumnStatsCollector extends ColumnStatsCollector { +@Slf4j +public class StringColumnStatsCollector extends ColumnStatsCollector { private final Frequency frequencies = new Frequency(); private final AtomicLong nulls = new AtomicLong(0); + private final long limit; - public StringColumnStatsCollector(String name, String label, String description, ResultType type, PrintSettings printSettings) { - super(name, label, description, type, printSettings); + private final ResultType.StringT type; + + public StringColumnStatsCollector(String name, String label, String description, ResultType.StringT type, PrintSettings printSettings, long limit) { + super(name, label, description, printSettings); + this.limit = limit; + this.type = type; } @Override - public void consume(String value) { + public void consume(Object value) { if (value == null) { nulls.incrementAndGet(); return; @@ -36,23 +44,37 @@ public void consume(String value) { @Override public ResultColumnStatistics describe() { - final Map repr = + final List, Long>> entriesSorted = StreamSupport.stream(((Iterable, Long>>) frequencies::entrySetIterator).spliterator(), false) - .collect(Collectors.toMap(entry -> (String) entry.getKey(), Map.Entry::getValue)); + .sorted(Map.Entry., Long>comparingByValue().reversed()) + .toList(); + final long end = Math.min(limit, entriesSorted.size()); - return new ColumnDescription(getName(), getLabel(), getDescription(), repr); - } + final List head = new ArrayList<>(); + long shownTotal = 0; + + for (int i = 0; i < end; i++) { + final Map.Entry, Long> counts = entriesSorted.get(i); + + final HistogramColumnDescription.Entry entry = new HistogramColumnDescription.Entry(((String) counts.getKey()), counts.getValue()); + head.add(entry); - @Getter - @CPSType(id = "HISTO", base = ResultColumnStatistics.class) - @ToString(callSuper = true) - public static class ColumnDescription extends ResultColumnStatistics { - private final Map histogram; + shownTotal += counts.getValue(); - public ColumnDescription(String name, String label, String description, Map histogram) { - super(name, label, description, "STRING"); - this.histogram = histogram; } + + final StatisticsLabels statisticsLabels = C10N.get(StatisticsLabels.class, getPrintSettings().getLocale()); + + final Map extras = + entriesSorted.size() <= limit + ? Collections.emptyMap() + : Map.of( + statisticsLabels.remainingValues(entriesSorted.size() - limit), + statisticsLabels.remainingEntries(frequencies.getSumFreq() - shownTotal) + ); + + return new HistogramColumnDescription(getName(), getLabel(), getDescription(), head, extras); } + } diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultArrowResource.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultArrowResource.java index 51eefb75e2..5678bd9b98 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultArrowResource.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultArrowResource.java @@ -6,6 +6,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Optional; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.DefaultValue; @@ -43,11 +44,13 @@ public Response getFile( @Auth Subject subject, @PathParam(QUERY) ManagedExecution query, @HeaderParam(HttpHeaders.USER_AGENT) String userAgent, - @QueryParam("pretty") @DefaultValue("false") boolean pretty) { + @QueryParam("pretty") @DefaultValue("false") boolean pretty, + @QueryParam("limit") OptionalLong limit + ) { checkSingleTableResult(query); log.info("Result for {} download on dataset {} by subject {} ({}).", query.getId(), query.getDataset().getId(), subject.getId(), subject.getName()); - return processor.createResultFile(subject, query, pretty); + return processor.createResultFile(subject, query, pretty, limit); } public static URL getFileDownloadURL(UriBuilder uriBuilder, E exec) throws MalformedURLException { @@ -78,9 +81,11 @@ public Response getStream( @Auth Subject subject, @PathParam(QUERY) ManagedExecution execution, @HeaderParam(HttpHeaders.USER_AGENT) String userAgent, - @QueryParam("pretty") Optional pretty) { + @QueryParam("pretty") Optional pretty, + @QueryParam("limit") OptionalLong limit + ) { checkSingleTableResult(execution); log.info("Result for {} download on dataset {} by subject {} ({}).", execution, execution.getDataset().getId(), subject.getId(), subject.getName()); - return processor.createResultStream(subject, execution, pretty.orElse(false)); + return processor.createResultStream(subject, execution, pretty.orElse(false), limit); } } diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultCsvResource.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultCsvResource.java index 70134250e0..99cf6fcfdb 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultCsvResource.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultCsvResource.java @@ -6,6 +6,7 @@ import java.net.MalformedURLException; import java.net.URL; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.DefaultValue; @@ -54,11 +55,13 @@ public Response getAsCsv( @PathParam(QUERY) ManagedExecution execution, @HeaderParam(HttpHeaders.USER_AGENT) String userAgent, @QueryParam("charset") String queryCharset, - @QueryParam("pretty") @DefaultValue("true") boolean pretty) { + @QueryParam("pretty") @DefaultValue("true") boolean pretty, + @QueryParam("limit") OptionalLong limit + ) { checkSingleTableResult(execution); log.info("Result for {} download on dataset {} by subject {} ({}).", execution, execution.getDataset().getId(), subject.getId(), subject.getName()); - return processor.createResult(subject, (E) execution, pretty, determineCharset(userAgent, queryCharset)); + return processor.createResult(subject, (E) execution, pretty, determineCharset(userAgent, queryCharset), limit); } } diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultExcelResource.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultExcelResource.java index 67a2339e75..4cfe62791c 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultExcelResource.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultExcelResource.java @@ -5,6 +5,7 @@ import java.net.MalformedURLException; import java.net.URL; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.DefaultValue; @@ -44,11 +45,12 @@ public Response get( @Auth Subject subject, @PathParam(QUERY) ManagedExecution execution, @HeaderParam(HttpHeaders.USER_AGENT) String userAgent, - @QueryParam("pretty") @DefaultValue("true") boolean pretty) { + @QueryParam("pretty") @DefaultValue("true") boolean pretty, + @QueryParam("limit") OptionalLong limit) { checkSingleTableResult(execution); log.info("Result for {} download on dataset {} by subject {} ({}).", execution.getId(), execution.getDataset() .getId(), subject.getId(), subject.getName()); - return processor.createResult(subject, (E) execution, pretty); + return processor.createResult(subject, (E) execution, pretty, limit); } public static URL getDownloadURL(UriBuilder uriBuilder, E exec) throws MalformedURLException { diff --git a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultParquetResource.java b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultParquetResource.java index 44f0bdf5b7..3b85eb1d90 100644 --- a/backend/src/main/java/com/bakdata/conquery/resources/api/ResultParquetResource.java +++ b/backend/src/main/java/com/bakdata/conquery/resources/api/ResultParquetResource.java @@ -6,6 +6,7 @@ import java.net.MalformedURLException; import java.net.URL; +import java.util.OptionalLong; import javax.inject.Inject; import javax.ws.rs.DefaultValue; @@ -54,11 +55,12 @@ public Response getFile( @Auth Subject subject, @PathParam(QUERY) ManagedExecution execution, @HeaderParam(HttpHeaders.USER_AGENT) String userAgent, - @QueryParam("pretty") @DefaultValue("false") boolean pretty) { + @QueryParam("pretty") @DefaultValue("false") boolean pretty, + @QueryParam("limit") OptionalLong limit) { checkSingleTableResult(execution); log.info("Result for {} download on dataset {} by subject {} ({}).", execution.getId(), execution.getDataset().getId(), subject.getId(), subject.getName()); - return processor.createResultFile(subject, execution, pretty); + return processor.createResultFile(subject, execution, pretty, limit); } } diff --git a/backend/src/main/java/com/bakdata/conquery/sql/conquery/SqlManagedQuery.java b/backend/src/main/java/com/bakdata/conquery/sql/conquery/SqlManagedQuery.java index 74552cab97..3e6d6e7a4a 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/conquery/SqlManagedQuery.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/conquery/SqlManagedQuery.java @@ -1,6 +1,8 @@ package com.bakdata.conquery.sql.conquery; import java.util.List; +import java.util.OptionalLong; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.Consumer; import java.util.stream.Stream; @@ -79,8 +81,17 @@ public List getResultInfos() { } @Override - public Stream streamResults() { - return result.getTable().stream(); + public Stream streamResults(OptionalLong maybeLimit) { + final Stream results = result.getTable().stream(); + + if(maybeLimit.isEmpty()){ + return results; + } + + final long limit = maybeLimit.getAsLong(); + final AtomicLong consumed = new AtomicLong(); + + return results.takeWhile(line -> consumed.addAndGet(line.length()) < limit); } @Override diff --git a/backend/src/main/java/com/bakdata/conquery/sql/execution/SqlEntityResult.java b/backend/src/main/java/com/bakdata/conquery/sql/execution/SqlEntityResult.java index 9fbe011230..91c7a0d1b2 100644 --- a/backend/src/main/java/com/bakdata/conquery/sql/execution/SqlEntityResult.java +++ b/backend/src/main/java/com/bakdata/conquery/sql/execution/SqlEntityResult.java @@ -24,10 +24,6 @@ public class SqlEntityResult implements EntityResult { private final String id; private Object[] values; - public String getId() { - return id; - } - @Override public int getEntityId() { return entityId; @@ -43,6 +39,11 @@ public int columnCount() { return values.length; } + @Override + public int length() { + return 1; + } + @Override public void modifyResultLinesInplace(UnaryOperator lineModifier) { values = lineModifier.apply(values); diff --git a/backend/src/test/java/com/bakdata/conquery/integration/json/AbstractQueryEngineTest.java b/backend/src/test/java/com/bakdata/conquery/integration/json/AbstractQueryEngineTest.java index be9726a5d4..8f64443f01 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/json/AbstractQueryEngineTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/json/AbstractQueryEngineTest.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.OptionalLong; import javax.validation.UnexpectedTypeException; import javax.ws.rs.core.Response; @@ -57,7 +58,7 @@ public void executeTest(StandaloneSupport standaloneSupport) throws IOException //check result info size List resultInfos = executionResult.getResultInfos(); - assertThat(executionResult.streamResults().flatMap(EntityResult::streamValues)) + assertThat(executionResult.streamResults(OptionalLong.empty()).flatMap(EntityResult::streamValues)) .as("Should have same size as result infos") .allSatisfy(v -> assertThat(v).hasSameSizeAs(resultInfos)); @@ -86,7 +87,7 @@ public void executeTest(StandaloneSupport standaloneSupport) throws IOException .containsExactlyInAnyOrderElementsOf(expected); // check that getLastResultCount returns the correct size - if (executionResult.streamResults().noneMatch(MultilineEntityResult.class::isInstance)) { + if (executionResult.streamResults(OptionalLong.empty()).noneMatch(MultilineEntityResult.class::isInstance)) { long lastResultCount; if (executionResult instanceof EditorQuery editorQuery) { lastResultCount = editorQuery.getLastResultCount(); diff --git a/backend/src/test/java/com/bakdata/conquery/integration/json/FormTest.java b/backend/src/test/java/com/bakdata/conquery/integration/json/FormTest.java index 1d16d4486b..1744813710 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/json/FormTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/json/FormTest.java @@ -9,6 +9,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.OptionalLong; import java.util.concurrent.TimeUnit; import javax.validation.Valid; @@ -163,7 +164,7 @@ private void checkMultipleResult(Map> managedMapping, resultInfos, managed.getValue() .stream() - .flatMap(ManagedQuery::streamResults) + .flatMap(managedQuery -> managedQuery.streamResults(OptionalLong.empty())) ); writer.close(); @@ -196,7 +197,7 @@ private void checkSingleResult(F man renderer.toCSV( config.getIdColumns().getIdResultInfos(), managedForm.getResultInfos(), - managedForm.streamResults() + managedForm.streamResults(OptionalLong.empty()) ); writer.close(); diff --git a/backend/src/test/java/com/bakdata/conquery/integration/tests/QueryStatisticsTest.java b/backend/src/test/java/com/bakdata/conquery/integration/tests/QueryStatisticsTest.java index aad0493fb5..2258fb9c34 100644 --- a/backend/src/test/java/com/bakdata/conquery/integration/tests/QueryStatisticsTest.java +++ b/backend/src/test/java/com/bakdata/conquery/integration/tests/QueryStatisticsTest.java @@ -2,7 +2,6 @@ import static org.assertj.core.api.Assertions.assertThat; -import java.math.BigDecimal; import java.time.LocalDate; import java.util.List; import java.util.Locale; @@ -11,6 +10,7 @@ import javax.ws.rs.core.UriBuilder; +import c10n.C10N; import com.bakdata.conquery.integration.common.IntegrationUtils; import com.bakdata.conquery.integration.json.JsonIntegrationTest; import com.bakdata.conquery.integration.json.QueryTest; @@ -20,9 +20,9 @@ import com.bakdata.conquery.models.execution.ExecutionState; import com.bakdata.conquery.models.identifiable.ids.specific.ManagedExecutionId; import com.bakdata.conquery.models.query.statistics.DateColumnStatsCollector; -import com.bakdata.conquery.models.query.statistics.NumberColumnStatsCollector; +import com.bakdata.conquery.models.query.statistics.HistogramColumnDescription; import com.bakdata.conquery.models.query.statistics.ResultStatistics; -import com.bakdata.conquery.models.query.statistics.StringColumnStatsCollector; +import com.bakdata.conquery.models.query.statistics.StatisticsLabels; import com.bakdata.conquery.resources.ResourceConstants; import com.bakdata.conquery.resources.api.QueryResource; import com.bakdata.conquery.resources.hierarchies.HierarchyHelper; @@ -56,12 +56,12 @@ public void execute(String name, TestConquery testConquery) throws Exception { final UriBuilder uriBuilder = HierarchyHelper.hierarchicalPath(conquery.defaultApiURIBuilder(), QueryResource.class, "getDescription"); - final ResultStatistics results = conquery.getClient().target( - uriBuilder.buildFromMap(Map.of(ResourceConstants.QUERY, executionId.toString()))) + final ResultStatistics results = conquery.getClient().target(uriBuilder.buildFromMap(Map.of(ResourceConstants.QUERY, executionId.toString()))) .request() .acceptLanguage(Locale.ENGLISH) .get(ResultStatistics.class); + final StatisticsLabels labels = C10N.get(StatisticsLabels.class, Locale.ENGLISH); // We are using TreeMaps for Maps that have a defined order. final ResultStatistics expected = new ResultStatistics( @@ -84,14 +84,6 @@ public void execute(String name, TestConquery testConquery) throws Exception { )), Range.of( LocalDate.of(2021, 1, 1), LocalDate.of(2021, 10, 1) - ), - List.of( - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 11), - LocalDate.of(2021, 1, 12), - LocalDate.of(2021, 10, 1) ) ), new DateColumnStatsCollector.ColumnDescription( @@ -111,117 +103,115 @@ public void execute(String name, TestConquery testConquery) throws Exception { )), Range.of( LocalDate.of(2021, 1, 1), LocalDate.of(2021, 10, 1) - ), - List.of( - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 1), - LocalDate.of(2021, 1, 11), - LocalDate.of(2021, 1, 12), - LocalDate.of(2021, 10, 1) ) ), - new StringColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept string", "concept string", null, - Map.of( - "a", 1L, - "b", 1L, - "c", 2L, - "d", 1L - ) + List.of( + new HistogramColumnDescription.Entry("c", 2), + new HistogramColumnDescription.Entry("a", 1), + new HistogramColumnDescription.Entry("b", 1), + new HistogramColumnDescription.Entry("d", 1) + ), + Map.of() ), - new StringColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept mapped", "concept mapped", null, - Map.of( - "BEH", 1L, - "d", 1L, - "AH", 1L, - "CEH", 2L - ) + List.of( + new HistogramColumnDescription.Entry("CEH", 2), + new HistogramColumnDescription.Entry("AH", 1), + new HistogramColumnDescription.Entry("BEH", 1), + new HistogramColumnDescription.Entry("d", 1) + ), + Map.of() ), - new NumberColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept int", "concept int", null, - "INTEGER", - 6, - 1, - 2.6, - 3.0, - 1.140175425099138, - 1, - 4, List.of( - 1, - 2, - 3, - 3, - 4 - ) + new HistogramColumnDescription.Entry("1", 1), + new HistogramColumnDescription.Entry("2", 1), + new HistogramColumnDescription.Entry("3", 2), + new HistogramColumnDescription.Entry("4", 1), + new HistogramColumnDescription.Entry("5", 0), + new HistogramColumnDescription.Entry("6", 0), + new HistogramColumnDescription.Entry("7", 0), + new HistogramColumnDescription.Entry("8", 0), + new HistogramColumnDescription.Entry("9", 0), + new HistogramColumnDescription.Entry("10", 0) + ), + Map.of(labels.p25(), "1.5", labels.p75(), "3.5", labels.max(), "4", labels.mean(), "2.6", labels.median(), "3", labels.min(), "1", labels.missing(), "1", labels.std(), "1.14", labels.sum(), "13", labels.count(), "5") ), - new NumberColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept real", "concept real", null, - "NUMERIC", - 6, - 1, - 2.6, - 3.0, - 1.140175425099138, - 1, - 4, List.of( - new BigDecimal("1.0"), - new BigDecimal("2.0"), - new BigDecimal("3.0"), - new BigDecimal("3.0"), - new BigDecimal("4.0") - ) + new HistogramColumnDescription.Entry("0", 1), + new HistogramColumnDescription.Entry("(0 – 1)", 1), + new HistogramColumnDescription.Entry("[1 – 2)", 1), + new HistogramColumnDescription.Entry("[2 – 3)", 1), + new HistogramColumnDescription.Entry("[3 – 4)", 2), + new HistogramColumnDescription.Entry("[4 – 5)", 1), + new HistogramColumnDescription.Entry("[5 – 6)", 0), + new HistogramColumnDescription.Entry("[6 – 7)", 0), + new HistogramColumnDescription.Entry("[7 – 8)", 0), + new HistogramColumnDescription.Entry("[8 – 9)", 0), + new HistogramColumnDescription.Entry("[9 – 10)", 0) + ), + Map.of(labels.p25(), "0.1", labels.p75(), "3", labels.max(), "4", labels.mean(), "1.87", labels.median(), "2", labels.min(), "0", labels.missing(), "0", labels.std(), "1.55", labels.sum(), "13.1", labels.count(), "7") ), - new NumberColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept decimal", "concept decimal", null, - "NUMERIC", - 6, - 1, - 2.6, - 3.0, - 1.140175425099138, - 1, - 4, List.of( - 1, - 2, - 3, - 3, - 4 - ) + new HistogramColumnDescription.Entry("[1 – 2)", 1), + new HistogramColumnDescription.Entry("[2 – 3)", 1), + new HistogramColumnDescription.Entry("[3 – 4)", 2), + new HistogramColumnDescription.Entry("[4 – 5)", 1), + new HistogramColumnDescription.Entry("[5 – 6)", 0), + new HistogramColumnDescription.Entry("[6 – 7)", 0), + new HistogramColumnDescription.Entry("[7 – 8)", 0), + new HistogramColumnDescription.Entry("[8 – 9)", 0), + new HistogramColumnDescription.Entry("[9 – 10)", 0), + new HistogramColumnDescription.Entry("[10 – 11)", 0) + ), + Map.of(labels.p25(), "1.5", labels.p75(), "3.5", labels.max(), "4", labels.mean(), "2.6", labels.median(), "3", labels.min(), "1", labels.missing(), "1", labels.std(), "1.14", labels.sum(), "13", labels.count(), "5") ), - new NumberColumnStatsCollector.ColumnDescription( + new HistogramColumnDescription( "concept money", "concept money", null, - "MONEY", - 6, - 1, - 26.0, - 30, - 11.40175425099138, - 10, - 40, List.of( - new BigDecimal("10.00"), - new BigDecimal("20.00"), - new BigDecimal("30.00"), - new BigDecimal("30.00"), - new BigDecimal("40.00") - ) + new HistogramColumnDescription.Entry("[€10.00 – €13.00)", 1), + new HistogramColumnDescription.Entry("[€13.00 – €16.00)", 0), + new HistogramColumnDescription.Entry("[€16.00 – €19.00)", 0), + new HistogramColumnDescription.Entry("[€19.00 – €22.00)", 1), + new HistogramColumnDescription.Entry("[€22.00 – €25.00)", 0), + new HistogramColumnDescription.Entry("[€25.00 – €28.00)", 0), + new HistogramColumnDescription.Entry("[€28.00 – €31.00)", 2), + new HistogramColumnDescription.Entry("[€31.00 – €34.00)", 0), + new HistogramColumnDescription.Entry("[€34.00 – €37.00)", 0), + new HistogramColumnDescription.Entry("[€37.00 – €40.00)", 0), + new HistogramColumnDescription.Entry("€40.00", 1) + ), + Map.of(labels.p25(), "€15.00", labels.p75(), "€35.00", labels.max(), "€40.00", labels.mean(), "€26.00", labels.median(), "€30.00", labels.min(), "€10.00", labels.missing(), "1", labels.std(), "11.4", labels.sum(), "€130.00", labels.count(), "5") + ), + new HistogramColumnDescription( + "concept boolean", + "concept boolean", + null, + List.of( + new HistogramColumnDescription.Entry("Yes", 4), + new HistogramColumnDescription.Entry("No", 1) + ), + Map.of() ) ), Range.of(LocalDate.of(2021, 1, 1), LocalDate.of(2021, 10, 1)) diff --git a/backend/src/test/java/com/bakdata/conquery/io/result/ResultTestUtil.java b/backend/src/test/java/com/bakdata/conquery/io/result/ResultTestUtil.java index 7127e8d27c..161c93b063 100644 --- a/backend/src/test/java/com/bakdata/conquery/io/result/ResultTestUtil.java +++ b/backend/src/test/java/com/bakdata/conquery/io/result/ResultTestUtil.java @@ -2,6 +2,7 @@ import java.util.Collections; import java.util.List; +import java.util.OptionalLong; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -77,7 +78,7 @@ public List getResultInfos() { } @Override - public Stream streamResults() { + public Stream streamResults(OptionalLong maybeLimit) { return getTestEntityResults().stream(); } }; diff --git a/backend/src/test/java/com/bakdata/conquery/io/result/arrow/ArrowResultGenerationTest.java b/backend/src/test/java/com/bakdata/conquery/io/result/arrow/ArrowResultGenerationTest.java index 04ee93d866..3c641e37a8 100644 --- a/backend/src/test/java/com/bakdata/conquery/io/result/arrow/ArrowResultGenerationTest.java +++ b/backend/src/test/java/com/bakdata/conquery/io/result/arrow/ArrowResultGenerationTest.java @@ -1,9 +1,9 @@ package com.bakdata.conquery.io.result.arrow; import static com.bakdata.conquery.io.result.ResultTestUtil.*; -import static com.bakdata.conquery.io.result.arrow.ArrowUtil.generateFields; import static com.bakdata.conquery.io.result.arrow.ArrowRenderer.renderToStream; import static com.bakdata.conquery.io.result.arrow.ArrowUtil.ROOT_ALLOCATOR; +import static com.bakdata.conquery.io.result.arrow.ArrowUtil.generateFields; import static org.assertj.core.api.Assertions.assertThat; import java.io.ByteArrayInputStream; @@ -15,6 +15,7 @@ import java.util.List; import java.util.Locale; import java.util.Objects; +import java.util.OptionalLong; import java.util.StringJoiner; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -135,7 +136,7 @@ void writeAndRead() throws IOException { new ArrowConfig(BATCH_SIZE), ResultTestUtil.ID_FIELDS, mquery.getResultInfos(), - mquery.streamResults() + mquery.streamResults(OptionalLong.empty()) ); InputStream inputStream = new ByteArrayInputStream(output.toByteArray()); diff --git a/backend/src/test/java/com/bakdata/conquery/io/result/csv/CsvResultGenerationTest.java b/backend/src/test/java/com/bakdata/conquery/io/result/csv/CsvResultGenerationTest.java index d6d25c2b71..e7d9e89183 100644 --- a/backend/src/test/java/com/bakdata/conquery/io/result/csv/CsvResultGenerationTest.java +++ b/backend/src/test/java/com/bakdata/conquery/io/result/csv/CsvResultGenerationTest.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.OptionalLong; import java.util.StringJoiner; import java.util.stream.Collectors; @@ -56,7 +57,7 @@ void writeAndRead() throws IOException { StringWriter writer = new StringWriter(); CsvRenderer renderer = new CsvRenderer(CONFIG.getCsv().createWriter(writer), printSettings); - renderer.toCSV(ResultTestUtil.ID_FIELDS, mquery.getResultInfos(), mquery.streamResults()); + renderer.toCSV(ResultTestUtil.ID_FIELDS, mquery.getResultInfos(), mquery.streamResults(OptionalLong.empty())); String computed = writer.toString(); diff --git a/backend/src/test/java/com/bakdata/conquery/io/result/excel/ExcelResultRenderTest.java b/backend/src/test/java/com/bakdata/conquery/io/result/excel/ExcelResultRenderTest.java index 76f6d97f02..b19db6dfc5 100644 --- a/backend/src/test/java/com/bakdata/conquery/io/result/excel/ExcelResultRenderTest.java +++ b/backend/src/test/java/com/bakdata/conquery/io/result/excel/ExcelResultRenderTest.java @@ -11,6 +11,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Locale; +import java.util.OptionalLong; import java.util.StringJoiner; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -73,7 +74,7 @@ public List getResultInfos() { } @Override - public Stream streamResults() { + public Stream streamResults(OptionalLong maybeLimit) { return results.stream(); } }; @@ -86,7 +87,8 @@ public Stream streamResults() { renderer.renderToStream( ResultTestUtil.ID_FIELDS, mquery, - output); + output, OptionalLong.empty() + ); InputStream inputStream = new ByteArrayInputStream(output.toByteArray()); diff --git a/backend/src/test/java/com/bakdata/conquery/io/result/parquet/ParquetResultGenerationTest.java b/backend/src/test/java/com/bakdata/conquery/io/result/parquet/ParquetResultGenerationTest.java index 6055c03642..f991afa214 100644 --- a/backend/src/test/java/com/bakdata/conquery/io/result/parquet/ParquetResultGenerationTest.java +++ b/backend/src/test/java/com/bakdata/conquery/io/result/parquet/ParquetResultGenerationTest.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.util.List; import java.util.Locale; +import java.util.OptionalLong; import java.util.StringJoiner; import java.util.stream.Collectors; @@ -114,7 +115,7 @@ void writeAndRead() throws IOException { // First we write to the buffer, than we read from it and parse it as TSV ByteArrayOutputStream output = new ByteArrayOutputStream(); - ParquetRenderer.writeToStream(output, ResultTestUtil.ID_FIELDS, managedQuery.getResultInfos(), printSettings, managedQuery.streamResults()); + ParquetRenderer.writeToStream(output, ResultTestUtil.ID_FIELDS, managedQuery.getResultInfos(), printSettings, managedQuery.streamResults(OptionalLong.empty())); final byte[] buf = output.toByteArray(); diff --git a/backend/src/test/java/com/bakdata/conquery/models/query/statistics/HistogramTest.java b/backend/src/test/java/com/bakdata/conquery/models/query/statistics/HistogramTest.java new file mode 100644 index 0000000000..052c74096a --- /dev/null +++ b/backend/src/test/java/com/bakdata/conquery/models/query/statistics/HistogramTest.java @@ -0,0 +1,150 @@ +package com.bakdata.conquery.models.query.statistics; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; +import java.util.Random; + +import lombok.extern.slf4j.Slf4j; +import org.assertj.core.data.Offset; +import org.junit.jupiter.api.Test; + +@Slf4j +class HistogramTest { + + public static final int SEED = 0xD00F; + + + + @Test + void plain() { + + final Histogram histogram = Histogram.zeroCentered(0, 10, -2, 15, 10, false); + + final Random random = new Random(SEED); + + for (int it = 0; it < 10; it++) { + histogram.add(0); + } + + for (int it = 0; it < 100; it++) { + histogram.add(random.nextDouble(-2, -1)); + } + + + for (int it = 0; it < 100; it++) { + histogram.add(random.nextDouble(0, 5)); + } + + for (int it = 0; it < 100; it++) { + histogram.add(random.nextDouble(7, 10)); + } + + for (int it = 0; it < 100; it++) { + histogram.add(random.nextDouble(10, 15)); + } + + final List nodes = histogram.nodes(); + + log.info("{}", nodes); + + assertThat(nodes).hasSize(13); + + final Histogram.Node first = nodes.get(0); + + assertThat(first.getMin()).isLessThanOrEqualTo(-1); + + // assertThat(nodes.get(7).getCount()).isZero(); + // assertThat(nodes.get(8).getCount()).isZero(); + + final Histogram.Node last = nodes.get(nodes.size() - 1); + + assertThat(last.getMin()).isCloseTo(10, Offset.offset(0.2d)); + assertThat(last.getMax()).isGreaterThanOrEqualTo(11); + + for (int i = 0; i < nodes.size(); i++) { + final Histogram.Node node = nodes.get(i); + + assertThat(node.getMin()) + .as("[%d]=%s lower < upper", i, node) + .isLessThanOrEqualTo(node.getMax()); + + if (node.getMin() == Double.POSITIVE_INFINITY) { + // has no values + continue; + } + + assertThat(node.getMin()).isGreaterThanOrEqualTo(node.getMin()); + assertThat(node.getMax()).isLessThanOrEqualTo(node.getMax()); + assertThat(node.getMin()).isLessThanOrEqualTo(node.getMax()); + } + + } + + @Test + void weird() { + + final Histogram histogram = Histogram.zeroCentered(-35, 27, -40, 28, 12, false); + + final Random random = new Random(SEED); + + for (int it = 0; it < 10; it++) { + histogram.add(0); + } + + for (int it = 0; it < 100; it++) { + histogram.add(random.nextDouble(-40, 38)); + } + + + + final List nodes = histogram.nodes(); + + log.info("{}", nodes); + + assertThat(nodes).hasSize(15 /*12 + lower, zero, upper*/); + + final Histogram.Node first = nodes.get(0); + + assertThat(first.getMin()).isLessThanOrEqualTo(-1); + + assertThat(nodes.get(7).getCount()).isEqualTo(5); + assertThat(nodes.get(8).getCount()).isEqualTo(10); + + final Histogram.Node last = nodes.get(nodes.size() - 1); + + assertThat(last.getMin()).isCloseTo(26, Offset.offset(0.2d)); + assertThat(last.getMax()).isGreaterThanOrEqualTo(28); + + for (int i = 0; i < nodes.size(); i++) { + final Histogram.Node node = nodes.get(i); + + assertThat(node.getMin()) + .as("[%d]=%s lower < upper", i, node) + .isLessThanOrEqualTo(node.getMax()); + + if (node.getMin() == Double.POSITIVE_INFINITY) { + // has no values + continue; + } + + assertThat(node.getMin()).isGreaterThanOrEqualTo(node.getMin()); + assertThat(node.getMax()).isLessThanOrEqualTo(node.getMax()); + assertThat(node.getMin()).isLessThanOrEqualTo(node.getMax()); + } + + } + + @Test + public void zeroWidth() { + Histogram histogram = Histogram.zeroCentered(0.0, 0.0, -1.0, 1.0, 10, true); + // Should be only zero-bin, under and overflow bins + histogram.add(0); + histogram.add(-0.5); + histogram.add(0.5); + + assertThat(histogram.nodes()).hasSize(3); + } + + +} \ No newline at end of file diff --git a/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/SIMPLE_TREECONCEPT_Query.json b/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/SIMPLE_TREECONCEPT_Query.json index ab7d79373b..cad17eed49 100644 --- a/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/SIMPLE_TREECONCEPT_Query.json +++ b/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/SIMPLE_TREECONCEPT_Query.json @@ -20,7 +20,8 @@ "concept.connector.int", "concept.connector.real", "concept.connector.decimal", - "concept.connector.money" + "concept.connector.money", + "concept.connector.boolean" ] } ] @@ -71,7 +72,7 @@ { "name": "real", "column": "table.real", - "type": "SUM" + "type": "DISTINCT" }, { "name": "decimal", @@ -82,6 +83,11 @@ "name": "money", "column": "table.money", "type": "SUM" + }, + { + "name": "boolean", + "column": "table.boolean", + "type": "FIRST" } ], "filters": [ @@ -131,6 +137,10 @@ { "name": "money", "type": "MONEY" + }, + { + "name": "boolean", + "type": "BOOLEAN" } ] } diff --git a/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/content.csv b/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/content.csv index 21ddd2a870..ca12ac91c1 100644 --- a/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/content.csv +++ b/backend/src/test/resources/tests/query/QUERY_STATISTICS_TESTS/content.csv @@ -1,7 +1,9 @@ -id,date,string,mapped,int,real,decimal,money -1,2021-01-01,a,0,1,1,1,10 -2,2021-10-01,b,0,2,2,2,20 -3,2021-01-01,c,0,3,3,3,30 -4,2021-01-01,c,1,3,3,3,30 -5,2021-01-11,d,1,4,4,4,40 -6,2021-01-12,,,,,, +id,date,string,mapped,int,real,decimal,money,boolean +1,2021-01-01,,,,0,,, +1,2021-01-01,,,,0.1,,, +1,2021-01-01,a,0,1,1,1,10,true +2,2021-10-01,b,0,2,2,2,20,true +3,2021-01-01,c,0,3,3,3,30,true +4,2021-01-01,c,1,3,3,3,30,true +5,2021-01-11,d,1,4,4,4,40,false +6,2021-01-12,,,,,,,