From c7575bfe75a792047046f4b18f192f7b210d3dc7 Mon Sep 17 00:00:00 2001 From: 51-code <146736881+51-code@users.noreply.github.com> Date: Mon, 4 Dec 2023 13:01:52 +0200 Subject: [PATCH] Fix: Sort strings as numbers in auto sort if the strings are numerical (#7) --- .../functions/dpf_02/SortOperation.java | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/teragrep/functions/dpf_02/SortOperation.java b/src/main/java/com/teragrep/functions/dpf_02/SortOperation.java index 3f33fa3..fd21dfa 100644 --- a/src/main/java/com/teragrep/functions/dpf_02/SortOperation.java +++ b/src/main/java/com/teragrep/functions/dpf_02/SortOperation.java @@ -83,7 +83,7 @@ public Dataset orderDatasetByGivenColumns(Dataset ds) { if (this.listOfSortByClauses != null && this.listOfSortByClauses.size() > 0) { for (SortByClause sbc : listOfSortByClauses) { if (sbc.getSortAsType() == SortByClause.Type.AUTOMATIC) { - SortByClause.Type autoType = detectSortByType(ds.schema().fields(), sbc.getFieldName()); + SortByClause.Type autoType = detectSortByType(ds, sbc.getFieldName()); ds = orderDatasetBySortByClause(ss, ds, sbc, autoType); } else { @@ -131,12 +131,13 @@ private Dataset orderDatasetBySortByClause(final SparkSession ss, final Dat } // detect sorting type if auto() was used in sort - private SortByClause.Type detectSortByType(final StructField[] fields, final String fieldName) { + private SortByClause.Type detectSortByType(final Dataset ds, final String fieldName) { + StructField[] fields = ds.schema().fields(); for (StructField field : fields) { if (field.name().equals(fieldName)) { switch (field.dataType().typeName()) { case "string": // ip address? - return SortByClause.Type.STRING; + return numericalStringCheck(ds, fieldName); case "long": case "integer": case "float": @@ -151,4 +152,23 @@ private SortByClause.Type detectSortByType(final StructField[] fields, final Str } return SortByClause.Type.DEFAULT; } + + /** + * Checks if a column only contains numbers even if it is labeled as a string column. + * @param dataset dataset to check + * @param fieldName name of the column + * @return Numeric or String SortByClause.Type + */ + private SortByClause.Type numericalStringCheck(Dataset dataset, String fieldName) { + // Value is numerical if it is castable to Double + Dataset tempDataset = dataset.withColumn("isNumerical", functions.col(fieldName).cast(DataTypes.DoubleType).isNotNull()); + + // If the isNumerical column has even one false value, the column contains strings + if (tempDataset.filter(tempDataset.col("isNumerical").contains(false)).isEmpty()) { + // no false values found + return SortByClause.Type.NUMERIC; + } else { + return SortByClause.Type.STRING; + } + } } \ No newline at end of file