Skip to content

Commit

Permalink
Fix: Sort strings as numbers in auto sort if the strings are numerical (
Browse files Browse the repository at this point in the history
  • Loading branch information
51-code authored Dec 4, 2023
1 parent a5da6f1 commit c7575bf
Showing 1 changed file with 23 additions and 3 deletions.
26 changes: 23 additions & 3 deletions src/main/java/com/teragrep/functions/dpf_02/SortOperation.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public Dataset<Row> orderDatasetByGivenColumns(Dataset<Row> ds) {
if (this.listOfSortByClauses != null && this.listOfSortByClauses.size() > 0) {
for (SortByClause sbc : listOfSortByClauses) {
if (sbc.getSortAsType() == SortByClause.Type.AUTOMATIC) {
SortByClause.Type autoType = detectSortByType(ds.schema().fields(), sbc.getFieldName());
SortByClause.Type autoType = detectSortByType(ds, sbc.getFieldName());
ds = orderDatasetBySortByClause(ss, ds, sbc, autoType);
}
else {
Expand Down Expand Up @@ -131,12 +131,13 @@ private Dataset<Row> orderDatasetBySortByClause(final SparkSession ss, final Dat
}

// detect sorting type if auto() was used in sort
private SortByClause.Type detectSortByType(final StructField[] fields, final String fieldName) {
private SortByClause.Type detectSortByType(final Dataset<Row> ds, final String fieldName) {
StructField[] fields = ds.schema().fields();
for (StructField field : fields) {
if (field.name().equals(fieldName)) {
switch (field.dataType().typeName()) {
case "string": // ip address?
return SortByClause.Type.STRING;
return numericalStringCheck(ds, fieldName);
case "long":
case "integer":
case "float":
Expand All @@ -151,4 +152,23 @@ private SortByClause.Type detectSortByType(final StructField[] fields, final Str
}
return SortByClause.Type.DEFAULT;
}

/**
* Checks if a column only contains numbers even if it is labeled as a string column.
* @param dataset dataset to check
* @param fieldName name of the column
* @return Numeric or String SortByClause.Type
*/
private SortByClause.Type numericalStringCheck(Dataset<Row> dataset, String fieldName) {
// Value is numerical if it is castable to Double
Dataset<Row> tempDataset = dataset.withColumn("isNumerical", functions.col(fieldName).cast(DataTypes.DoubleType).isNotNull());

// If the isNumerical column has even one false value, the column contains strings
if (tempDataset.filter(tempDataset.col("isNumerical").contains(false)).isEmpty()) {
// no false values found
return SortByClause.Type.NUMERIC;
} else {
return SortByClause.Type.STRING;
}
}
}

0 comments on commit c7575bf

Please sign in to comment.