Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BETWEEN support to NumericalFilterOptimizer #14163

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,42 +24,51 @@
import org.apache.pinot.common.request.Expression;
import org.apache.pinot.common.request.ExpressionType;
import org.apache.pinot.common.request.Function;
import org.apache.pinot.common.request.Literal;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.FieldSpec.DataType;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.sql.FilterKind;


/**
* Numerical expressions of form "column <operator> literal", where operator can be '=', '!=', '>', '>=', '<', or '<=',
* can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These expressions
* can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression whose LHS and
* RHS are of the same datatype.
*
* Numerical expressions of the form "column [operator] literal", where operator can be '=', '!=', '>', '>=', '<', '<=',
* or 'BETWEEN' can compare a column of one datatype (say INT) with a literal of different datatype (say DOUBLE). These
* expressions can not be evaluated on the Server. Hence, we rewrite such expressions into an equivalent expression
* whose LHS and RHS are of the same datatype.
* <p>
* Simple predicate examples:
* 1) WHERE "intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
* 2) WHERE "intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
* 3) WHERE "intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5.
* 4) WHERE "intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000.
* 5) WHERE "intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to 3000000000.
* 6) WHERE "intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
* 7) WHERE "intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9.
*
* <ol>
* <li> "WHERE intColumn = 5.0" gets rewritten to "WHERE intColumn = 5"
* <li> "WHERE intColumn != 5.0" gets rewritten to "WHERE intColumn != 5"
* <li> "WHERE intColumn = 5.5" gets rewritten to "WHERE false" because INT values can not match 5.5.
* <li> "WHERE intColumn = 3000000000" gets rewritten to "WHERE false" because INT values can not match 3000000000.
* <li> "WHERE intColumn != 3000000000" gets rewritten to "WHERE true" because INT values always not equal to
* 3000000000.
* <li> "WHERE intColumn < 5.1" gets rewritten to "WHERE intColumn <= 5"
* <li> "WHERE intColumn > -3E9" gets rewritten to "WHERE true" because int values are always greater than -3E9.
* <li> "WHERE intColumn BETWEEN 2.5 AND 7.5" gets rewritten to "WHERE intColumn BETWEEN 3 AND 7"
* <li> "WHERE intColumn BETWEEN 5.5 AND 3000000000" gets rewritten to "WHERE intColumn BETWEEN 6 AND 2147483647" since
* 3000000000 is greater than Integer.MAX_VALUE.
* <li> "WHERE intColumn BETWEEN 10 AND 0" gets rewritten to "WHERE false" because lower bound is greater than upper
* bound.
* </ol>
* <p>
* Compound predicate examples:
* 8) WHERE "intColumn1 = 5.5 AND intColumn2 = intColumn3"
* <ol>
* <li> "WHERE intColumn1 = 5.5 AND intColumn2 = intColumn3"
* rewrite to "WHERE false AND intColumn2 = intColumn3"
* rewrite to "WHERE intColumn2 = intColumn3"
* 9) WHERE "intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column)
* <li> "WHERE intColumn1 != 5.5 OR intColumn2 = 5000000000" (5000000000 is out of bounds for integer column)
* rewrite to "WHERE true OR false"
* rewrite to "WHERE true"
* rewrite to query without any WHERE clause.
*
* </ol>
* <p>
* When entire predicate gets rewritten to false (Example 3 above), the query will not return any data. Hence, it is
* better for the Broker itself to return an empty response rather than sending the query to servers for further
* evaluation.
*
* TODO: Add support for BETWEEN, IN, and NOT IN operators.
* <p>
* TODO: Add support for IN, and NOT IN operators.
*/
public class NumericalFilterOptimizer extends BaseAndOrBooleanFilterOptimizer {

Expand All @@ -74,33 +83,39 @@ boolean canBeOptimized(Expression filterExpression, @Nullable Schema schema) {
Expression optimizeChild(Expression filterExpression, @Nullable Schema schema) {
Function function = filterExpression.getFunctionCall();
FilterKind kind = FilterKind.valueOf(function.getOperator());

if (!kind.isRange() && kind != FilterKind.EQUALS && kind != FilterKind.NOT_EQUALS) {
return filterExpression;
}

List<Expression> operands = function.getOperands();
// Verify that LHS is a numeric column and RHS is a literal before rewriting.
Expression lhs = operands.get(0);
Expression rhs = operands.get(1);

DataType dataType = getDataType(lhs, schema);
if (dataType == null || !dataType.isNumeric() || !rhs.isSetLiteral()) {
// No rewrite here
return filterExpression;
}

switch (kind) {
case IS_NULL:
case IS_NOT_NULL:
// No need to try to optimize IS_NULL and IS_NOT_NULL operations on numerical columns.
break;
default:
List<Expression> operands = function.getOperands();
// Verify that LHS is a numeric column and RHS is a numeric literal before rewriting.
Expression lhs = operands.get(0);
Expression rhs = operands.get(1);
if (isNumericLiteral(rhs)) {
DataType dataType = getDataType(lhs, schema);
if (dataType != null && dataType.isNumeric()) {
switch (kind) {
case EQUALS:
case NOT_EQUALS:
return rewriteEqualsExpression(filterExpression, kind, dataType, rhs);
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case LESS_THAN_OR_EQUAL:
return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
default:
break;
}
}
case BETWEEN: {
return rewriteBetweenExpression(filterExpression, dataType);
}
case EQUALS:
case NOT_EQUALS:
case GREATER_THAN:
case GREATER_THAN_OR_EQUAL:
case LESS_THAN:
case LESS_THAN_OR_EQUAL: {
if (kind.isRange()) {
return rewriteRangeExpression(filterExpression, kind, dataType, rhs);
} else {
return rewriteEqualsExpression(filterExpression, kind, dataType, rhs);
}
}
default:
break;
}
return filterExpression;
Expand Down Expand Up @@ -346,6 +361,159 @@ private static Expression rewriteRangeExpression(Expression range, FilterKind ki
return range;
}

/**
* Rewrite expressions of the form "column BETWEEN lower AND upper" to ensure that lower and upper bounds are the same
* datatype as the column (or can be cast to the same datatype in the server).
*/
private static Expression rewriteBetweenExpression(Expression between, DataType dataType) {
// TODO: Consider unifying logic with rewriteRangeExpression
List<Expression> operands = between.getFunctionCall().getOperands();
Expression lower = operands.get(1);
Expression upper = operands.get(2);

// The BETWEEN filter predicate currently only supports literals as lower and upper bounds, but we're still checking
// here just in case.
if (lower.isSetLiteral()) {
switch (lower.getLiteral().getSetField()) {
case LONG_VALUE: {
long actual = lower.getLiteral().getLongValue();
// Other data types can be converted on the server side.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like this is a reduced version of the existing rewriteRangeExpression(). To keep the behavior consistent, are we able to extract out the logic of rewriting lower bound and upper bound, then use the common logic for both rewriteRangeExpression() and rewriteBetweenExpression()?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did consider consolidating the two but the major difference is that in rewriteRangeExpression we're rewriting the operators too (and avoiding manually modifying the literal value other than the cast) which we don't want to do for BETWEEN (as discussed in #14111). The other alternative would be to change rewriteRangeExpression to avoid rewriting the operators but I thought it'd be better not to change the behavior of the existing optimizer?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This part I still don't follow. Do you mean we should not rewrite BETWEEN the same way as other range filters, or is it too complicated? As long as we computed lower and upper bound, we should be able to assemble it back to a BETWEEN.
It is also fine to do it separately

Copy link
Collaborator Author

@yashmayya yashmayya Oct 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean we should not rewrite BETWEEN the same way as other range filters

Yeah, basically this. For instance, taking intCol >= 2.5 as an example. 2.5 is cast to 2 (int), and then the >= is rewritten to > because actual - converted > 0 resulting in intCol > 2. For BETWEEN, we want to instead rewrite intCol BETWEEN 2.5 AND y to intCol BETWEEN 3 AND y. We could change the logic for regular range filter to rewrite intCol >= 2.5 to intCol >= 3 instead to match the BETWEEN rewrite logic - is that what you're suggesting? There are some other differences too though. For instance, floatCol < longLiteral can be rewritten to floatCol <= castedFloatLiteral depending on the comparison between longLiteral and castedFloatLiteral. We can't do the same for BETWEEN though, and we simply skip any conversion in these cases, allowing the server to do the cast. Given these differences, it seemed better overall to keep these rewrites separate, what do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the detailed explanation. Ideally we want the behavior to be consistent across all range filters. Given the difficulties and the scope of this PR, let's add a TODO to revisit in the future

if (dataType == DataType.INT) {
if (actual > Integer.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual < Integer.MIN_VALUE) {
lower.getLiteral().setIntValue(Integer.MIN_VALUE);
}
}
break;
}
case DOUBLE_VALUE: {
double actual = lower.getLiteral().getDoubleValue();

switch (dataType) {
case INT: {
if (actual > Integer.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual < Integer.MIN_VALUE) {
lower.getLiteral().setIntValue(Integer.MIN_VALUE);
} else {
// Double value is in int range
int converted = (int) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison >= 0) {
lower.getLiteral().setIntValue(converted);
} else {
lower.getLiteral().setIntValue(converted + 1);
}
}
break;
}
case LONG: {
if (actual > Long.MAX_VALUE) {
// Lower bound literal value is greater than the bounds of LONG.
return getExpressionFromBoolean(false);
}
if (actual < Long.MIN_VALUE) {
lower.getLiteral().setLongValue(Long.MIN_VALUE);
} else {
// Double value is in long range
long converted = (long) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison >= 0) {
lower.getLiteral().setLongValue(converted);
} else {
lower.getLiteral().setLongValue(converted + 1);
}
}
break;
}
default:
// For other numeric data types, the double literal can be converted on the server side.
break;
}
break;
}
default:
break;
}
}

if (upper.isSetLiteral()) {
switch (upper.getLiteral().getSetField()) {
case LONG_VALUE: {
long actual = upper.getLiteral().getLongValue();
// Other data types can be converted on the server side.
if (dataType == DataType.INT) {
if (actual < Integer.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual > Integer.MAX_VALUE) {
upper.getLiteral().setIntValue(Integer.MAX_VALUE);
}
}
break;
}
case DOUBLE_VALUE: {
double actual = upper.getLiteral().getDoubleValue();

switch (dataType) {
case INT: {
if (actual < Integer.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of INT.
return getExpressionFromBoolean(false);
}
if (actual > Integer.MAX_VALUE) {
upper.getLiteral().setIntValue(Integer.MAX_VALUE);
} else {
// Double value is in int range
int converted = (int) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison <= 0) {
upper.getLiteral().setIntValue(converted);
} else {
upper.getLiteral().setIntValue(converted - 1);
}
}
break;
}
case LONG: {
if (actual < Long.MIN_VALUE) {
// Upper bound literal value is lesser than the bounds of LONG.
return getExpressionFromBoolean(false);
}
if (actual > Long.MAX_VALUE) {
upper.getLiteral().setLongValue(Long.MAX_VALUE);
} else {
// Double value is in long range
long converted = (long) actual;
int comparison = BigDecimal.valueOf(converted).compareTo(BigDecimal.valueOf(actual));
if (comparison <= 0) {
upper.getLiteral().setLongValue(converted);
} else {
upper.getLiteral().setLongValue(converted - 1);
}
}
break;
}
default:
// For other numeric data types, the double literal can be converted on the server side.
break;
}
break;
}
default:
break;
}
}

return between;
}

/**
* Helper function to rewrite range operator of a range expression.
* @param range Range expression.
Expand All @@ -356,9 +524,9 @@ private static void rewriteRangeOperator(Expression range, FilterKind kind, int
if (comparison > 0) {
// Literal value is greater than the converted value, so rewrite:
// "column > literal" to "column > converted"
// "column >= literal" to "column >= converted"
// "column >= literal" to "column > converted"
// "column < literal" to "column <= converted"
// "column <= literal" to "column < converted"
// "column <= literal" to "column <= converted"
if (kind == FilterKind.GREATER_THAN || kind == FilterKind.GREATER_THAN_OR_EQUAL) {
range.getFunctionCall().setOperator(FilterKind.GREATER_THAN.name());
} else if (kind == FilterKind.LESS_THAN || kind == FilterKind.LESS_THAN_OR_EQUAL) {
Expand Down Expand Up @@ -413,21 +581,4 @@ private static DataType getDataType(Expression expression, Schema schema) {
}
return null;
}

/** @return true if expression is a numeric literal; otherwise, false. */
private static boolean isNumericLiteral(Expression expression) {
if (expression.getType() == ExpressionType.LITERAL) {
Literal._Fields type = expression.getLiteral().getSetField();
switch (type) {
case INT_VALUE:
case LONG_VALUE:
case FLOAT_VALUE:
case DOUBLE_VALUE:
return true;
default:
break;
}
}
return false;
}
}
Loading
Loading