Skip to content

Commit

Permalink
First/Last reduction and cleanup of agg APIs (#839)
Browse files Browse the repository at this point in the history
Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
  • Loading branch information
revans2 authored Sep 23, 2020
1 parent ee8a778 commit 1a2b17e
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 97 deletions.
48 changes: 48 additions & 0 deletions integration_tests/src/main/python/hash_aggregate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,3 +348,51 @@ def test_count_distinct_with_nan_floats(data_gen):

# TODO: Literal tests
# TODO: First and Last tests

# REDUCTIONS

non_nan_all_basic_gens = [byte_gen, short_gen, int_gen, long_gen,
# nans and -0.0 cannot work because of nan support in min/max, -0.0 == 0.0 in cudf for distinct and
# https://github.com/NVIDIA/spark-rapids/issues/84 in the ordering
FloatGen(no_nans=True, special_cases=[]), DoubleGen(no_nans=True, special_cases=[]),
string_gen, boolean_gen, date_gen, timestamp_gen]


@pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn)
def test_generic_reductions(data_gen):
assert_gpu_and_cpu_are_equal_collect(
# Coalesce and sort are to make sure that first and last, which are non-deterministic
# become deterministic
lambda spark : binary_op_df(spark, data_gen)\
.coalesce(1)\
.sortWithinPartitions('b').selectExpr(
'min(a)',
'max(a)',
'first(a)',
'last(a)',
'count(a)',
'count(1)'),
conf = _no_nans_float_conf)

@pytest.mark.parametrize('data_gen', non_nan_all_basic_gens, ids=idfn)
def test_distinct_count_reductions(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).selectExpr(
'count(DISTINCT a)'))

@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/837')
@pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn)
def test_distinct_float_count_reductions(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).selectExpr(
'count(DISTINCT a)'))

@approximate_float
@pytest.mark.parametrize('data_gen', numeric_gens, ids=idfn)
def test_arithmetic_reductions(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, data_gen).selectExpr(
'sum(a)',
'avg(a)'),
conf = _no_nans_float_conf)

Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

package com.nvidia.spark.rapids

import ai.rapids.cudf.{DType, Table, WindowAggregate, WindowOptions}
import ai.rapids.cudf.{Aggregation, AggregationOverWindow, DType, Table, WindowOptions}
import com.nvidia.spark.rapids.GpuOverrides.wrapExpr

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
Expand Down Expand Up @@ -243,11 +243,11 @@ object GpuWindowExpression {
def getRowBasedWindowFrame(columnIndex : Int,
aggExpression : Expression,
windowSpec : GpuSpecifiedWindowFrame)
: WindowAggregate = {
: AggregationOverWindow = {

// FIXME: Currently, only negative or 0 values are supported.
var lower = getBoundaryValue(windowSpec.lower)
if(lower > 0) {
if (lower > 0) {
throw new IllegalStateException(
s"Lower-bounds ahead of current row is not supported. Found $lower")
}
Expand All @@ -272,32 +272,33 @@ object GpuWindowExpression {
val windowOption = WindowOptions.builder().minPeriods(1)
.window(lower, upper).build()

aggExpression match {
val agg: Aggregation = aggExpression match {
case gpuAggregateExpression : GpuAggregateExpression =>
gpuAggregateExpression.aggregateFunction match {
case GpuCount(_) => WindowAggregate.count(columnIndex, windowOption)
case GpuSum(_) => WindowAggregate.sum(columnIndex, windowOption)
case GpuMin(_) => WindowAggregate.min(columnIndex, windowOption)
case GpuMax(_) => WindowAggregate.max(columnIndex, windowOption)
case GpuCount(_) => Aggregation.count()
case GpuSum(_) => Aggregation.sum()
case GpuMin(_) => Aggregation.min()
case GpuMax(_) => Aggregation.max()
case anythingElse =>
throw new UnsupportedOperationException(
s"Unsupported aggregation: ${anythingElse.prettyName}")
}
case _: GpuRowNumber =>
// ROW_NUMBER does not depend on input column values.
WindowAggregate.row_number(0, windowOption)
// ROW_NUMBER does not depend on input column values, but it still should be fine
Aggregation.rowNumber()
case anythingElse =>
throw new UnsupportedOperationException(
s"Unsupported window aggregation: ${anythingElse.prettyName}")
}
agg.onColumn(columnIndex).overWindow(windowOption)
}

def getRangeBasedWindowFrame(aggColumnIndex : Int,
timeColumnIndex : Int,
aggExpression : Expression,
windowSpec : GpuSpecifiedWindowFrame,
timestampIsAscending : Boolean)
: WindowAggregate = {
: AggregationOverWindow = {

// FIXME: Currently, only negative or 0 values are supported.
var lower = getBoundaryValue(windowSpec.lower)
Expand Down Expand Up @@ -332,12 +333,12 @@ object GpuWindowExpression {

val windowOption = windowOptionBuilder.build()

aggExpression match {
val agg: Aggregation = aggExpression match {
case gpuAggExpression : GpuAggregateExpression => gpuAggExpression.aggregateFunction match {
case GpuCount(_) => WindowAggregate.count(aggColumnIndex, windowOption)
case GpuSum(_) => WindowAggregate.sum(aggColumnIndex, windowOption)
case GpuMin(_) => WindowAggregate.min(aggColumnIndex, windowOption)
case GpuMax(_) => WindowAggregate.max(aggColumnIndex, windowOption)
case GpuCount(_) => Aggregation.count()
case GpuSum(_) => Aggregation.sum()
case GpuMin(_) => Aggregation.min()
case GpuMax(_) => Aggregation.max()
case anythingElse =>
throw new UnsupportedOperationException(
s"Unsupported aggregation: ${anythingElse.prettyName}")
Expand All @@ -346,6 +347,7 @@ object GpuWindowExpression {
throw new UnsupportedOperationException(
s"Unsupported window aggregation: ${anythingElse.prettyName}")
}
agg.onColumn(aggColumnIndex).overWindow(windowOption)
}

def getBoundaryValue(boundary : Expression) : Int = boundary match {
Expand Down
18 changes: 2 additions & 16 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/aggregate.scala
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,6 @@ class GpuHashAggregateMeta(
resultExpressions

override def tagPlanForGpu(): Unit = {
if (agg.groupingExpressions.isEmpty) {
// first/last reductions not supported yet
if (agg.aggregateExpressions.exists(e => e.aggregateFunction.isInstanceOf[First] ||
e.aggregateFunction.isInstanceOf[Last])) {
willNotWorkOnGpu("First/Last reductions are not supported on GPU")
}
}
if (agg.resultExpressions.isEmpty) {
willNotWorkOnGpu("result expressions is empty")
}
Expand Down Expand Up @@ -192,13 +185,6 @@ class GpuSortAggregateMeta(
resultExpressions

override def tagPlanForGpu(): Unit = {
if (agg.groupingExpressions.isEmpty) {
// first/last reductions not supported yet
if (agg.aggregateExpressions.exists(e => e.aggregateFunction.isInstanceOf[First] ||
e.aggregateFunction.isInstanceOf[Last])) {
willNotWorkOnGpu("First/Last reductions are not supported on GPU")
}
}
if (GpuOverrides.isAnyStringLit(agg.groupingExpressions)) {
willNotWorkOnGpu("string literal values are not supported in a hash aggregate")
}
Expand Down Expand Up @@ -842,9 +828,9 @@ case class GpuHashAggregateExec(
val aggregates = aggModeCudfAggregates.flatMap(_._2)
val cudfAggregates = aggModeCudfAggregates.flatMap { case (mode, aggregates) =>
if ((mode == Partial || mode == Complete) && !merge) {
aggregates.map(a => a.updateAggregate)
aggregates.map(a => a.updateAggregate.onColumn(a.getOrdinal(a.ref)))
} else {
aggregates.map(a => a.mergeAggregate)
aggregates.map(a => a.mergeAggregate.onColumn(a.getOrdinal(a.ref)))
}
}
tbl = new cudf.Table(toAggregateCvs.map(_.getBase): _*)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.spark.sql.rapids

import ai.rapids.cudf
import ai.rapids.cudf.Aggregation
import com.nvidia.spark.rapids._

import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
Expand Down Expand Up @@ -111,7 +112,7 @@ case class GpuAggregateExpression(origAggregateFunction: GpuAggregateFunction,
resultId: ExprId)
extends GpuExpression with GpuUnevaluable {

val aggregateFunction = if (filter.isDefined) {
val aggregateFunction: GpuAggregateFunction = if (filter.isDefined) {
WrappedAggFunction(origAggregateFunction, filter.get)
} else {
origAggregateFunction
Expand Down Expand Up @@ -170,8 +171,8 @@ abstract case class CudfAggregate(ref: Expression) extends GpuUnevaluable {
def getOrdinal(ref: Expression): Int = ref.asInstanceOf[GpuBoundReference].ordinal
val updateReductionAggregate: cudf.ColumnVector => cudf.Scalar
val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar
val updateAggregate: cudf.Aggregate
val mergeAggregate: cudf.Aggregate
val updateAggregate: Aggregation
val mergeAggregate: Aggregation

def dataType: DataType = ref.dataType
def nullable: Boolean = ref.nullable
Expand All @@ -185,9 +186,8 @@ class CudfCount(ref: Expression) extends CudfAggregate(ref) {
(col: cudf.ColumnVector) => cudf.Scalar.fromLong(col.getRowCount - col.getNullCount)
override val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.sum
override lazy val updateAggregate: cudf.Aggregate =
cudf.Table.count(getOrdinal(ref), includeNulls)
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.sum(getOrdinal(ref))
override lazy val updateAggregate: Aggregation = Aggregation.count(includeNulls)
override lazy val mergeAggregate: Aggregation = Aggregation.sum()
override def toString(): String = "CudfCount"
}

Expand All @@ -196,8 +196,8 @@ class CudfSum(ref: Expression) extends CudfAggregate(ref) {
(col: cudf.ColumnVector) => col.sum
override val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.sum
override lazy val updateAggregate: cudf.Aggregate = cudf.Table.sum(getOrdinal(ref))
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.sum(getOrdinal(ref))
override lazy val updateAggregate: Aggregation = Aggregation.sum()
override lazy val mergeAggregate: Aggregation = Aggregation.sum()
override def toString(): String = "CudfSum"
}

Expand All @@ -206,8 +206,8 @@ class CudfMax(ref: Expression) extends CudfAggregate(ref) {
(col: cudf.ColumnVector) => col.max
override val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.max
override lazy val updateAggregate: cudf.Aggregate = cudf.Table.max(getOrdinal(ref))
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.max(getOrdinal(ref))
override lazy val updateAggregate: Aggregation = Aggregation.max()
override lazy val mergeAggregate: Aggregation = Aggregation.max()
override def toString(): String = "CudfMax"
}

Expand All @@ -216,48 +216,41 @@ class CudfMin(ref: Expression) extends CudfAggregate(ref) {
(col: cudf.ColumnVector) => col.min
override val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) => col.min
override lazy val updateAggregate: cudf.Aggregate = cudf.Table.min(getOrdinal(ref))
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.min(getOrdinal(ref))
override lazy val updateAggregate: Aggregation = Aggregation.min()
override lazy val mergeAggregate: Aggregation = Aggregation.min()
override def toString(): String = "CudfMin"
}

abstract class CudfFirstLastBase(ref: Expression) extends CudfAggregate(ref) {
val includeNulls: Boolean
val offset: Int

override val updateReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) =>
throw new UnsupportedOperationException("first/last reduction not supported on GPU")
(col: cudf.ColumnVector) => col.reduce(Aggregation.nth(offset, includeNulls))
override val mergeReductionAggregate: cudf.ColumnVector => cudf.Scalar =
(col: cudf.ColumnVector) =>
throw new UnsupportedOperationException("first/last reduction not supported on GPU")
(col: cudf.ColumnVector) => col.reduce(Aggregation.nth(offset, includeNulls))
override lazy val updateAggregate: Aggregation = Aggregation.nth(offset, includeNulls)
override lazy val mergeAggregate: Aggregation = Aggregation.nth(offset, includeNulls)
}

class CudfFirstIncludeNulls(ref: Expression) extends CudfFirstLastBase(ref) {
val includeNulls = true
override lazy val updateAggregate: cudf.Aggregate =
cudf.Table.first(getOrdinal(ref), includeNulls)
override lazy val mergeAggregate: cudf.Aggregate =
cudf.Table.first(getOrdinal(ref), includeNulls)
override val includeNulls: Boolean = true
override val offset: Int = 0
}

class CudfFirstExcludeNulls(ref: Expression) extends CudfFirstLastBase(ref) {
val includeNulls = false
override lazy val updateAggregate: cudf.Aggregate =
cudf.Table.first(getOrdinal(ref), includeNulls)
override lazy val mergeAggregate: cudf.Aggregate =
cudf.Table.first(getOrdinal(ref), includeNulls)
override val includeNulls: Boolean = false
override val offset: Int = 0
}

class CudfLastIncludeNulls(ref: Expression) extends CudfFirstLastBase(ref) {
val includeNulls = true
override lazy val updateAggregate: cudf.Aggregate =
cudf.Table.last(getOrdinal(ref), includeNulls)
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.last(getOrdinal(ref), includeNulls)
override val includeNulls: Boolean = true
override val offset: Int = -1
}

class CudfLastExcludeNulls(ref: Expression) extends CudfFirstLastBase(ref) {
val includeNulls = false
override lazy val updateAggregate: cudf.Aggregate =
cudf.Table.last(getOrdinal(ref), includeNulls)
override lazy val mergeAggregate: cudf.Aggregate = cudf.Table.last(getOrdinal(ref), includeNulls)
override val includeNulls: Boolean = false
override val offset: Int = -1
}

abstract class GpuDeclarativeAggregate extends GpuAggregateFunction with GpuUnevaluable {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,23 +90,6 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
.agg(first(col("c0"), ignoreNulls = true), last(col("c0"), ignoreNulls = true))
}

testExpectedExceptionStartsWith("test unsorted agg with first and last no grouping",
classOf[IllegalArgumentException],
"Part of the plan is not columnar", firstDf, repart = 2) {
frame => frame
.coalesce(1)
.agg(first(col("c0"), ignoreNulls = true), last(col("c0"), ignoreNulls = true))
}

testExpectedExceptionStartsWith("test sorted agg with first and last no grouping",
classOf[IllegalArgumentException],
"Part of the plan is not columnar", firstDf, repart = 2) {
frame => frame
.coalesce(1)
.sort(col("c2").asc, col("c0").asc) // force deterministic use case
.agg(first(col("c0"), ignoreNulls = true), last(col("c0"), ignoreNulls = true))
}

IGNORE_ORDER_testSparkResultsAreEqualWithCapture(
"nullable aggregate with not null filter",
firstDf,
Expand Down Expand Up @@ -733,20 +716,6 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
frame => frame.groupBy(col("more_longs") + col("longs")).agg(min("longs"))
}

testExpectedExceptionStartsWith("first without grouping",
classOf[IllegalArgumentException],
"Part of the plan is not columnar",
intCsvDf) {
frame => frame.agg(first("ints", false))
}

testExpectedExceptionStartsWith("last without grouping",
classOf[IllegalArgumentException],
"Part of the plan is not columnar",
intCsvDf) {
frame => frame.agg(first("ints", false))
}

IGNORE_ORDER_testSparkResultsAreEqual("first ignoreNulls=false", intCsvDf) {
frame => frame.groupBy(col("more_ints")).agg(first("ints", false))
}
Expand Down

0 comments on commit 1a2b17e

Please sign in to comment.