AbsaOSS · yruslan · Oct 29, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 9, 2024
diff --git a/.github/workflows/jacoco.yml b/.github/workflows/jacoco.yml
@@ -48,7 +48,7 @@ jobs:
         run: sbt -DSPARK_VERSION=${{matrix.spark}} ++${{matrix.scala}} jacoco
       - name: Add coverage to PR
         id: jacoco
-        uses: madrapps/jacoco-report@v1.3
+        uses: madrapps/jacoco-report@v1.7.1
         with:
           paths: >
             ${{ github.workspace }}/pramen/core/target/scala-${{ matrix.scala_short }}/jacoco/report/jacoco.xml,
@@ -58,6 +58,7 @@ jobs:
           min-coverage-changed-files: ${{ matrix.changed }}
           title: Unit Test Coverage
           update-comment: true
+          #debug-mode: true
       - name: Get the Coverage info
         run: |
           echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}"

diff --git a/.github/workflows/scala.yml b/.github/workflows/scala.yml
@@ -2,12 +2,16 @@ name: ScalaCI
 
 on:
   push:
-    branches: [ main ]
+    branches:
+      - "main"
+      - "support/*"
     paths:
       - "pramen/**"
       - ".github/workflows/scala.yml"
   pull_request:
-    branches: [ main ]
+    branches:
+      - "main"
+      - "support/*"
     paths:
       - "pramen/**"
       - ".github/workflows/scala.yml"
@@ -18,7 +22,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        scala: [2.11.12, 2.12.19, 2.13.13]
+        scala: [2.11.12, 2.12.20, 2.13.14]
         spark: [2.4.8, 3.3.4, 3.4.2, 3.5.1]
         exclude:
           - scala: 2.11.12
@@ -27,9 +31,9 @@ jobs:
             spark: 3.4.2
           - scala: 2.11.12
             spark: 3.5.1
-          - scala: 2.12.19
+          - scala: 2.12.20
             spark: 2.4.8
-          - scala: 2.13.13
+          - scala: 2.13.14
             spark: 2.4.8
     name: Test Spark ${{matrix.spark}} on Scala ${{matrix.scala}}
     steps:
@@ -42,9 +46,19 @@ jobs:
           distribution: temurin
           java-version: 8
           cache: sbt
+      - name: Install sbt
+        run: |
+          sudo apt-get update
+          sudo apt-get install apt-transport-https curl gnupg -yqq
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list
+          echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list
+          curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo -H gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import
+          sudo chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg
+          sudo apt-get update
+          sudo apt-get install sbt
       - name: Build and run unit tests
         working-directory: ./pramen
-        run: sbt ++${{matrix.scala}} test -DSPARK_VERSION=${{matrix.spark}}
+        run: sbt ++${{matrix.scala}} unit:test -DSPARK_VERSION=${{matrix.spark}}
       - name: Run integration tests
         working-directory: ./pramen
         run: sbt ++${{matrix.scala}} integration:test -DSPARK_VERSION=${{matrix.spark}}
diff --git a/README.md b/README.md
@@ -115,6 +115,24 @@ In addition to basic error notification, typical operational warnings are genera
 
 Pramen is built using SBT.
 
+**Note** By default `sbt test` runs unit tests and integration tests. In order to run just unit tests, please use
+`sbt t` alias.
+
+- `sbt +t` - runs unit tests only, for all Scala versions
+- `sbt test` - runs all tests (unit and integration)
+- `sbt unit:test` - runs unit tests only
+- `sbt integration:test` - runs integration tests only
+
+Install locally for `sbt` projects:
+```
+sbt +publishLocal
+```
+
+Install locally for `Maven` projects:
+```
+sbt +publishM2
+```
+
 ## Project structure
 Pramen consists of a few components:
 - `pramen-api` - contains traits (interfaces) for defining custom transformations, sources and sinks. 
@@ -188,8 +206,8 @@ dependencies in an uber jar that you can build for your Scala version. You can d
 Creating an uber jar for Pramen is very easy. Just clone the repository and run one of the following commands:
 ```sh
 sbt ++2.11.12 assembly 
-sbt ++2.12.18 assembly
-sbt ++2.13.12 assembly
+sbt ++2.12.20 assembly
+sbt ++2.13.14 assembly
 ```
 
 You can collect the uber jar of Pramen either at
@@ -201,8 +219,8 @@ Spark distributions. This makes the runner independent of Spark version. But if
 in your bundle, use one of example commands specifying your Spark version:
 ```sh
 sbt -DSPARK_VERSION="2.4.8" -Dassembly.features="includeDelta" ++2.11.12 assembly 
-sbt -DSPARK_VERSION="3.3.3" -Dassembly.features="includeDelta" ++2.12.18 assembly
-sbt -DSPARK_VERSION="3.4.1" -Dassembly.features="includeDelta" ++2.13.12 assembly
+sbt -DSPARK_VERSION="3.3.4" -Dassembly.features="includeDelta" ++2.12.20 assembly
+sbt -DSPARK_VERSION="3.5.2" -Dassembly.features="includeDelta" ++2.13.14 assembly
 ```
 
 Then, run `spark-shell` or `spark-submit` adding the fat jar as the option.
@@ -602,6 +620,10 @@ is determined by the pipeline configuration.
     # Specifies the maximum number of records to fetch. Good for testing purposes.
     #limit.records = 100
 
+    # Specify the timezone of the database server, if it is different from the default timezone.
+    # It is needed for incremental ingestion based on offset field that has a timestamp or datetime data type.
+    #server.timezone = "Africa/Johannesburg"
+
     # Optionally, you can specify a class for a custom SQL generator for your RDMS engine.
     # The class whould extend 'za.co.absa.pramen.api.sql.SqlGenerator'
     #sql.generator.class = "com.example.MySqlGenerator"
@@ -786,6 +808,34 @@ pramen.operations = [
 ]
 ```
 
+### Incremental Ingestion (experimental)
+Pramen `version 1.10` introduces the concept of incremental ingestion. It allows running a pipeline multiple times a day
+without reprocessing data that was already processed. In order to enable it, use `incremental` schedule when defining your
+ingestion operation:
+```hocon
+schedule = "incremental"
+```
+
+In order for the incremental ingestion to work you need to define a monotonically increasing field, called an offset.
+Usually, this incremental field can be a counter, or a record creation timestamp. You need to define the offset field in
+your source. The source should support incremental ingestion in order to use this mode.
+```hocon
+offset.column {
+  name = "created_at"
+  type = "datetime"
+}
+```
+
+Offset types available at the moment:
+
+| Type     | Description                                |
+|----------|--------------------------------------------|
+| integral | Any integral type (`short`, `int`, `long`) |
+| datetime | A `datetime `or `timestamp` fields         |
+| string   | Only `string` / `varchar(n)` types.        |
+
+Only ingestion jobs support incremental schedule at the moment. Incremental transformations and sinks are planned to be
+available soon.
 
 ### Sinks
 Sinks define a way data needs to be sent to a target system. Built-in sinks include:

@@ -26,6 +26,7 @@ import java.time.LocalDate
   * @param format          The format of the table.
   * @param infoDateColumn  The name of the column that contains the information date (partitioned by).
   * @param infoDateFormat  The format of the information date.
+  * @param batchIdColumn   The name of the column that contains the batch id.
   * @param hiveTable       The name of the Hive table.
   * @param hivePath        The path of the Hive table (if it differs from the path in the underlying format).
   * @param infoDateStart   The start date of the information date.
@@ -38,6 +39,7 @@ case class MetaTableDef(
                          format: DataFormat,
                          infoDateColumn: String,
                          infoDateFormat: String,
+                         batchIdColumn: String,
                          hiveTable: Option[String],
                          hivePath: Option[String],
                          infoDateStart: LocalDate,

@@ -17,6 +17,8 @@
 package za.co.absa.pramen.api
 
 import org.apache.spark.sql.DataFrame
+import za.co.absa.pramen.api.offset.DataOffset
+import za.co.absa.pramen.api.status.TaskRunReason
 
 import java.time.LocalDate
 
@@ -27,7 +29,7 @@ import java.time.LocalDate
 trait MetastoreReader {
 
   /**
-    * Reads a table given th range of information dates, and returns back the dataframe.
+    * Reads a table given the range of information dates, and returns back the dataframe.
     *
     * In order to read a table it is not sufficient the table to be registered in the metastore. It also
     * should be defined as input tables of the job. Otherwise, a runtime exception will be thrown.
@@ -41,6 +43,29 @@ trait MetastoreReader {
                infoDateFrom: Option[LocalDate] = None,
                infoDateTo: Option[LocalDate] = None): DataFrame
 
+  /**
+    * Reads the 'current batch' of the table to be processed incrementally.
+    *
+    * For incremental processing this method returns the current chunk being processed.
+    * It may include multiple chunks from non-processed data if transformer has failed previously.
+    *
+    * For non-incremental processing the call to this method is equivalent to:
+    * {{{
+    *   val df = getTable(tableName)
+    * }}}
+    *
+    * which returns all data for the current information date being processed.
+    *
+    * This method is the method to use for transformers that would use 'incremental' schedule.
+    *
+    * In order to read a table it is not sufficient the table to be registered in the metastore. It also
+    * should be defined as input tables of the job. Otherwise, a runtime exception will be thrown.
+    *
+    * @param tableName    The name of the table to read.
+    * @return The dataframe containing data from the table.
+    */
+  def getCurrentBatch(tableName: String): DataFrame
+
   /**
     * Reads the latest partition of a given table.
     *
@@ -66,7 +91,6 @@ trait MetastoreReader {
     */
   def getLatestAvailableDate(tableName: String, until: Option[LocalDate] = None): Option[LocalDate]
 
-
   /**
     * Returns true if data for the specified table is available for the specified range.
     *
@@ -79,6 +103,15 @@ trait MetastoreReader {
     */
   def isDataAvailable(tableName: String, from: Option[LocalDate], until: Option[LocalDate]): Boolean
 
+  /**
+    * Returns offsets for an information date (both committed and uncommitted).
+    *
+    * This info can be used by transformers and sinks to decide if actions need to be taken depending on the
+    * current micro batch. For example, adding partitions to Hive needs to happen only once per info date,
+    * so a sink that does this can check if micro-batches have been ran for the current day.
+    */
+  def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset]
+
   /**
     * Gets definition of a metastore table. Please, use with caution and do not write to the underlying path
     * from transformers.
@@ -99,6 +132,12 @@ trait MetastoreReader {
     */
   def getTableRunInfo(tableName: String, infoDate: LocalDate): Option[MetaTableRunInfo]
 
+  /**
+    * Returns the reason of running the task. This helps transformers and sinks to determine logic based on whether
+    * thr run is a normal run or a force re-run.
+    */
+  def getRunReason: TaskRunReason
+
   /**
     * Returns an object that allows accessing metadata of metastore tables.
     */

@@ -16,6 +16,8 @@
 
 package za.co.absa.pramen.api
 
+import za.co.absa.pramen.api.offset.{OffsetInfo, OffsetValue}
+
 import java.time.LocalDate
 
 /**
@@ -42,6 +44,13 @@ trait Source extends ExternalChannel {
     */
   def hasInfoDateColumn(query: Query): Boolean = true
 
+  /**
+    * If non-empty, the source is configured for incremental ingestion, returns minimum value with type
+    *
+    * If empty, the source can't be used for incremental ingestion.
+    */
+  def getOffsetInfo: Option[OffsetInfo] = None
+
   /**
     * Validates if the source is okay and the ingestion can proceed.
     */
@@ -57,6 +66,51 @@ trait Source extends ExternalChannel {
     */
   def getData(query: Query, infoDateBegin: LocalDate, infoDateEnd: LocalDate, columns: Seq[String]): SourceResult
 
+  /**
+    * Returns the incremental data between specified offsets. The offset intervals could be half open,
+    * e.g. only offsetFrom or offsetTo is specified.
+    *
+    * If an information date is provided and available at the source, the query will be limited to that date.
+    *
+    * <ul>
+    * <li> When both `offsetFrom` from and `offsetTo` are passed the source should return offsets using an inclusive interval
+    *   (offsetFrom <= offset <= offsetTo) </li>
+    * <li> When only `offsetFrom` is present the source should return offsets using an exclusive interval interval
+    *   (offset > offsetFrom)</li>
+    * <li> When only `offsetTo` is present the source should return offsets using an inclusive interval
+    *   (offset <= offsetTo)</li>
+    *</ul>
+    *
+    * The method will be used in incremental ingestion like this. When the framework queries new data the caller would
+    * specify only `offsetFrom`, and the query is going to look like:
+    *
+    * {{{
+    * SELECT * FROM table WHERE offset > offsetFrom
+    * (exclusive)
+    * }}}
+    *
+    * When a rerun is happening for a day, the caller provided both minimum and maximum offsets for that day and runs:
+    *
+    * {{{
+    * SELECT * FROM table WHERE offset >= offsetFrom offset <= offsetTo
+    * (inclusive)
+    * }}}
+    *
+    * The last case, when only `offsetTo` is available might not be used in practice. Added it for completion.
+    * Potentially it can be used to query the old database for all data that was already loaded:
+    *
+    * {{{
+    * SELECT * FROM table WHERE offset <= offsetTo
+    * (inclusive)
+    * }}}
+    *
+    * @param offsetFromOpt   This is an exclusive parameter the query will be SELECT ... WHERE offset_col > min_offset
+    * @param offsetToOpt     This is an exclusive parameter the query will be SELECT ... WHERE offset_col <= min_offset
+    * @param onlyForInfoDate An information date to get data for. Can be empty if the source table doesn't have such a column.
+    * @param columns         Select only specified columns. Selects all if an empty Seq is passed.
+    */
+  def getDataIncremental(query: Query, onlyForInfoDate: Option[LocalDate], offsetFromOpt: Option[OffsetValue], offsetToOpt: Option[OffsetValue], columns: Seq[String]): SourceResult
+
   /**
     * This method is called after the ingestion is finished. You can query the output table form the output information
     * data and the data should be there.

@@ -17,11 +17,14 @@
 package za.co.absa.pramen.api
 
 import org.apache.spark.sql.DataFrame
+import za.co.absa.pramen.api.offset.OffsetValue
 
 import java.time.LocalDate
 
 trait TableReader {
   def getRecordCount(query: Query, infoDateBegin: LocalDate, infoDateEnd: LocalDate): Long
 
   def getData(query: Query, infoDateBegin: LocalDate, infoDateEnd: LocalDate, columns: Seq[String]): DataFrame
+
+  def getIncrementalData(query: Query, onlyForInfoDate: Option[LocalDate], offsetFromOpt: Option[OffsetValue], offsetToOpt: Option[OffsetValue], columns: Seq[String]): DataFrame
 }